In [3]:
import pandas as pd
# Load the data file
file_path = '/Users/anirudhravipudi/Desktop/AI/Practice/ml-100k/u.data'  # <- replace with your actual path
column_names = ['user_id', 'item_id', 'rating', 'timestamp']

ratings_df = pd.read_csv(file_path, sep='\t', names=column_names)

# Check it out
print(ratings_df.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [7]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... done
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-macosx_11_0_arm64.whl size=485003 sha256=005e52d560da76d5b5baf0f5a04d66e58a9bc52439eeba1b9fc0a05e3af75d47
  Stored in directory: /Users/anirudhravipudi/Library/Caches/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [9]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

file_path = '/Users/anirudhravipudi/Desktop/AI/Practice/ml-100k/u.data'
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)

# Step 2: Train/Test Split
trainset, testset = train_test_split(data, test_size=0.25)

# Step 3: User-based Collaborative Filtering using Cosine Similarity
sim_options = {
    'name': 'cosine',
    'user_based': True
}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# Step 4: Evaluate on test set
predictions = algo.test(testset)
accuracy.rmse(predictions)

# Step 5: Predict a rating
user_id = str(196)  # User IDs must be strings for surprise
item_id = str(302)  # Same with item IDs
pred = algo.predict(uid=user_id, iid=item_id)
print(f"\nPredicted rating for User {user_id} on Movie {item_id}: {pred.est:.2f}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0219

Predicted rating for User 196 on Movie 302: 4.20


In [13]:
# Load movie titles from u.item
file_path = '/Users/anirudhravipudi/Desktop/AI/Practice/ml-100k/u.data'  # Update path
movie_titles = {}
movie_file = '/Users/anirudhravipudi/Desktop/AI/Practice/ml-100k/u.item'
with open(movie_file, encoding='ISO-8859-1') as f:
    for line in f:
        parts = line.strip().split('|')
        movie_id = parts[0]
        title = parts[1]
        movie_titles[movie_id] = title

# Test it
print(movie_titles['302'])  # Title for movie ID 302

L.A. Confidential (1997)


In [15]:
from collections import defaultdict

def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

# Build full model and anti-testset
trainset = data.build_full_trainset()
algo.fit(trainset)
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
top_n = get_top_n(predictions, n=5)

# Display recommendations for user 196 with movie titles
print(f"\n🎬 Top 5 Recommendations for User 196:")
for movie_id, score in top_n['196']:
    title = movie_titles.get(movie_id, "Unknown Title")
    print(f"{title} (Movie ID {movie_id}) — Predicted Rating: {score:.2f}")

Computing the cosine similarity matrix...
Done computing similarity matrix.

🎬 Top 5 Recommendations for User 196:
Prefontaine (1997) (Movie ID 1189) — Predicted Rating: 5.00
Santa with Muscles (1996) (Movie ID 1500) — Predicted Rating: 5.00
Great Day in Harlem, A (1994) (Movie ID 814) — Predicted Rating: 5.00
Aiqing wansui (1994) (Movie ID 1536) — Predicted Rating: 5.00
Star Kid (1997) (Movie ID 1293) — Predicted Rating: 5.00


In [17]:
from surprise import KNNBasic
from surprise import accuracy

# Set up Item-Based CF with Cosine
sim_options = {
    'name': 'cosine',
    'user_based': False
}
algo_item_cosine = KNNBasic(sim_options=sim_options)
algo_item_cosine.fit(trainset)

# Predict & evaluate
predictions_item_cosine = algo_item_cosine.test(testset)
rmse_item_cosine = accuracy.rmse(predictions_item_cosine)

# Predict rating for User 196 on Movie 302
pred_item_cosine = algo_item_cosine.predict(uid='196', iid='302')
print(f"Predicted rating [Item-Based, Cosine]: {pred_item_cosine.est:.2f}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.4893
Predicted rating [Item-Based, Cosine]: 3.61


In [19]:
# Set up User-Based CF with Pearson
sim_options = {
    'name': 'pearson',
    'user_based': True
}
algo_user_pearson = KNNBasic(sim_options=sim_options)
algo_user_pearson.fit(trainset)

# Predict & evaluate
predictions_user_pearson = algo_user_pearson.test(testset)
rmse_user_pearson = accuracy.rmse(predictions_user_pearson)

# Predict rating for User 196 on Movie 302
pred_user_pearson = algo_user_pearson.predict(uid='196', iid='302')
print(f"Predicted rating [User-Based, Pearson]: {pred_user_pearson.est:.2f}")

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9442
Predicted rating [User-Based, Pearson]: 3.93
