# Importing Libraries

In [52]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split as sklearn_split
from surprise.model_selection import train_test_split as surprise_split
from surprise import Dataset, Reader, SVD
from collections import defaultdict

# Load Data

In [53]:
ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
movies = pd.read_csv("ml-100k/u.item", sep="|", encoding="latin-1", header=None, usecols=[0, 1], names=["item_id", "title"])

# User-based Collaborative Filtering

## Train-Test Split (User-wise)

In [54]:
def train_test_split_userwise(df, test_size=0.2):
    train_list = []
    test_list = []
    for user_id, group in df.groupby("user_id"):
        if len(group) >= 5: # split if user rates 5 or more movies (to be meaningful)
            train, test = sklearn_split(group, test_size=test_size, random_state=42)
            train_list.append(train)
            test_list.append(test)
        else:
            train_list.append(group)
    return pd.concat(train_list), pd.concat(test_list) # combine all mini dataframes into one dataset

## Split data

In [55]:
ratings_train, ratings_test = train_test_split_userwise(ratings)

## Build User-Item Matrix

In [56]:
user_item_matrix = ratings_train.pivot_table(index='user_id', columns='item_id', values='rating')
user_item_matrix_filled = user_item_matrix.fillna(0)

## Compute User-User Similarity

In [57]:
user_similarity = cosine_similarity(user_item_matrix_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index) # to make it easy to access and use, using .loc[user1, user2]

## Recommend Movies

In [58]:
def recommend_movies(user_id, k=10, n_recommendations=5):

    # get top-k similar users (removing the user)
    similar_users = user_similarity_df[user_id].drop(user_id)
    top_k_users = similar_users.sort_values(ascending=False).head(k)

    # Compute weighted ratings from top-k users
    top_k_ratings = user_item_matrix.loc[top_k_users.index]
    weighted_ratings = top_k_ratings.T.dot(top_k_users) / top_k_users.sum()

    # remove already watched movies
    watched = user_item_matrix.loc[user_id].dropna().index # go to original user matrix which contains nulls (NOT THE FILLED ONE)
    recommendations = weighted_ratings.drop(watched, errors='ignore').sort_values(ascending=False)

    # return the top n recommended movie titles
    top_items = recommendations.head(n_recommendations).index
    return movies.set_index("item_id").loc[top_items][["title"]] # return titles of the movies and not the id


## Precision@K Evaluation

In [59]:
def precision_at_k(user_id, recommended_items, ratings_test, k=5):
    actual_items = ratings_test[ratings_test["user_id"] == user_id]["item_id"]
    if actual_items.empty or recommended_items.empty:
        return 0.0
    predicted_items = recommended_items.index[:k]
    return len(set(actual_items).intersection(predicted_items)) / k

## Example Run

In [60]:
precisions = []

for user_id in ratings_test["user_id"].unique():
    recs = recommend_movies(user_id, k=20, n_recommendations=5)
    if not recs.empty:
        score = precision_at_k(user_id, recs, ratings_test, k=5)
        precisions.append(score)
        # print top 5 recommended movies for first 3 users as sample
        if len(precisions) <= 3:
            print(f"\n🎬 User {user_id} - Top 5 Recommendations:")
            print(recs)


print(f"\n📊 Evaluated {len(precisions)} users")
print(f"✅ Average Precision@5: {np.mean(precisions):.4f}")


🎬 User 1 - Top 5 Recommendations:
                              title
item_id                            
2                  GoldenEye (1995)
4                 Get Shorty (1995)
17       From Dusk Till Dawn (1996)
23               Taxi Driver (1976)
24       Rumble in the Bronx (1995)

🎬 User 2 - Top 5 Recommendations:
                     title
item_id                   
1         Toy Story (1995)
2         GoldenEye (1995)
3        Four Rooms (1995)
4        Get Shorty (1995)
5           Copycat (1995)

🎬 User 3 - Top 5 Recommendations:
                     title
item_id                   
1         Toy Story (1995)
2         GoldenEye (1995)
3        Four Rooms (1995)
4        Get Shorty (1995)
5           Copycat (1995)

📊 Evaluated 943 users
✅ Average Precision@5: 0.0579


# Surprise: Matrix Factorization (SVD)

## Convert to Surprise format

In [61]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[["user_id", "item_id", "rating"]], reader)

## Split data using Surprise’s method

In [62]:
trainset, testset = surprise_split(data, test_size=0.2, random_state=42)

## Train SVD and Predict

In [63]:
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)

## Format predictions into top-n per user

In [64]:
def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for pred in predictions:
        uid = pred.uid
        iid = pred.iid
        est = pred.est
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

## Precision@K for Surprise SVD

In [65]:
def precision_at_k_surprise(user_id, recs, testset, k=5):
    actual = [iid for uid, iid, _ in testset if uid == user_id]
    if not actual:
        return 0.0
    predicted = [iid for iid, _ in recs[:k]]
    return len(set(actual) & set(predicted)) / k


## Evaluate Surprise model


In [68]:
## Evaluate Surprise model
svd_precisions = []
for uid, recs in top_n.items():
    svd_precisions.append(precision_at_k_surprise(uid, recs, testset, k=5))
    # print top 5 recommended movies for first 3 users as sample
    if len(svd_precisions) <= 3:
        top_movie_ids = [iid for iid, _ in recs]
        top_movie_titles = movies[movies["item_id"].isin(top_movie_ids)][["item_id", "title"]]
        print(f"\n🎬 User {uid} - Top 5 Recommendations (SVD):")
        print(top_movie_titles.to_string(index=False))

print(f"\n🔍 Evaluated {len(svd_precisions)} users (SVD)")
print(f"🎯 Average Precision@5 (SVD): {np.mean(svd_precisions):.4f}")


🎬 User 907 - Top 5 Recommendations (SVD):
 item_id                           title
      79            Fugitive, The (1993)
     143      Sound of Music, The (1965)
     172 Empire Strikes Back, The (1980)
     647                      Ran (1985)
     813    Celluloid Closet, The (1995)

🎬 User 371 - Top 5 Recommendations (SVD):
 item_id                                     title
      55                  Professional, The (1994)
      97                 Dances with Wolves (1990)
     186                Blues Brothers, The (1980)
     210 Indiana Jones and the Last Crusade (1989)
     746                        Real Genius (1985)

🎬 User 218 - Top 5 Recommendations (SVD):
 item_id                      title
      12 Usual Suspects, The (1995)
      42              Clerks (1994)
     164          Abyss, The (1989)
     209  This Is Spinal Tap (1984)
     654           Chinatown (1974)

🔍 Evaluated 940 users (SVD)
🎯 Average Precision@5 (SVD): 0.9626
