In [24]:
# Collaborative Filtering Models

# This notebook implements user-based and item-based collaborative filtering to capture personalized user preferences beyond baseline recommenders.


In [25]:
import pandas as pd
import numpy as np

ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")


In [26]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42
)


In [27]:
from sklearn.metrics import mean_squared_error
import numpy as np

# -------- Global Average Baseline --------
global_mean = train['rating'].mean()
test['pred_global'] = global_mean

rmse_global = np.sqrt(
    mean_squared_error(test['rating'], test['pred_global'])
)

# -------- Movie Average Baseline --------
movie_means = train.groupby('movieId')['rating'].mean()
test['pred_movie'] = test['movieId'].map(movie_means)
test['pred_movie'] = test['pred_movie'].fillna(global_mean)

rmse_movie = np.sqrt(
    mean_squared_error(test['rating'], test['pred_movie'])
)

rmse_global, rmse_movie


(np.float64(1.0488405992661316), np.float64(0.9827389937822489))

In [28]:
user_item_matrix = train.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
)

user_item_matrix.head()


movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [29]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_item_matrix.fillna(0))
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

user_similarity_df.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.016314,0.049021,0.165799,0.123392,0.118556,0.112563,0.142135,0.056088,0.012906,...,0.070901,0.152097,0.187324,0.067264,0.151517,0.139042,0.198771,0.232811,0.112174,0.143902
2,0.016314,1.0,0.0,0.004627,0.0,0.013391,0.029067,0.032754,0.0,0.080739,...,0.170123,0.020395,0.014415,0.0,0.0,0.019846,0.016076,0.05561,0.032404,0.07581
3,0.049021,0.0,1.0,0.0,0.00577,0.004833,0.0,0.005911,0.0,0.0,...,0.006401,0.005889,0.015344,0.0,0.012783,0.008884,0.004642,0.009433,0.0,0.031309
4,0.165799,0.004627,0.0,1.0,0.133565,0.090914,0.094497,0.050417,0.0,0.021991,...,0.075828,0.090252,0.241155,0.054366,0.081585,0.162277,0.083074,0.107276,0.02672,0.068325
5,0.123392,0.0,0.00577,0.133565,1.0,0.238812,0.071386,0.393773,0.0,0.006245,...,0.050523,0.343953,0.101064,0.159651,0.111464,0.086797,0.073278,0.09704,0.205395,0.05309


In [30]:
def predict_user_based(user_id, movie_id, k=5):
    if movie_id not in user_item_matrix.columns:
        return np.nan

    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:k+1]
    ratings = user_item_matrix.loc[similar_users.index, movie_id]

    return ratings.mean()


In [31]:
test['pred_user_cf'] = test.apply(
    lambda x: predict_user_based(x['userId'], x['movieId']),
    axis=1
)

global_mean = train['rating'].mean()
test['pred_user_cf'] = test['pred_user_cf'].fillna(global_mean)


In [32]:
from sklearn.metrics import mean_squared_error

rmse_user_cf = np.sqrt(
    mean_squared_error(test['rating'], test['pred_user_cf'])
)

rmse_user_cf

### User-Based Collaborative Filtering
# - Captures similarities between users
# - Introduces personalization
# - Sensitive to data sparsity

np.float64(1.0911685027259228)

In [33]:
item_similarity = cosine_similarity(user_item_matrix.fillna(0).T)
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)


In [34]:
def predict_item_based(user_id, movie_id, k=5):
    # Movie not in training data
    if movie_id not in user_item_matrix.columns:
        return np.nan

    # Movies the user has rated
    rated_movies = user_item_matrix.loc[user_id].dropna()

    if rated_movies.empty:
        return np.nan

    # Similarities between target movie and movies user rated
    similarities = item_similarity_df.loc[movie_id, rated_movies.index]

    # Take top-k most similar items
    top_k = similarities.sort_values(ascending=False).head(k)

    # If all similarities are zero → fallback
    if top_k.sum() == 0:
        return np.nan

    ratings = rated_movies[top_k.index]

    # Weighted average
    return np.average(ratings, weights=top_k)

In [35]:
test['pred_item_cf'] = test.apply(
    lambda x: predict_item_based(x['userId'], x['movieId']),
    axis=1
)

test['pred_item_cf'] = test['pred_item_cf'].fillna(global_mean)

In [36]:
rmse_item_cf = np.sqrt(
    mean_squared_error(test['rating'], test['pred_item_cf'])
)

rmse_item_cf


np.float64(0.9014048006054212)

In [37]:
pd.DataFrame({
    "Model": [
        "Global Average",
        "Movie Average",
        "User-Based CF",
        "Item-Based CF"
    ],
    "RMSE": [
        rmse_global,
        rmse_movie,
        rmse_user_cf,
        rmse_item_cf
    ]
})

## Collaborative Filtering Insights

# - Collaborative filtering significantly improves personalization.
# - Item-based CF is often more stable than user-based CF.
# - Sparsity and cold-start remain key challenges.
# - These limitations motivate matrix factorization and hybrid models.

Unnamed: 0,Model,RMSE
0,Global Average,1.048841
1,Movie Average,0.982739
2,User-Based CF,1.091169
3,Item-Based CF,0.901405
