Task: Study the various Recommendation Techniques for recommending movies using
movies.csv, ratings.csv datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD, NMF
from math import sqrt

# from surprise import Dataset, Reader, KNNBasic, SVD, NMF
# from surprise.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

# Load movies.csv and ratings.csv dataset


In [2]:
df_movie = pd.read_csv("movies.csv")
df_rating = pd.read_csv("ratings.csv")

## Merge both data frames on movieid


In [3]:
merged_df = pd.merge(df_movie, df_rating, on="movieId")

In [4]:
print(merged_df.head(10))
print(merged_df.info())

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
5        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
6        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
7        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
8        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
9        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating   timestamp  
0       1     4.0   964982703  
1       5     4.0   847434962  
2       7     4.5  1106635946  
3      15     2.5  15105

## Create User-Item Matrix (Hint: Use pandas pivot_table method with index = 'userId',columns = 'title', values = 'rating' )

In [5]:
user_item_matrix = pd.pivot_table(merged_df, index='userId', columns='title', values='rating')
print(user_item_matrix.head())

title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              NaN                                      NaN   
2              NaN                                      NaN   
3              NaN                                      NaN   
4              NaN                                      NaN   
5              NaN                                      NaN   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          NaN                  NaN   
2                          NaN                  NaN   
3                          NaN                  NaN   
4                          NaN                  NaN   
5                          NaN                  NaN   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                                                               
1                             Na

## Perform User-based Collaborative Filtering

### Fill the row-wise NaNs in the User-Item Matrix with the corresponding user's mean ratings, and find the Pearson correlation between users

In [6]:
user_item_filled = user_item_matrix.T.fillna(user_item_matrix.T.mean()).T
print(user_item_filled.head())

title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1         4.366379                                 4.366379   
2         3.948276                                 3.948276   
3         2.435897                                 2.435897   
4         3.555556                                 3.555556   
5         3.636364                                 3.636364   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                     4.366379             4.366379   
2                     3.948276             3.948276   
3                     2.435897             2.435897   
4                     3.555556             3.555556   
5                     3.636364             3.636364   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                                                               
1                        4.36637


### Choose the correlation of all users with only User 1
Sort the User 1 correlation in the descending order

Drop the NaN values generated in the correlation matrix


In [7]:
user_correlation = user_item_filled.T.corr()
user1_corr = user_correlation[1].sort_values(ascending=False)
user1_corr = user1_corr.dropna().drop(1)

### Choose the top 50 users that are highly correlated to User 1


In [8]:
top_50_users = user1_corr[1:51]
print("Top 50 Users similar to User 1 are: ", top_50_users)

Top 50 Users similar to User 1 are:  userId
597    0.102631
414    0.101348
477    0.099217
57     0.099070
369    0.098295
206    0.096852
535    0.096493
590    0.095191
418    0.094153
120    0.092770
75     0.091987
577    0.089396
198    0.088883
160    0.088133
226    0.088068
266    0.086064
312    0.086017
19     0.085249
135    0.084672
484    0.084350
469    0.084184
72     0.083613
593    0.082403
44     0.081400
297    0.080839
434    0.078361
483    0.078085
449    0.077631
552    0.077630
171    0.077241
199    0.076905
45     0.076489
608    0.075224
494    0.073544
116    0.073329
450    0.072014
201    0.071913
387    0.071418
173    0.071317
600    0.069528
513    0.069213
524    0.069208
368    0.069179
555    0.068507
180    0.067516
445    0.067329
20     0.066990
307    0.066782
480    0.066395
178    0.066101
Name: 1, dtype: float64


### Predict the rating that User 1 might give for the movie with movieid 32 based on the top 50 user correlation matrix
(Hint: Predicted rating = sum of [(weights) * (ratings)] / sum of (weights ). Here, weights is the correlation of the corresponding user with the first user). That is, the predicted ratingis calculated as the weighted average of k similar users

In [9]:
Movie_id_to_predict = 32
movie_title = df_movie[df_movie['movieId'] == Movie_id_to_predict]['title'].values[0]


In [10]:
ratings_top_50_users = user_item_filled.loc[top_50_users.index, movie_title]

In [11]:
weights = top_50_users.values
weighted_sum = (ratings_top_50_users * weights).sum()
weights_sum = weights.sum()
predicted_rating = weighted_sum / weights_sum
print("Predicted Rating for the movie: ", predicted_rating)

Predicted Rating for the movie:  3.864651734772927


## Perform Item-based Collaborative Filtering
### Fill the column-wise NaN's in the User-Item Matrix with the corresponding movie's mean ratings, and find Pearson correlation between movies

In [12]:
movie_item_filled = user_item_matrix.apply(lambda col: col.fillna(col.mean()), axis=0)

movie_corr = movie_item_filled.corr()

### Choose the correlation of all movies with the movie Jurassic Park (1993) only


In [13]:
jurassic_corr = movie_corr["Jurassic Park (1993)"].dropna().sort_values(ascending=False)

### Sort the Jurassic Park movie correlation in descending order

### Find 10 movies similar to the movie Jurassic Park (1993)

In [14]:
similar_movies = jurassic_corr.drop(labels=["Jurassic Park (1993)"]).head(10)
print("\nTop 10 movies similar to 'Jurassic Park (1993)':")
print(similar_movies)


Top 10 movies similar to 'Jurassic Park (1993)':
title
Fugitive, The (1993)                           0.324717
Lethal Weapon (1987)                           0.318646
Independence Day (a.k.a. ID4) (1996)           0.263629
Mission: Impossible (1996)                     0.258080
Ghostbusters (a.k.a. Ghost Busters) (1984)     0.256527
Mulan (1998)                                   0.255672
Rise of the Planet of the Apes (2011)          0.248134
Bug's Life, A (1998)                           0.240964
Indiana Jones and the Temple of Doom (1984)    0.239826
Die Hard (1988)                                0.239294
Name: Jurassic Park (1993), dtype: float64


## Perform KNNBasic, SVD, NMF Model-based Collaborative Filtering
### Initialize KNNBasic with similarity configuration as Mean Squared Distance Similarity (msd), 20 neighbors and cross-validate 5 folds against measure RMSE.
(Hint: cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, verbose=True))


In [15]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df_rating[['userId', 'movieId', 'rating']], reader = reader)

NameError: name 'Reader' is not defined

In [None]:
# KNNBasic Model
knn = NearestNeighbors(sim_options={'name': 'msd', 'user_based': True}, k=20)
cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=True)


Average RMSE Scores (5-Fold CV):
KNNBasic (approx MSD): 2.5222
SVD: 3.4011
NMF: 3.3646


### Initialize Singular Value Decomposition (SVD) and cross-validate 5 folds against measure RMSE.


In [None]:
# SVD Model
svd = TruncatedSVD()
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

### Initialize Non-Negative Matrix Factorization (NMF) and cross-validate 5 folds against measure RMSE.


In [None]:
# NMF Model
nmf = NMF()
cross_validate(nmf, data, measures=['RMSE'], cv=5, verbose=True)

In [None]:
### --- Model-Based Collaborative Filtering (KNN, SVD, NMF) ---
# Fill NaNs with 0 for modeling
user_item_model = user_item_matrix.fillna(0)
X = user_item_model.values
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_knn, rmse_svd, rmse_nmf = [], [], []

In [None]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    # KNN Approximate MSD
    knn = NearestNeighbors(n_neighbors=20, metric='minkowski', p=2)
    knn.fit(X_train)
    distances, indices = knn.kneighbors(X_test)
    knn_preds, knn_actuals = [], []
    for i, neighbors in enumerate(indices):
        neighbor_ratings = X_train[neighbors]
        pred = np.where(np.count_nonzero(neighbor_ratings, axis=0) > 0,
                        neighbor_ratings.sum(axis=0) / np.maximum(1, (neighbor_ratings != 0).sum(axis=0)),
                        0)
        actual = X_test[i]
        mask = actual != 0
        if mask.sum() > 0:
            knn_preds.extend(pred[mask])
            knn_actuals.extend(actual[mask])
    rmse_knn.append(np.sqrt(mean_squared_error(knn_actuals, knn_preds)))
        # SVD
    svd_model = TruncatedSVD(n_components=20, random_state=42)
    X_svd = svd_model.fit_transform(X_train)
    X_reconstructed = svd_model.inverse_transform(X_svd)
    svd_preds, svd_actuals = [], []
    for i, user_idx in enumerate(test_index):
        actual = X[user_idx]
        pred = X_reconstructed[i]
        mask = actual != 0
        svd_preds.extend(pred[mask])
        svd_actuals.extend(actual[mask])
    rmse_svd.append(np.sqrt(mean_squared_error(svd_actuals, svd_preds)))
    # NMF
    nmf_model = NMF(n_components=20, init='random', random_state=42, max_iter=200)
    X_train_nmf = np.where(X_train == 0, 1e-5, X_train)
    W = nmf_model.fit_transform(X_train_nmf)
    H = nmf_model.components_
    X_reconstructed = np.dot(W, H)
    nmf_preds, nmf_actuals = [], []
    for i, user_idx in enumerate(test_index):
        actual = X[user_idx]
        pred = X_reconstructed[i]
        mask = actual != 0
        nmf_preds.extend(pred[mask])
        nmf_actuals.extend(actual[mask])
    rmse_nmf.append(np.sqrt(mean_squared_error(nmf_actuals, nmf_preds)))
# Output average RMSE scores
avg_rmse_scores = {
    'KNNBasic (approx MSD)': np.mean(rmse_knn),
    'SVD': np.mean(rmse_svd),
    'NMF': np.mean(rmse_nmf)
}
print("\nAverage RMSE Scores (5-Fold CV):")
for model, score in avg_rmse_scores.items():
    print(f"{model}: {score:.4f}")


Average RMSE Scores (5-Fold CV):
KNNBasic (approx MSD): 2.5222
SVD: 3.4011
NMF: 3.3646


### Print best score and best params from Cross Validate on all the models built.