Task: Study the various Recommendation Techniques for recommending movies using
movies.csv, ratings.csv datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD, NMF

# Load movies.csv and ratings.csv dataset


In [None]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [None]:
movies_df.info()
movies_df.head()

In [None]:
ratings_df.info()
ratings_df.head()

## Merge both data frames on movieid


In [None]:
df = pd.merge(ratings_df, movies_df, on='movieId')
df.head()
df.info()

In [None]:
df.describe()

Combined dataframe is 100836 rows and 6 columns

### Genres of Movies

In [None]:
genres = {}

def find_genres():
    for genre in df['genres']:
        words = genre.split('|')
        for word in words:
            genres[word] = genres.get(word, 0) + 1
find_genres()

# replace '(no genres listed)' by 'None'
genres['None'] = genres.pop('(no genres listed)')

genres_df = pd.DataFrame(list(genres.items()), columns=['genres', 'count'])
genres_df


20 total Genres

In [None]:
plt.style.use('seaborn-v0_8')
sns.barplot(x=genres_df['genres'], y=genres_df['count'])
plt.title('Bar plot of Movie Genres')
plt.xticks(rotation=90)
plt.xlabel('Movie Genres')
plt.ylabel('Number of Movies')
plt.show()

### Most Rated Movies

In [None]:
df_ratings_agg = pd.DataFrame(df.groupby('title')['rating'].mean())
df_ratings_agg['total ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
df_ratings_agg.rename(columns={'rating': 'mean rating'}, inplace=True)
df_ratings_agg.sort_values('total ratings', ascending=False).head()

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(df_ratings_agg['total ratings'], bins=20, kde=True)
plt.xlabel('Total Number of Ratings')
plt.show()

Majority of movies have less that 50 ratings and the top 3 movies have over 300 ratings

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(df_ratings_agg['mean rating'], bins='auto', kde=True)
plt.title('Distribution of Mean Rating')
plt.xlabel('Mean Rating')
plt.show()

## Create User-Item Matrix

In [None]:
user_item_matrix = pd.pivot_table(df, index='userId', columns='title', values='rating')
print(user_item_matrix.head())

## Perform User-based Collaborative Filtering

### Fill the row-wise NaNs in the User-Item Matrix with the corresponding user's mean ratings, and find the Pearson correlation between users

In [None]:
# Fill NaN values with user mean ratings
user_item_matrix_filled = user_item_matrix.T.fillna(user_item_matrix.T.mean()).T

# Display the matrix after filling NaNs
print(user_item_matrix_filled.head(5))

### Choose the correlation of all users with only User 1

In [None]:
# Calculate the Pearson correlation matrix
user_item_matrix_filled.T.corr()

User 1 correlation to other users

In [None]:
user1_corr = user_item_matrix_filled.T.corr()[1].copy()

#### Sort the User 1 correlation in the descending order

In [None]:
user1_corr.sort_values(ascending=False, inplace=True)

#### Drop the NaN values generated in the correlation matrix

In [None]:
user1_corr.dropna(inplace=True)

### Choose the top 50 users that are highly correlated to User 1


In [None]:
top_50_corr_users = user1_corr[1:51]

In [None]:
# Name of Movie ID 32
movies_df['title'][movies_df['movieId']==32].values 

In [None]:
# Movies Rated by User 1
print(df[df['userId']==1])

# Users that rated Twelve Monkeys
print(df[df['movieId']== 32])

As seen, User 1 has not seen Twelve Monkeys

### Predict the rating that User 1 might give for the movie with movieid 32 based on the top 50 user correlation matrix
(Hint: Predicted rating = sum of [(weights) * (ratings)] / sum of (weights ). Here, weights is the correlation of the corresponding user with the first user). That is, the predicted ratingis calculated as the weighted average of k similar users

Method 1: Predicting using only users that rated Movie 32

In [None]:
# Twelve Monkeys mean rating and total ratings
df_ratings_agg.loc[['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)']]

In [None]:
top_50_users = top_50_corr_users.keys()

count = 0
users = list() 
for user in top_50_users:
    # Filter each user in top 50 that rated movie 32
    if df[(df['userId']==user) & (df['movieId']==32)]['rating'].sum():
        count += 1
        users.append(user)
print(f'Total users that rated Twelve Monkeys: {count}')

In [None]:
def predict_rating():
    sum_similarity = 0
    weighted_ratings = 0
    for user in users:
        weighted_ratings += top_50_corr_users.loc[user] * df[(df['userId']==user) & (df['movieId']==32)]['rating'].sum()
        sum_similarity += top_50_corr_users.loc[user]

    return(weighted_ratings/sum_similarity)
print(f'Predicted User 1 rating for Twelve Monkeys {predict_rating()}')

Method 2 if filled with NaNs filled with user means

In [None]:
ratings_top_50_users = user_item_matrix_filled.loc[top_50_corr_users.index,'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)']
weights = top_50_users.values
weighted_sum = (ratings_top_50_users * weights).sum()
weights_sum = weights.sum()
predicted_rating = weighted_sum / weights_sum
print("Predicted User 1 rating with filled means for Twelve Monkeys: ", predicted_rating)

## Perform Item-based Collaborative Filtering
### Fill the column-wise NaN's in the User-Item Matrix with the corresponding movie's mean ratings, and find Pearson correlation between movies

In [None]:
movie_item_filled = user_item_matrix.apply(lambda col: col.fillna(col.mean()), axis=0)

movie_corr = movie_item_filled.corr()

### Choose the correlation of all movies with the movie Jurassic Park (1993) only


In [None]:
jurassic_corr = movie_corr["Jurassic Park (1993)"].dropna().sort_values(ascending=False)

### Sort the Jurassic Park movie correlation in descending order

### Find 10 movies similar to the movie Jurassic Park (1993)

In [None]:
similar_movies = jurassic_corr.drop(labels=["Jurassic Park (1993)"]).head(10)
print("\nTop 10 movies similar to 'Jurassic Park (1993)':")
print(similar_movies)

## Perform KNNBasic, SVD, NMF Model-based Collaborative Filtering
Initialize KNNBasic with similarity configuration as Mean Squared Distance Similarity (msd), 20 neighbors and cross-validate 5 folds against measure RMSE.
(Hint: cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, verbose=True))


In [2]:
from surprise import Reader, Dataset, KNNBasic, SVD, NMF
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from surprise import accuracy


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/cullen/Documents/Python-Projects/.venv/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  F

ImportError: numpy.core.multiarray failed to import (auto-generated because you didn't call 'numpy.import_array()' after cimporting numpy; use '<void>numpy._import_array' to disable if you are certain you don't need it).

In [None]:
# Load the ratings dataset
ratings = pd.read_csv('ratings.csv')  # Adjust the path as necessary
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

## K-Nearest Neighbours (KNN)

In [None]:
# KNNBasic Model
knn_algo = KNNBasic(sim_options={'name': 'msd', 'user_based': True})
knn_cv_results = cross_validate(knn_algo, data, measures=['RMSE'], cv=5, verbose=True)
knn_best_score = knn_cv_results['test_rmse'].mean()

### Initialize Singular Value Decomposition (SVD) and cross-validate 5 folds against measure RMSE.


In [None]:
# SVD Model
svd_algo = SVD()
svd_cv_results = cross_validate(svd_algo, data, measures=['RMSE'], cv=5, verbose=True)
svd_best_score = svd_cv_results['test_rmse'].mean()

### Initialize Non-Negative Matrix Factorization (NMF) and cross-validate 5 folds against measure RMSE.


In [None]:
# NMF Model
nmf_algo = NMF()
nmf_cv_results = cross_validate(nmf_algo, data, measures=['RMSE'], cv=5, verbose=True)
nmf_best_score = nmf_cv_results['test_rmse'].mean()

### Print best score and best params from Cross Validate on all the models built.

In [None]:
# Print best scores for each model
print(f"KNNBasic Mean RMSE: {knn_best_score:.4f}")
print(f"SVD Mean RMSE: {svd_best_score:.4f}")
print(f"NMF Mean RMSE: {nmf_best_score:.4f}")