In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error

import pickle

In [2]:
movies = pd.read_csv("archive/movies.csv")
ratings = pd.read_csv("archive//ratings.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings.shape

(100836, 4)

## Collaborative (User-Based) Filtering

#### Creating the user-movie rating matrix - 
Pivoting the ratings data so that each row represents a user and each column a movie.

In [6]:
user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
# print(user_movie_matrix)

In [7]:
user_movie_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### Finding similar users by computing cosine similarity between users.
#### This creates a matrix where each entry (i, j) is the similarity between user i and user j.

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Fill missing values with 0
user_movie_filled = user_movie_matrix.fillna(0)
# Compute cosine similarity between users
user_similarity = cosine_similarity(user_movie_filled)
cosine_sim_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

print("User Similarity Matrix:")
print(cosine_sim_df)


User Similarity Matrix:
userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.027283  0.059720  0.194395  0.129080  0.128152  0.158744   
2       0.027283  1.000000  0.000000  0.003726  0.016614  0.025333  0.027585   
3       0.059720  0.000000  1.000000  0.002251  0.005020  0.003936  0.000000   
4       0.194395  0.003726  0.002251  1.000000  0.128659  0.088491  0.115120   
5       0.129080  0.016614  0.005020  0.128659  1.000000  0.300349  0.108342   
...          ...       ...       ...       ...       ...       ...       ...   
606     0.164191  0.028429  0.012993  0.200395  0.106435  0.102123  0.200035   
607     0.269389  0.012948  0.019247  0.131746  0.152866  0.162182  0.186114   
608     0.291097  0.046211  0.021128  0.149858  0.135535  0.178809  0.323541   
609     0.093572  0.027565  0.000000  0.032198  0.261232  0.214234  0.090840   
610     0.145321

We can also find similar users by **pearson correlation**

In [9]:
# Compute mean rating per user
user_means = user_movie_matrix.mean(axis=1)

# Subtract the mean rating from each user's ratings to center the data
ratings_diff = user_movie_matrix.sub(user_means, axis=0)

# Compute Pearson correlation by taking the correlation of users
# Note: We need to work with the original non-filled matrix to avoid biasing the correlation.
pearson_sim_df = ratings_diff.T.corr()
pearson_sim_df.fillna(0.0, inplace=True)      # now every similarity is a real number

### Making Recommendations -
1. Identifying similar users
2. Predicting ratings for unseen movies
3. Returning top *n* recommendations

In [10]:
def get_top_k_users(user_id, similarity_df, k=2):
    return similarity_df.loc[user_id].drop(user_id).sort_values(ascending=False).head(k)

In [22]:
def get_recommendations(user_id, num_recommendations=5):
    """
    Get top movie recommendations for a user based on user-based collaborative filtering.
    
    Args:
        user_id (int): The target user ID.
        num_recommendations (int): Number of movie recommendations to return.
    
    Returns:
        A DataFrame with recommended movies and predicted ratings.
    """
    # Get the similarity scores for the target user, drop self, and sort descending
    # similar_users = user_similarity_df[user_id].drop(labels=[user_id]).sort_values(ascending=False)
    similar_users = get_top_k_users(user_id, cosine_sim_df)
    # similar_users = get_top_k_users(user_id, pearson_sim_df)
    
    # Get movies already rated by the target user
    user_rated_movies = user_movie_matrix.loc[user_id].dropna().index.tolist()
    
    # Dictionary to store the weighted ratings for movies not seen by the user
    recommendations = {}
    
    # Iterate over all movies in the dataset
    for movie in user_movie_matrix.columns:
        if movie not in user_rated_movies:
            numerator = 0
            denominator = 0
            # Consider similar users who rated this movie
            for other_user, sim_score in similar_users.items():
                other_rating = user_movie_matrix.loc[other_user].get(movie)
                if pd.notna(other_rating):
                    numerator += sim_score * other_rating
                    denominator += sim_score
            if denominator > 0:
                recommendations[movie] = numerator / denominator

    # print(recommendations)
    # Convert the recommendations dictionary to a DataFrame and sort by predicted rating
    recs = pd.DataFrame(list(recommendations.items()), columns=['movieId', 'predicted_rating'])
    recs = recs.sort_values(by='predicted_rating', ascending=False)
    
    # Merge with movies dataframe to add movie titles and genres
    recs = recs.merge(movies, on='movieId', how='left')
    
    return recs.head(num_recommendations)

In [23]:
target_user_id = 1  # Replace with any valid userId from your dataset
recommended_movies = get_recommendations(target_user_id, num_recommendations=5)

print("Top movie recommendations for user {}:".format(target_user_id))
print(recommended_movies)

Top movie recommendations for user 1:
   movieId  predicted_rating                                         title  \
0     3032               5.0                         Omega Man, The (1971)   
1     2890               5.0                            Three Kings (1999)   
2     1266               5.0                             Unforgiven (1992)   
3     2683               5.0  Austin Powers: The Spy Who Shagged Me (1999)   
4     2702               5.0                          Summer of Sam (1999)   

                              genres  
0       Action|Drama|Sci-Fi|Thriller  
1  Action|Adventure|Comedy|Drama|War  
2                      Drama|Western  
3            Action|Adventure|Comedy  
4                              Drama  


#### Evaluation

In [13]:
def predict_rating_cosine(user_id, movie_id):
    # Select similar users (dropping target user)
    similar_users = cosine_sim_df[user_id].drop(labels=[user_id]).sort_values(ascending=False)
    
    # Calculate numerator & denominator for weighted average
    numerator, denominator = 0, 0
    for other_user, sim_score in similar_users.items():
        other_rating = user_movie_matrix.loc[other_user].get(movie_id)
        if pd.notna(other_rating):
            numerator += sim_score * other_rating
            denominator += sim_score
    if denominator == 0:
        # Fallback: if no similar user rated the movie, use the user's mean or global mean
        user_ratings = user_movie_matrix.loc[user_id]
        if user_ratings.count() > 0:
            return user_ratings.mean()
        else:
            return user_movie_matrix.values.flatten().mean()
    return numerator / denominator


In [14]:
def predict_rating_pearson(user_id, movie_id):
    # Mean rating for the target user
    target_mean = user_means.loc[user_id]
    
    # Similarity scores from Pearson correlation
    similar_users = pearson_sim_df[user_id].drop(labels=[user_id]).sort_values(ascending=False)
    
    numerator, denominator = 0, 0
    for other_user, sim_score in similar_users.items():
        other_mean = user_means.loc[other_user]
        other_rating = user_movie_matrix.loc[other_user].get(movie_id)
        if pd.notna(other_rating):
            # Use the deviation from the mean rating
            numerator += sim_score * (other_rating - other_mean)
            denominator += abs(sim_score)
    if denominator == 0:
        # Fallback: if no similar user rated the movie, use the user's mean or global mean
        user_ratings = user_movie_matrix.loc[user_id]
        if user_ratings.count() > 0:
            return user_ratings.mean()
        else:
            return user_movie_matrix.values.flatten().mean()
    return (numerator / denominator)


In [15]:
def leave_one_out_split(ratings_df):
    """
    For every user, hold out the latest rating (by timestamp) for testing.
    If a user only has one rating, we keep it in training.
    Returns:
        train_df: DataFrame for training data.
        test_df: DataFrame for test data.
    """
    # Ensure ratings are sorted by timestamp to simulate time-based splitting
    ratings_df = ratings_df.sort_values(by='timestamp')
    
    train_list = []
    test_list = []
    
    # Group by user
    for user_id, group in ratings_df.groupby('userId'):
        if len(group) > 1:
            # Use the latest rating as test, and the remaining as training
            test_list.append(group.tail(1))
            train_list.append(group.iloc[:-1])
        else:
            # If there's only one rating for the user, include it in training
            train_list.append(group)
    
    train_df = pd.concat(train_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)
    return train_df, test_df

train_df, test_df = leave_one_out_split(ratings)
print("Training set size:", train_df.shape[0])
print("Test set size:", test_df.shape[0])

Training set size: 100226
Test set size: 610


In [16]:
predictions=[]
actuals=[]

predictions_pearson=[]
actuals=[]

In [17]:
for row in test_df.itertuples(index=False):
    user_id = row.userId
    movie_id = row.movieId
    true_rating = row.rating
    
    # Check if the user exists in the training set; if not, fallback to global average
    if user_id in user_movie_matrix.index:
        pred_rating = predict_rating_cosine(user_id, movie_id)
        pred_rating_pearson = predict_rating_pearson(user_id, movie_id)
    else:
        pred_rating = train_df['rating'].mean()  # or any default strategy
        pred_rating_pearson = train_df['rating'].mean()  # or any default strategy
    # print(pred_rating)
    predictions.append(pred_rating)
    predictions_pearson.append(pred_rating_pearson)
    actuals.append(true_rating)


In [19]:
rmse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)

print("Evaluation Metrics:")
print("RMSE: {:.4f}".format(rmse))
print("MAE: {:.4f}".format(mae))

Evaluation Metrics:
RMSE: 1.0619
MAE: 0.8133


In [18]:
rmse = mean_squared_error(actuals, predictions_pearson)
mae = mean_absolute_error(actuals, predictions_pearson)

print("Evaluation Metrics:")
print("RMSE: {:.4f}".format(rmse))
print("MAE: {:.4f}".format(mae))

Evaluation Metrics:
RMSE: 13.6444
MAE: 3.5447
