In [1]:
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix


## Data Visualization


In [2]:
#read csv(s)
movies_df = pd.read_csv("dataset/movies.csv")
ratings_df = pd.read_csv("dataset/ratings.csv")
tags_df = pd.read_csv("dataset/tags.csv")
gscores_df = pd.read_csv("dataset/genome-scores.csv")
gtags_df = pd.read_csv("dataset/genome-tags.csv")
links_df = pd.read_csv("dataset/links.csv")


In [3]:
#drop timestamp as it is only consuming memory
ratings_df.drop('timestamp', axis=1, inplace=True)
tags_df.drop('timestamp', axis=1, inplace=True)


In [38]:


print(movies_df.isnull().sum())
# movies_df['genres'] = movies_df['genres'].str.split('|')
# genres_encoded = movies_df['genres'].str.get_dummies(sep='|')

movies_df.head()

movieId    0
title      0
genres     0
dtype: int64


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print(ratings_df.isnull().sum())
ratings_df.head()

userId     0
movieId    0
rating     0
dtype: int64


Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [6]:
#print(tags_df.isnull().sum()) #It has 16 null tags
#print(tags_df[tags_df['tag'].isnull()].iloc[0])
tags_df.dropna(inplace=True)
print(tags_df.isnull().sum())
tags_df.head()

userId     0
movieId    0
tag        0
dtype: int64


Unnamed: 0,userId,movieId,tag
0,3,260,classic
1,3,260,sci-fi
2,4,1732,dark comedy
3,4,1732,great dialogue
4,4,7569,so bad it's good


In [7]:
print(gscores_df.isnull().sum())
gscores_df.head()

movieId      0
tagId        0
relevance    0
dtype: int64


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [8]:
print(gtags_df.isnull().sum())
gtags_df.head()

tagId    0
tag      0
dtype: int64


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [9]:
print(links_df.isnull().sum())
links_df.head()

movieId      0
imdbId       0
tmdbId     107
dtype: int64


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [13]:
movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")
movies_rating_user_df.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0


In [46]:
movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating']].groupby(['movieId', 'title'])['rating'].agg(['count', 'mean']).round(1)


In [47]:
movies_rating_df.sort_values('count', ascending=False, inplace=True)
movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)
movies_rating_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Num_ratings,Average_rating
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),81491,4.0
318,"Shawshank Redemption, The (1994)",81482,4.4
296,Pulp Fiction (1994),79672,4.2
593,"Silence of the Lambs, The (1991)",74127,4.2
2571,"Matrix, The (1999)",72674,4.2
260,Star Wars: Episode IV - A New Hope (1977),68717,4.1
480,Jurassic Park (1993),64144,3.7
527,Schindler's List (1993),60411,4.2
110,Braveheart (1995),59184,4.0
2959,Fight Club (1999),58773,4.2


# Data Preprocessing

In [48]:
#let's use the beysian average to calculate a more accurate rating
#this is because a review of 5 with only 1 review is worthless, while a 4.2 with multiple reviews is more reliable

def calculate_weighted_rating(df, C, m):
    """
    Calculate Bayesian weighted rating for each movie in the DataFrame.

    Parameters:
    df (DataFrame): DataFrame containing movie ratings.
    C (float): Average rating across all movies (prior assumption).
    m (int): Minimum number of ratings required to be considered.

    Returns:
    DataFrame: DataFrame with Bayesian weighted rating column added.
    """
    
    # Add the Bayesian weighted rating as a new column in the DataFrame
    df['Bayesian_rating'] = (df['Num_ratings'] / (df['Num_ratings'] + m)) * df['Average_rating'] + (m / (df['Num_ratings'] + m)) * C

    return df

C = round(ratings_df['rating'].mean(), 2)
movies_rating_df = calculate_weighted_rating(movies_rating_df, C, 500)
movies_rating_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Num_ratings,Average_rating,bayesian_rating
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),81491,4.0,3.997134
318,"Shawshank Redemption, The (1994)",81482,4.4,4.394694
296,Pulp Fiction (1994),79672,4.2,4.195821
593,"Silence of the Lambs, The (1991)",74127,4.2,4.195511
2571,"Matrix, The (1999)",72674,4.2,4.195422


In [11]:

#create a utility matrix

users_U = movies_rating_df['userId'].nunique()
movies_U = movies_rating_df['movieId'].nunique()

# def create_utility_matrix(df):
    


162541
59047


# Recommendation System



## Content-based approach


In [30]:
# mlb = MultiLabelBinarizer()
# genres_encoded = pd.DataFrame(mlb.fit_transform(movies_df['genres'].str.split('|')), columns=mlb.classes_, index=movies_df.index)


# #this will work for a specific user
# User_id = 1
# N = 5 # number of recommendations

# def return_user_recommendations(User_id: int, N: int):
#    # Filter DataFrame to include only ratings made by the specified user
   
#    user_ratings = movies_rating_user_df[movies_rating_user_df['userId'] == User_id]

#    # Convert genres into binary features using one-hot encoding
#    genres_encoded = pd.DataFrame(mlb.transform(user_ratings['genres'].str.split('|')), columns=mlb.classes_, index=user_ratings.index)

#    # Create user profile based on their ratings for movies with the same genres
#    user_profile = genres_encoded.mean()

#    # Calculate cosine similarity between user profile and all movies
#    similarity_scores = cosine_similarity([user_profile], genres_encoded)

#    # Get indices of movies sorted by similarity score in descending order
#    sorted_indices = similarity_scores.argsort(axis=1)[:, ::-1].flatten()

#    # Exclude movies the user has already rated
#    unrated_indices = [index for index in sorted_indices if index not in user_ratings.index]

#    # Recommend top N unrated movies
#    top_n_recommendations = unrated_indices[:N]

#    # Print recommendations
#    print(f"Recommendations for User {User_id}:")
#    for index in top_n_recommendations:
#       movie_title = movies_df.iloc[index]['title']
#       print(f"- {movie_title}")

# return_user_recommendations(User_id, N)


Recommendations for User 1:
- From Dusk Till Dawn (1996)
- Clueless (1995)
- Dead Presidents (1995)
- Sabrina (1995)
- Assassins (1995)


## Collaborative/Social


In [None]:
# def collaborative_filtering_recommendation(user_id, N):
#     # Filter DataFrame to include ratings of the target user
#     target_user_ratings = movies_rating_df[movies_rating_df['userId'] == user_id]

#     # Calculate user-user similarity matrix using Pearson correlation
#     user_similarity_matrix = []
#     for index, row in movies_rating_df.iterrows():
#         if row['userId'] != user_id:
#             rating = row['rating']
#             if not pd.isnull(rating):  # Check if rating is not NaN
#                 similarity = pearsonr(target_user_ratings['rating'], rating)[0]
#                 user_similarity_matrix.append((index, similarity))

#     # Sort the user_similarity_matrix based on similarity value
#     user_similarity_matrix.sort(key=lambda x: x[1], reverse=True)

#     # Find top-k most similar users
#     similar_users_indices = [index for index, _ in user_similarity_matrix[:N]]

#     # Get ratings of similar users for items not rated by the target user
#     similar_users_ratings = movies_rating_df[movies_rating_df['userId'].isin(similar_users_indices)]

#     # Group by movieId and calculate mean rating of similar users
#     similar_users_ratings_grouped = similar_users_ratings.groupby('movieId')['rating'].mean().reset_index()
#     similar_users_ratings_grouped = pd.merge(movies_df, similar_users_ratings_grouped, on="movieId", how="inner")
#     similar_users_ratings_grouped.drop('genres', axis=1, inplace=True)

#     # Exclude movies already rated by the target user
#     unrated_movies = similar_users_ratings_grouped[~similar_users_ratings_grouped['movieId'].isin(target_user_ratings['movieId'])]

#     # Sort by predicted rating and recommend top N movies
#     top_n_recommendations = unrated_movies.sort_values(by='rating', ascending=False).head(N)

#     return top_n_recommendations

# # Example usage:
# user_id = 1
# N = 5  # Number of recommendations
# recommendations = collaborative_filtering_recommendation(user_id, N)
# print(f"Recommendations for User {user_id}:")
# print(recommendations)