In [1]:
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer



## Data Preparations


In [2]:
#read csv(s)
movies_df = pd.read_csv("dataset/movies_full.csv")
ratings_df = pd.read_csv("dataset/ratings.csv")
tags_df = pd.read_csv("dataset/tags.csv")
gscores_df = pd.read_csv("dataset/genome-scores.csv")
gtags_df = pd.read_csv("dataset/genome-tags.csv")
links_df = pd.read_csv("dataset/links.csv")


In [3]:
# print(ratings_df.duplicated)
# print(ratings_df.shape)
# print(ratings_df['userId'].nunique())

In [4]:
#drop timestamp as it is only consuming memory
ratings_df.drop('timestamp', axis=1, inplace=True)
tags_df.drop('timestamp', axis=1, inplace=True)
movies_df.drop(columns=(['imdbId', 'url', 'titleLower']), inplace=True)


In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [6]:


movies_2 = movies_df.copy()




movies_df['genres'] = movies_df['genres'].str.split('|')
movies_df['genres'] = movies_df['genres'].apply(lambda x: ', '.join(x)) 
# genres_encoded = movies_df['genres'].str.get_dummies(sep='|')
# movies_df['year'] = movies_df['title'].str.extract('\((\d{4})\)')


# movies_df['year'] = movies_df['year'].astype(int)
movies_df['decade'] = (movies_df['year'] // 10) * 10

movies_df['title'].replace('\((\d{4})\)', '', regex=True, inplace=True)
movies_df.head()


# Flatten the list of genres
movies_2['genres'] = movies_2['genres'].str.split('|')
all_genres = [genre for sublist in movies_2['genres'] for genre in sublist]

# Extract unique genres
unique_genres = list(set(all_genres))

# Sort the unique genres for better readability (optional)
unique_genres.sort()

# Display the unique genres
# print(unique_genres)

del all_genres
del movies_2



# print(len(unique_genres))


In [7]:
print(ratings_df.isnull().sum())
ratings_df.head()

userId     0
movieId    0
rating     0
dtype: int64


Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [8]:
#print(tags_df.isnull().sum()) #It has 16 null tags
#print(tags_df[tags_df['tag'].isnull()].iloc[0])
tags_df.dropna(inplace=True)
print(tags_df.isnull().sum())
tags_df.head()

userId     0
movieId    0
tag        0
dtype: int64


Unnamed: 0,userId,movieId,tag
0,3,260,classic
1,3,260,sci-fi
2,4,1732,dark comedy
3,4,1732,great dialogue
4,4,7569,so bad it's good


In [9]:
print(gscores_df.isnull().sum())
gscores_df.head()

movieId      0
tagId        0
relevance    0
dtype: int64


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [10]:
print(gtags_df.isnull().sum())
gtags_df.head()

tagId    0
tag      0
dtype: int64


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [11]:
print(links_df.isnull().sum())
links_df.head()

movieId      0
imdbId       0
tmdbId     107
dtype: int64


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [12]:
movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")
movies_rating_user_df.head()

Unnamed: 0,movieId,title,genres,year,decade,userId,rating
0,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995.0,1990.0,2,3.5
1,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995.0,1990.0,3,4.0
2,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995.0,1990.0,4,3.0
3,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995.0,1990.0,5,4.0
4,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995.0,1990.0,8,4.0


In [13]:

movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating', 'genres', 'year', 'decade']].groupby(['movieId', 'title', 'genres', 'year', 'decade'])['rating'].agg(['count', 'mean']).round(1)
movies_rating_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean
movieId,title,genres,year,decade,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995.0,1990.0,57309,3.9
2,Jumanji,"Adventure, Children, Fantasy",1995.0,1990.0,24228,3.3
3,Grumpier Old Men,"Comedy, Romance",1995.0,1990.0,11804,3.1
4,Waiting to Exhale,"Comedy, Drama, Romance",1995.0,1990.0,2523,2.9
5,Father of the Bride Part II,Comedy,1995.0,1990.0,11714,3.1


In [14]:
movies_rating_df.sort_values('count', ascending=False, inplace=True)
movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)
movies_rating_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Num_ratings,Average_rating
movieId,title,genres,year,decade,Unnamed: 5_level_1,Unnamed: 6_level_1
356,Forrest Gump,"Comedy, Drama, Romance, War",1994.0,1990.0,81491,4.0
318,"Shawshank Redemption, The","Crime, Drama",1994.0,1990.0,81482,4.4
296,Pulp Fiction,"Comedy, Crime, Drama, Thriller",1994.0,1990.0,79672,4.2
593,"Silence of the Lambs, The","Crime, Horror, Thriller",1991.0,1990.0,74127,4.2
2571,"Matrix, The","Action, Sci-Fi, Thriller",1999.0,1990.0,72674,4.2
260,Star Wars: Episode IV - A New Hope,"Action, Adventure, Sci-Fi",1977.0,1970.0,68717,4.1
480,Jurassic Park,"Action, Adventure, Sci-Fi, Thriller",1993.0,1990.0,64144,3.7
527,Schindler's List,"Drama, War",1993.0,1990.0,60411,4.2
110,Braveheart,"Action, Drama, War",1995.0,1990.0,59184,4.0
2959,Fight Club,"Action, Crime, Drama, Thriller",1999.0,1990.0,58773,4.2


# Data Preprocessing

In [15]:
#let's use the beysian average to calculate a more accurate rating
#this is because a review of 5 with only 1 review is worthless, while a 4.2 with multiple reviews is more reliable

def calculate_weighted_rating(df, C, m):
    """
    Calculate Bayesian weighted rating for each movie in the DataFrame.

    Parameters:
    df (DataFrame): DataFrame containing movie ratings.
    C (float): Average rating across all movies (prior assumption).
    m (int): Minimum number of ratings required to be considered.

    Returns:
    DataFrame: DataFrame with Bayesian weighted rating column added.
    """
    
    # Add the Bayesian weighted rating as a new column in the DataFrame
    df['Bayesian_rating'] = (df['Num_ratings'] / (df['Num_ratings'] + m)) * df['Average_rating'] + (m / (df['Num_ratings'] + m)) * C

    return df

C = round(ratings_df['rating'].mean(), 2)
movies_rating_df = calculate_weighted_rating(movies_rating_df, C, 500)
movies_rating_df.drop(columns='Average_rating', inplace=True)
movies_rating_df.sort_values(by='Bayesian_rating', ascending=False, inplace=True)
movies_rating_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Num_ratings,Bayesian_rating
movieId,title,genres,year,decade,Unnamed: 5_level_1,Unnamed: 6_level_1
318,"Shawshank Redemption, The","Crime, Drama",1994.0,1990.0,81482,4.394694
50,"Usual Suspects, The","Crime, Mystery, Thriller",1995.0,1990.0,55366,4.293109
858,"Godfather, The","Crime, Drama",1972.0,1970.0,52498,4.292736
1221,"Godfather: Part II, The","Crime, Drama",1974.0,1970.0,34188,4.288901
159817,Planet Earth,Documentary,2006.0,2000.0,1747,4.284157


# Non-personalized Recommendation System



### This will be applied to the website if there isn't any user information, aka, one is not logged in.

Despite not being logged in, recommendations will be provided such as:
- Best movies overall
- Best movies for each genre
- Spoting trends
- etc

In [None]:
# let's recommend the best movies of all time (doesn't matter the genre, only it's popularity)
def non_personalized_recommendations_overall_rating(num_movies, df):
    movies_best = df.copy()
    movies_best = movies_best.iloc[:num_movies]
    movies_best.reset_index(inplace=True)
    
    return movies_best[['movieId', 'title']]

print(non_personalized_recommendations_overall_rating(10, movies_rating_df))

In [None]:


#unique_genres is a list with all available genres

# Function to find the best movie for each genre
def find_best_movies_for_genres(df, possible_genres, num_of_movies):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True)
    df_copy['genres'] = df_copy['genres'].str.split(', ')
    best_movies_for_genres = {}
    for genre in possible_genres:
        # Filter DataFrame by genre

        genre_df = df_copy[df_copy['genres'].apply(lambda x: genre in x)]
        if not genre_df.empty:
           
            # Select the top movie for the genre
            best_movie = genre_df[['movieId', 'title']].iloc[:num_of_movies]
            best_movies_for_genres[genre] = best_movie
    del df_copy
    del genre_df
    return best_movies_for_genres

# Find the best movie for each genre
best_movies_for_genres = find_best_movies_for_genres(movies_rating_df, unique_genres, 10)

# Display the best movies for each genre
for genre, movie in best_movies_for_genres.items():
    print(f"Best movie for genre '{genre}':")
    print(movie[['movieId', 'title']])
    print()


In [25]:
#best movies of a given year

def recommend_movies_best_year(df, year, num_movies):
    movies_year = df.copy()
    movies_year.reset_index(inplace=True)
    movies_year = movies_year[movies_year['year'] == year]
    movies_year_best = movies_year.iloc[:num_movies]
    del movies_year 
    
    
    return movies_year_best[['movieId', 'title', 'year']]

    
best_year_movies = recommend_movies_best_year(movies_rating_df, 2010, 10)
print(best_year_movies)


      movieId                                              title    year
13      79132                                          Inception  2010.0
174     74458                                     Shutter Island  2010.0
317     80906                                         Inside Job  2010.0
345     86345                              Louis C.K.: Hilarious  2010.0
376     85774                                              Senna  2010.0
390     81845                                 King's Speech, The  2010.0
391     78499                                        Toy Story 3  2010.0
394     81834       Harry Potter and the Deathly Hallows: Part 1  2010.0
396     76093                           How to Train Your Dragon  2010.0
402     85342  Elite Squad: The Enemy Within (Tropa de Elite ...  2010.0
474     86781                                          Incendies  2010.0
538     77455                         Exit Through the Gift Shop  2010.0
733     80463                                Social

In [30]:
#recommend the bets movies of a given decade

def recommend_movies_best_decade(df, decade, num_movies):
    movies_decade = df.copy()
    movies_decade.reset_index(inplace=True)
    movies_decade = movies_decade[movies_decade['decade'] == decade]
    movies_decade_best = movies_decade.iloc[:num_movies]
    del movies_decade 
    
    
    return movies_decade_best[['movieId', 'title']]#'decade','year',  'Bayesian_rating']]

    
best_year_movies = recommend_movies_best_decade(movies_rating_df, 2010, 10)
print(best_year_movies)

     movieId                              title
6     171011                    Planet Earth II
13     79132                          Inception
65    109487                       Interstellar
77     92259                       Intouchables
84    112552                           Whiplash
102   142488                          Spotlight
123   195159  Spider-Man: Into the Spider-Verse
124    96829                 Hunt, The (Jagten)
131    98491                           Paperman
167    99114                   Django Unchained
