In [212]:
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt

In [213]:
#read csv(s)
movies_df = pd.read_csv("dataset/small_dataset/movies_full_2.csv")
ratings_df = pd.read_csv("dataset/small_dataset/ratings.csv")
tags_df = pd.read_csv("dataset/small_dataset/tags.csv")
links_df = pd.read_csv("dataset/small_dataset/links.csv")


In [214]:


movies_2 = movies_df.copy()


# Flatten the list of genres
movies_2['genres'] = movies_2['genres'].str.split('|')
all_genres = [genre for sublist in movies_2['genres'] for genre in sublist]

# Extract unique genres
unique_genres = list(set(all_genres))

# Sort the unique genres for better readability (optional)
unique_genres.sort()

# Display the unique genres
# print(unique_genres)

del all_genres
del movies_2

print(unique_genres)

# print(len(unique_genres))


['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [215]:
movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")
movies_rating_user_df.head()

Unnamed: 0,movieId,title,genres,imdbId,year,url,titleLower,userId,rating,timestamp
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,1,4.0,964982703
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,5,4.0,847434962
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,7,4.5,1106635946
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,15,2.5,1510577970
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,17,4.5,1305696483


In [216]:

movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating', 'genres', 'year', 'url']].groupby(['movieId', 'title', 'genres', 'year', 'url'])['rating'].agg(['count', 'mean']).round(1)
movies_rating_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean
movieId,title,genres,year,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0,215,3.9
2,Jumanji,Adventure|Children|Fantasy,1995.0,110,3.4
3,Grumpier Old Men,Comedy|Romance,1995.0,52,3.3
4,Waiting to Exhale,Comedy|Drama|Romance,1995.0,7,2.4
5,Father of the Bride Part II,Comedy,1995.0,49,3.1


In [217]:
movies_rating_df.sort_values('count', ascending=False, inplace=True)
movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)
movies_rating_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Num_ratings,Average_rating
movieId,title,genres,year,Unnamed: 4_level_1,Unnamed: 5_level_1
356,Forrest Gump,Comedy|Drama|Romance|War,1994.0,329,4.2
318,"Shawshank Redemption, The",Crime|Drama,1994.0,317,4.4
296,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994.0,307,4.2
593,"Silence of the Lambs, The",Crime|Horror|Thriller,1991.0,279,4.2
2571,"Matrix, The",Action|Sci-Fi|Thriller,1999.0,278,4.2
260,Star Wars: Episode IV - A New Hope,Action|Adventure|Sci-Fi,1977.0,251,4.2
480,Jurassic Park,Action|Adventure|Sci-Fi|Thriller,1993.0,238,3.8
110,Braveheart,Action|Drama|War,1995.0,237,4.0
589,Terminator 2: Judgment Day,Action|Sci-Fi,1991.0,224,4.0
527,Schindler's List,Drama|War,1993.0,220,4.2


In [218]:
#let's use the beysian average to calculate a more accurate rating
#this is because a review of 5 with only 1 review is worthless, while a 4.2 with multiple reviews is more reliable

def calculate_weighted_rating(df, C, m):
    """
    Calculate Bayesian weighted rating for each movie in the DataFrame.

    Parameters:
    df (DataFrame): DataFrame containing movie ratings.
    C (float): Average rating across all movies (prior assumption).
    m (int): Minimum number of ratings required to be considered.

    Returns:
    DataFrame: DataFrame with Bayesian weighted rating column added.
    """
    
    # Add the Bayesian weighted rating as a new column in the DataFrame
    df['Bayesian_rating'] = (df['Num_ratings'] / (df['Num_ratings'] + m)) * df['Average_rating'] + (m / (df['Num_ratings'] + m)) * C

    return df

C = round(ratings_df['rating'].mean(), 2)
movies_rating_df = calculate_weighted_rating(movies_rating_df, C, 500)
movies_rating_df.drop(columns='Average_rating', inplace=True)
movies_rating_df.sort_values(by='Bayesian_rating', ascending=False, inplace=True)
movies_rating_df.rename(columns={'Num_ratings' : 'count', 'Bayesian_rating' : 'weighted_rating'}, inplace=True)
movies_rating_df.reset_index(inplace=True)
movies_rating_df['genres'] = movies_rating_df['genres'].str.split('|')
movies_rating_df.head()

Unnamed: 0,movieId,title,genres,year,count,weighted_rating
0,318,"Shawshank Redemption, The","[Crime, Drama]",1994.0,317,3.849204
1,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994.0,329,3.777805
2,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994.0,307,3.766295
3,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991.0,279,3.750706
4,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999.0,278,3.750129


In [219]:
movies_rating_df

Unnamed: 0,movieId,title,genres,year,count,weighted_rating
0,318,"Shawshank Redemption, The","[Crime, Drama]",1994.0,317,3.849204
1,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994.0,329,3.777805
2,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994.0,307,3.766295
3,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991.0,279,3.750706
4,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999.0,278,3.750129
...,...,...,...,...,...,...
9431,19,Ace Ventura: When Nature Calls,[Comedy],1995.0,88,3.380272
9432,344,Ace Ventura: Pet Detective,[Comedy],1994.0,161,3.378215
9433,435,Coneheads,"[Comedy, Sci-Fi]",1993.0,63,3.376909
9434,2701,Wild Wild West,"[Action, Comedy, Sci-Fi, Western]",1999.0,53,3.375407


In [220]:
print(movies_rating_df.iloc[41])

movieId                                                            1
title                                                      Toy Story
genres             [Adventure, Animation, Children, Comedy, Fantasy]
year                                                          1995.0
count                                                            215
weighted_rating                                              3.62028
Name: 41, dtype: object


In [226]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


# Function to recommend movies
def find_movie_indices(df, title):

    df_copy = df.copy()

    
    # Vectorize genres
    df_copy['genres_str'] = df_copy['genres'].apply(lambda x: ' '.join(x))


    count_vect = CountVectorizer()
    genre_matrix = count_vect.fit_transform(df_copy['genres_str'])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(genre_matrix, genre_matrix)
    
    # Get the index of the movie that matches the title
    idx = df_copy.index[df_copy['title'] == title].tolist()[0]
    
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the 10 most similar movies
    movie_indices = [i[0] for i in sim_scores]

    movie_indices = movie_indices[1:20]
    del df_copy

    return movie_indices

    
def recommend_movies(df, movie_indices, preferred_genres=None, disliked_genres=None):

    # Filter based on user preferences
    #print(movie_indices)
    recommended_movies_ids = []
    for i in movie_indices:
        movie_genres = set(df.loc[i, 'genres'])
        
        # print(movie_genres)
        #print(df.loc[i, 'title'])

        # Check if the movie does not have any of the disliked genres
       

        if disliked_genres:
            if movie_genres.intersection(set(disliked_genres)):
                continue



        # Check if the movie has any of the preferred genres
        if preferred_genres:
            if not movie_genres.intersection(set(preferred_genres)):
                continue
        
        # Add the movie to the list of recommendations
        recommended_movies_ids.append(i)

        #df.iloc[i][['title', 'year' ,'url', 'count', 'weighted_rating']]
        
        # Limit the number of recommended movies to 10
        if len(recommended_movies) >= 10:
            break
    
    recommended_movies_df = df.loc[recommended_movies_ids]
    recommended_movies_df = recommended_movies_df[['title', 'year', 'url', 'count', 'weighted_rating']]

    return recommended_movies_df


# Example usage
title = 'Toy Story'
preferred_genres = ['Animation', 'Drama', 'Mistery']
disliked_genres = ['Action']

movie_indices_list = find_movie_indices(movies_rating_df, title)
recommended_movies = recommend_movies(movies_rating_df, movie_indices_list, preferred_genres, disliked_genres)
print(recommended_movies)

    movieId           title  \
76     4886  Monsters, Inc.   

                                               genres    year  count  \
76  [Adventure, Animation, Children, Comedy, Fantasy]  2001.0    132   

    weighted_rating  
76         3.583544  
