In [20]:
import sys
import os
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from helper_functions import DataTransformer, get_top_ten_similar_movies, get_movie_id_from_title

# Data import

In [4]:
dt = DataTransformer()
movies_df = dt.get_movies_df()
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [5]:
genres_encoded_df = dt.get_binary_encoded_genres_df()
genres_encoded_df.head()

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculate cosine similarity 

In [26]:
movie_ids = genres_encoded_df['movieId']
genres_encoded_matrix = genres_encoded_df.drop(columns=['movieId'])

genres_cosine_similarity_matrix = cosine_similarity(genres_encoded_matrix, genres_encoded_matrix)
genres_cosine_similarity_matrix = pd.DataFrame(genres_cosine_similarity_matrix, index=movie_ids, columns=movie_ids)
genres_cosine_similarity_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.774597,0.316228,0.258199,0.447214,0.0,0.316228,0.632456,0.0,0.258199,...,0.447214,0.316228,0.316228,0.447214,0.0,0.67082,0.774597,0.0,0.316228,0.447214
2,0.774597,1.0,0.0,0.0,0.0,0.0,0.0,0.816497,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.288675,0.333333,0.0,0.0,0.0
3,0.316228,0.0,1.0,0.816497,0.707107,0.0,1.0,0.0,0.0,0.0,...,0.353553,0.0,0.5,0.0,0.0,0.353553,0.408248,0.0,0.0,0.707107
4,0.258199,0.0,0.816497,1.0,0.57735,0.0,0.816497,0.0,0.0,0.0,...,0.288675,0.408248,0.816497,0.0,0.0,0.288675,0.333333,0.57735,0.0,0.57735
5,0.447214,0.0,0.707107,0.57735,1.0,0.0,0.707107,0.0,0.0,0.0,...,0.5,0.0,0.707107,0.0,0.0,0.5,0.57735,0.0,0.0,1.0


In [19]:
def return_cosine_similar_movies(movie_id, cosine_similarity_matrix):
    # Extract the cosine similarity values for the movie
    similar_movies = pd.DataFrame(cosine_similarity_matrix[movie_id])
    # Remove the similarity to the movie itself
    similar_movies = similar_movies[similar_movies.index != movie_id]
    # Rename the column as it will be merged with other scores later.
    #similar_movies.rename(columns={movie_id: 'Cosine_similarity'}, inplace=True)
    return similar_movies

In [43]:
movie_title = 'pulp fiction'
movie_id = get_movie_id_from_title(movie_title, movies_df)


In [44]:
cosine_similar_movies_by_genre = return_cosine_similar_movies(movie_id, genres_cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies_by_genre, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,pulp fiction - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1034,1.0,Freeway
4242,1.0,Beautiful Creatures
6003,1.0,Confessions of a Dangerous Mind
3266,1.0,Man Bites Dog (C'est arrivé près de chez vous)
6705,1.0,Party Monster
75813,1.0,Leaves of Grass
608,1.0,Fargo
71211,1.0,"Informant!, The"
57669,1.0,In Bruges
145,0.894427,Bad Boys
