# Import Libraries

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Import Data

In [2]:
ratings = pd.read_csv("Data/ratings.csv")
movies = pd.read_csv("Data/movies.csv")

In [3]:
movie_data = pd.merge(ratings, movies, on='movieId')
movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


# Data Processing

In [4]:
movie_data.shape

(100836, 6)

In [5]:
movies['title']

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

In [6]:
movies["year"] = movies["title"].str.extract(r'\((\d{4})\)')

In [7]:
movies['year'].isna().sum()

13

In [8]:
# Calculate median year
median_year = movies['year'].median()

# Fill NaN values in 'year' column with median year
movies['year'].fillna(median_year, inplace=True)

In [9]:
movies["year_cat"] = pd.cut(movies["year"].astype(int), bins=[1900, 1970, 1990, 2000, 2010, 2020], labels=["1900-1970", "1970-1990", "1990-2000", "2000-2010", "2010-2020"])
year_cat = pd.get_dummies(movies["year_cat"],dtype=int)   
year_cat["movieId"] = movies["movieId"] 

In [10]:
genres = movies.set_index('movieId')['genres'].str.get_dummies(sep='|')

In [11]:
utility_matrix = movie_data.pivot_table(index='movieId', columns='userId', values='rating')

In [12]:
utility_matrix.fillna(0, inplace=True)

In [13]:
movie_features = utility_matrix.merge(genres, on='movieId', how='inner')
movie_features = movie_features.merge(year_cat, on='movieId', how='inner')
movie_features.set_index('movieId', inplace=True)

In [14]:
similarity = cosine_similarity(movie_features)
similarity = pd.DataFrame(similarity, index=movie_features.index, columns=movie_features.index,dtype=float)

In [23]:
## save similarity matrix to pickle file
with open('item_sim.pkl', 'wb') as file:
    pickle.dump(similarity, file)

In [15]:
def getsimilarity(movieid,topn=10):
    index = similarity[movieid].sort_values(ascending=False).index[1:topn+1]
    val = similarity[movieid].sort_values(ascending=False).values[1:topn+1]
    df = pd.DataFrame(val, index=index).rename(columns={0:"similarity"}).set_index(index)
    return df.merge(movies.set_index("movieId"), on='movieId')

In [16]:
df = getsimilarity(1,10)
df.merge(movies.set_index("movieId"), on='movieId')

Unnamed: 0_level_0,similarity,title_x,genres_x,year_x,year_cat_x,title_y,genres_y,year_y,year_cat_y
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3114,0.573591,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,1999,1990-2000,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,1999,1990-2000
480,0.565318,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,1993,1990-2000,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,1993,1990-2000
780,0.563897,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,1996,1990-2000,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,1996,1990-2000
260,0.556915,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977,1970-1990,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977,1970-1990
356,0.546832,Forrest Gump (1994),Comedy|Drama|Romance|War,1994,1990-2000,Forrest Gump (1994),Comedy|Drama|Romance|War,1994,1990-2000
364,0.541285,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,1994,1990-2000,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,1994,1990-2000
1210,0.540599,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,1983,1970-1990,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,1983,1970-1990
648,0.538554,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,1996,1990-2000,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,1996,1990-2000
1265,0.534304,Groundhog Day (1993),Comedy|Fantasy|Romance,1993,1990-2000,Groundhog Day (1993),Comedy|Fantasy|Romance,1993,1990-2000
1270,0.530187,Back to the Future (1985),Adventure|Comedy|Sci-Fi,1985,1970-1990,Back to the Future (1985),Adventure|Comedy|Sci-Fi,1985,1970-1990
