
###  Predictive Analytics
###  Movie Recommendor System 


In [1]:
# Getting libraries
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
# Getting the movie data
df_movie= pd.read_csv("movies.csv")

In [3]:
# Checking top rows
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Checking size
df_movie.shape

(9742, 3)

In [5]:
# Getting the rating data
df_rate= pd.read_csv("ratings.csv")

In [6]:
# Check top rows
df_rate.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Checking Shape
df_rate.shape

(100836, 4)

In [8]:
# The movie and rating tables have a common column, movieid
# Merging the movie and rating rable by Movie ID
df= pd.merge(df_movie, df_rate, on= 'movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [9]:
# Creating a pivot table with the title as index
df_reco= df.pivot_table(index='title', columns='userId',values='rating')
df_reco.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [10]:
# The pivot table has many NAN values
# replacing the NAN values by 0 for the model 
df_reco.fillna(0,inplace=True)
df_reco.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Converting the pivot table to matrix to handle the sparcity
df_matrix = csr_matrix(df_reco.values)


In [12]:
# Fitting Model
# KNN model will be used to identify more similar movies

In [13]:
# The model will use cosine to group similar movies
model = NearestNeighbors(metric='cosine', algorithm='brute')
#Fitting model
model.fit(df_matrix)


NearestNeighbors(algorithm='brute', metric='cosine')

In [14]:
# Defining a function 
def get_recommendation(movie_name):
    #getting the row number
    movie_idx = df_reco.index.get_loc(movie_name)
    # Getting other similar moview
    distances, indices = model.kneighbors(df_reco.iloc[movie_idx,:]\
                                          .values.reshape(1,-1),\
                                          n_neighbors= 11)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.\
                  format(df_reco.index[movie_idx])) 
        else:
            print('{0}: {1}:'.format(i, df_reco.index\
                                     [indices.flatten()[i]]))


In [15]:
# Testing 1
get_recommendation('xXx: State of the Union (2005)')

Recommendations for xXx: State of the Union (2005):

1: Taxi (2004):
2: Pulse (2006):
3: Lakeview Terrace (2008):
4: Darkness Falls (2003):
5: Hills Have Eyes II, The (2007):
6: Haunting in Connecticut, The (2009):
7: RV (2006):
8: Fat Albert (2004):
9: Anacondas: The Hunt for the Blood Orchid (2004):
10: Stay Alive (2006):


In [16]:
# Testing 2
get_recommendation('xXx (2002)')

Recommendations for xXx (2002):

1: Die Another Day (2002):
2: Rundown, The (2003):
3: S.W.A.T. (2003):
4: Charlie's Angels: Full Throttle (2003):
5: Evolution (2001):
6: 2 Fast 2 Furious (Fast and the Furious 2, The) (2003):
7: Resident Evil (2002):
8: Hostage (2005):
9: Reign of Fire (2002):
10: Snakes on a Plane (2006):


#### Reference
https://www.analyticsvidhya.com/blog/2020/11/create-your-own-movie-movie-recommendation-system/
