# Movie Recommendation System

Dataset Link - https://grouplens.org/datasets/movielens/latest/

In [1]:
import numpy as np
import pandas as pd

Importing the dataset containing the user ratings

In [2]:
df=pd.read_csv(r"ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [3]:
df.drop(columns=['timestamp'],inplace=True) # Dropping this column as it is not required
df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


Importing the dataset containing the movie titles

In [4]:
df1=pd.read_csv(r"movies.csv")
df1.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Merging the two dataframes based on the common column 'movieId'

In [5]:
df2=pd.merge(df,df1,on='movieId')
df2.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [6]:
df2.nunique()

userId     64883
movieId    46435
rating        10
title      46372
genres      1551
dtype: int64

In [7]:
df2.isnull().sum()

userId     0
movieId    0
rating     0
title      0
genres     0
dtype: int64

Getting the rating count for each movie

In [8]:
rc = (df2.groupby('title').rating.count().reset_index().rename(columns = {'rating':'RatingCount'}))
rc.head()

Unnamed: 0,title,RatingCount
0,"""Great Performances"" Cats (1998)",70
1,#1 Cheerleader Camp (2010),4
2,#Female Pleasure (2018),1
3,#FollowMe (2019),2
4,#Horror (2015),8


Merging the two DataFrames

In [9]:
rc1 = pd.merge(df2,rc,on='title')
rc1.head()

Unnamed: 0,userId,movieId,rating,title,genres,RatingCount
0,1,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
1,3,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
2,4,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
3,5,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
4,7,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796


Getting only those movies which have a rating count greater than 100

In [10]:
rc2 = rc1[rc1.RatingCount >= 100]
rc2.head()

Unnamed: 0,userId,movieId,rating,title,genres,RatingCount
0,1,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
1,3,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
2,4,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
3,5,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796
4,7,296,4.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,31796


Creating a pivot matrix to see how a particular user rated a particular movie

In [11]:
df3=rc2.pivot_table(index=['title'],columns=['userId'],values='rating').fillna(0)
df3.head()

userId,1,2,3,4,5,6,7,8,9,10,...,64874,64875,64876,64877,64878,64879,64880,64881,64882,64883
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'night Mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Converting the pivot table to a sparse matrix and using cosine similarity method for getting the closest(most similar) movies

Explanation on cosine similarity - https://dzone.com/articles/machinex-cosine-similarity-for-item-based-collabor

In [12]:
from scipy.sparse import csr_matrix

df4 = csr_matrix(df3.values) #removing all the zero values

from sklearn.neighbors import NearestNeighbors as NN #usig knn algorithm (unsupervised type)

m_knn = NN(metric = 'cosine', algorithm = 'brute')
m_knn.fit(df4)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

Creating a function which takes the input

In [13]:
def inp():
    l=[]
    x = input('Name of the movie for which you want recommendations: ')
    print('Which one did you mean (give the number) : \n')
    
    x = x.title()
    x = list(x)
    x = ''.join(x)    
    a = df3[df3.index.str.contains(x)]
    
    for i in a.index:
        l = l+[i]
        
    if len(l)==0:
        print('Movie Not Found!')
        
    else:    
        for i in range(len(l)):
            print(f'{i+1}. {l[i]}')
            
        d = int(input())
        
        m = df3.index.get_loc(l[d-1])
        
        return m

Creating a function which produces the output

In [14]:
def out():
    distances,indices = m_knn.kneighbors(df3.iloc[m,:].values.reshape(1,-1), n_neighbors = 6)
    
    print(f'Recommendations for {df3.index[m]} : \n')
    
    for i in range(0, len(distances.flatten())):
        if i != 0:
            print(f'{i}: {df3.index[indices.flatten()[i]]}, with distances of {distances.flatten()[i]}') 

In [None]:
m=inp()
if m!=None:
    out()