In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [80]:
#import movie list
movies_df = pd.read_csv('movies.csv')
#drop genres column
movies_df.drop(['genres'], axis = 1, inplace = True)
#remove year from title column
movies_df.title = movies_df.title.apply(lambda x: x[:len(x)-7].strip())
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
2,3,Grumpier Old Men
3,4,Waiting to Exhale
4,5,Father of the Bride Part II


In [81]:
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')
ratings_df.drop(['timestamp'], axis = 1, inplace = True)
#merge rating with movies data to get movie title
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [82]:
#create target user input
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies = inputMovies.merge(movies_df, on = 'title')
inputMovies.sort_values(['movieId'], ascending = True, inplace = True)
inputMovies.head()

Unnamed: 0,rating,title,movieId
1,3.5,Toy Story,1
2,2.0,Jumanji,2
3,5.0,Pulp Fiction,296
4,4.5,Akira,1274
0,5.0,"Breakfast Club, The",1968


In [83]:
#create subset of rating 
subset_rating = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'])]
subset_rating.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [84]:
group_subset = subset_rating.groupby(['userId'])
group_subset = sorted(group_subset, key = lambda x: len(x[1]), reverse = True)
group_subset = group_subset[:100]
print(group_subset[0:5])

[(75,       userId  movieId  rating
7507      75        1     5.0
7508      75        2     3.5
7540      75      296     5.0
7633      75     1274     4.5
7673      75     1968     5.0), (106,       userId  movieId  rating
9083     106        1     2.5
9084     106        2     3.0
9115     106      296     3.5
9198     106     1274     3.0
9238     106     1968     3.5), (686,        userId  movieId  rating
61336     686        1     4.0
61337     686        2     3.0
61377     686      296     4.0
61478     686     1274     4.0
61569     686     1968     5.0), (815,        userId  movieId  rating
73747     815        1     4.5
73748     815        2     3.0
73922     815      296     5.0
74362     815     1274     3.0
74678     815     1968     4.5), (1040,        userId  movieId  rating
96689    1040        1     3.0
96690    1040        2     1.5
96733    1040      296     3.5
96859    1040     1274     3.0
96922    1040     1968     4.0)]


In [85]:
# create pearsonr
from scipy.stats import pearsonr
userId = []
pearson = []
for group in group_subset:
    userId.append(group[0])
    relevantData = group[1]
    relevantData = relevantData[relevantData['movieId'].isin(inputMovies['movieId'])]
    sub_target = inputMovies[inputMovies['movieId'].isin(relevantData['movieId'])]
    pearson.append(pearsonr(sub_target['rating'], relevantData['rating'])[0])
similarity_index = pd.DataFrame({'userId':userId, 'similarity':pearson})
similarity_index.sort_values(['similarity'], ascending = False, inplace = True)
similarity_index = similarity_index[:50]
similarity_index.head()

Unnamed: 0,similarity,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


In [86]:
#merge top users back
score_matrix = similarity_index.merge(ratings_df, on = 'userId', how = 'inner')
boo = (score_matrix['movieId'].isin(inputMovies['movieId']))
boo = [not i for i in boo]
score_matrix = score_matrix[boo]
score_matrix.head(10)

Unnamed: 0,similarity,userId,movieId,rating
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5
5,0.961678,12325,7,3.0
6,0.961678,12325,10,3.0
7,0.961678,12325,11,2.5
8,0.961678,12325,17,4.0
9,0.961678,12325,19,1.0
10,0.961678,12325,21,3.5
11,0.961678,12325,31,2.0


In [87]:
score_matrix['SimXrating'] = score_matrix.similarity*score_matrix.rating
group_score_matrix = score_matrix.groupby('movieId').sum()
print(group_score_matrix.head())
group_score_matrix['final_score'] = group_score_matrix.SimXrating/group_score_matrix.similarity
group_score_matrix.sort_values(['final_score'], ascending = False, inplace = True)

         similarity  userId  rating  SimXrating
movieId                                        
3         10.253981  151444    35.0   27.254477
4          0.929294   12120     3.0    2.787882
5         11.723262  142707    36.0   27.151751
6         23.072783  330367   116.0   86.609760
7          9.577335  126797    34.0   27.321140


In [88]:
group_score_matrix= group_score_matrix.merge(movies_df, on = 'movieId')
group_score_matrix.head(10)

Unnamed: 0,movieId,similarity,userId,rating,SimXrating,final_score,title
0,6660,0.602018,10863,5.0,3.010092,5.0,"Red Shoes, The"
1,7122,0.692179,9358,5.0,3.460894,5.0,King of Hearts
2,98981,0.943456,1040,5.0,4.717282,5.0,"Arrival of a Train, The"
3,26094,0.716115,16456,5.0,3.580574,5.0,"Eclisse, L' (Eclipse)"
4,9018,0.877058,17897,5.0,4.38529,5.0,Control Room
5,8684,0.602018,10863,5.0,3.010092,5.0,"Man Escaped, A (Un condamné à mort s'est écha..."
6,1695,0.586009,12916,5.0,2.930045,5.0,Artemisia
7,99917,0.716115,14984,5.0,3.580574,5.0,Upstream Color
8,8338,0.602018,10863,5.0,3.010092,5.0,Black Narcissus
9,100106,0.602018,10863,5.0,3.010092,5.0,"Pervert's Guide to Ideology, The"
