In [153]:
import pandas as pd
from math import sqrt
import numpy as np

In [154]:
movies_df = pd.read_csv('data2/movies.csv')
ratings_df = pd.read_csv('data2/ratings.csv')
tags_df = pd.read_csv('data2/tags.csv')
print(movies_df.info())
print(ratings_df.info())
print(tags_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    36

In [155]:
userInput = [{'title':'Jumanji (1995)', 'rating':3},
             {'title':'Sabrina (1995)','rating':3.5},
             {'title':'Ace Ventura: When Nature Calls (1995)','rating':4},
             {'title':'Clueless (1995)','rating':4.5},
             {'title':'Mortal Kombat (1995)','rating':4}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                   title  rating
0                         Jumanji (1995)     3.0
1                         Sabrina (1995)     3.5
2  Ace Ventura: When Nature Calls (1995)     4.0
3                        Clueless (1995)     4.5
4                   Mortal Kombat (1995)     4.0


In [156]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                  title  rating
0        2                         Jumanji (1995)     3.0
1        7                         Sabrina (1995)     3.5
2       19  Ace Ventura: When Nature Calls (1995)     4.0
3       39                        Clueless (1995)     4.5
4       44                   Mortal Kombat (1995)     4.0


In [157]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
2           110     110        110
7            54      54         54
19           88      88         88
39          104     104        104
44           46      46         46


In [158]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((68,),        userId  movieId  rating   timestamp
10361      68        2     2.5  1158532776
10365      68        7     2.0  1230498124
10371      68       19     1.5  1158532448
10376      68       39     4.0  1158532000
10377      68       44     3.0  1158534993), ((117,),        userId  movieId  rating  timestamp
18349     117        2     3.0  844163002
18353     117        7     4.0  844163615
18357     117       19     2.0  844162892
18364     117       39     3.0  844162955
18366     117       44     2.0  844163037), ((599,),        userId  movieId  rating   timestamp
92624     599        2     2.5  1498514085
92627     599        7     2.5  1498514161
92636     599       19     3.0  1498524930
92645     599       39     3.0  1498525783
92649     599       44     2.5  1498517161), ((19,),       userId  movieId  rating  timestamp
2275      19        2     3.0  965704331
2277      19        7     2.0  965706657
2282      19       19     2.0  965708339
2285      19       44     3

In [159]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [160]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF['userId'] = pearsonDF['userId'].apply(lambda x: int(str(x)[1:-2]))
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.501557      68
1        -0.366900     117
2         0.720577     599
3        -0.301511      19
4         0.000000      58


In [161]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
28              1.0     274
82              1.0     434
67              1.0     230
66              1.0     222
64              1.0     200


In [162]:
print(topUsers['userId'].dtype)
print(ratings_df['userId'].dtype)

int64
int64


In [163]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     274        1     4.0  1171410158
1               1.0     274        2     3.5  1171934785
2               1.0     274        6     4.0  1197022122
3               1.0     274        8     3.0  1172030892
4               1.0     274       10     4.0  1171428459
..              ...     ...      ...     ...         ...
95              1.0     274      520     3.5  1171943669
96              1.0     274      527     4.0  1171758666
97              1.0     274      541     2.5  1171409458
98              1.0     274      542     3.5  1172023905
99              1.0     274      543     2.5  1171511510

[100 rows x 5 columns]


In [164]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     274        1     4.0  1171410158             4.0
1              1.0     274        2     3.5  1171934785             3.5
2              1.0     274        6     4.0  1197022122             4.0
3              1.0     274        8     3.0  1172030892             3.0
4              1.0     274       10     4.0  1171428459             4.0


In [165]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  14.251271           55.315059
2                  15.590638           44.625323
3                   3.992649           13.101590
4                   0.000000            0.000000
5                   4.849028           13.170512


In [166]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.881412        1
2                                     2.862315        2
3                                     3.281428        3
4                                          NaN        4
5                                     2.716114        5
6                                     3.728473        6
7                                     2.626354        7
8                                     3.000000        8
9                                     1.500000        9
10                                    3.496012       10


In [167]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
5292                                       5.0     5292
1238                                       5.0     1238
26528                                      5.0    26528
93008                                      5.0    93008
93320                                      5.0    93320
...                                        ...      ...
182715                                     NaN   182715
184245                                     NaN   184245
188675                                     NaN   188675
188833                                     NaN   188833
189381                                     NaN   189381

[6085 rows x 2 columns]


In [168]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                      title  \
0           1                           Toy Story (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
5           6                                Heat (1995)   
...       ...                                        ...   
9695   184791  Fred Armisen: Standup for Drummers (2018)   
9710   187595             Solo: A Star Wars Story (2018)   
9714   188675                              Dogman (2018)   
9717   188833      The Man Who Killed Don Quixote (2018)   
9721   189381                            SuperFly (2018)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                                          Comedy  
5                  