In [99]:
import pandas as pd
from math import sqrt
import numpy as np

In [100]:
movies_df = pd.read_csv('Netflix_Dataset_Movie.csv')
Ratings_df = pd.read_csv('Netflix_Dataset_Rating.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Movie_ID  17770 non-null  int64 
 1   Year      17770 non-null  int64 
 2   Name      17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB
None


In [101]:
userInput = [{'Name':'Dragonheart', 'Rating':5},
             {'Name':'Sea of Love', 'Rating':1},
             {'Name':'The Color of Money', 'Rating':1},
             {'Name':'Superstar', 'Rating':5},
             {'Name':'Scandal', 'Rating':4.5},
             {'Name':'Jumanji', 'Rating':1}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)


                 Name  Rating
0         Dragonheart     5.0
1         Sea of Love     1.0
2  The Color of Money     1.0
3           Superstar     5.0
4             Scandal     4.5
5             Jumanji     1.0


In [102]:

inputId = movies_df[movies_df['Name'].isin(inputMovies['Name'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('Year', 1)
inputMovies = inputMovies[['Movie_ID','Name','Rating']]
print(inputMovies)

   Movie_ID                Name  Rating
0        58         Dragonheart     5.0
1       110             Scandal     4.5
2       599         Sea of Love     1.0
3      1502           Superstar     5.0
4      2139  The Color of Money     1.0
5     15078             Jumanji     1.0


  inputMovies = inputMovies.drop('Year', 1)


In [103]:
userSubset = Ratings_df[Ratings_df['Movie_ID'].isin(inputMovies['Movie_ID'].tolist())]
print(userSubset.groupby('Movie_ID').count())

          User_ID  Rating
Movie_ID                 
58          16116   16116
110          1863    1863
599          8928    8928
1502         8736    8736
2139        20496   20496


In [104]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['User_ID'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


[(16272,          User_ID  Rating  Movie_ID
188089     16272       4        58
263599     16272       3       110
2310897    16272       3       599
5386753    16272       1      1502
7971646    16272       3      2139), (57633,          User_ID  Rating  Movie_ID
193076     57633       2        58
264195     57633       3       110
2313649    57633       3       599
5389398    57633       2      1502
7977980    57633       3      2139), (64765,          User_ID  Rating  Movie_ID
188600     64765       2        58
263643     64765       2       110
2311126    64765       4       599
5386984    64765       2      1502
7972174    64765       4      2139), (74441,          User_ID  Rating  Movie_ID
189146     74441       2        58
263715     74441       3       110
2311488    74441       5       599
5387315    74441       2      1502
7972961    74441       4      2139), (109177,          User_ID  Rating  Movie_ID
191501    109177       3        58
264011    109177       5       110
23127

In [105]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='Movie_ID')
    inputMovies = inputMovies.sort_values(by='Movie_ID')

    #Get the N for the formula
    nRatings = len(group)

    # #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['Movie_ID'].isin(group['Movie_ID'].tolist())]

    # #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['Rating'].tolist()
   
    # #Let's also put the current user group reviews in a list format
    tempGroupList = group['Rating'].tolist()
   
    
    # #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [106]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['User_ID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())


   similarityIndex  User_ID
0        -0.183915    16272
1        -0.735662    57633
2        -0.995307    64765
3        -0.936209    74441
4        -0.145750   109177


In [107]:

topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())



    similarityIndex  User_ID
10         0.711111   387418
16         0.670402   722591
37         0.627476  1784150
58         0.609499  2485642
59         0.586601  2499884


In [108]:

topUsersRating=topUsers.merge(Ratings_df, left_on='User_ID', right_on='User_ID', how='inner')
print(topUsersRating.head(100))

    similarityIndex  User_ID  Rating  Movie_ID
0          0.711111   387418       2         3
1          0.711111   387418       1         8
2          0.711111   387418       2        16
3          0.711111   387418       3        17
4          0.711111   387418       2        18
..              ...      ...     ...       ...
95         0.711111   387418       2       313
96         0.711111   387418       1       316
97         0.711111   387418       3       329
98         0.711111   387418       2       330
99         0.711111   387418       3       331

[100 rows x 4 columns]


In [109]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Rating']
print(topUsersRating.head())


   similarityIndex  User_ID  Rating  Movie_ID  weightedRating
0         0.711111   387418       2         3        1.422222
1         0.711111   387418       1         8        0.711111
2         0.711111   387418       2        16        1.422222
3         0.711111   387418       3        17        2.133333
4         0.711111   387418       2        18        1.422222


In [110]:

#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('Movie_ID').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())


          sum_similarityIndex  sum_weightedRating
Movie_ID                                         
3                    0.272229            0.431594
8                    3.066015            8.091749
16                   2.041695            6.836447
17                   0.047184            1.248484
18                   2.357068            9.551250


In [128]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['Movie_ID'] = tempTopUsersRating.index

print(recommendation_df.head(10))

          weighted average recommendation score  Movie_ID
Movie_ID                                                 
3                                      1.585407         3
8                                      2.639174         8
16                                     3.348418        16
17                                    26.459820        17
18                                     4.052174        18
26                                     1.413066        26
28                                     3.229077        28
30                                     4.019718        30
32                                     1.538846        32
33                                     0.193320        33


In [123]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df = recommendation_df.head(-1)
print(recommendation_df)


          weighted average recommendation score  Movie_ID
Movie_ID                                                 
4155                                 192.788832      4155
3404                                  83.851823      3404
1234                                  44.068157      1234
1851                                  35.218273      1851
17                                    26.459820        17
...                                         ...       ...
1111                                 -11.700939      1111
2262                                 -11.804498      2262
2331                                 -15.920065      2331
431                                  -17.853603       431
3812                                 -18.856090      3812

[1343 rows x 2 columns]


In [121]:
recommended_movie=movies_df.loc[movies_df['Movie_ID'].isin(recommendation_df['Movie_ID'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.Movie_ID.isin(userSubset['Movie_ID'])]

print(recommended_movie)

      Movie_ID  Year                        Name
2            3  1997                   Character
7            8  2004  What the #$*! Do We Know!?
15          16  1996                   Screamers
16          17  2005                   7 Seconds
17          18  1994            Immortal Beloved
...        ...   ...                         ...
4487      4488  2000                 Wonder Boys
4489      4490  2004                   Ned Kelly
4491      4492  2004                  Club Dread
4492      4493  2003           Ju-on: The Grudge
4495      4496  1993       Farewell My Concubine

[1339 rows x 3 columns]


In [126]:
#Check wether the reccomended movie is wrong or no
#if the the movie id is nan in the reccomended has value in the reccomended_movie. It means it is wrong (Stored all)
print(recommended_movie[recommended_movie['Movie_ID'] == 1111]) #NO
print()
print(recommended_movie[recommended_movie['Movie_ID'] == 1189]) #NO
print()
print(recommended_movie[recommended_movie['Movie_ID'] == 4489]) #NO
print()
print(recommended_movie[recommended_movie['Movie_ID'] == 4490]) #CORRECT

#Just checking
#Not really important

      Movie_ID  Year                Name
1110      1111  1972  Cries and Whispers

Empty DataFrame
Columns: [Movie_ID, Year, Name]
Index: []

Empty DataFrame
Columns: [Movie_ID, Year, Name]
Index: []

      Movie_ID  Year       Name
4489      4490  2004  Ned Kelly
