In [162]:
import pandas as pd
from math import sqrt
import numpy as np

In [163]:
movies_df = pd.read_csv('Netflix_Dataset_Movie.csv')
Ratings_df = pd.read_csv('Netflix_Dataset_Rating.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Movie_ID  17770 non-null  int64 
 1   Year      17770 non-null  int64 
 2   Name      17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB
None


In [164]:
userInput = [{'Name':'Dragonheart', 'Rating':5},
             {'Name':'Sea of Love', 'Rating':1},
             {'Name':'The Color of Money', 'Rating':1},
             {'Name':'Superstar', 'Rating':5},
             {'Name':'Scandal', 'Rating':4.5},
             {'Name':"National Lampoon's Van Wilder", 'Rating':1}]

inputMovies = pd.DataFrame(userInput)
print(inputMovies)


                            Name  Rating
0                    Dragonheart     5.0
1                    Sea of Love     1.0
2             The Color of Money     1.0
3                      Superstar     5.0
4                        Scandal     4.5
5  National Lampoon's Van Wilder     1.0


In [165]:

inputId = movies_df[movies_df['Name'].isin(inputMovies['Name'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('Year', 1)
inputMovies = inputMovies[['Movie_ID','Name','Rating']]
print(inputMovies)

   Movie_ID                           Name  Rating
0        58                    Dragonheart     5.0
1       110                        Scandal     4.5
2       599                    Sea of Love     1.0
3      1502                      Superstar     5.0
4      1509  National Lampoon's Van Wilder     1.0
5      2139             The Color of Money     1.0


  inputMovies = inputMovies.drop('Year', 1)


In [166]:
userSubset = Ratings_df[Ratings_df['Movie_ID'].isin(inputMovies['Movie_ID'].tolist())]
print(userSubset.groupby('Movie_ID').count())

          User_ID  Rating
Movie_ID                 
58          16116   16116
110          1863    1863
599          8928    8928
1502         8736    8736
1509        32975   32975
2139        20496   20496


In [167]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['User_ID'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


[(16272,          User_ID  Rating  Movie_ID
188089     16272       4        58
263599     16272       3       110
2310897    16272       3       599
5386753    16272       1      1502
5414831    16272       4      1509
7971646    16272       3      2139), (64765,          User_ID  Rating  Movie_ID
188600     64765       2        58
263643     64765       2       110
2311126    64765       4       599
5386984    64765       2      1502
5415562    64765       1      1509
7972174    64765       4      2139), (151004,          User_ID  Rating  Movie_ID
192823    151004       4        58
264164    151004       3       110
2313496   151004       4       599
5389269   151004       1      1502
5424346   151004       5      1509
7977645   151004       3      2139), (303948,          User_ID  Rating  Movie_ID
179034    303948       4        58
262593    303948       3       110
2305911   303948       4       599
5381809   303948       4      1502
5396274   303948       4      1509
7960232   3039

In [168]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='Movie_ID')
    inputMovies = inputMovies.sort_values(by='Movie_ID')

    #Get the N for the formula
    nRatings = len(group)

    # #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['Movie_ID'].isin(group['Movie_ID'].tolist())]

    # #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['Rating'].tolist()
   
    # #Let's also put the current user group reviews in a list format
    tempGroupList = group['Rating'].tolist()
   
    
    # #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [169]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['User_ID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())


   similarityIndex  User_ID
0        -0.346518    16272
1        -0.445532    64765
2        -0.544089   151004
3        -0.368048   303948
4        -0.607097   305344


In [170]:

topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())



    similarityIndex  User_ID
54         0.666667    44434
70         0.645497   166041
85         0.612372   290916
96         0.612372   394895
82         0.612372   267378


In [171]:

topUsersRating=topUsers.merge(Ratings_df, left_on='User_ID', right_on='User_ID', how='inner')
print(topUsersRating.head(100))

    similarityIndex  User_ID  Rating  Movie_ID
0          0.666667    44434       1        26
1          0.666667    44434       2        30
2          0.666667    44434       2        46
3          0.666667    44434       2        58
4          0.666667    44434       2        78
..              ...      ...     ...       ...
95         0.666667    44434       2      1236
96         0.666667    44434       3      1255
97         0.666667    44434       1      1289
98         0.666667    44434       4      1300
99         0.666667    44434       2      1305

[100 rows x 4 columns]


In [172]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Rating']
print(topUsersRating.head())


   similarityIndex  User_ID  Rating  Movie_ID  weightedRating
0         0.666667    44434       1        26        0.666667
1         0.666667    44434       2        30        1.333333
2         0.666667    44434       2        46        1.333333
3         0.666667    44434       2        58        1.333333
4         0.666667    44434       2        78        1.333333


In [173]:

#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('Movie_ID').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())


          sum_similarityIndex  sum_weightedRating
Movie_ID                                         
3                    0.292617            0.710645
8                    1.895848            4.929304
16                   2.121550            7.623800
17                   1.062291            4.215310
18                   4.957048           18.969511


In [174]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['Movie_ID'] = tempTopUsersRating.index

print(recommendation_df.head(10))

          weighted average recommendation score  Movie_ID
Movie_ID                                                 
3                                      2.428589         3
8                                      2.600053         8
16                                     3.593505        16
17                                     3.968133        17
18                                     3.826775        18
26                                     2.283243        26
28                                     4.021103        28
30                                     3.980418        30
32                                     4.316373        32
33                                     1.907859        33


In [175]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df = recommendation_df.head(-1)
print(recommendation_df)


          weighted average recommendation score  Movie_ID
Movie_ID                                                 
2423                                   8.133141      2423
3529                                   7.844890      3529
1481                                   7.518619      1481
1476                                   5.186384      1476
3456                                   5.049442      3456
...                                         ...       ...
4237                                  -0.631917      4237
1604                                  -1.249197      1604
2889                                  -1.249197      2889
4387                                  -1.837474      4387
2944                                  -3.185202      2944

[1349 rows x 2 columns]


In [176]:
recommended_movie=movies_df.loc[movies_df['Movie_ID'].isin(recommendation_df['Movie_ID'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.Movie_ID.isin(userSubset['Movie_ID'])]

print(recommended_movie)

      Movie_ID  Year                        Name
2            3  1997                   Character
7            8  2004  What the #$*! Do We Know!?
15          16  1996                   Screamers
16          17  2005                   7 Seconds
17          18  1994            Immortal Beloved
...        ...   ...                         ...
4487      4488  2000                 Wonder Boys
4489      4490  2004                   Ned Kelly
4491      4492  2004                  Club Dread
4492      4493  2003           Ju-on: The Grudge
4495      4496  1993       Farewell My Concubine

[1343 rows x 3 columns]


In [177]:
#Check wether the reccomended movie is wrong or no
#if the the movie id is nan in the reccomended has value in the reccomended_movie. It means it is wrong (Stored all)
print(recommended_movie[recommended_movie['Movie_ID'] == 1111]) #CORRECT
print()
print(recommended_movie[recommended_movie['Movie_ID'] == 1189]) #NO
print()
print(recommended_movie[recommended_movie['Movie_ID'] == 4489]) #NO
print()
print(recommended_movie[recommended_movie['Movie_ID'] == 4490]) #CORRECT

#Just checking
#Not really important

      Movie_ID  Year                Name
1110      1111  1972  Cries and Whispers

Empty DataFrame
Columns: [Movie_ID, Year, Name]
Index: []

Empty DataFrame
Columns: [Movie_ID, Year, Name]
Index: []

      Movie_ID  Year       Name
4489      4490  2004  Ned Kelly
