In [2]:
import pandas as pd
from math import sqrt
import numpy as np

In [4]:
# Read csv files
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

In [13]:
movies_df.head(20)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [10]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [17]:
# Added userInput of movie ratings
userInput = [{'title':'Toy Story (1995)', 'rating':4},
             {'title':'Jumanji (1995)', 'rating':3},
             {'title':'GoldenEye (1995)', 'rating':5},
             {'title':'Money Train (1995)', 'rating':4},
             {'title':'Casino (1995)', 'rating':4.5}]

inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                title  rating
0    Toy Story (1995)     4.0
1      Jumanji (1995)     3.0
2    GoldenEye (1995)     5.0
3  Money Train (1995)     4.0
4       Casino (1995)     4.5


In [21]:
# Added movieID column
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]

print(inputMovies)

   movieId               title  rating
0        1    Toy Story (1995)     4.0
1        2      Jumanji (1995)     3.0
2       10    GoldenEye (1995)     5.0
3       16       Casino (1995)     4.5
4       20  Money Train (1995)     4.0


In [22]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]

print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1           215     215        215
2           110     110        110
10          132     132        132
16           82      82         82
20           15      15         15


In [23]:
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(274,        userId  movieId  rating   timestamp
39229     274        1     4.0  1171410158
39230     274        2     3.5  1171934785
39233     274       10     4.0  1171428459
39235     274       16     4.5  1171493420
39237     274       20     3.5  1171830022), (448,        userId  movieId  rating   timestamp
68655     448        1     5.0  1019126661
68656     448        2     3.0  1019125424
68659     448       10     4.0  1019124400
68661     448       16     5.0  1019138531
68663     448       20     3.0  1019124922), (599,        userId  movieId  rating   timestamp
92623     599        1     3.0  1498524204
92624     599        2     2.5  1498514085
92629     599       10     3.5  1498500281
92633     599       16     3.0  1498523389
92637     599       20     1.5  1498504813), (68,        userId  movieId  rating   timestamp
10360      68        1     2.5  1158531426
10361      68        2     2.5  1158532776
10366      68       10     4.5  1158531612
10368      68       16  

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [24]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [25]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))

print(pearsonDF.head())

   similarityIndex  userId
0         0.644658     274
1         0.505650     448
2         0.511237     599
3         0.866400      68
4         0.529150      91


In [26]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]

print(topUsers.head())

    similarityIndex  userId
99              1.0     305
95              1.0     266
91              1.0     200
38              1.0     322
86              1.0     169


In [27]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')

print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     305        2     3.5  1460136227
1               1.0     305        6     3.5  1460222104
2               1.0     305       16     4.5  1460136042
3               1.0     305       25     3.0  1518197993
4               1.0     305       32     5.0  1460222250
..              ...     ...      ...     ...         ...
95              1.0     305     1250     5.0  1460305704
96              1.0     305     1252     4.0  1461700582
97              1.0     305     1253     5.0  1460366849
98              1.0     305     1258     5.0  1460135285
99              1.0     305     1259     4.5  1460135385

[100 rows x 5 columns]


In [28]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']

print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     305        2     3.5  1460136227             3.5
1              1.0     305        6     3.5  1460222104             3.5
2              1.0     305       16     4.5  1460136042             4.5
3              1.0     305       25     3.0  1518197993             3.0
4              1.0     305       32     5.0  1460222250             5.0


In [29]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']

print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  28.117221           98.789180
2                  22.052298           60.072981
3                  10.637202           31.698284
5                   7.473909           22.841875
6                  17.566764           70.379313


In [32]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted_average_recommendation_score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index

print(recommendation_df.head(10))

         weighted_average_recommendation_score  movieId
movieId                                                
1                                     3.513476        1
2                                     2.724114        2
3                                     2.979946        3
5                                     3.056215        5
6                                     4.006390        6
7                                     3.198005        7
8                                     3.000000        8
9                                     2.370575        9
10                                    3.898333       10
11                                    3.592875       11


In [34]:
recommendation_df = recommendation_df.sort_values(by='weighted_average_recommendation_score', ascending=False)

print(recommendation_df)

         weighted_average_recommendation_score  movieId
movieId                                                
6345                                       5.0     6345
158027                                     5.0   158027
906                                        5.0      906
55363                                      5.0    55363
55908                                      5.0    55908
...                                        ...      ...
5357                                       NaN     5357
5493                                       NaN     5493
5499                                       NaN     5499
46347                                      NaN    46347
52712                                      NaN    52712

[6373 rows x 2 columns]


In [37]:
recommended_movies = movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

# To prevent recommending the same movie
recommended_movies = recommended_movies.loc[~recommended_movies.movieId.isin(userSubset['movieId'])]

recommended_movies.head()

Unnamed: 0,movieId,title,genres
2,3,Grumpier Old Men (1995),Comedy|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
