## COLLABORATIVE FILTERING USING PEARSON SIMILARITY

In [None]:
# import the necessary libraries
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt

In [None]:
# getting the datasets frm the csv files
movie = pd.read_csv('/content/movies.csv')
rating= pd.read_csv('/content/ratings.csv')

In [None]:
# displaying some rows of the dataset
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
import re

In [None]:
# function to remove the year from the movie title
def remove_year_from_title(title):
    cleaned_title = re.sub(r'\s\(\d+\)', '', title)
    return cleaned_title

In [None]:
movie['year'] = movie.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movie['year'] = movie.year.str.extract('(\d\d\d\d)',expand=False)

#Removing the years from the 'title' column
movie['title'] = movie.title.apply(lambda x : remove_year_from_title(x))
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movie['title'] = movie['title'].apply(lambda x: x.strip())

In [None]:
# displaying the dataset with the transformed movie title column
movie.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [None]:
# dropping genres from the movies dataset
movie.drop(columns=['genres'], inplace=True)

In [None]:
# dropping timestamps fromm the ratings dataset
rating.drop(columns=['timestamp'],inplace=True)

In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
# defining a user's preferred movies
user = [
            {'title':'Breakfast Club, The', 'rating':4},
            {'title':'Toy Story', 'rating':2.5},
            {'title':'Jumanji', 'rating':3},
            {'title':"Pulp Fiction", 'rating':4.5},
            {'title':'Akira', 'rating':5}
         ]
inputMovie = pd.DataFrame(user)

In [None]:
inputMovie

Unnamed: 0,title,rating
0,"Breakfast Club, The",4.0
1,Toy Story,2.5
2,Jumanji,3.0
3,Pulp Fiction,4.5
4,Akira,5.0


In [None]:
#Filtering out the movies by title
Id = movie[movie['title'].isin(inputMovie['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovie = pd.merge(Id, inputMovie)
#Dropping information we won't use from the input dataframe
inputMovie = inputMovie.drop(columns=["year"])

In [None]:
#Filtering out users that have watched movies that the input has watched and storing it
users = rating[rating['movieId'].isin(inputMovie['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0


In [None]:
users.shape

(784, 3)

In [None]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = users.groupby(['userId'])

In [None]:
#showing one such group example by getting all the users of a particular uderId
# userSubsetGroup.get_group(1130)

In [None]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [None]:
userSubsetGroup[0:3]

[((91,),
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 ((177,),
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 ((219,),
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0)]

In [None]:
userSubsetGroup = userSubsetGroup[0:100]

In [None]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovie = inputMovie.sort_values(by='movieId')
    #Get the N for the formula
    n = len(group)
    #Get the review scores for the movies that they both have in common
    temp = inputMovie[inputMovie['movieId'].isin(group['movieId'].tolist())]
    #storing them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp['rating'].tolist()
    #putting the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #calculating the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(n)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(n)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(n)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorDict[name] = 0


In [None]:
pearsonCorDict.items()

dict_items([((91,), 0.5796011559684829), ((177,), -0.5187513759338123), ((219,), -0.05547950410914763), ((274,), 0.48424799847909017), ((298,), 0.8329565184432136), ((414,), 0.48993185504860093), ((474,), -0.48993185504860093), ((477,), 0.835703992326648), ((480,), 0.9644856443408245), ((483,), 0.0), ((599,), 0.9007334537569819), ((608,), 0.8367179328930429), ((50,), 0.6172133998483676), ((57,), -0.9669875568304563), ((68,), -0.22360679774997896), ((103,), 0.7302967433402214), ((135,), 0.3651483716701107), ((182,), 0.9258200997725514), ((202,), 0.3651483716701107), ((217,), -0.31622776601683794), ((226,), 0.848528137423857), ((288,), 0.26519741765271837), ((307,), 0.5786913866044946), ((318,), 0.8783100656536799), ((322,), 0.5786913866044946), ((330,), 0.42799248836102016), ((357,), 0.0), ((434,), 0.7407610636824496), ((448,), 0.0), ((469,), 0.2672612419124244), ((561,), 0.7302967433402214), ((600,), 0.5329480400990121), ((606,), 0.8233293074216317), ((610,), 0.0), ((18,), 0.7205766921

In [None]:
# creating a pearson dataframe containing similarity index, user ID
pearsonDF = pd.DataFrame.from_dict(pearsonCorDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.579601,"(91,)"
1,-0.518751,"(177,)"
2,-0.05548,"(219,)"
3,0.484248,"(274,)"
4,0.832957,"(298,)"


In [None]:
# getting top similar users
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
72,1.0,"(381,)"
54,0.995871,"(232,)"
42,0.981981,"(122,)"
76,0.981981,"(425,)"
70,0.970725,"(373,)"


In [None]:
# topUsersRating = topUsers.merge(rating, left_on='userId', right_on='userId', how='inner')
# topUsersRating.head()

# topUsers['userId'] = topUsers['userId'].astype(str)
topUsers["userId"] = topUsers["userId"].apply(lambda x : x[0])
# rating['userId'] = rating['userId'].astype(str)

# merging the dataframes
print(topUsers.shape)
topUsersRating = pd.merge(topUsers, rating, on='userId', how="outer")

(50, 2)


In [None]:
#Multipling the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,381,1,3.5,3.5
1,1.0,381,2,4.0,4.0
2,1.0,381,19,2.5,2.5
3,1.0,381,32,3.5,3.5
4,1.0,381,34,3.5,3.5


In [None]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30.995236,102.602205
2,26.43198,83.750962
3,7.718315,20.176568
4,0.532948,0.799422
5,6.899552,20.812054


In [None]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#taking the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.310257,1
2,3.168547,2
3,2.614116,3
4,1.5,4
5,3.016435,5


In [None]:
# sorting the dataframe with respect to the werighted average recommendation score
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4117,5.0,4117
3951,5.0,3951
6442,5.0,6442
2457,5.0,2457
290,5.0,290
3606,5.0,3606
5485,5.0,5485
6159,5.0,6159
5416,5.0,5416
5328,5.0,5328


In [None]:
# Displaying the dataframe containing recommended movies
movie.loc[movie['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
251,290,Once Were Warriors,1994
1848,2457,Running Scared,1986
2690,3606,On the Town,1949
2947,3951,Two Family House,2000
3068,4117,Hope and Glory,1987
3807,5328,Rain,2001
3852,5416,Cherish,2002
3905,5485,Tadpole,2002
4231,6159,All the Real Girls,2003
4390,6442,Belle époque,1992
