In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt


In [2]:
genome_scores=pd.read_csv('genome_scores.csv')
genome_tags=pd.read_csv('genome_tags.csv')
links=pd.read_csv('link.csv')
movies=pd.read_csv('movie.csv')
ratings=pd.read_csv('rating.csv')
tags=pd.read_csv('tag.csv')


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
movies['year']=movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year']=movies.year.str.extract('(\d\d\d\d)',expand=False)


In [6]:
movies['title']=movies.title.str.replace('(\(\d\d\d\d\))', '')
movies['title']=movies['title'].apply(lambda x:x.strip())


In [7]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [8]:
movies.drop(columns=['genres'],inplace=True)

In [9]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [10]:
ratings.drop(columns=['timestamp'],inplace=True)

In [11]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [12]:
userInput=[
    {'title':'Breakfast Club, The','rating':5},
    {'title':'Toy Story','rating':3.5},
    {'title':'Jumanji','rating':2},
    {'title':'Pulp Fiction','rating':5},
    {'title':'Akira','rating':4.5}
]

In [13]:
inputMovies=pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [14]:
Id=movies[movies['title'].isin(inputMovies['title'].tolist())]
inputMovies=pd.merge(Id,inputMovies)
inputMovies=inputMovies.drop('year',1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [15]:
users=ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
11,1,296,4.0
236,3,1,4.0
451,5,2,3.0
517,6,1,5.0


In [16]:
users.shape

(168730, 3)

In [17]:
userSubsetGroup=users.groupby(['userId'])
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
166633,1130,1968,4.0


In [20]:
userSubsetGroup=sorted(userSubsetGroup,key=lambda x:len(x[1]),reverse=True)


In [19]:
userSubsetGroup[0:3]

[(91,       userId  movieId  rating
  9621      91        1     4.0
  9622      91        2     3.5
  9669      91      296     3.5
  9826      91     1274     2.5
  9903      91     1968     4.0), (294,        userId  movieId  rating
  37452     294        1     4.5
  37453     294        2     4.5
  37504     294      296     4.5
  37648     294     1274     4.5
  37731     294     1968     5.0), (586,        userId  movieId  rating
  81164     586        1     2.5
  81165     586        2     3.0
  81226     586      296     5.0
  81390     586     1274     4.0
  81499     586     1968     3.0)]

In [21]:
userSubsetGroup=userSubsetGroup[0:100]

In [40]:
pearsonCorDict = {}

for name, group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    n = len(group)
    temp = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(n)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(n)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(n)
    

    if Sxx != 0 and Syy != 0:
        pearsonCorDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorDict[name] = 0


In [41]:
pearsonCorDict.items()

dict_items([(91, -0.08006407690254357), (294, 0.4385290096535115), (586, 0.5393193716300061), (648, 0.6880209161537812), (775, 0.8362420100070908), (812, 0.6016568375961869), (869, 0.1860521018838127), (903, -0.17902871850985827), (1200, 0.5370861555295743), (1244, 0.10963225241337883), (1715, 0.8951435925492911), (1748, 0.8320502943378437), (1763, -0.268543077764787), (1810, 0.8594395636904102), (1813, 0.8347371386380908), (1849, 0.626600514784503), (1864, 0.8320502943378437), (1942, 0.774023530673004), (1984, -0.31803907173309875), (2047, 0.8976095575314932), (2099, -0.4385290096535115), (2367, 0.49334513586020373), (2397, 0), (2515, 0.8951435925492914), (2661, 0.4385290096535153), (2757, 0.7844645405527362), (2959, 0.11720180773462363), (2988, 0.7197795937681559), (3179, 0.29417420270727607), (3218, 0.8503864129218268), (3268, 0.8204126541423654), (3269, 0.8648817040445187), (3318, 0.8790135580096794), (3397, 0.711233325153824), (3487, 0.36544084137792915), (3576, 0.5967623950328603

In [42]:
pearsonDf=pd.DataFrame.from_dict(pearsonCorDict,orient='index')
pearsonDf.columns=['SimilarityIndex']
pearsonDf['userId']=pearsonDf.index
pearsonDf.index=range(len(pearsonDf))
pearsonDf.head()

Unnamed: 0,SimilarityIndex,userId
0,-0.080064,91
1,0.438529,294
2,0.539319,586
3,0.688021,648
4,0.836242,775


In [43]:
topUsers=pearsonDf.sort_values(by='SimilarityIndex',ascending=False)[0:50]
topUsers.head()

Unnamed: 0,SimilarityIndex,userId
89,0.946029,10387
19,0.89761,2047
81,0.895144,9772
23,0.895144,2515
10,0.895144,1715


In [46]:
topUserRating=topUsers.merge(ratings,left_on='userId',right_on='userId',how='inner')
topUserRating.head()


Unnamed: 0,SimilarityIndex,userId,movieId,rating
0,0.946029,10387,1,4.0
1,0.946029,10387,2,3.5
2,0.946029,10387,10,3.0
3,0.946029,10387,11,3.0
4,0.946029,10387,17,3.0


In [49]:
topUserRating['weightedRating']=topUserRating['SimilarityIndex']*topUserRating['rating']
topUserRating.head()

Unnamed: 0,SimilarityIndex,userId,movieId,rating,weightedRating
0,0.946029,10387,1,4.0,3.784115
1,0.946029,10387,2,3.5,3.311101
2,0.946029,10387,10,3.0,2.838086
3,0.946029,10387,11,3.0,2.838086
4,0.946029,10387,17,3.0,2.838086


In [53]:
tempTopUserRating=topUserRating.groupby('movieId').sum()[['SimilarityIndex','weightedRating']]
tempTopUserRating.columns=['sum_similarityIndex','sum_weightedRating']
tempTopUserRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.821238,146.424613
2,38.821238,101.191887
3,13.674659,35.392039
4,3.586101,9.326486
5,9.194413,23.109653


In [56]:
recomm_df=pd.DataFrame()
recomm_df['weightedAverageRecommScore']=tempTopUserRating['sum_weightedRating']/tempTopUserRating['sum_similarityIndex']
recomm_df['movieId']=tempTopUserRating.index
recomm_df.head()

Unnamed: 0_level_0,weightedAverageRecommScore,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.771766,1
2,2.606612,2
3,2.588148,3
4,2.600732,4
5,2.513445,5


In [57]:
recomm_df=recomm_df.sort_values(by='weightedAverageRecommScore',ascending=False)
recomm_df.head(10)

Unnamed: 0_level_0,weightedAverageRecommScore,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4278,5.0,4278
8836,5.0,8836
50742,5.0,50742
1913,5.0,1913
5056,5.0,5056
59684,5.0,59684
8516,5.0,8516
1929,5.0,1929
5289,5.0,5289
1934,5.0,1934


In [59]:
movies.loc[movies['movieId'].isin(recomm_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
1829,1913,Picnic at Hanging Rock,1975
1845,1929,Grand Hotel,1932
1850,1934,You Can't Take It with You,1938
4183,4278,Triumph of the Will (Triumph des Willens),1934
4960,5056,"Enigma of Kaspar Hauser, The (a.k.a. Mystery o...",1974
5192,5289,Body and Soul,1947
7863,8516,"Matter of Life and Death, A (Stairway to Heaven)",1946
8153,8836,Wicker Park,2004
11598,50742,7 Plus Seven,1970
12679,59684,Lake of Fire,2006
