In [104]:
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt

In [105]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('rating.csv')

In [106]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [107]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


# Preprocessing Data

In [108]:
df_movies['year'] = df_movies['title'].str.extract('(\(\d\d\d\d\))')
df_movies['year'] = df_movies['year'].str.extract('(\d\d\d\d)')
df_movies['title'] = df_movies['title'].str.replace('(\(\d\d\d\d\))','')
df_movies['title'] = df_movies['title'].apply(lambda x: x.strip())
df_movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
34203,151697,Grand Slam,Thriller,1967
34204,151701,Bloodmoney,(no genres listed),2010
34205,151703,The Butterfly Circus,Drama,2009
34206,151709,Zero,Drama|Sci-Fi,2015


In [109]:
df_movies['genres'] = df_movies['genres'].str.split('|')
df_movies

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
34203,151697,Grand Slam,[Thriller],1967
34204,151701,Bloodmoney,[(no genres listed)],2010
34205,151703,The Butterfly Circus,[Drama],2009
34206,151709,Zero,"[Drama, Sci-Fi]",2015


In [110]:
moviesWithGenres_df = df_movies.copy()
moviesWithGenres_df

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
34203,151697,Grand Slam,[Thriller],1967
34204,151701,Bloodmoney,[(no genres listed)],2010
34205,151703,The Butterfly Circus,[Drama],2009
34206,151709,Zero,"[Drama, Sci-Fi]",2015


In [111]:
for index, row in df_movies.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index,genre] = 1

In [112]:
moviesWithGenres_df

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,,1.0,,1.0,,...,,,,,,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,,,,1.0,,1.0,...,,,,,,,,,,
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,,,,1.0,,1.0,...,,,,,,,,,,
4,5,Father of the Bride Part II,[Comedy],1995,,,,1.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34203,151697,Grand Slam,[Thriller],1967,,,,,,,...,,,,,,,,,,
34204,151701,Bloodmoney,[(no genres listed)],2010,,,,,,,...,,,,,,,,,,1.0
34205,151703,The Butterfly Circus,[Drama],2009,,,,,,,...,,,,,,,,,,
34206,151709,Zero,"[Drama, Sci-Fi]",2015,,,,,,,...,,,1.0,,,,,,,


In [113]:
moviesWithGenres_df=moviesWithGenres_df.fillna(0)

In [114]:
moviesWithGenres_df.isnull().sum()

movieId               0
title                 0
genres                0
year                  0
Adventure             0
Animation             0
Children              0
Comedy                0
Fantasy               0
Romance               0
Drama                 0
Action                0
Crime                 0
Thriller              0
Horror                0
Mystery               0
Sci-Fi                0
IMAX                  0
Documentary           0
War                   0
Musical               0
Western               0
Film-Noir             0
(no genres listed)    0
dtype: int64

# Visualizing the rating dataframe

In [115]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [116]:
df_ratings = df_ratings.drop('timestamp',1)
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


# Collaborative Filterting

This technique uses other users to recommend movies to the input user.It attempts to find users that have similar preferences and opinions as input and then recommneds items that they have liked to the input.

In [117]:
userInput = [
            {'title':'Breakfast Club, The','rating':5},
            {'title':'Toy Story','rating':3.5},
            {'title':'Jumanji','rating':2},
            {'title':'Akira','rating':4.5},
            {'title':'Pulp Fiction','rating':5}
            ]
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Akira,4.5
4,Pulp Fiction,5.0


# Add movie id to input user

In [118]:
inputId = df_movies[df_movies['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId,inputMovies)

In [119]:
inputMovies = inputMovies.drop('year',1)
inputMovies

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",3.5
1,2,Jumanji,"[Adventure, Children, Fantasy]",2.0
2,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",5.0
3,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",4.5
4,1968,"Breakfast Club, The","[Comedy, Drama]",5.0


# Finding the nearest neighbours

In [120]:
#Filtering out users that have watched movies that the input has watched storing it.
userSubset = df_ratings[df_ratings['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
11,1,296,4.0
236,3,1,4.0
451,5,2,3.0
517,6,1,5.0
...,...,...,...
19999786,138491,1,2.0
19999838,138492,1968,5.0
19999890,138493,1,3.5
19999891,138493,2,4.0


In [121]:
userSubset['movieId'].unique()

array([   2,  296,    1, 1274, 1968], dtype=int64)

We now group up the rows by userId

In [122]:
userSubsetGroup = userSubset.groupby('userId')

In [123]:
userSubsetGroup

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F57577FEE0>

In [124]:
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
166633,1130,1968,4.0


In [125]:
userSubsetGroup = sorted(userSubsetGroup,key = lambda x: len(x[1]), reverse=True)
userSubsetGroup

[(91,
        userId  movieId  rating
  9621      91        1     4.0
  9622      91        2     3.5
  9669      91      296     3.5
  9826      91     1274     2.5
  9903      91     1968     4.0),
 (294,
         userId  movieId  rating
  37452     294        1     4.5
  37453     294        2     4.5
  37504     294      296     4.5
  37648     294     1274     4.5
  37731     294     1968     5.0),
 (586,
         userId  movieId  rating
  81164     586        1     2.5
  81165     586        2     3.0
  81226     586      296     5.0
  81390     586     1274     4.0
  81499     586     1968     3.0),
 (648,
         userId  movieId  rating
  92885     648        1     4.0
  92886     648        2     2.0
  92937     648      296     5.0
  93124     648     1274     4.0
  93237     648     1968     3.0),
 (775,
          userId  movieId  rating
  113566     775        1     4.5
  113567     775        2     2.0
  113695     775      296     5.0
  114057     775     1274     4.5
  

In [126]:
userSubsetGroup[0]

(91,
       userId  movieId  rating
 9621      91        1     4.0
 9622      91        2     3.5
 9669      91      296     3.5
 9826      91     1274     2.5
 9903      91     1968     4.0)

# Similarity of users to input users

We are gpoing to find out how similar is each use ro tinput user using Pearson Correlation Coefficient. In our case 1 means two users have same preferences and -1 means they do not have same preferences. 

In [127]:
userSubsetGroup = userSubsetGroup[0:100]

In [128]:
for name, group in userSubsetGroup:
    print(name)
    print(group)
    print(len(group))

91
      userId  movieId  rating
9621      91        1     4.0
9622      91        2     3.5
9669      91      296     3.5
9826      91     1274     2.5
9903      91     1968     4.0
5
294
       userId  movieId  rating
37452     294        1     4.5
37453     294        2     4.5
37504     294      296     4.5
37648     294     1274     4.5
37731     294     1968     5.0
5
586
       userId  movieId  rating
81164     586        1     2.5
81165     586        2     3.0
81226     586      296     5.0
81390     586     1274     4.0
81499     586     1968     3.0
5
648
       userId  movieId  rating
92885     648        1     4.0
92886     648        2     2.0
92937     648      296     5.0
93124     648     1274     4.0
93237     648     1968     3.0
5
775
        userId  movieId  rating
113566     775        1     4.5
113567     775        2     2.0
113695     775      296     5.0
114057     775     1274     4.5
114255     775     1968     4.0
5
812
        userId  movieId  rating
12050

In [129]:
#Store the pearson correlation in a dictionary, where the key will be userID and value will be pearson coefficient
pearsonCorrelationDict = {}
from scipy.stats import pearsonr

#For every user group in our subset
for name,group in userSubsetGroup:
    #Lets start by sorting the input and current user group
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the n for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And store them in temporary variable in a list format to facilitate for future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Lets also put current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    corr, p_value = pearsonr(tempRatingList,tempGroupList)
    pearsonCorrelationDict[name] = corr



In [130]:
pearsonCorrelationDict.items()

dict_items([(91, -0.08006407690254357), (294, 0.43852900965351466), (586, 0.5393193716300062), (648, 0.6880209161537815), (775, 0.8362420100070908), (812, 0.6016568375961869), (869, 0.18605210188381266), (903, -0.17902871850985821), (1200, 0.5370861555295747), (1244, 0.10963225241337866), (1715, 0.895143592549291), (1748, 0.8320502943378437), (1763, -0.26854307776478736), (1810, 0.8594395636904107), (1813, 0.8347371386380907), (1849, 0.6266005147845037), (1864, 0.8320502943378437), (1942, 0.774023530673004), (1984, -0.3180390717330988), (2047, 0.8976095575314936), (2099, -0.43852900965351466), (2367, 0.49334513586020395), (2397, nan), (2515, 0.8951435925492912), (2661, 0.4385290096535146), (2757, 0.7844645405527362), (2959, 0.11720180773462385), (2988, 0.719779593768156), (3179, 0.29417420270727607), (3218, 0.8503864129218267), (3268, 0.8204126541423671), (3269, 0.8648817040445188), (3318, 0.8790135580096791), (3397, 0.7112333251538238), (3487, 0.3654408413779288), (3576, 0.59676239503

In [131]:
#Coverting dictionary to dataframe
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict,orient='index')
pearsonDF.columns = ['SimilarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))

In [132]:
pearsonDF

Unnamed: 0,SimilarityIndex,userId
0,-0.080064,91
1,0.438529,294
2,0.539319,586
3,0.688021,648
4,0.836242,775
...,...,...
95,0.451243,10621
96,0.298381,10670
97,0.225621,10821
98,0.045231,10822


# Finding the top similar users 

In [133]:
topUsers = pearsonDF.sort_values(by='SimilarityIndex',ascending=False)[:50]

In [134]:
topUsers

Unnamed: 0,SimilarityIndex,userId
89,0.946029,10387
19,0.89761,2047
23,0.895144,2515
78,0.895144,9410
81,0.895144,9772
10,0.895144,1715
47,0.880705,5576
36,0.880705,3629
32,0.879014,3318
94,0.871312,10560


# Rating of selected users to all movies

Take the weighted average of the ratings of the movies. But to do this, we first need to get the movies watched by the users in our pearsonDF from the ratings dataframe and then store their correlation in a new column called similarityIndex. This is achieved by mergin below two tables

In [135]:
topUserRating = topUsers.merge(df_ratings, left_on='userId',right_on='userId',how='inner')
topUserRating.head(20)

Unnamed: 0,SimilarityIndex,userId,movieId,rating
0,0.946029,10387,1,4.0
1,0.946029,10387,2,3.5
2,0.946029,10387,10,3.0
3,0.946029,10387,11,3.0
4,0.946029,10387,17,3.0
5,0.946029,10387,21,3.5
6,0.946029,10387,32,4.0
7,0.946029,10387,39,3.5
8,0.946029,10387,47,4.0
9,0.946029,10387,50,5.0


Now simply multiply rating with similarityIndex and then sum up teh new rating and divide it by sum of the weights

In [136]:
topUserRating['weightedRating'] = topUserRating['SimilarityIndex']*topUserRating['rating']
topUserRating.head(20)

Unnamed: 0,SimilarityIndex,userId,movieId,rating,weightedRating
0,0.946029,10387,1,4.0,3.784115
1,0.946029,10387,2,3.5,3.311101
2,0.946029,10387,10,3.0,2.838086
3,0.946029,10387,11,3.0,2.838086
4,0.946029,10387,17,3.0,2.838086
5,0.946029,10387,21,3.5,3.311101
6,0.946029,10387,32,4.0,3.784115
7,0.946029,10387,39,3.5,3.311101
8,0.946029,10387,47,4.0,3.784115
9,0.946029,10387,50,5.0,4.730144


In [137]:
tempTopUserRating = topUserRating.groupby('movieId').sum()[['SimilarityIndex','weightedRating']]

In [138]:
tempTopUserRating.columns.dtype

dtype('O')

In [139]:
tempTopUserRating.columns = ['sum_SimilarityIndex','sum_weightedRating']

In [140]:
tempTopUserRating.head()

Unnamed: 0_level_0,sum_SimilarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.821238,146.424613
2,38.821238,101.191887
3,13.674659,35.392039
4,3.586101,9.326486
5,9.194413,23.109653


In [141]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUserRating['sum_weightedRating']/tempTopUserRating['sum_SimilarityIndex']
recommendation_df['movieIds'] = tempTopUserRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieIds
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.771766,1
2,2.606612,2
3,2.588148,3
4,2.600732,4
5,2.513445,5


In [142]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score',ascending=False)
recommendation_df.head(20)

Unnamed: 0_level_0,weighted average recommendation score,movieIds
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4278,5.0,4278
27002,5.0,27002
42422,5.0,42422
5632,5.0,5632
7502,5.0,7502
53883,5.0,53883
1788,5.0,1788
77658,5.0,77658
5496,5.0,5496
59684,5.0,59684


In [143]:
recommendation_df['weighted average recommendation score'].unique()

array([5.        , 5.        , 5.        , ..., 0.73746859, 0.72913449,
       0.5       ])

In [157]:
inputId = df_movies.loc[df_movies['movieId'].isin(recommendation_df['movieIds'].tolist())]
recommendation_df=recommendation_df.merge(inputId, left_on='movieIds', right_on='movieId', how='inner')
recommendation_df= recommendation_df.drop('movieIds',1)
recommendation_df.head(100)

Unnamed: 0,weighted average recommendation score,movieId,title,genres,year
0,5.0,4278,Triumph of the Will (Triumph des Willens),[Documentary],1934
1,5.0,27002,From the Earth to the Moon,"[Action, Documentary, Drama, Thriller]",1998
2,5.0,42422,Voices of a Distant Star (Hoshi no koe),"[Animation, Drama, Romance, Sci-Fi]",2003
3,5.0,5632,Bloody Sunday,[Drama],2002
4,5.0,7502,Band of Brothers,"[Action, Drama, War]",2001
...,...,...,...,...,...
95,5.0,31934,"Four Feathers, The","[Adventure, War]",1939
96,5.0,31770,Night and the City,"[Film-Noir, Thriller]",1950
97,5.0,25753,Greed,[Drama],1924
98,5.0,25755,"Phantom of the Opera, The","[Drama, Horror]",1925
