In [171]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [172]:
movie_df = pd.read_csv('movies.csv')

rating_df = pd.read_csv('ratings.csv')

movie_df.head() 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [173]:
rating_df.drop('timestamp' , axis=1 , inplace=True)
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [174]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [175]:
movie_df['year'] = movie_df['title'].str.extract('\((\d\d\d\d\))' , expand=False)

In [176]:
movie_df['year'] = movie_df['year'].str.extract('(\d\d\d\d)', expand=False)

In [177]:
movie_df['title'] = movie_df['title'].str.replace('\((\d\d\d\d\))' , " ")

In [178]:
movie_df['title'] = movie_df['title'].str.strip()

In [179]:
movie_df.drop('genres' , axis=1 , inplace=True)

In [180]:
movie_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [181]:
user_input = [
            {'title':'Breakfast Club, The', 'rating':2.5},
            {'title':'Toy Story', 'rating':4.5},
            {'title':'Jumanji', 'rating':2.5},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
]

user_input_df = pd.DataFrame(user_input)

In [182]:
movie_input_df = movie_df[movie_df['title'].isin(user_input_df['title'].tolist())]

In [183]:
movie_input_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
257,296,Pulp Fiction,1994
973,1274,Akira,1988
1445,1968,"Breakfast Club, The",1985


In [184]:
user_movie_df = pd.merge(user_input_df ,movie_input_df )
user_movie_df.head()

Unnamed: 0,title,rating,movieId,year
0,"Breakfast Club, The",2.5,1968,1985
1,Toy Story,4.5,1,1995
2,Jumanji,2.5,2,1995
3,Pulp Fiction,5.0,296,1994
4,Akira,4.5,1274,1988


In [185]:
user_movie_df.drop('year',1,inplace=True)

In [186]:
usersubset = rating_df[rating_df['movieId'].isin(user_movie_df['movieId'].tolist())]

In [187]:
usersubset.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0


In [188]:
usersubset_group= usersubset.groupby('userId')

In [189]:
usersubset_group = sorted(usersubset_group , key=lambda x: len(x[1]) , reverse=True) 

In [190]:
usersubset_group[:2]

[(91,        userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0), (177,        userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5)]

**Use the formula below to find similar**

![alt text](https://wikimedia.org/api/rest_v1/media/math/render/svg/bd1ccc2979b0fd1c1aec96e386f686ae874f9ec0 "Pearson Correlation")

In [191]:
usersubset_group = usersubset_group[0:100]

In [192]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
import math
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in usersubset_group:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    user_movie_df = user_movie_df.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = user_movie_df[user_movie_df['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/math.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [193]:
pearsonCorrelation_df = pd.DataFrame.from_dict(pearsonCorrelationDict , orient='index')

In [194]:
pearsonCorrelation_df.columns = ['simlar_index']

In [195]:
pearsonCorrelation_df['userId'] = pearsonCorrelation_df.index

In [196]:
pearsonCorrelation_df.index = range(len(pearsonCorrelation_df))

In [197]:
pearsonCorrelation_df.head()

Unnamed: 0,simlar_index,userId
0,0.905265,91
1,0.297775,177
2,0.58916,219
3,0.701241,274
4,0.55853,298


In [198]:
similar_person = pearsonCorrelation_df.sort_values('simlar_index',ascending= False)[0:60]

In [199]:
similar_person_rating =  similar_person.merge(rating_df , left_on='userId' , right_on='userId')

In [200]:
similar_person_rating.head()

Unnamed: 0,simlar_index,userId,movieId,rating
0,1.0,294,2,3.0
1,1.0,294,3,1.0
2,1.0,294,6,3.0
3,1.0,294,10,3.0
4,1.0,294,12,1.0


In [201]:
similar_person_rating['weighted_rating'] = similar_person_rating['simlar_index'] * similar_person_rating['rating']

In [202]:
temp_similar_person_rating = similar_person_rating.groupby('movieId').sum()[['simlar_index' , 'weighted_rating']]
temp_similar_person_rating.columns = ['sum_similarityIndex','sum_weightedRating']
temp_similar_person_rating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.046941,152.690654
2,29.892832,94.373904
3,9.957228,28.058114
4,0.981981,1.963961
5,8.006719,23.03513


In [203]:
recomendation_df = pd.DataFrame()
recomendation_df['weighted average recommendation score'] = temp_similar_person_rating['sum_weightedRating']/temp_similar_person_rating['sum_similarityIndex']
recomendation_df['movieId'] = temp_similar_person_rating.index
recomendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.013218,1
2,3.157075,2
3,2.817864,3
4,2.0,4
5,2.876975,5


In [204]:
recomendation_df = recomendation_df.sort_values(by='weighted average recommendation score', ascending=False)

In [205]:
recomendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5747,5.0,5747
6460,5.0,6460
26169,5.0,26169
1699,5.0,1699
5490,5.0,5490


In [206]:
movie_df.loc[movie_df['movieId'].isin(recomendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
717,936,Ninotchka,1939
1280,1699,"Butcher Boy, The",1997
1977,2624,After Life (Wandafuru raifu),1998
2711,3637,Vagabond (Sans toit ni loi),1985
2723,3655,Blow-Out (La grande bouffe),1973
3908,5490,The Big Bus,1976
4046,5747,Gallipoli,1981
4396,6460,"Trial, The (Procès, Le)",1962
5466,26169,Branded to Kill (Koroshi no rakuin),1967
9284,158027,SORI: Voice from the Heart,2016
