# Collaborative Filtering
## user - user based Filtering

In [1]:
import pandas as pd
import numpy as np

### Reading Data

In [2]:
movies_df = pd.read_csv('../data/movies/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('../data/movies/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


### Picking a Random Test User

In [4]:
np.random.seed(2)
userId = np.random.randint(1, np.max(np.unique(ratings_df['userId'])))
userId

7337

### Creating Test user Ratings for The movies he/she Watched

In [5]:
test_user_ratings_df = ratings_df[ratings_df['userId'] == userId]
test_user_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
677286,7337,480,4.0,949305508
677287,7337,802,4.0,949305818
677288,7337,858,5.0,949305645
677289,7337,1210,4.0,949305548
677290,7337,1834,4.0,949305645


Formating the table for just MovieId and Ratings

In [6]:
test_user_ratings_df = test_user_ratings_df.drop(['userId', 'timestamp'], axis=1)
test_user_ratings_df = test_user_ratings_df.reset_index(drop=True)
test_user_ratings_df.head()

Unnamed: 0,movieId,rating
0,480,4.0
1,802,4.0
2,858,5.0
3,1210,4.0
4,1834,4.0


### Get other users who have rated 1 or more movies similar to Test user

In [7]:
other_user_ratings_df = ratings_df[ratings_df['movieId'].isin(test_user_ratings_df['movieId'].tolist())]
other_user_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
27,4,480,2.0,1037737218
91,4,1834,5.0,1037739516
120,4,2997,4.0,1037737504
232,6,3082,4.0,1275904986
241,7,858,3.5,1451817833


#### Grouping other_user_ratings_df wrt to userId

In [8]:
other_user_ratings_groups = other_user_ratings_df.groupby('userId')
other_user_ratings_groups.get_group(4)

Unnamed: 0,userId,movieId,rating,timestamp
27,4,480,2.0,1037737218
91,4,1834,5.0,1037739516
120,4,2997,4.0,1037737504


#### Sorting the other_user_ratings_groups with most common movies rated by users

In [9]:
#Sorting it so users with movie most in common with the input will have priority
other_user_ratings_groups = sorted(other_user_ratings_groups,  key=lambda x: len(x[1]), reverse=True)
# other_user_ratings_groups[:2]

selecting top 100 users to use for recommendations

In [10]:
other_user_ratings_groups = other_user_ratings_groups[:100] 

### Similarity Matrix of Users by Pearson  Correlation

In [11]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

for userId, group in other_user_ratings_groups:

    group = group.sort_values(by='movieId')   # Sorted groups wrt movieId
    
    test_user_ratings_df = test_user_ratings_df.sort_values(by='movieId')
    nRatings = len(group)   # Get the N for the formula
    #Get the review scores for the movies that they both have in common
    temp_df = test_user_ratings_df[test_user_ratings_df['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[userId] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[userId] = 0

#### Creating data Frame for the Similarity indeics from pearpearsonCorrelationDict

In [12]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.185098,3388
1,0.134779,4733
2,-0.048828,5133
3,0.563492,6530
4,1.0,7337


### Selecting top 50 Users that are most similar 

In [13]:
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
4,1.0,7337
76,0.801901,1361
66,0.790569,241
90,0.733333,2711
77,0.679366,1420


#### Getting all the Candidate movie list for all the users in topUsers

In [14]:
topUsersRating = topUsers.merge(ratings_df, left_on = 'userId', right_on = 'userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp
0,1.0,7337,480,4.0,949305508
1,1.0,7337,802,4.0,949305818
2,1.0,7337,858,5.0,949305645
3,1.0,7337,1210,4.0,949305548
4,1.0,7337,1834,4.0,949305645


### Creating Weighted Movie Rating for Each movie rated by the user by multiplying ratings witth similarityIndex

In [15]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp,weightedRating
0,1.0,7337,480,4.0,949305508,4.0
1,1.0,7337,802,4.0,949305818,4.0
2,1.0,7337,858,5.0,949305645,5.0
3,1.0,7337,1210,4.0,949305548,4.0
4,1.0,7337,1834,4.0,949305645,4.0


### Finding the Sum of similarity Index and sum of weighted Rating grouped by movies

In [16]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,15.691113,66.843985
2,9.864473,29.823282
3,5.075636,15.305973
4,1.362816,3.646632
5,6.453337,15.883624


### Dividing sum_weightedRating by sum_similarityIndex

In [17]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.25999,1
2,3.023302,2
3,3.015577,3
4,2.675806,4
5,2.461304,5


#### Sorting the recommendation DF by recommendation score

In [18]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
78895,5.0,78895
4884,5.0,4884
95311,5.0,95311
72104,5.0,72104
51380,5.0,51380
8426,5.0,8426
50619,5.0,50619
72649,5.0,72649
50477,5.0,50477
50245,5.0,50245


### Displaying Top 10 Movie Recommendations

In [19]:
recommendation_df = movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]
recommendation_df = recommendation_df.reset_index(drop=True)
recommendation_df

Unnamed: 0,movieId,title,genres
0,4884,Trembling Before G-d (2001),Documentary
1,8426,Robot Carnival (Roboto kânibauru) (1987),Animation|Comedy|Drama|Fantasy|Sci-Fi
2,50245,Alice in the Cities (Alice in den Stadten) (1974),Drama
3,50477,"Testament of Orpheus, The (Testament d'Orphée)...",Drama
4,50619,"Blood of a Poet, The (Sang d'un poète, Le) (1930)",Drama|Fantasy
5,51380,"Canterbury Tales, The (I racconti di Canterbur...",Comedy|Drama
6,72104,Balance (1989),Animation|Drama|Mystery|Sci-Fi|Thriller
7,72649,"Ceremony, The (Gishiki) (1971)",Comedy|Drama
8,78895,When Father Was Away on Business (Otac na sluz...,Drama
9,95311,Presto (2008),Animation|Children|Comedy|Fantasy
