## User based similarity

In [3]:
#importing the libraries
import pandas as pd
import numpy as np
import scipy as sp


In [4]:
rating_df = pd.read_csv("ratings.csv" )

In [3]:
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
rating_df.drop( 'timestamp', axis = 1, inplace = True )

In [5]:
len( rating_df.userId.unique() )

610

In [6]:
len( rating_df.movieId.unique() )

9724

In [7]:
user_movies_df = rating_df.pivot( index='userId',columns='movieId',values = "rating" ).reset_index(drop=True)
user_movies_df.index = rating_df.userId.unique()


In [8]:
user_movies_df.iloc[0:5, 0:15]


movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,,4.0,,,4.0,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,


In [9]:
user_movies_df.fillna( 0, inplace = True )
user_movies_df.iloc[0:5, 0:10]

movieId,1,2,3,4,5,6,7,8,9,10
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculating Cosine Similarity between users

In [10]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
user_sim = 1 - pairwise_distances( user_movies_df.values, metric="cosine" )
#Store the results in a dataframe
user_sim_df = pd.DataFrame( user_sim )
# set the index and column names to user ids (0 to 610)
user_sim_df.index = rating_df.userId.unique()
user_sim_df.columns = rating_df.userId.unique()


In [11]:
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,1.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,1.0,0.0,0.003726,0.016614
3,0.05972,0.0,1.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,1.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,1.0


In [12]:
user_sim_df.shape


(610, 610)

In [13]:
np.fill_diagonal( user_sim, 0 )
user_sim_df.iloc[0:5, 0:5]


Unnamed: 0,1,2,3,4,5
1,0.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,0.0,0.0,0.003726,0.016614
3,0.05972,0.0,0.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,0.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,0.0


## Filtering Similar User

In [14]:
user_sim_df.idxmax(axis=1)[0:5]


1    266
2    366
3    313
4    391
5    470
dtype: int64

In [15]:
user_sim_df.iloc[1:2, 330:340]

Unnamed: 0,331,332,333,334,335,336,337,338,339,340
2,0.199366,0.073652,0.050674,0.053668,0.073991,0.046544,0.018408,0.074145,0.111447,0.03063


## Loading the movies dataset

In [5]:
movies_df = pd.read_csv( "movies.csv" )

In [6]:
movies_df[0:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
movies_df.drop( 'genres', axis = 1, inplace = True )

## Finding common movies of similar users

In [19]:
def get_user_similar_movies( user1, user2 ):
# Inner join between movies watched between two users will give the common movies watched.
    common_movies = rating_df[rating_df.userId == user1].merge(rating_df[rating_df.userId == user2],on = "movieId",how = "inner" )
# join the above result set with movies details
    return common_movies.merge( movies_df, on = 'movieId' )

In [20]:
common_movies = get_user_similar_movies( 2, 330 )

In [21]:
common_movies[(common_movies.rating_x >= 4.0) & ((common_movies.rating_y >= 4.0))]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
1,2,3578,4.0,330,4.0,Gladiator (2000)
2,2,6874,4.0,330,5.0,Kill Bill: Vol. 1 (2003)
3,2,48516,4.0,330,4.5,"Departed, The (2006)"
4,2,58559,4.5,330,5.0,"Dark Knight, The (2008)"
5,2,68157,4.5,330,5.0,Inglourious Basterds (2009)
7,2,79132,4.0,330,5.0,Inception (2010)


In [22]:
common_movies = get_user_similar_movies( 2, 338 )
common_movies

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,2,318,3.0,338,5.0,"Shawshank Redemption, The (1994)"
1,2,6874,4.0,338,4.5,Kill Bill: Vol. 1 (2003)


## Item based similarity

## Calculating Cosine Similarity between movies

In [23]:
rating_mat = rating_df.pivot( index='movieId',columns='userId',values = "rating" ).reset_index(drop=True)
# fill all NaNs with 0
rating_mat.fillna( 0, inplace = True )
# Find the correlation between movies
movie_sim = 1 - pairwise_distances( rating_mat.values,metric="correlation" )
# Fill the diagonal with 0, as it repreresent the auto-correlation of movies
movie_sim_df = pd.DataFrame( movie_sim )

In [24]:
movie_sim_df.iloc[0:5, 0:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.231327,0.173213,-0.028917,0.192474
1,0.231327,1.0,0.191945,0.071269,0.200526
2,0.173213,0.191945,1.0,0.067143,0.370171
3,-0.028917,0.071269,0.067143,1.0,0.16791
4,0.192474,0.200526,0.370171,0.16791,1.0


In [25]:
movie_sim_df.shape


(9724, 9724)

## Finding most similar movies

In [26]:
def get_similar_movies( movieid, topN = 5 ):
    movieidx = movies_df[movies_df.movieId == movieid].index[0]
    movies_df['similarity'] = movie_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
    return top_n

## Finding similar movies to Godfather

In [27]:
movies_df[movies_df.movieId == 858]

Unnamed: 0,movieId,title
659,858,"Godfather, The (1972)"


In [28]:
get_similar_movies(858)

Unnamed: 0,movieId,title,similarity
659,858,"Godfather, The (1972)",1.0
921,1220,"Blues Brothers, The (1980)",0.76939
913,1212,"Third Man, The (1949)",0.560246
895,1192,Paris Is Burning (1990),0.496048
827,1088,Dirty Dancing (1987),0.442128


## Finding similar movies to Dumb & Dumber

In [29]:
movies_df[movies_df.movieId == 231]


Unnamed: 0,movieId,title,similarity
197,231,Dumb & Dumber (Dumb and Dumber) (1994),0.095286


In [30]:
get_similar_movies(231)


Unnamed: 0,movieId,title,similarity
197,231,Dumb & Dumber (Dumb and Dumber) (1994),1.0
302,344,Ace Ventura: Pet Detective (1994),0.582137
138,165,Die Hard: With a Vengeance (1995),0.465081
291,333,Tommy Boy (1995),0.464892
126,153,Batman Forever (1995),0.432095


## Using Surprise Library - pip install scikit-surprise

In [31]:
from surprise import Dataset, Reader, KNNBasic,accuracy

In [32]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


In [33]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rating_df[['userId','movieId','rating']], reader=reader)

## Create user based similiarity algorithm

In [34]:
item_based_cosine_sim = {'name': 'pearson','user_based': True}
knn = KNNBasic(k= 20,min_k = 5,sim_options = item_based_cosine_sim)


In [35]:
from surprise.model_selection import cross_validate
cv_results = cross_validate(knn,data,measures=['RMSE'],cv=5,verbose=False)


Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [36]:
np.mean( cv_results.get('test_rmse') )

0.9716388647479868

## Finding Best Model

In [37]:
from surprise.model_selection.search import GridSearchCV

In [38]:
param_grid = {'k': [10, 20],
'sim_options': {'name': ['cosine', 'pearson'],
'user_based': [True, False]}
}
grid_cv = GridSearchCV(KNNBasic,
param_grid,
measures=['rmse'],
cv=5,
refit=True)
grid_cv.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

In [39]:
# best RMSE score
print(grid_cv.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(grid_cv.best_params['rmse'])

0.9745239078368779
{'k': 20, 'sim_options': {'name': 'cosine', 'user_based': True}}


In [40]:
results_df = pd.DataFrame.from_dict(grid_cv.cv_results)
results_df[['param_k', 'param_sim_options', 'mean_test_rmse', 'rank_test_rmse']]

Unnamed: 0,param_k,param_sim_options,mean_test_rmse,rank_test_rmse
0,10,"{'name': 'cosine', 'user_based': True}",0.987105,4
1,10,"{'name': 'cosine', 'user_based': False}",1.022934,8
2,10,"{'name': 'pearson', 'user_based': True}",0.985799,3
3,10,"{'name': 'pearson', 'user_based': False}",1.014475,7
4,20,"{'name': 'cosine', 'user_based': True}",0.974524,1
5,20,"{'name': 'cosine', 'user_based': False}",0.99522,6
6,20,"{'name': 'pearson', 'user_based': True}",0.975084,2
7,20,"{'name': 'pearson', 'user_based': False}",0.987902,5


## Making Predictions

In [41]:
grid_cv.predict( 1, 2 )

Prediction(uid=1, iid=2, r_ui=None, est=3.7233653693643416, details={'actual_k': 20, 'was_impossible': False})

## Matrix Factorization

In [42]:
from surprise import SVD
# Use 10 factors for building the model
svd = SVD( n_factors = 5 )

In [43]:
cv_results = cross_validate(svd,
data,
measures=['RMSE'],
cv=5,
verbose=True)
# Setting verbose to True in the configuration will result 
# in the service generating more output (will show you both WARNING and INFO log levels), 
# normally you will only see WARNING or higher (ERROR for example).

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8665  0.8721  0.8742  0.8666  0.8712  0.8701  0.0031  
Fit time          2.60    2.64    2.45    2.50    2.53    2.54    0.07    
Test time         0.79    0.47    0.23    0.37    0.40    0.45    0.19    
