In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#from sklearn import cross_validation as cv

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, SVD, NMF
import warnings

In [11]:
# read master data
master = pd.read_csv('Data/master.csv')

# Drop unnecessary columns
master_1 = master.loc[:, ['movieId', 'userId', 'rating']]
master_1.drop_duplicates(inplace=True)

## Collaborative filtering 

1. KNN with scipy sparse matrix 
2. KNN-KNNBasic, KNNMeans, KNNBasline (with surprise)
3. Matrix Factorization- SVD (with surpise)


## Memory-Based Methods (Neighborhood-Based)
## 1. KNN with scipy sparse matrix 
Because we have fewer items than users, we calculate item-item similarity. 

### Movie-user matrices (rows=movieId, columns=userId)

In [19]:
# pivot and create movie-user matrix and fill missing observations with 0s
movie_to_user_df = master_1.pivot(
    index='movieId',
     columns='userId',
      values='rating').fillna(0)

movie_to_user_df.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# transform matrix to scipy sparse matrix

movie_to_user_sparse_df = csr_matrix(movie_to_user_df.values)
movie_to_user_sparse_df

# Check the shape
print(movie_to_user_sparse_df.shape)


(9724, 610)


###  Check sparsity of the movie-user matrix 

In [38]:
# calcuate total number of entries in the movie-user matrix
num_entries = movie_to_user_df.shape[0] * movie_to_user_df.shape[1]
# calculate total number of entries with zero values
num_zeros = (movie_to_user_df==0).sum(axis=1).sum()
# calculate ratio of number of zeros to number of entries
ratio_zeros = num_zeros / num_entries
print('There is about {:.2%} of ratings in our data is missing'.format(ratio_zeros))

There is about 98.30% of ratings in our data is missing


Insights: The majority of entries is zero. With too many zeros, the distance between similar items in KNN model will be very large.

### Fitting K-Nearest Neighbours model to the scipy sparse matrix

In [24]:
# Fiting the model 

knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

knn_model.fit(movie_to_user_sparse_df)



NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

### Using the trained model to make movie recommendations

In [25]:
movies_list = list(movie_to_user_df.index)
# Creating a dictionary with movie name as key and its index from the list as value 
movie_dict = {movie : index for index, movie in enumerate(movies_list)}

In [35]:
## function to find top n similar movies of the given movie title. 

def get_similar_movies(title, n = 5):
    ## get movieId from title
    movie_id= master.loc[master.title==title, 'movieId'][0]
    ## input to this function is the movie and number of top similar movies you want.
    index = movie_dict[movie_id]
    knn_input = np.asarray([movie_to_user_df.values[index]])
    n = min(len(movies_list)-1,n)
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    print("Top",n,"movies which are very much similar to the Movie-",title, "are: ")
    print(" ")
    for i in range(1,len(distances[0])):
        print(master.loc[master.movieId==movies_list[indices[0][i]], 'title'][:1])
        #print(movies_list[indices[0][i]], master.loc[master.movieId==movies_list[indices[0][i]], 'title'][:1])
        
  

In [36]:
from pprint import pprint
get_similar_movies('Toy Story (1995)',5)

Top 5 movies which are very much similar to the Movie- Toy Story (1995) are: 
 
52421    Toy Story 2 (1999)
Name: title, dtype: object
13387    Jurassic Park (1993)
Name: title, dtype: object
19177    Independence Day (a.k.a. ID4) (1996)
Name: title, dtype: object
6905    Star Wars: Episode IV - A New Hope (1977)
Name: title, dtype: object
10305    Forrest Gump (1994)
Name: title, dtype: object


## 2. KNN with surprise

In [39]:
# read new_df as Surprise dataset 
# specify the range of rating 0.5-5 (defalt setting is 1-5)
reader = Reader(rating_scale =(0.5, 5) ) 
df = Dataset.load_from_df(master_1,reader)

In [40]:
# report how many users and items we have in our dataset
dataset = df.build_full_trainset()

print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items, '\n')

Number of users:  9724 

Number of items:  610 



In [41]:
# the range of ratings 
print('Min rating:', master_1.rating.min())
print('Max rating:', master_1.rating.max())

Min rating: 0.5
Max rating: 5.0


In [42]:
#Train test split with test size of 20% 

trainset, testset = train_test_split(df, test_size=0.2)


### KNNBasic
A basic collaborative filtering algorithm.

In [72]:
# Two similarity options 

sim_cos = {'name':'cosine', 'user_based':False}
sim_pearson = {'name':'pearson', 'user_based':False}

sim_options = [sim_cos, sim_pearson]

# Ks 
list_of_ks = [10,20,40]

In [77]:
# Hyperparameter Tuning
# KNNBasic 
for sim in sim_options:

    for k in list_of_ks:
        
        print(
            'Calculating sim_option = ' + str(sim['name']) + \
            ' and k = ' + str(k) + ':' )        
        algo = KNNBasic(k = k, sim_options = sim)
        results = cross_validate(algo, df, measures=['RMSE'], cv=3, return_train_measures=True);
        print('RMSE', np.mean(results['test_rmse']))


Calculating sim_option = cosine and k = 10:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.994164707209045
Calculating sim_option = cosine and k = 20:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9821544120129753
Calculating sim_option = cosine and k = 40:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.978840977526117
Calculating sim_option = pearson and k = 10:
Computing the pearson similarity matrix...
Done computing similarity 

Insights: For KNN Basic, pick {sim_option = cosine and k = 40} , RMSE 0.9808902092915393

In [60]:
# cross validating with KNNBasic, item-based approach 

sim_cos = {'name':'cosine', 'user_based':False}

basic = KNNBasic(sim_options=sim_cos)
cv_knn_basic = cross_validate(basic, trainset, measures=['RMSE'], cv=3, n_jobs = -1)

# n_jobs =-1 to ensures that all of the cores will be used to process fitting and evaluating. 


In [61]:
# print out the average RMSE score for the test set
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))



('test_rmse', array([0.98314383, 0.98263915, 0.97586189]))
('fit_time', (0.1603069305419922, 0.15911293029785156, 0.15041875839233398))
('test_time', (1.33148193359375, 1.3358078002929688, 1.3139100074768066))
-----------------------
0.980548287552845


### KNNMean
This is the same thing as the basic KNN model, except it takes into account the mean rating of each item.

In [64]:
# Hyperparameter Tuning
# KNNMeans 
for sim in sim_options:

    for k in list_of_ks:
        
        print(
            'Calculating sim_option = ' + str(sim['name']) + \
            ' and k = ' + str(k) + ':' )        
        algo = KNNWithMeans(k = k, sim_options = sim)
        results = cross_validate(algo, df, measures=['RMSE'], cv=3, return_train_measures=True);
        print('RMSE', np.mean(results['test_rmse']))


Calculating sim_option = cosine and k = 10:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9208576776199479
Calculating sim_option = cosine and k = 20:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9106196853698317
Calculating sim_option = cosine and k = 40:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9090380269766835
Calculating sim_option = pearson and k = 10:
Computing the pearson similarity matrix...
Done computing similarit

Insights: For KNN Means, pick {sim_option = pearson and k = 40} , RMSE 0.9079785175984204

In [65]:
# cross validating with KNNMean
mean = KNNWithMeans(sim_options=sim_cos)
cv_knn_mean = cross_validate(mean, df, n_jobs = -1)

In [66]:
# print out the average RMSE score for the test set
for i in cv_knn_mean.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_mean['test_rmse']))



('test_rmse', array([0.90727953, 0.89911284, 0.909075  , 0.89607843, 0.90052929]))
('test_mae', array([0.6935493 , 0.68596303, 0.69642293, 0.68642909, 0.6873128 ]))
('fit_time', (0.220444917678833, 0.2231001853942871, 0.23791122436523438, 0.21805715560913086, 0.21230316162109375))
('test_time', (1.1647169589996338, 1.1703667640686035, 1.1610219478607178, 1.1665871143341064, 1.1365859508514404))
-----------------------
0.9024150170275282


### KNNBaseline
It takes into account a baseline rating. It adds biases for items and users. 

In [69]:
# Hyperparameter Tuning
# KNNBaseline 
for sim in sim_options:

    for k in list_of_ks:
        
        print(
            'Calculating sim_option = ' + str(sim['name']) + \
            ' and k = ' + str(k) + ':' )        
        algo = KNNBaseline(k = k, sim_options = sim)
        results = cross_validate(algo, df, measures=['RMSE'], cv=3, return_train_measures=True);
        print('RMSE', np.mean(results['test_rmse']))


Calculating sim_option = cosine and k = 10:
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.8952920093005599
Calculating sim_option = cosine and k = 20:
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.8883825238225258
Calculating sim_option = cosine and k = 40:
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix..

Insights: For KNN Baseline, pick {sim_option = cosine and k = 40} , RMSE 0.8868368800819563

In [67]:
# cross validating with KNNBaseline
baseline = KNNBaseline(sim_options=sim_cos)
cv_knn_baseline = cross_validate(baseline, df, n_jobs = -1)

In [68]:
# print out the average score for the test set
for i in cv_knn_baseline.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_baseline['test_rmse']))


('test_rmse', array([0.88192946, 0.87554556, 0.8668803 , 0.885553  , 0.88569512]))
('test_mae', array([0.67470494, 0.67041945, 0.66629289, 0.67539273, 0.67660623]))
('fit_time', (0.27367234230041504, 0.27748608589172363, 0.27275800704956055, 0.24278593063354492, 0.25119614601135254))
('test_time', (1.4932341575622559, 1.4593956470489502, 1.4546098709106445, 1.4698169231414795, 1.439692735671997))
-----------------------
0.879120687193384


## Model-Based Methods 
## 3. Matrix Factorization with surprise
### SVD

In [100]:
## Perform a gridsearch with SVD

param_grid = {'n_factors':[20, 50, 100],'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005, 0.01],
               'reg_all': [0.04, 0.06]}
gs_model = GridSearchCV(SVD, param_grid, cv=3, n_jobs = -1, joblib_verbose=3)
gs_model.fit(df)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   42.8s finished


In [101]:
# print out optimal parameters for SVD after GridSearch
print(gs_model.best_score['rmse'])
print(gs_model.best_params['rmse'])

0.8776272643241537
{'n_factors': 100, 'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.06}


In [102]:
# SVD with best params 
svd = SVD(n_factors=100, n_epochs=15, lr_all=0.01, reg_all=0.06)
cv_svd = cross_validate(svd, df, cv=3, n_jobs = -1)

In [103]:
for i in cv_svd.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svd['test_rmse']))


('test_rmse', array([0.87349157, 0.87693559, 0.88158069]))
('test_mae', array([0.67147149, 0.67725865, 0.67718558]))
('fit_time', (2.3285319805145264, 2.3774940967559814, 2.8599600791931152))
('test_time', (0.1505889892578125, 0.15070605278015137, 0.1367800235748291))
-----------------------
0.8773359498086899


## Making Recommendations
### Making simple predictions

In [104]:
# make a prediction for an individual user and item using the SVD model above
model = SVD(n_factors=100, n_epochs=15, lr_all=0.01, reg_all=0.06)
model.fit(trainset)

# a prediction for user 50 and item 4 
pred = model.predict(uid = 5, iid=1)
score = pred.est
print('Estimated rating for user=2 item=4: ', round(score, 2))

Estimated rating for user=2 item=4:  3.87


## Make recommendations to an existing user (uid =50)

### reference https://blog.cambridgespark.com/tutorial-practical-introduction-to-recommender-systems-dbe22848392b

In [105]:
# find the movie ids that user 50 didn’t rate 

# get a list of all movie titles 
iids = master['title'].unique()
# Get a list of movie ids that uid 50 has rated 
iid50 = master.loc[master['userId']==50, 'title']
# Remove the iids that uid50 has rated from the lisf of iids
iids_to_pred = np.setdiff1d(iids, iid50)  # setdiff1d = Find the set difference of two arrays.



In [106]:
# Predict the score of each of the movie ids that user 50 didn’t rate, and find the best one.

# create another dataset with the iids
# arbitrarily set all the ratings of this test set to 4

testset = [[50, iid, 4.] for iid in iids_to_pred]
predictions = model.test(testset)
predictions[0]



Prediction(uid=50, iid="'71 (2014)", r_ui=4.0, est=4.201787625891698, details={'was_impossible': False})

In [110]:
# Predict the score of each of the movie ids that user 50 didn’t rate, and find the top one.

pred_ratings = np.array([pred.est for pred in predictions])

# Find the index of the max predicted rating 
#i_max = pred_ratings.argmax()

# Use this to find the corresponding movie title to recommend 
#iid = iids_to_pred[i_max]

#print('Top item for user 50 is {} with predicted rating {}'
#      .format(iid, pred_ratings[i_max]))


# Find the index of top 5 predicted ratings
i_max_5 = pred_ratings.argpartition(-5)[-5:]

iid_5 = iids_to_pred[i_max_5]

print('Top 5 item for user 50 is {}'
      .format(iid_5))


Top 5 item for user 50 is ['Fried Green Tomatoes (1991)'
 'Friday the 13th Part VI: Jason Lives (1986)'
 'Friday the 13th Part 3: 3D (1982)'
 'Friday the 13th Part IV: The Final Chapter (1984)'
 'À nous la liberté (Freedom for Us) (1931)']
