In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#from sklearn import cross_validation as cv

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, SVD, NMF
import warnings

In [2]:
# read master data
master = pd.read_csv('Data/master.csv')

# Drop unnecessary columns
master_1 = master.loc[:, ['movieId', 'userId', 'rating']]
master_1.drop_duplicates(inplace=True)

## Collaborative filtering 

1. KNN with scipy sparse matrix 
2. KNN-KNNBasic, KNNMeans, KNNBasline (with surprise)
3. Matrix Factorization- SVD (with surpise)


## Memory-Based Methods (Neighborhood-Based)
## 1. KNN with scipy sparse matrix 
Because we have fewer items than users, we calculate item-item similarity. 

### Movie-user matrices (rows=movieId, columns=userId)

In [3]:
# pivot and create movie-user matrix and fill missing observations with 0s
movie_to_user_df = master_1.pivot(
    index='movieId',
     columns='userId',
      values='rating').fillna(0)

movie_to_user_df.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# transform matrix to scipy sparse matrix

movie_to_user_sparse_df = csr_matrix(movie_to_user_df.values)
movie_to_user_sparse_df

# Check the shape
print(movie_to_user_sparse_df.shape)


(9724, 610)


###  Check sparsity of the movie-user matrix 

In [5]:
# calcuate total number of entries in the movie-user matrix
num_entries = movie_to_user_df.shape[0] * movie_to_user_df.shape[1]
# calculate total number of entries with zero values
num_zeros = (movie_to_user_df==0).sum(axis=1).sum()
# calculate ratio of number of zeros to number of entries
ratio_zeros = num_zeros / num_entries
print('There is about {:.2%} of ratings in our data is missing'.format(ratio_zeros))

There is about 98.30% of ratings in our data is missing


Insights: The majority of entries is zero. With too many zeros, the distance between similar items in KNN model will be very large.

### Fitting K-Nearest Neighbours model to the scipy sparse matrix

In [6]:
# Fiting the model 

knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

knn_model.fit(movie_to_user_sparse_df)



NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

### Using the trained model to make movie recommendations

In [7]:
movies_list = list(movie_to_user_df.index)
# Creating a dictionary with movie name as key and its index from the list as value 
movie_dict = {movie : index for index, movie in enumerate(movies_list)}

In [8]:
## function to find top n similar movies of the given movie title. 

def get_similar_movies(title, n = 5):
    ## get movieId from title
    movie_id= master.loc[master.title==title, 'movieId'][0]
    ## input to this function is the movie and number of top similar movies you want.
    index = movie_dict[movie_id]
    knn_input = np.asarray([movie_to_user_df.values[index]])
    n = min(len(movies_list)-1,n)
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    print("Top",n,"movies which are very much similar to the Movie-",title, "are: ")
    print(" ")
    for i in range(1,len(distances[0])):
        print(master.loc[master.movieId==movies_list[indices[0][i]], 'title'][:1])
        #print(movies_list[indices[0][i]], master.loc[master.movieId==movies_list[indices[0][i]], 'title'][:1])
        
  

In [9]:
from pprint import pprint
get_similar_movies('Toy Story (1995)',5)

Top 5 movies which are very much similar to the Movie- Toy Story (1995) are: 
 
52421    Toy Story 2 (1999)
Name: title, dtype: object
13387    Jurassic Park (1993)
Name: title, dtype: object
19177    Independence Day (a.k.a. ID4) (1996)
Name: title, dtype: object
6905    Star Wars: Episode IV - A New Hope (1977)
Name: title, dtype: object
10305    Forrest Gump (1994)
Name: title, dtype: object


## 2. KNN with surprise

In [10]:
# read new_df as Surprise dataset 
# specify the range of rating 0.5-5 (defalt setting is 1-5)
reader = Reader(rating_scale =(0.5, 5) ) 
df = Dataset.load_from_df(master_1,reader)

In [11]:
# report how many users and items we have in our dataset
dataset = df.build_full_trainset()

print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items, '\n')

Number of users:  9724 

Number of items:  610 



In [12]:
# the range of ratings 
print('Min rating:', master_1.rating.min())
print('Max rating:', master_1.rating.max())

Min rating: 0.5
Max rating: 5.0


In [13]:
#Train test split with test size of 20% 

trainset, testset = train_test_split(df, test_size=0.2)


### KNNBasic
A basic collaborative filtering algorithm.

In [14]:
# Two similarity options 

sim_cos = {'name':'cosine', 'user_based':False}
sim_pearson = {'name':'pearson', 'user_based':False}

sim_options = [sim_cos, sim_pearson]

# Ks 
list_of_ks = [10,20,40]

In [15]:
# Hyperparameter Tuning
# KNNBasic 
for sim in sim_options:

    for k in list_of_ks:
        
        print(
            'Calculating sim_option = ' + str(sim['name']) + \
            ' and k = ' + str(k) + ':' )        
        algo = KNNBasic(k = k, sim_options = sim)
        results = cross_validate(algo, df, measures=['RMSE'], cv=3, return_train_measures=True);
        print('RMSE', np.mean(results['test_rmse']))


Calculating sim_option = cosine and k = 10:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9953967464817491
Calculating sim_option = cosine and k = 20:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9828086133897894
Calculating sim_option = cosine and k = 40:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9810974946298798
Calculating sim_option = pearson and k = 10:
Computing the pearson similarity matrix...
Done computing similarit

Insights: For KNN Basic, pick {sim_option = cosine and k = 40} , RMSE 0.9808902092915393

In [17]:
# cross validating with KNNBasic, item-based approach 

sim_cos = {'name':'cosine', 'user_based':False}

basic = KNNBasic(sim_options=sim_cos)
cv_knn_basic = cross_validate(basic, df, measures=['RMSE'], cv=3, n_jobs = -1)

# n_jobs =-1 to ensures that all of the cores will be used to process fitting and evaluating. 


In [18]:
# print out the average RMSE score for the test set
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))



('test_rmse', array([0.98523485, 0.97086065, 0.98041521]))
('fit_time', (0.16493916511535645, 0.1630570888519287, 0.16947031021118164))
('test_time', (1.340721845626831, 1.327664852142334, 1.3435888290405273))
-----------------------
0.9788369044334124


### KNNMean
This is the same thing as the basic KNN model, except it takes into account the mean rating of each item.

In [19]:
# Hyperparameter Tuning
# KNNMeans 
for sim in sim_options:

    for k in list_of_ks:
        
        print(
            'Calculating sim_option = ' + str(sim['name']) + \
            ' and k = ' + str(k) + ':' )        
        algo = KNNWithMeans(k = k, sim_options = sim)
        results = cross_validate(algo, df, measures=['RMSE'], cv=3, return_train_measures=True);
        print('RMSE', np.mean(results['test_rmse']))


Calculating sim_option = cosine and k = 10:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9214135833353835
Calculating sim_option = cosine and k = 20:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9119544160755719
Calculating sim_option = cosine and k = 40:
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.9093085234394095
Calculating sim_option = pearson and k = 10:
Computing the pearson similarity matrix...
Done computing similarit

Insights: For KNN Means, pick {sim_option = pearson and k = 40} , RMSE 0.9079785175984204

In [20]:
# cross validating with KNNMean
mean = KNNWithMeans(sim_options=sim_cos)
cv_knn_mean = cross_validate(mean, df, n_jobs = -1)

In [21]:
# print out the average RMSE score for the test set
for i in cv_knn_mean.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_mean['test_rmse']))



('test_rmse', array([0.90628256, 0.90321849, 0.90417636, 0.8927588 , 0.90459365]))
('test_mae', array([0.69222423, 0.69158727, 0.6905144 , 0.68378511, 0.69215994]))
('fit_time', (0.2415471076965332, 0.23057103157043457, 0.2398841381072998, 0.2309551239013672, 0.24585795402526855))
('test_time', (1.1737420558929443, 1.2422337532043457, 1.1536052227020264, 1.200530767440796, 1.1927309036254883))
-----------------------
0.9022059721784249


### KNNBaseline
It takes into account a baseline rating. It adds biases for items and users. 

In [22]:
# Hyperparameter Tuning
# KNNBaseline 
for sim in sim_options:

    for k in list_of_ks:
        
        print(
            'Calculating sim_option = ' + str(sim['name']) + \
            ' and k = ' + str(k) + ':' )        
        algo = KNNBaseline(k = k, sim_options = sim)
        results = cross_validate(algo, df, measures=['RMSE'], cv=3, return_train_measures=True);
        print('RMSE', np.mean(results['test_rmse']))


Calculating sim_option = cosine and k = 10:
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.8947970449069572
Calculating sim_option = cosine and k = 20:
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE 0.8881197537445793
Calculating sim_option = cosine and k = 40:
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix..

Insights: For KNN Baseline, pick {sim_option = cosine and k = 40} , RMSE 0.8868368800819563

In [23]:
# cross validating with KNNBaseline
baseline = KNNBaseline(sim_options=sim_cos)
cv_knn_baseline = cross_validate(baseline, df, n_jobs = -1)

In [24]:
# print out the average score for the test set
for i in cv_knn_baseline.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_baseline['test_rmse']))


('test_rmse', array([0.87866426, 0.87817519, 0.8755136 , 0.8815892 , 0.88021499]))
('test_mae', array([0.67234037, 0.67367921, 0.66637112, 0.67430384, 0.67492312]))
('fit_time', (0.2752220630645752, 0.2796192169189453, 0.3004002571105957, 0.2790038585662842, 0.2596869468688965))
('test_time', (1.4980831146240234, 1.505275011062622, 1.482499122619629, 1.5172221660614014, 1.475606918334961))
-----------------------
0.8788314472303774


## Model-Based Methods 
## 3. Matrix Factorization with surprise
### SVD

In [25]:
## Perform a gridsearch with SVD

param_grid = {'n_factors':[20, 50, 100],'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005, 0.01],
               'reg_all': [0.04, 0.06]}
gs_model = GridSearchCV(SVD, param_grid, cv=3, n_jobs = -1, joblib_verbose=3)
gs_model.fit(df)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   48.9s finished


In [26]:
# print out optimal parameters for SVD after GridSearch
print(gs_model.best_score['rmse'])
print(gs_model.best_params['rmse'])

0.8769195312018185
{'n_factors': 100, 'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.06}


In [27]:
# SVD with best params 
svd = SVD(n_factors=100, n_epochs=15, lr_all=0.01, reg_all=0.06)
cv_svd = cross_validate(svd, df, cv=3, n_jobs = -1)

In [28]:
for i in cv_svd.items():
    print(i)
print('-----------------------')
print(np.mean(cv_svd['test_rmse']))


('test_rmse', array([0.87070164, 0.88113095, 0.88073159]))
('test_mae', array([0.67301447, 0.67659699, 0.67820408]))
('fit_time', (3.17297101020813, 2.837721109390259, 3.1856589317321777))
('test_time', (0.16179895401000977, 0.17902803421020508, 0.15520310401916504))
-----------------------
0.877521391025725


## Making Recommendations
### Making simple predictions

In [36]:
# make a prediction for an individual user and item using the SVD model above
model = SVD(n_factors=100, n_epochs=15, lr_all=0.01, reg_all=0.06)
model.fit(trainset)

# a prediction for user 50 and item 4 
pred = model.predict(uid = 5, iid=1)
score = pred.est
print('Estimated rating for user=2 item=4: ', round(score, 2))

Estimated rating for user=2 item=4:  3.73


## Make recommendations to an existing user (uid =50)

### reference https://blog.cambridgespark.com/tutorial-practical-introduction-to-recommender-systems-dbe22848392b

In [41]:
# find the movie ids that user_id=50 didn’t rate 

# get a list of all movie titles 
iids = master['title'].unique()
# Get a list of movie ids that uid 50 has rated 
iid50 = master.loc[master['userId']==50, 'title']
# Remove the iids that uid50 has rated from the lisf of iids
iids_to_pred = np.setdiff1d(iids, iid50)  # setdiff1d = Find the set difference of two arrays.



In [42]:
# Predict the score of each of the movie ids that user 50 didn’t rate, and find the best one.

# create another dataset with the iids
# arbitrarily set all the ratings of this test set to 4

testset = [[50, iid, 4.] for iid in iids_to_pred]
predictions = model.test(testset)
predictions[0]



Prediction(uid=50, iid="'71 (2014)", r_ui=4.0, est=4.192082135941737, details={'was_impossible': False})

In [104]:
# Predict the score of each of the movie ids that user 50 didn’t rate, and find the top one.

pred_ratings = np.array([pred.est for pred in predictions])


# Find the index of top 5 predicted ratings
i_max_5 = pred_ratings.argpartition(-5)[-5:]

# Use this to find the corresponding movie title to recommend 
iid_5 = iids_to_pred[i_max_5]

print('Top 5 item for user 50 is {}'
      .format(iid_5))


Top 5 item for user 50 is ['Fried Green Tomatoes (1991)'
 'Friday the 13th Part VI: Jason Lives (1986)'
 'Friday the 13th Part 3: 3D (1982)'
 'Friday the 13th Part IV: The Final Chapter (1984)'
 'À nous la liberté (Freedom for Us) (1931)']
Predicted ratings [4.19208214 4.19208214 4.19208214 4.19208214 4.19208214]


In [67]:
def actual_top5(user_id, master_data):
    # return top5 movie titles and ratings that user actually rated 
    rated = master.loc[master_data['userId']==user_id, ['title', 'rating']]
    rated = rated.sort_values('rating', ascending=False)[:5]
    return rated
    

In [68]:
actual_top5(50, master)

Unnamed: 0,title,rating
21443,2001: A Space Odyssey (1968),4.5
25969,Lawrence of Arabia (1962),4.5
26200,Apocalypse Now (1979),4.5
28272,8 1/2 (8½) (1963),4.5
75595,Persona (1966),4.0


## Make recommendations to an existing user

In [62]:
# Function to provide top 5 ratings for user i 

def top5(user_id, master_data):
    
    '''
    Step 1 find the movies that user i didn’t rate 
    
    Step 2 Predict the score of each of the movie that user 50 didn’t rate, and find top 5.
    '''
    
    # step 1 find the movies that user i didn’t rate 
    # get a list of all movie titles 
    mids = master_data['title'].unique()
    # Get a list of movie ids that user i has rated 
    mid_i = master_data.loc[master_data['userId']==user_id, 'title']
    # from the list of all movie, remove titles user i has rated
    mids_to_pred = np.setdiff1d(mids, mid_i)  
    
    # step 2 Predict the score of each of the movie that user 50 didn’t rate, and find the best one.
    # create another dataset with the mids (movie titles). arbitrarily set all the ratings of this test set to 4
    testset = [[user_id, mid, 4.] for mid in mids_to_pred]
    # Fit the testdata to our model and get predicted ratings
    predictions = model.test(testset)
    pred_ratings = np.array([pred.est for pred in predictions])
    
    # Find the index of top 5 predicted ratings
    top_5 = pred_ratings.argpartition(-5)[-5:]
    # Find the corresponding movie title to recommend 
    top_5_title = mids_to_pred[i_max_5]
    
    return print('Top 5 item for user {} is {}'.format(user_id, top_5_title))




In [98]:
# Function which returns top10 movie titles with ratings that user i actually rated 
def actual_top10(user_id, master_data):
    
    rated = master.loc[master_data['userId']==user_id, ['title', 'rating']]
    rated = rated.sort_values('rating', ascending=False)[:10]
    return rated
    

## Evaluating our recommendation system

### A case of User Id = 50

In [99]:
# Test the function with user_id 50 
top5(50, master)

Top 5 item for user 50 is ['Fried Green Tomatoes (1991)'
 'Friday the 13th Part VI: Jason Lives (1986)'
 'Friday the 13th Part 3: 3D (1982)'
 'Friday the 13th Part IV: The Final Chapter (1984)'
 'À nous la liberté (Freedom for Us) (1931)']


In [100]:
# Check the actual rating by use 50
actual_top10(50, master)

Unnamed: 0,title,rating
21443,2001: A Space Odyssey (1968),4.5
25969,Lawrence of Arabia (1962),4.5
26200,Apocalypse Now (1979),4.5
28272,8 1/2 (8½) (1963),4.5
75595,Persona (1966),4.0
24445,Monty Python and the Holy Grail (1975),4.0
25644,Brazil (1985),4.0
25831,"Good, the Bad and the Ugly, The (Buono, il bru...",4.0
27587,Stalker (1979),4.0
28291,Chinatown (1974),4.0


### A case of User Id 100

In [101]:
# Test the function with user_id 100
top5(100, master)

Top 5 item for user 100 is ['Fraternity Vacation (1985)' 'Frankie and Johnny (1966)'
 'Frankenstein Must Be Destroyed (1969)' 'Frankenstein Unbound (1990)'
 'Woman in Gold (2015)']


In [102]:
# Check the actual rating by user 100
actual_top10(100, master)

Unnamed: 0,title,rating
37674,Terms of Endearment (1983),5.0
44261,Christmas Vacation (National Lampoon's Christm...,5.0
68977,Sweet Home Alabama (2002),5.0
24100,Top Gun (1986),5.0
60795,"Officer and a Gentleman, An (1982)",5.0
30277,When Harry Met Sally... (1989),4.5
36972,Out of Sight (1998),4.5
36098,"Wedding Singer, The (1998)",4.5
34950,Sliding Doors (1998),4.5
34939,"Joy Luck Club, The (1993)",4.5


### A case of User Id 150

In [105]:
# Test the function with user_id 150
top5(150, master)

Top 5 item for user 150 is ['Foreign Correspondent (1940)' 'Forbidden Planet (1956)'
 'For the Love of Benji (1977)' 'Forbidden Games (Jeux interdits) (1952)'
 'Who Framed Roger Rabbit? (1988)']


In [106]:
# Check the actual rating by user 150
actual_top10(150, master)

Unnamed: 0,title,rating
30758,Star Trek: First Contact (1996),5.0
1553,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),5.0
3943,"Birdcage, The (1996)",5.0
17772,Mission: Impossible (1996),4.0
462,Heat (1995),4.0
1339,Leaving Las Vegas (1995),4.0
1842,Dead Man Walking (1995),4.0
2617,Mighty Aphrodite (1995),4.0
19818,"Time to Kill, A (1996)",4.0
19228,Independence Day (a.k.a. ID4) (1996),4.0
