In [29]:
# import libraries
import os

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset, Reader, SVD, NMF 
from surprise.model_selection import cross_validate, GridSearchCV

In [5]:
# read dataset
data_path = Path("data")
ratings_data_file = os.path.join(data_path, 'ratings.csv')

ratings_data = pd.read_csv(ratings_data_file)
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [25]:
# read dataset
movies_data_file = os.path.join(data_path, 'movies.csv')

movies_data = pd.read_csv(movies_data_file)
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Get minimum and maximum rating from the dataset
min_rating = ratings_data.rating.min()
max_rating = ratings_data.rating.max()
 
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

In [9]:
svd = SVD(n_epochs=10)
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8814  0.8830  0.8708  0.8810  0.8771  0.8670  0.8741  0.8905  0.8842  0.8711  0.8780  0.0069  
MAE (testset)     0.6805  0.6775  0.6730  0.6757  0.6747  0.6689  0.6713  0.6861  0.6829  0.6732  0.6764  0.0051  
Fit time          0.34    0.31    0.33    0.33    0.32    0.33    0.32    0.31    0.42    0.36    0.34    0.03    
Test time         0.05    0.04    0.17    0.04    0.04    0.04    0.04    0.04    0.04    0.11    0.06    0.04    


In [10]:
print("Average MAE: ", np.average(results["test_mae"]))
print("Average RMSE: ", np.average(results["test_rmse"]))

Average MAE:  0.6763766722411801
Average RMSE:  0.8780216504973465


#### Hyperparameter Tuning

In [13]:
param_grid = {
  'n_factors': [20, 50, 100],
  'n_epochs': [5, 10, 20]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8653184403785428
{'n_factors': 20, 'n_epochs': 20}


In [14]:
from surprise.model_selection import train_test_split
 
# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']
 
# sample random trainset and testset
# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data, test_size=.20)
 
# We'll use the famous SVD algorithm.
svd = SVD(n_factors=best_factor, n_epochs=best_epoch)
 
# Train the algorithm on the trainset
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc009f484c0>

In [17]:
ratings_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [26]:
def generate_recommendation(model, user_id, ratings_df, movies_df, n_items):
   # Get a list of all movie IDs from dataset
   movie_ids = ratings_df["movieId"].unique()
 
   # Get a list of all movie IDs that have been watched by user
   movie_ids_user = ratings_df.loc[ratings_df["userId"] == user_id, "movieId"]
    # Get a list off all movie IDS that that have not been watched by user
   movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user)
 
   # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
   test_set = [[user_id, movie_id, 4] for movie_id in movie_ids_to_pred]
 
   # Predict the ratings and generate recommendations
   predictions = model.test(test_set)
   pred_ratings = np.array([pred.est for pred in predictions])
   print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
   # Rank top-n movies based on the predicted ratings
   index_max = (-pred_ratings).argsort()[:n_items]
   for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(movies_df[movies_df["movieId"]==movie_id]["title"].values[0], pred_ratings[i])
 
 
# define which user ID that we want to give recommendation
userID = 23
# define how many top-n movies that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
generate_recommendation(svd,userID,ratings_data,movies_data,n_items)

Top 10 item recommendations for user 23:
Shawshank Redemption, The (1994) 4.166106712301013
Lawrence of Arabia (1962) 4.149825067525008
Philadelphia Story, The (1940) 4.026330344089726
Boondock Saints, The (2000) 4.015354749023998
Serenity (2005) 4.008519613999052
Streetcar Named Desire, A (1951) 4.008508035884578
Rosemary's Baby (1968) 3.9998930793252967
Three Billboards Outside Ebbing, Missouri (2017) 3.981167142285495
Eternal Sunshine of the Spotless Mind (2004) 3.9763664403318697
North by Northwest (1959) 3.974732627590121
