In [1]:
import pandas as pd
import numpy as np
import statistics
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import cross_validate, KFold, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

# Introduction
This model exploits the similarities between the preferences of the users and the past ratings given by a user to generate recommendations.

In [2]:
df = pd.read_csv('user.csv')
df.shape

(11458900, 5)

#### Notes: We only run 25% of the dataset due to lack of time 

In [3]:
df = df.drop(columns = ['Unnamed: 0'])
df = df.sample(frac=0.025, replace=True, random_state=1)
display(df.head(2))
print(df.shape)

Unnamed: 0,movieId,original_title,userId,rating
4686059,3052,Don't Make Waves,185614,4.5
6762380,1959,Swept from the Sea,129519,4.0


(286472, 4)


In [4]:
df_movie = df[['movieId','original_title']]
display(df_movie.head(5))
print(df_movie.shape)

Unnamed: 0,movieId,original_title
4686059,3052,Don't Make Waves
6762380,1959,Swept from the Sea
491263,260,The 39 Steps
6662859,1611,Das Wunder von Bern
9554767,558,Spider-Man 2


(286472, 2)


In [5]:
df_movie_name = df_movie.groupby('movieId').nth([0]).reset_index()
display(df_movie_name.head(5))
print(df_movie_name.shape)

Unnamed: 0,movieId,original_title
0,2,Ariel
1,3,Varjoja paratiisissa
2,5,Four Rooms
3,6,Judgment Night
4,11,Star Wars


(4405, 2)


In [6]:
model_df = pd.DataFrame({'movie': df['movieId'], 'user_id':df['userId'], 'rating':df['rating']})
display(model_df.head(5))
print(model_df.shape)

Unnamed: 0,movie,user_id,rating
4686059,3052,185614,4.5
6762380,1959,129519,4.0
491263,260,57380,3.0
6662859,1611,246589,3.5
9554767,558,90607,3.0


(286472, 3)


To Build the recommendation system, we need to use a Scikit-learn library called Surprise. This library is very useful for build and analyze recommendation system with rating dataset.

Since, not every users rated all the movies is hard to evaluate the accuracy. Luckily, the Surprise library has built-in features that can calculate the accuracy of various machine learning models like SVD,KNN and etc.

In [7]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(model_df[['user_id', 'movie', 'rating']], reader)

It's time to do machine learning. Let's fit a Singular Value Decomposition model (SVD). 
The Singular Value Decomposition model will separate the user-item rating matrix into two smaller matrices containing user and movie features. Then, it will predicts the ratings based on the dot product of the two matrices.

Next, we need to do train test split. I am using the built in three-fold cross validation function to optimize the hyperparameters.

In [8]:
algo = SVD(random_state=10)

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True);

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9520  0.9552  0.9523  0.9532  0.0015  
MAE (testset)     0.7401  0.7413  0.7392  0.7402  0.0009  
Fit time          10.85   10.96   10.79   10.87   0.07    
Test time         0.83    0.71    0.71    0.75    0.06    


The table shows that Fold 2 has the lowest RMSE error. <br>
Let's try to fit into the SVD parameter and reducing the RMSE error.

In [9]:
kf = KFold(n_splits = 3)

algo = SVD()

all_acc = []
for trainset, testset in kf.split(data):

    algo.fit(trainset)
    predictions = algo.test(testset)
   
    acc = accuracy.rmse(predictions, verbose=True);
    all_acc.append(acc)
    

RMSE: 0.9508
RMSE: 0.9531
RMSE: 0.9542


In [10]:
statistics.mean(all_acc)

0.9527214184603517

Using Grid search to finetune the hyperparameters and minimize the error term using stochastic gradient descent algorithm:

In [11]:
param_grid = {'n_epochs': [15, 25], 'lr_all': [0.002, 0.004],
              'reg_all': [0.2, 0.4]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)

print(f"RMSE Accuary Score: {grid_search.best_score['rmse']}")
print(grid_search.best_params['rmse'])

RMSE Accuary Score: 0.9489231651015894
{'n_epochs': 25, 'lr_all': 0.004, 'reg_all': 0.2}


In [12]:
param_grid2 = {'n_epochs': [25, 35], 'lr_all': [0.003, 0.005],
              'reg_all': [0.1, 0.3]}
grid_search2 = GridSearchCV(SVD, param_grid2, measures=['rmse'], cv=3)
grid_search2.fit(data)

print(f"RMSE Accuary Score: {grid_search2.best_score['rmse']}")
print(grid_search2.best_params['rmse'])

RMSE Accuary Score: 0.9436076004911383
{'n_epochs': 35, 'lr_all': 0.005, 'reg_all': 0.1}


As the result, the GridSearchCV shows that the model 2 get the lowest RMSE accuary. Then, we will apply the result into our model.

In [13]:
trainset = data.build_full_trainset()

model = SVD(n_epochs = 35, lr_all = 0.005, reg_all = 0.1)

model.fit(trainset);

# Movie Collaborative Filtering Function 

This Movie Collaborative Filtering Function helps user find the movies that they haven't rate (WATCH).

In [14]:
def movie_collaborative_fitering(user_id):

    movie = model_df['movie'].unique()
    rated = model_df.loc[model_df['user_id']== user_id, 'movie']
    unrated = np.setdiff1d(movie, rated)
    
    test_set = [[user_id, item_id, 4] for item_id in unrated]
    predictions = model.test(test_set)
    pred_ratings = np.array([pred.est for pred in predictions])
    
    top_item = pred_ratings.argsort()[-5:][::-1]
    item_id = unrated[top_item]
    
    x = pd.DataFrame()
    for movie_id in item_id:
        recommended_movies = [(df_movie_name[df_movie_name['movieId'] == movie_id])]
        x = x.append(recommended_movies)
    print('Recommended movies:')
    
    return display(x)

In [15]:
user_test = model_df['user_id']
user_test = user_test.reset_index(drop= True)
user_test.head()

0    185614
1    129519
2     57380
3    246589
4     90607
Name: user_id, dtype: int64

In [16]:
movie_collaborative_fitering(user_test[0])

Recommended movies:


Unnamed: 0,movieId,original_title
2765,26366,Ellen DeGeneres: The Beginning
3917,84273,Потомок Чингисхана
1084,1900,Traffic
1592,3086,The Lady Eve
57,96,Beverly Hills Cop II


In [17]:
movie_collaborative_fitering(user_test[1])

Recommended movies:


Unnamed: 0,movieId,original_title
2765,26366,Ellen DeGeneres: The Beginning
1084,1900,Traffic
3917,84273,Потомок Чингисхана
255,326,Snakes on a Plane
1592,3086,The Lady Eve


In [18]:
movie_collaborative_fitering(user_test[2])

Recommended movies:


Unnamed: 0,movieId,original_title
1084,1900,Traffic
2765,26366,Ellen DeGeneres: The Beginning
1466,2757,Adaptation.
4378,160718,I Spy Returns
1353,2330,Taxi


The Movie Collaborative Filtering can provide recommendations on movie we haven't rate (WATCH). Lastly, we should run all the dataset instead of 25% of the dataset to see how accurate the result. 