In [1]:
import pandas as pd
import os

from surprise import Reader, Dataset
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

In [2]:
data_folder = 'ml-latest-small'
file_format = 'csv'

In [3]:
paths_to_data_files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if file.endswith(file_format)]

In [4]:
paths_to_data_files

['ml-latest-small\\links.csv',
 'ml-latest-small\\movies.csv',
 'ml-latest-small\\ratings.csv',
 'ml-latest-small\\tags.csv']

In [5]:
for i, csv_file in enumerate(paths_to_data_files):
    if i == 0:
        links_df = pd.read_csv(csv_file)
    elif i == 1:
        movies_df = pd.read_csv(csv_file)
    if i == 2:
        ratings_df = pd.read_csv(csv_file)
    if i == 3:
        tags_df = pd.read_csv(csv_file)

In [6]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [10]:
min_rating = ratings_df['rating'].min()
max_rating = ratings_df['rating'].max()
reader = Reader(rating_scale=(min_rating, max_rating))

In [11]:
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader=reader)

In [12]:
grid_param = {'n_epochs': [5, 10, 20, 30],
             'n_factors': [20, 50, 100]}
grid_search = GridSearchCV(SVD, grid_param, measures=['rmse', 'mae'], cv=10)
grid_search.fit(data)

In [13]:
print("Best RMSE score:", grid_search.best_score['rmse'])
print("Best MAE score:", grid_search.best_score['mae'])
print("Best parameters:", grid_search.best_params)

Best RMSE score: 0.8628783942193377
Best MAE score: 0.6610466915687149
Best parameters: {'rmse': {'n_epochs': 30, 'n_factors': 20}, 'mae': {'n_epochs': 30, 'n_factors': 20}}


In [14]:
best_epochs = grid_search.best_params['rmse']['n_epochs']
best_factors = grid_search.best_params['rmse']['n_factors']

In [15]:
svd_model = SVD(n_epochs=best_epochs, n_factors=best_factors)
results_svd = cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8502  0.8524  0.8666  0.8586  0.8730  0.8649  0.8605  0.8664  0.8782  0.8681  0.8639  0.0082  
MAE (testset)     0.6530  0.6570  0.6653  0.6607  0.6678  0.6587  0.6607  0.6665  0.6696  0.6613  0.6620  0.0049  
Fit time          0.67    0.71    0.68    0.61    0.59    0.62    0.59    0.66    0.64    0.62    0.64    0.04    
Test time         0.03    0.08    0.03    0.07    0.03    0.07    0.03    0.03    0.03    0.08    0.05    0.02    


In [16]:
m_rmse_svd = results_svd['test_rmse'].mean()
m_mae_svd = results_svd['test_mae'].mean()

print("Mean RMSE score:", m_rmse_svd)
print("Mean MAE score:", m_mae_svd)

Mean RMSE score: 0.8639040024801347
Mean MAE score: 0.6620492626244177


In [17]:
svdpp_model = SVDpp()

In [18]:
results_svdpp = cross_validate(svdpp_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8567  0.8664  0.8635  0.8596  0.8644  0.8621  0.0035  
MAE (testset)     0.6552  0.6659  0.6634  0.6601  0.6604  0.6610  0.0036  
Fit time          39.00   37.61   36.58   37.03   36.58   37.36   0.90    
Test time         6.64    6.65    5.85    5.80    5.80    6.15    0.41    


In [19]:
m_rmse_svdpp = results_svdpp['test_rmse'].mean()
m_mae_svdpp = results_svdpp['test_mae'].mean()

print("Mean RMSE score:", m_rmse_svdpp)
print("Mean MAE score:", m_mae_svdpp)

Mean RMSE score: 0.8620958295638201
Mean MAE score: 0.661006907980639


In [20]:
nmf_model = NMF()

In [23]:
results_nmf = cross_validate(nmf_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9240  0.9143  0.9158  0.9267  0.9297  0.9221  0.0060  
MAE (testset)     0.7061  0.7007  0.6999  0.7077  0.7140  0.7057  0.0051  
Fit time          1.31    1.22    1.29    1.36    1.27    1.29    0.05    
Test time         0.05    0.11    0.12    0.05    0.11    0.09    0.03    


In [22]:
m_rmse_nmf = results_nmf['test_rmse'].mean()
m_mae_nmf = results_nmf['test_mae'].mean()

print("Mean RMSE score:", m_rmse_nmf)
print("Mean MAE score:", m_mae_nmf)

Mean RMSE score: 0.9128912823660974
Mean MAE score: 0.6988577623078528


Найкращi результати показала модель SVDpp, найгiршi у моделi NMF, у той час як SVD трохи гiрше за SVDpp.

SVD: 
Mean RMSE score: 0.8639040024801347
Mean MAE score: 0.662049262624417

SVDpp:
Mean RMSE score: 0.8620958295638201
Mean MAE score: 0.66100690798063

NMF:
Mean RMSE score: 0.9128912823660974
Mean MAE score: 0.698857762307852897