In [94]:
from surprise import KNNWithMeans, KNNBasic, SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNWithZScore, BaselineOnly, CoClustering
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import pandas as pd

In [66]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [67]:
mov_rat = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
mov_rat.dropna(inplace=True)
mov_rat.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [112]:
# соберем датасет
dataset = pd.DataFrame({
    'uid': mov_rat.userId,
    'iid': mov_rat.title,
    'rating': mov_rat.rating})

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [113]:
trainset, testset = train_test_split(data, test_size=.15)

In [123]:
#оценим RMSE на разных моделях

benchmark = []
for algorithm in [SVD(), SlopeOne(), NormalPredictor(), KNNWithMeans(), NMF(), CoClustering(), BaselineOnly(), KNNWithZScore(), KNNBaseline() ]:

    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=True)

    tmp = pd.DataFrame.from_dict(results['test_rmse']).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

df = pd.DataFrame(benchmark).set_index('Algorithm')
df.set_axis(['test_rmse'], axis=1, inplace=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8711  0.8830  0.8740  0.8719  0.8712  0.8742  0.0045  
Fit time          5.08    5.04    5.02    4.99    5.04    5.03    0.03    
Test time         0.15    0.16    0.26    0.15    0.15    0.17    0.04    
Evaluating RMSE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9044  0.8966  0.9092  0.8980  0.8996  0.9015  0.0046  
Fit time          5.33    5.85    5.68    5.48    5.88    5.64    0.21    
Test time         8.71    8.73    8.75    8.92    9.36    8.89    0.24    
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4242  1.4245  1.4254  1.4305  1.4274  1.4264  0.0023  
Fit time          0.19    0.19    0.20    0.19    0.18    0.19    0.01    
Test time  

In [119]:
df.sort_values('test_rmse') 

Unnamed: 0_level_0,test_rmse
Algorithm,Unnamed: 1_level_1
BaselineOnly,0.872502
KNNBaseline,0.873928
SVD,0.87409
KNNWithZScore,0.895938
KNNWithMeans,0.897458
SlopeOne,0.901739
NMF,0.921948
CoClustering,0.946494
NormalPredictor,1.425703
