### Задание 
Используя данные MovieLens 1M, получите RMSE на тестовом сете <= 0.87

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [2]:
df_ratings = pd.read_csv('data/ratings.csv')
df_movies = pd.read_csv('data/movies.csv')

In [3]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [5]:
#!pip install surprise
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

In [6]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [7]:
RATING_MIN, RATING_MAX = df_ratings.rating.min(), df_ratings.rating.max()

In [8]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]
df_for_surpise.columns = ['uid', 'iid', 'rating']
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [9]:
reader = Reader(rating_scale=(RATING_MIN, RATING_MAX))
dataset = Dataset.load_from_df(df_for_surpise, reader)
trainset, testset = train_test_split(dataset, test_size=0.2)

### Попробуем KNNBasic в качестве baseline модели

In [37]:
from surprise import KNNBasic

In [39]:
scores = {}
for i in [5, 10, 20, 30, 50]:
    algo = KNNBasic(k=i, sim_options={'name': 'cosine', 'user_based': True})
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse_score = accuracy.rmse(predictions)
    print(f'For KNNBasic with k={i} rmse : {rmse_score}')
    scores[i] = rmse_score

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0221
For KNNBasic with k=5 rmse : 1.0220953922551823
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9902
For KNNBasic with k=10 rmse : 0.9901560798034851
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9783
For KNNBasic with k=20 rmse : 0.9783223706250015
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9767
For KNNBasic with k=30 rmse : 0.9767144812519929
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9765
For KNNBasic with k=50 rmse : 0.976505791834966
{5: 1.0220953922551823, 10: 0.9901560798034851, 20: 0.9783223706250015, 30: 0.9767144812519929, 50: 0.976505791834966}


In [44]:
sorted(scores.items(),key= lambda x : x[1])

[(50, 0.976505791834966),
 (30, 0.9767144812519929),
 (20, 0.9783223706250015),
 (10, 0.9901560798034851),
 (5, 1.0220953922551823)]

### Попробуем SVD и SVDpp

In [10]:
from surprise import SVD, SVDpp
from surprise.model_selection import cross_validate

In [14]:
algo = SVD()
cross_validate(algo, dataset, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8737  0.8732  0.8705  0.8772  0.8693  0.8728  0.0028  
Fit time          4.11    4.05    4.00    3.95    4.02    4.02    0.05    
Test time         0.10    0.09    0.17    0.16    0.10    0.12    0.03    


{'test_rmse': array([0.8737297 , 0.87322649, 0.87046911, 0.87723877, 0.86934912]),
 'fit_time': (4.105120897293091,
  4.047452688217163,
  3.9967939853668213,
  3.952303409576416,
  4.0171332359313965),
 'test_time': (0.09999656677246094,
  0.09276461601257324,
  0.16945385932922363,
  0.16246986389160156,
  0.10017848014831543)}

In [13]:
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
print(f'For SVD rmse : {accuracy.rmse(predictions)}')

RMSE: 0.8758
For SVD rmse : 0.8757988510094677


In [27]:
from surprise.model_selection import GridSearchCV, RandomizedSearchCV

In [47]:
param_grid = { 'n_factors' : [50, 100],
               'n_epochs': [100, 150], 
               'lr_all': [0.005],
               'reg_all': [0.2]}
gs = RandomizedSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_iter=4, n_jobs=-1)
gs.fit(dataset)
print('Best score : {}'.format(gs.best_score['rmse']))
print('Best params : {}'.format(gs.best_params['rmse']))

Best score : 0.8646182720807658
Best params : {'n_factors': 100, 'n_epochs': 150, 'lr_all': 0.005, 'reg_all': 0.2}


In [49]:
gs.best_params['rmse']

{'n_factors': 100, 'n_epochs': 150, 'lr_all': 0.005, 'reg_all': 0.2}

In [50]:
algo = SVD(n_factors=100, n_epochs=100, lr_all=0.005, reg_all=0.2)
algo.fit(trainset)
predictions = algo.test(testset)
print(f'For SVD with best gridsearched params rmse : {accuracy.rmse(predictions)}')

RMSE: 0.8696
For SVD with best gridsearched params rmse : 0.8695976670813423


In [33]:
algo = SVDpp()
algo.fit(trainset)
predictions = algo.test(testset)
print(f'For SVD++ rmse : {accuracy.rmse(predictions)}')

RMSE: 0.8607
For SVD++ rmse : 0.8606890479977599


In [51]:
# SVD++ дольше всех считался, но выдал наилучший результат из всех использованных моделей