# Задание 
### ПАКЕТ SURPRISE

1. используйте данные MovieLens 1M (можно использовать любые модели из пакета)
2. получите RMSE на тестовом сете 0.87 и ниже
**Комментарий преподавателя :**
В ДЗ на датасет 1М может не хватить RAM. Можно сделать на 100K. Качество RMSE предлагаю считать на основе CrossValidation (5 фолдов), а не отложенном датасете.

In [1]:
!pip install scikit-surprise
from surprise import KNNWithMeans, KNNBasic, SVD, KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd
import numpy as np

from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.3 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633720 sha256=5e05f0d812342493410fcbe948acaca295a64f13c546df7114ca595484bd5d68
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [7]:
movies_with_ratings[movies_with_ratings.userId == 1.0].title.unique()

array(['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)',
       'Seven (a.k.a. Se7en) (1995)', 'Usual Suspects, The (1995)',
       'From Dusk Till Dawn (1996)', 'Bottle Rocket (1996)',
       'Braveheart (1995)', 'Rob Roy (1995)', 'Canadian Bacon (1995)',
       'Desperado (1995)', 'Billy Madison (1995)', 'Clerks (1994)',
       'Dumb & Dumber (Dumb and Dumber) (1994)', 'Ed Wood (1994)',
       'Star Wars: Episode IV - A New Hope (1977)', 'Pulp Fiction (1994)',
       'Stargate (1994)', 'Tommy Boy (1995)',
       'Clear and Present Danger (1994)', 'Forrest Gump (1994)',
       'Jungle Book, The (1994)', 'Mask, The (1994)', 'Blown Away (1994)',
       'Dazed and Confused (1993)', 'Fugitive, The (1993)',
       'Jurassic Park (1993)', 'Mrs. Doubtfire (1993)',
       "Schindler's List (1993)", 'So I Married an Axe Murderer (1993)',
       'Three Musketeers, The (1993)', 'Tombstone (1993)',
       'Dances with Wolves (1990)', 'Batman (1989)',
       'Silence of the Lambs, The 

In [8]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [9]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [10]:
dataset.shape

(100836, 3)

In [11]:
dataset.isnull().any()

uid       False
iid       False
rating    False
dtype: bool

In [12]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [13]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fe5d96d34d0>

In [14]:
trainset, testset = train_test_split(data, test_size=.15)

## KNNWithMeans

In [15]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8988


0.8987819808815495

In [16]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8972  0.8991  0.8957  0.8967  0.8966  0.8971  0.0011  
MAE (testset)     0.6784  0.6825  0.6796  0.6803  0.6823  0.6806  0.0016  
Fit time          0.51    0.54    0.99    1.22    0.78    0.81    0.27    
Test time         1.43    2.16    3.14    3.91    1.

{'fit_time': (0.510188102722168,
  0.5400173664093018,
  0.9922242164611816,
  1.2151813507080078,
  0.7837796211242676),
 'test_mae': array([0.67837329, 0.68250092, 0.67960744, 0.68029018, 0.68232725]),
 'test_rmse': array([0.89722818, 0.89905537, 0.89566658, 0.89673953, 0.89660247]),
 'test_time': (1.432265043258667,
  2.164642095565796,
  3.142317295074463,
  3.912307024002075,
  1.1868445873260498)}

## KNNBasic

In [17]:
algo_basic = KNNBasic(k=50)
algo_basic.fit(trainset)

test_pred = algo_basic.test(testset)
accuracy.rmse(test_pred, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9536


0.953593430780949

In [18]:
cross_validate(algo_basic, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9514  0.9511  0.9411  0.9576  0.9501  0.9502  0.0053  
MAE (testset)     0.7295  0.7281  0.7221  0.7361  0.7297  0.7291  0.0045  
Fit time          0.09    0.12    0.12    0.14    0.12    0.12    0.02    
Test time         1.24    1.28    1.19    1.33    1.29    1.27    0.05    


{'fit_time': (0.08998274803161621,
  0.12181663513183594,
  0.1196441650390625,
  0.1379406452178955,
  0.12157416343688965),
 'test_mae': array([0.72949687, 0.72813985, 0.72209537, 0.73606826, 0.72973867]),
 'test_rmse': array([0.95141271, 0.9510972 , 0.94106582, 0.95755277, 0.95005314]),
 'test_time': (1.2412967681884766,
  1.279785394668579,
  1.193119764328003,
  1.334944486618042,
  1.2863647937774658)}

## SVD

In [19]:
algo_svd = SVD()
algo_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe5d3dd2510>

In [20]:
algo_svd = SVD()
algo_svd.fit(trainset)

test_pred = algo_svd.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8765


0.8764526688876144

In [21]:
cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8779  0.8661  0.8780  0.8628  0.8771  0.8724  0.0066  
MAE (testset)     0.6765  0.6645  0.6765  0.6616  0.6727  0.6704  0.0062  
Fit time          4.61    4.62    4.62    4.54    4.53    4.58    0.04    
Test time         0.14    0.13    0.13    0.13    0.27    0.16    0.06    


{'fit_time': (4.612585067749023,
  4.616160154342651,
  4.616073846817017,
  4.540902376174927,
  4.528967618942261),
 'test_mae': array([0.6764818 , 0.6645287 , 0.67652243, 0.66163278, 0.67273122]),
 'test_rmse': array([0.87790744, 0.86607469, 0.87798988, 0.8628434 , 0.87712468]),
 'test_time': (0.13605070114135742,
  0.13466358184814453,
  0.12547922134399414,
  0.1258544921875,
  0.2748100757598877)}

## KNNBaseline

In [22]:
algo_KNNBaseline = KNNBaseline(k=50)
algo_KNNBaseline.fit(trainset)

test_pred = algo_KNNBaseline.test(testset)
accuracy.rmse(test_pred, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8812


0.8811566171867811

In [23]:
cross_validate(algo_KNNBaseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8745  0.8804  0.8710  0.8719  0.8726  0.8741  0.0034  
MAE (testset)     0.6671  0.6714  0.6673  0.6670  0.6679  0.6681  0.0017  
Fit time          0.32    0.32    0.33    0.32    0.35    0.33    0.01    
Test time         1.58    1.52    1.67    1.51    1.66    1.59    0.07    


{'fit_time': (0.3211331367492676,
  0.32021331787109375,
  0.3259744644165039,
  0.3191642761230469,
  0.3461925983428955),
 'test_mae': array([0.66708784, 0.67141276, 0.66726261, 0.6670263 , 0.66794042]),
 'test_rmse': array([0.87445426, 0.8803876 , 0.87102861, 0.87185571, 0.87256762]),
 'test_time': (1.5755512714385986,
  1.5175766944885254,
  1.672590970993042,
  1.5104899406433105,
  1.6588513851165771)}

## GridSearchCV + KNNBaseline

In [24]:
params = {'bsl_options': {'method': ['als', 'sgd'],
                              'reg': [1, 2]},
              'k':[2,10,50,60,80,100,150,200],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
}

grid_search = GridSearchCV(KNNBaseline, params, measures=['rmse'], cv=3)
grid_search.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine sim

In [25]:
algo_grid = grid_search.best_estimator['rmse']

In [26]:
trainset = data.build_full_trainset()
algo_grid.fit(trainset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7fe5d9be8f90>

In [27]:
predictions = algo_grid.test(trainset.build_testset())
accuracy.rmse(predictions)

RMSE: 0.5883


0.5883307061109653

In [28]:
cross_validate(algo_grid, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8609  0.8559  0.8614  0.8602  0.8663  0.8609  0.0033  
MAE (testset)     0.6578  0.6535  0.6608  0.6603  0.6644  0.6594  0.0036  
Fit time          4.75    5.00    4.72    4.77    4.64    4.78    0.12    
Test time         10.81   10.71   10.59   10.43   10.43   10.59   0.15    


{'fit_time': (4.747292518615723,
  5.004809379577637,
  4.718553781509399,
  4.774207830429077,
  4.643995046615601),
 'test_mae': array([0.65777491, 0.65354556, 0.66075496, 0.66033452, 0.66444164]),
 'test_rmse': array([0.86086515, 0.85590967, 0.86143997, 0.8601887 , 0.86626664]),
 'test_time': (10.812444925308228,
  10.708088397979736,
  10.588817358016968,
  10.426898002624512,
  10.434215068817139)}

## GridSearchCV + SVD

In [32]:
param_grid = {'n_epochs': [5, 10], 
              'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

algo_gs = gs.best_estimator['rmse']

In [33]:
trainset_ = data.build_full_trainset()
algo_gs.fit(trainset_)

predictions = algo_gs.test(trainset_.build_testset())
accuracy.rmse(predictions)

RMSE: 0.8642


0.8642076566992575

In [34]:
cross_validate(algo_gs, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8944  0.8981  0.8850  0.8875  0.8866  0.8903  0.0050  
MAE (testset)     0.6933  0.6957  0.6850  0.6853  0.6855  0.6890  0.0046  
Fit time          2.29    2.30    2.34    2.35    2.31    2.32    0.02    
Test time         0.13    0.39    0.12    0.17    0.37    0.24    0.12    


{'fit_time': (2.2896668910980225,
  2.2966415882110596,
  2.339323043823242,
  2.345677614212036,
  2.3066117763519287),
 'test_mae': array([0.69325626, 0.6957054 , 0.68499597, 0.68529619, 0.6855185 ]),
 'test_rmse': array([0.89435974, 0.89812786, 0.88501255, 0.88754801, 0.88658368]),
 'test_time': (0.12536263465881348,
  0.38886046409606934,
  0.11999106407165527,
  0.17408061027526855,
  0.37400078773498535)}