# Задание по теме "Коллаборативная фильтрация"

Пакет SURPRISE:
- используйте данные MovieLens 1M,
- можно использовать любые модели из пакета,
- получите RMSE на тестовом сете 0,87 и ниже.

In [61]:
import pandas as pd
import numpy as np

from surprise import Dataset, accuracy
from surprise import KNNWithMeans, KNNBasic, SVD, BaselineOnly
from surprise.model_selection import train_test_split, KFold, cross_validate, LeaveOneOut, PredefinedKFold

## Загрузка данных

In [14]:
data = Dataset.load_builtin("ml-1m")

In [15]:
data

<surprise.dataset.DatasetAutoFolds at 0x7f0982fd58a0>

## Обучение моделей

### Использование train_test_split

In [16]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=51)

#### KNN as a prediction algorithms

In [17]:
# user_based model
algo_01 = KNNWithMeans(k=50, sim_options={
    'name': 'cosine',
    'user_based': True
})
algo_01.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f09837e7d00>

In [18]:
test_pred_01 = algo_01.test(testset)

In [19]:
accuracy.rmse(test_pred_01, verbose=True)

RMSE: 0.9353


0.9352676165031188

In [26]:
# item_based model
algo_02 = KNNWithMeans(k=50, sim_options={
    'name': 'cosine',
    'user_based': False
})
algo_02.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f0982fd7fd0>

In [27]:
test_pred_02 = algo_02.test(testset)

In [28]:
accuracy.rmse(test_pred_02, verbose=True)

RMSE: 0.8912


0.8911681978982693

#### SVD as a prediction algorithm

In [29]:
algo_03 = SVD()
algo_03.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f098379a020>

In [30]:
test_pred_03 = algo_03.test(testset)

In [32]:
accuracy.rmse(test_pred_03, verbose=True)

RMSE: 0.8734


0.8733936344274785

#### BaselineOnly as a prediction algorithm

In [41]:
bsl_options = {"method": "als", "n_epochs": 5, "reg_u": 12, "reg_i": 5}
algo_04 = BaselineOnly(bsl_options=bsl_options)
algo_04.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7f096351e260>

In [45]:
test_pred_04 = algo_04.test(testset)

In [46]:
accuracy.rmse(test_pred_04, verbose=True)

RMSE: 0.9069


0.9068589008532121

In [50]:
bsl_options = {"method": "sgd", "n_epochs": 5, "reg_u": 12, "reg_i": 5}
algo_05 = BaselineOnly(bsl_options=bsl_options)
algo_05.fit(trainset)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7f09935aef80>

In [51]:
test_pred_05 = algo_05.test(testset)

In [52]:
accuracy.rmse(test_pred_05, verbose=True)

RMSE: 0.9115


0.9114701689812353

### Кроссвалидация

#### Использование метода cross_validate()

In [53]:
# use KNNMeans
knn_means = KNNWithMeans()
cross_validate(knn_means, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9304  0.9284  0.9307  0.9285  0.9290  0.9294  0.0010  
MAE (testset)     0.7397  0.7376  0.7399  0.7377  0.7383  0.7386  0.0010  
Fit time          36.87   36.13   37.30   38.53   41.97   38.16   2.06    
Test time         102.57  97.29   114.32  104.61  108.37  105.43  5.70    


{'test_rmse': array([0.93042074, 0.92837105, 0.9306683 , 0.92850815, 0.92897945]),
 'test_mae': array([0.73970016, 0.73764692, 0.73990323, 0.73767794, 0.73825834]),
 'fit_time': (36.866288900375366,
  36.12835955619812,
  37.30366539955139,
  38.52669978141785,
  41.97278118133545),
 'test_time': (102.57249021530151,
  97.29402303695679,
  114.31759595870972,
  104.61051774024963,
  108.37229776382446)}

In [54]:
# use SVD
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8745  0.8728  0.8730  0.8733  0.8744  0.8736  0.0007  
MAE (testset)     0.6853  0.6857  0.6851  0.6859  0.6861  0.6856  0.0003  
Fit time          8.09    8.36    7.95    8.48    8.36    8.25    0.20    
Test time         2.61    1.00    2.34    0.98    1.07    1.60    0.72    


{'test_rmse': array([0.87447066, 0.87280124, 0.87302397, 0.87331274, 0.87444944]),
 'test_mae': array([0.6852809 , 0.68567578, 0.68513323, 0.68585453, 0.68605341]),
 'fit_time': (8.091882228851318,
  8.355844497680664,
  7.9458887577056885,
  8.484724283218384,
  8.35508680343628),
 'test_time': (2.605236768722534,
  1.0009913444519043,
  2.338616371154785,
  0.9836599826812744,
  1.0721631050109863)}

In [49]:
# use BaselineOnly
baseline_only = BaselineOnly()
cross_validate(baseline_only, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9111  0.9100  0.9081  0.9060  0.9082  0.9087  0.0018  
MAE (testset)     0.7216  0.7202  0.7196  0.7171  0.7190  0.7195  0.0015  
Fit time          4.19    4.45    4.98    4.63    4.69    4.59    0.26    
Test time         1.87    0.70    1.72    0.65    1.90    1.37    0.57    


{'test_rmse': array([0.91110685, 0.91004077, 0.90814235, 0.90596624, 0.90821457]),
 'test_mae': array([0.72160352, 0.7201665 , 0.71959864, 0.71707701, 0.71903615]),
 'fit_time': (4.189107418060303,
  4.445256471633911,
  4.98059606552124,
  4.629040002822876,
  4.68644642829895),
 'test_time': (1.8654606342315674,
  0.7027554512023926,
  1.7205188274383545,
  0.6496963500976562,
  1.9026248455047607)}

#### Use KFold()

In [55]:
kf = KFold(n_splits=5)

In [64]:
algo_06 = SVD()

In [68]:
for trainset, testset in kf.split(data):
    algo_06.fit(trainset)
    predictions_06 = algo_06.test(testset)
    accuracy.rmse(predictions_06, verbose=True)

RMSE: 0.8748
RMSE: 0.8733
RMSE: 0.8741
RMSE: 0.8730
RMSE: 0.8741


#### Use LeaveOneOut()

In [62]:
loo = LeaveOneOut(n_splits=5)

In [69]:
for trainset, testset in loo.split(data):
    algo_06.fit(trainset)
    predictions_07 = algo_06.test(testset)
    accuracy.rmse(predictions_07, verbose=True)

RMSE: 0.9035
RMSE: 0.8774
RMSE: 0.8928
RMSE: 0.8921
RMSE: 0.8998


**Вывод:**
- Лучние результаты достигнуты при использвании алгоритма SVD
- Попробуем улучнить модель, изменив параметры регуляризации

### Регуляризация

In [111]:
# Параметры по умолчанию
algo_07 = SVD(reg_bu=0.02, reg_bi=0.02, reg_pu=0.02, reg_qi=0.02)

In [94]:
for trainset, testset in kf.split(data):
    algo_07.fit(trainset)
    predictions_08 = algo_07.test(testset)
    accuracy.rmse(predictions_08, verbose=True)

RMSE: 0.8748
RMSE: 0.8731
RMSE: 0.8746
RMSE: 0.8744
RMSE: 0.8714


In [109]:
algo_07 = SVD(reg_bu=0.05, reg_bi=0.05, reg_pu=0.1, reg_qi=0.02)

In [110]:
for trainset, testset in kf.split(data):
    algo_07.fit(trainset)
    predictions_08 = algo_07.test(testset)
    accuracy.rmse(predictions_08, verbose=True)

RMSE: 0.8676
RMSE: 0.8683
RMSE: 0.8687
RMSE: 0.8691
RMSE: 0.8671


In [114]:
algo_07 = SVD(reg_bu=0.05, reg_bi=0.02, reg_pu=0.05, reg_qi=0.02)
for trainset, testset in kf.split(data):
    algo_07.fit(trainset)
    predictions_08 = algo_07.test(testset)
    accuracy.rmse(predictions_08, verbose=True)

RMSE: 0.8675
RMSE: 0.8630
RMSE: 0.8661
RMSE: 0.8690
RMSE: 0.8664


In [115]:
algo_07 = SVD(reg_bu=0.05, reg_bi=0.05, reg_pu=0.05, reg_qi=0.02)
for trainset, testset in kf.split(data):
    algo_07.fit(trainset)
    predictions_08 = algo_07.test(testset)
    accuracy.rmse(predictions_08, verbose=True)

RMSE: 0.8634
RMSE: 0.8652
RMSE: 0.8650
RMSE: 0.8690
RMSE: 0.8673


**Вывод:**
- Изменения параметров регуляризации позволило снизить значение RMSE в тестовом сете до значения < 0.87