In [5]:
import surprise
import pandas as pd
import numpy as np

from tqdm.notebook import trange
from surprise import Dataset, Reader, SVD, SVDpp, NMF, accuracy
from surprise.model_selection import  cross_validate, train_test_split, GridSearchCV
from surprise.dataset import DatasetAutoFolds

In [8]:
data = Dataset.load_builtin('ml-100k')

In [9]:
reader = Reader(rating_scale=(1, 5))
raw_ratings = data.raw_ratings

In [10]:
trainset, testset = train_test_split(data, test_size=0.2)

# Огляд даних

##### DF

In [11]:
df = pd.DataFrame(raw_ratings, columns=['userId', 'itemId', 'rating', 'created']).drop('created', axis=1)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userId  100000 non-null  object 
 1   itemId  100000 non-null  object 
 2   rating  100000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.3+ MB


In [13]:
df.describe()

Unnamed: 0,rating
count,100000.0
mean,3.52986
std,1.125674
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [14]:
df.head()

Unnamed: 0,userId,itemId,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


In [15]:
len(df)

100000

##### Train DataSet

In [16]:
trainset

<surprise.trainset.Trainset at 0x17675cfc610>

In [17]:
train_data = trainset.build_testset()
train_df = pd.DataFrame(train_data, columns=['userId', 'itemId', 'rating'])

In [18]:
train_df

Unnamed: 0,userId,itemId,rating
0,28,56,5.0
1,28,217,3.0
2,28,588,3.0
3,28,859,3.0
4,28,201,3.0
...,...,...,...
79995,794,14,5.0
79996,794,248,4.0
79997,794,109,4.0
79998,794,257,4.0


In [19]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userId  80000 non-null  object 
 1   itemId  80000 non-null  object 
 2   rating  80000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.8+ MB


In [20]:
train_df.describe()

Unnamed: 0,rating
count,80000.0
mean,3.529375
std,1.125468
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [21]:
len(train_df['userId'].unique())

943

In [22]:
len(train_df['itemId'].unique())

1654

##### Test DataSet

In [23]:
testset

[('532', '215', 5.0),
 ('222', '549', 4.0),
 ('404', '323', 3.0),
 ('869', '122', 3.0),
 ('631', '288', 3.0),
 ('537', '150', 3.0),
 ('244', '953', 4.0),
 ('246', '385', 1.0),
 ('25', '176', 4.0),
 ('85', '319', 4.0),
 ('715', '376', 2.0),
 ('345', '284', 4.0),
 ('715', '98', 5.0),
 ('308', '965', 4.0),
 ('393', '659', 4.0),
 ('216', '216', 4.0),
 ('32', '307', 2.0),
 ('474', '410', 2.0),
 ('40', '333', 4.0),
 ('386', '24', 4.0),
 ('699', '886', 3.0),
 ('551', '3', 5.0),
 ('320', '1081', 4.0),
 ('648', '797', 3.0),
 ('197', '227', 3.0),
 ('70', '511', 5.0),
 ('711', '312', 5.0),
 ('705', '597', 4.0),
 ('41', '1', 4.0),
 ('864', '98', 5.0),
 ('496', '174', 4.0),
 ('201', '42', 4.0),
 ('894', '271', 2.0),
 ('6', '269', 4.0),
 ('308', '87', 4.0),
 ('889', '654', 3.0),
 ('121', '937', 4.0),
 ('299', '962', 4.0),
 ('43', '289', 4.0),
 ('830', '241', 4.0),
 ('922', '80', 3.0),
 ('20', '678', 4.0),
 ('666', '180', 4.0),
 ('450', '223', 3.0),
 ('669', '483', 3.0),
 ('757', '742', 4.0),
 ('417'

In [24]:
test_df = pd.DataFrame(testset, columns=['userId', 'itemId', 'rating'])

In [25]:
test_df

Unnamed: 0,userId,itemId,rating
0,532,215,5.0
1,222,549,4.0
2,404,323,3.0
3,869,122,3.0
4,631,288,3.0
...,...,...,...
19995,316,730,4.0
19996,472,100,5.0
19997,847,89,2.0
19998,566,523,4.0


In [26]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userId  20000 non-null  object 
 1   itemId  20000 non-null  object 
 2   rating  20000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 468.9+ KB


In [27]:
test_df.describe()

Unnamed: 0,rating
count,20000.0
mean,3.5318
std,1.126522
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [28]:
len(test_df['userId'].unique())

940

In [29]:
len(test_df['itemId'].unique())

1407

In [30]:
skf = StratifiedKFold(n_splits=10)

In [31]:
skf.get_n_splits(data)

10

# CrossValidatuion

##### SVD

In [32]:
algo = SVD()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9330  0.9423  0.9325  0.9365  0.9412  0.9371  0.0041  
MAE (testset)     0.7349  0.7399  0.7378  0.7382  0.7447  0.7391  0.0032  
Fit time          3.19    2.75    2.62    2.74    2.71    2.80    0.20    
Test time         0.43    0.52    0.39    0.33    0.49    0.43    0.07    


{'test_rmse': array([0.93299526, 0.94228004, 0.93248244, 0.93649278, 0.94121464]),
 'test_mae': array([0.7349332 , 0.73985239, 0.73781019, 0.73824257, 0.74467361]),
 'fit_time': (3.1932382583618164,
  2.7489240169525146,
  2.6160340309143066,
  2.7431483268737793,
  2.705244302749634),
 'test_time': (0.4259927272796631,
  0.5199122428894043,
  0.38695788383483887,
  0.3340020179748535,
  0.489959716796875)}

##### SVD++

In [33]:
algo = SVDpp()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9235  0.9185  0.9220  0.9104  0.9212  0.9191  0.0047  
MAE (testset)     0.7237  0.7221  0.7251  0.7153  0.7184  0.7209  0.0036  
Fit time          66.37   57.86   62.90   62.68   57.92   61.54   3.26    
Test time         13.30   11.33   12.71   11.22   15.99   12.91   1.73    


{'test_rmse': array([0.92347784, 0.91850277, 0.92198187, 0.91039763, 0.92124925]),
 'test_mae': array([0.72367634, 0.72207942, 0.7251376 , 0.71534139, 0.71842391]),
 'fit_time': (66.3666479587555,
  57.85632014274597,
  62.896240234375,
  62.679577589035034,
  57.92180681228638),
 'test_time': (13.304041862487793,
  11.325393199920654,
  12.709648132324219,
  11.215431928634644,
  15.985170841217041)}

##### MNF

In [34]:
algo = NMF()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9601  0.9587  0.9605  0.9605  0.9664  0.9613  0.0026  
MAE (testset)     0.7531  0.7553  0.7546  0.7559  0.7588  0.7555  0.0019  
Fit time          5.14    4.97    4.49    4.31    7.12    5.21    1.00    
Test time         0.50    0.34    0.35    0.42    0.57    0.44    0.09    


{'test_rmse': array([0.96012636, 0.95872598, 0.96051299, 0.96051929, 0.9663786 ]),
 'test_mae': array([0.7530903 , 0.75532507, 0.75462731, 0.75586459, 0.75878516]),
 'fit_time': (5.139596462249756,
  4.967195749282837,
  4.494403600692749,
  4.307634592056274,
  7.119093894958496),
 'test_time': (0.5018303394317627,
  0.3395254611968994,
  0.35100221633911133,
  0.41699957847595215,
  0.5720994472503662)}

# Sampling

In [42]:
all_ratings = data.raw_ratings
print(f"Total number of ratings in the dataset: {len(all_ratings)}")

Total number of ratings in the dataset: 100000


In [48]:
slice_ratings = raw_ratings[:1000]

ratings_df = pd.DataFrame(slice_ratings, columns=['user', 'item', 'rating', 'timestamp']).drop(columns=['timestamp'])


# reader = Reader(rating_scale=(1, 5))
sample = DatasetAutoFolds(df=ratings_df, reader=reader)

# Печатаем информацию о новом наборе данных для проверки
print(f"Total number of ratings in the new dataset: {len(sample.raw_ratings)}")


Total number of ratings in the new dataset: 1000


# GridSearch

##### SVD

In [49]:
param_grid = {
    'n_factors': [10,20],
    'n_epochs': [10, 20],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.05, 0.1]
}

In [50]:
gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_svd.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [51]:
best_params_svd = gs_svd.best_params['rmse']


##### SVD++

In [52]:
gs_svdpp = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_svd.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [53]:
best_params_svdpp = gs_svdpp.best_params['rmse']

AttributeError: 'GridSearchCV' object has no attribute 'best_params'

##### NMF

In [56]:
param_grid_nmf = {
    'n_factors': [10, 20],
    'n_epochs': [10, 20],
    'lr_bu': [ 0.005, 0.01],
    'lr_bi': [ 0.005, 0.01],
    'reg_bu': [0.05, 0.1],
    'reg_bi': [0.05, 0.1]
}

In [57]:
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_nmf.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [58]:
best_params_nmf = gs_nmf.best_params['rmse']


In [59]:
print(f"Best parameters for SVD: {best_params_svd}")
# print(f"Best parameters for SVD++: {best_params_svdpp}")
print(f"Best parameters for NMF: {best_params_nmf}")

Best parameters for SVD: {'n_factors': 10, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.05}
Best parameters for NMF: {'n_factors': 20, 'n_epochs': 20, 'lr_bu': 0.01, 'lr_bi': 0.01, 'reg_bu': 0.1, 'reg_bi': 0.1}


# Result

In [60]:
algo_svd = SVD(**best_params_svd)
algo_svdpp = SVDpp(**best_params_svd)
algo_nmf = NMF(**best_params_nmf)

In [62]:
cv_results_svd = cross_validate(algo_svd, data, measures=['rmse'], cv=5, verbose=True)


Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9454  0.9432  0.9436  0.9326  0.9357  0.9401  0.0050  
Fit time          1.50    1.22    1.25    4.63    1.28    1.98    1.33    
Test time         0.46    0.78    0.95    0.75    0.73    0.73    0.16    


In [63]:
cv_results_svdpp = cross_validate(algo_svdpp, data, measures=['rmse'], cv=5, verbose=True)


KeyboardInterrupt: 

In [64]:
cv_results_nmf = cross_validate(algo_nmf, data, measures=['rmse'], cv=5, verbose=True)

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1205  1.1060  1.1066  1.1036  1.1060  1.1085  0.0061  
Fit time          2.99    2.90    2.44    3.43    4.36    3.23    0.65    
Test time         0.33    0.37    0.93    0.48    1.01    0.62    0.29    


In [65]:
print(f"Mean RMSE for SVD: {cv_results_svd['test_rmse'].mean()}")
# print(f"Mean RMSE for SVD++: {cv_results_svdpp['test_rmse'].mean()}")
print(f"Mean RMSE for NMF: {cv_results_nmf['test_rmse'].mean()}")

Mean RMSE for SVD: 0.9400970174401412
Mean RMSE for NMF: 1.1085275340546865


# Fit

In [69]:
for i in trange(100):
    algo_svd.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

In [70]:
for i in trange(100):
    algo_svdpp.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

In [71]:
for i in trange(100):
    algo_nmf.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

In [72]:
data

<surprise.dataset.DatasetAutoFolds at 0x176723de590>

# Test

In [91]:
userId = 794

In [92]:
items_to_recommend = [238]

In [98]:
for itemId in items_to_recommend:
    prediction = algo_svd.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 794 та об'єкта 238: 3.52986


In [94]:
for itemId in items_to_recommend:
    prediction = algo_nmf.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 794 та об'єкта 238: 3.52986


In [97]:
for itemId in items_to_recommend:
    prediction = algo_svdpp.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 794 та об'єкта 238: 3.52986
