In [1]:
import surprise
import pandas as pd
import numpy as np

from tqdm.notebook import trange
from surprise import Dataset, Reader, SVD, SVDpp, NMF, accuracy
from surprise.model_selection import  cross_validate, train_test_split, GridSearchCV
from surprise.dataset import DatasetAutoFolds

In [2]:
data = Dataset.load_builtin('ml-100k')

In [3]:
reader = Reader(rating_scale=(1, 5))
raw_ratings = data.raw_ratings

In [4]:
trainset, testset = train_test_split(data, test_size=0.2)

# Огляд даних

##### DF

In [5]:
df = pd.DataFrame(raw_ratings, columns=['userId', 'itemId', 'rating', 'created']).drop('created', axis=1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userId  100000 non-null  object 
 1   itemId  100000 non-null  object 
 2   rating  100000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.3+ MB


In [7]:
df.describe()

Unnamed: 0,rating
count,100000.0
mean,3.52986
std,1.125674
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [8]:
df.head()

Unnamed: 0,userId,itemId,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


In [9]:
len(df)

100000

##### Train DataSet

In [10]:
trainset

<surprise.trainset.Trainset at 0x2c05632aa90>

In [11]:
train_data = trainset.build_testset()
train_df = pd.DataFrame(train_data, columns=['userId', 'itemId', 'rating'])

In [12]:
train_df

Unnamed: 0,userId,itemId,rating
0,930,286,3.0
1,930,137,2.0
2,930,238,4.0
3,930,100,3.0
4,930,165,5.0
...,...,...,...
79995,513,435,5.0
79996,513,117,5.0
79997,513,265,5.0
79998,513,252,5.0


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userId  80000 non-null  object 
 1   itemId  80000 non-null  object 
 2   rating  80000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.8+ MB


In [14]:
train_df.describe()

Unnamed: 0,rating
count,80000.0
mean,3.531763
std,1.126602
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [15]:
len(train_df['userId'].unique())

943

In [16]:
len(train_df['itemId'].unique())

1646

##### Test DataSet

In [17]:
testset

[('897', '566', 2.0),
 ('870', '443', 3.0),
 ('889', '544', 3.0),
 ('299', '514', 5.0),
 ('163', '316', 5.0),
 ('230', '504', 3.0),
 ('812', '682', 4.0),
 ('923', '100', 5.0),
 ('921', '66', 5.0),
 ('88', '315', 4.0),
 ('639', '604', 4.0),
 ('545', '254', 4.0),
 ('637', '124', 3.0),
 ('757', '206', 4.0),
 ('665', '216', 4.0),
 ('660', '196', 4.0),
 ('498', '423', 3.0),
 ('655', '1186', 3.0),
 ('741', '790', 3.0),
 ('603', '1483', 5.0),
 ('716', '430', 5.0),
 ('407', '229', 3.0),
 ('194', '523', 5.0),
 ('749', '526', 5.0),
 ('456', '662', 4.0),
 ('506', '54', 4.0),
 ('626', '678', 1.0),
 ('498', '212', 3.0),
 ('410', '269', 5.0),
 ('467', '340', 3.0),
 ('870', '715', 3.0),
 ('561', '65', 3.0),
 ('95', '432', 3.0),
 ('588', '69', 2.0),
 ('821', '435', 4.0),
 ('669', '490', 5.0),
 ('454', '194', 3.0),
 ('276', '334', 4.0),
 ('679', '527', 4.0),
 ('921', '762', 2.0),
 ('802', '396', 2.0),
 ('115', '174', 5.0),
 ('897', '616', 5.0),
 ('301', '418', 3.0),
 ('144', '815', 1.0),
 ('52', '762',

In [18]:
test_df = pd.DataFrame(testset, columns=['userId', 'itemId', 'rating'])

In [19]:
test_df

Unnamed: 0,userId,itemId,rating
0,897,566,2.0
1,870,443,3.0
2,889,544,3.0
3,299,514,5.0
4,163,316,5.0
...,...,...,...
19995,286,139,3.0
19996,288,528,4.0
19997,85,647,4.0
19998,592,1226,4.0


In [20]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userId  20000 non-null  object 
 1   itemId  20000 non-null  object 
 2   rating  20000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 468.9+ KB


In [21]:
test_df.describe()

Unnamed: 0,rating
count,20000.0
mean,3.52225
std,1.121948
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [22]:
len(test_df['userId'].unique())

943

In [23]:
len(test_df['itemId'].unique())

1413

# CrossValidatuion

##### SVD

In [24]:
algo = SVD()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9394  0.9381  0.9269  0.9382  0.9355  0.9356  0.0045  
MAE (testset)     0.7408  0.7380  0.7319  0.7408  0.7365  0.7376  0.0033  
Fit time          4.15    2.44    2.61    3.15    3.10    3.09    0.60    
Test time         0.33    0.47    0.69    0.50    0.35    0.47    0.13    


{'test_rmse': array([0.93936253, 0.93812371, 0.9268947 , 0.93815311, 0.93554361]),
 'test_mae': array([0.74075098, 0.73802859, 0.73190985, 0.74077482, 0.73650508]),
 'fit_time': (4.15097188949585,
  2.4405150413513184,
  2.608506679534912,
  3.152235269546509,
  3.0971479415893555),
 'test_time': (0.3269984722137451,
  0.47341060638427734,
  0.685002326965332,
  0.5006210803985596,
  0.35300159454345703)}

##### SVD++

In [25]:
algo = SVDpp()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9141  0.9171  0.9214  0.9221  0.9176  0.9185  0.0030  
MAE (testset)     0.7172  0.7204  0.7228  0.7254  0.7192  0.7210  0.0028  
Fit time          48.10   51.64   58.48   71.17   64.43   58.76   8.37    
Test time         8.08    8.03    13.94   13.34   10.38   10.76   2.51    


{'test_rmse': array([0.91411337, 0.91710188, 0.9214367 , 0.92213018, 0.91764714]),
 'test_mae': array([0.71724127, 0.72035973, 0.72281368, 0.72542382, 0.71921812]),
 'fit_time': (48.09974908828735,
  51.64259099960327,
  58.47835350036621,
  71.16626596450806,
  64.42764973640442),
 'test_time': (8.08381724357605,
  8.029973983764648,
  13.942404747009277,
  13.34429383277893,
  10.379296064376831)}

##### MNF

In [51]:
algo = NMF()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9598  0.9624  0.9593  0.9660  0.9704  0.9636  0.0042  
MAE (testset)     0.7566  0.7538  0.7556  0.7599  0.7616  0.7575  0.0028  
Fit time          4.44    3.82    3.66    3.64    3.61    3.83    0.31    
Test time         0.21    0.19    0.38    0.19    0.35    0.27    0.09    


{'test_rmse': array([0.95981096, 0.96243456, 0.95929068, 0.96604884, 0.97041046]),
 'test_mae': array([0.75663731, 0.75384489, 0.7556086 , 0.7598991 , 0.76157362]),
 'fit_time': (4.442980527877808,
  3.8160181045532227,
  3.657900333404541,
  3.637640953063965,
  3.605818510055542),
 'test_time': (0.21396350860595703,
  0.19103741645812988,
  0.38499903678894043,
  0.18599772453308105,
  0.3524448871612549)}

# Sampling

In [27]:
all_ratings = data.raw_ratings
print(f"Total number of ratings in the dataset: {len(all_ratings)}")

Total number of ratings in the dataset: 100000


In [28]:
slice_ratings = raw_ratings[:1000]

ratings_df = pd.DataFrame(slice_ratings, columns=['user', 'item', 'rating', 'timestamp']).drop(columns=['timestamp'])


# reader = Reader(rating_scale=(1, 5))
sample = DatasetAutoFolds(df=ratings_df, reader=reader)

# Печатаем информацию о новом наборе данных для проверки
print(f"Total number of ratings in the new dataset: {len(sample.raw_ratings)}")


Total number of ratings in the new dataset: 1000


# GridSearch

##### SVD

In [29]:
param_grid = {
    'n_factors': [10,20],
    'n_epochs': [10, 20],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.05, 0.1]
}

In [30]:
gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_svd.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [31]:
best_params_svd = gs_svd.best_params['rmse']


##### SVD++

In [32]:
gs_svdpp = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_svd.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [33]:
# best_params_svdpp = gs_svdpp.best_params['rmse']

##### NMF

In [34]:
param_grid_nmf = {
    'n_factors': [10, 20],
    'n_epochs': [10, 20],
    'lr_bu': [ 0.005, 0.01],
    'lr_bi': [ 0.005, 0.01],
    'reg_bu': [0.05, 0.1],
    'reg_bi': [0.05, 0.1]
}

In [35]:
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_nmf.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [36]:
best_params_nmf = gs_nmf.best_params['rmse']


In [37]:
print(f"Best parameters for SVD: {best_params_svd}")
# print(f"Best parameters for SVD++: {best_params_svdpp}")
print(f"Best parameters for NMF: {best_params_nmf}")

Best parameters for SVD: {'n_factors': 10, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.05}
Best parameters for NMF: {'n_factors': 20, 'n_epochs': 20, 'lr_bu': 0.01, 'lr_bi': 0.01, 'reg_bu': 0.1, 'reg_bi': 0.1}


# Result

In [38]:
algo_svd = SVD(**best_params_svd)
algo_svdpp = SVDpp(**best_params_svd)
algo_nmf = NMF(**best_params_nmf)

In [39]:
cv_results_svd = cross_validate(algo_svd, data, measures=['rmse'], cv=5, verbose=True)


Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9403  0.9403  0.9390  0.9365  0.9424  0.9397  0.0019  
Fit time          1.05    1.21    1.13    1.18    1.23    1.16    0.06    
Test time         0.83    0.55    0.90    0.57    0.70    0.71    0.14    


In [40]:
cv_results_svdpp = cross_validate(algo_svdpp, data, measures=['rmse'], cv=5, verbose=True)


Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9299  0.9301  0.9266  0.9305  0.9284  0.9291  0.0014  
Fit time          17.46   16.60   30.23   26.89   24.89   23.22   5.34    
Test time         10.92   8.02    21.65   17.15   14.68   14.48   4.75    


In [41]:
cv_results_nmf = cross_validate(algo_nmf, data, measures=['rmse'], cv=5, verbose=True)

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1096  1.0926  1.1129  1.1033  1.0969  1.1031  0.0076  
Fit time          2.78    2.69    2.84    2.73    3.44    2.89    0.28    
Test time         0.76    0.47    0.75    0.55    1.30    0.77    0.29    


In [42]:
print(f"Mean RMSE for SVD: {cv_results_svd['test_rmse'].mean()}")
# print(f"Mean RMSE for SVD++: {cv_results_svdpp['test_rmse'].mean()}")
print(f"Mean RMSE for NMF: {cv_results_nmf['test_rmse'].mean()}")

Mean RMSE for SVD: 0.9396941573045637
Mean RMSE for NMF: 1.103050744860082


# Fit

In [43]:
for i in trange(100):
    algo_svd.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

In [44]:
for i in trange(100):
    algo_svdpp.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

In [45]:
for i in trange(100):
    algo_nmf.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

# Test

In [46]:
userId = 794

In [47]:
items_to_recommend = [238]

In [48]:
for itemId in items_to_recommend:
    prediction = algo_svd.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 794 та об'єкта 238: 3.5317625


In [49]:
for itemId in items_to_recommend:
    prediction = algo_nmf.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 794 та об'єкта 238: 3.5317625


In [50]:
for itemId in items_to_recommend:
    prediction = algo_svdpp.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 794 та об'єкта 238: 3.5317625
