In [1]:
import surprise
import pandas as pd
import numpy as np

from tqdm.notebook import trange
from surprise import Dataset, Reader, SVD, SVDpp, NMF, accuracy
from surprise.model_selection import  cross_validate, train_test_split, GridSearchCV
from surprise.dataset import DatasetAutoFolds

In [2]:
data = Dataset.load_builtin('ml-100k')

In [3]:
reader = Reader(rating_scale=(1, 5))
raw_ratings = data.raw_ratings

In [4]:
trainset, testset = train_test_split(data, test_size=0.2)

# Огляд даних

##### DF

In [5]:
df = pd.DataFrame(raw_ratings, columns=['userId', 'itemId', 'rating', 'created']).drop('created', axis=1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userId  100000 non-null  object 
 1   itemId  100000 non-null  object 
 2   rating  100000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.3+ MB


In [7]:
df.describe()

Unnamed: 0,rating
count,100000.0
mean,3.52986
std,1.125674
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [8]:
df.head()

Unnamed: 0,userId,itemId,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


In [9]:
len(df)

100000

##### Train DataSet

In [10]:
trainset

<surprise.trainset.Trainset at 0x1574ae3da50>

In [11]:
train_data = trainset.build_testset()
train_df = pd.DataFrame(train_data, columns=['userId', 'itemId', 'rating'])

In [12]:
train_df

Unnamed: 0,userId,itemId,rating
0,639,638,4.0
1,639,714,2.0
2,639,286,4.0
3,639,242,4.0
4,639,1020,4.0
...,...,...,...
79995,245,597,4.0
79996,245,94,2.0
79997,245,756,3.0
79998,245,1047,3.0


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userId  80000 non-null  object 
 1   itemId  80000 non-null  object 
 2   rating  80000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.8+ MB


In [14]:
train_df.describe()

Unnamed: 0,rating
count,80000.0
mean,3.532075
std,1.124172
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [15]:
len(train_df['userId'].unique())

943

In [16]:
len(train_df['itemId'].unique())

1655

##### Test DataSet

In [17]:
testset

[('676', '344', 5.0),
 ('871', '315', 3.0),
 ('393', '497', 4.0),
 ('14', '238', 5.0),
 ('445', '221', 1.0),
 ('345', '172', 4.0),
 ('435', '2', 4.0),
 ('838', '311', 4.0),
 ('508', '121', 2.0),
 ('23', '189', 5.0),
 ('472', '633', 4.0),
 ('393', '483', 4.0),
 ('601', '820', 1.0),
 ('828', '271', 2.0),
 ('379', '63', 2.0),
 ('280', '379', 5.0),
 ('595', '324', 3.0),
 ('756', '568', 3.0),
 ('378', '692', 4.0),
 ('291', '1253', 3.0),
 ('385', '240', 4.0),
 ('222', '328', 5.0),
 ('648', '448', 3.0),
 ('364', '289', 3.0),
 ('326', '651', 4.0),
 ('248', '1', 3.0),
 ('346', '147', 4.0),
 ('264', '659', 5.0),
 ('648', '1041', 3.0),
 ('480', '208', 2.0),
 ('279', '425', 4.0),
 ('13', '32', 4.0),
 ('36', '682', 1.0),
 ('416', '329', 3.0),
 ('67', '125', 4.0),
 ('345', '550', 3.0),
 ('868', '232', 1.0),
 ('830', '510', 4.0),
 ('91', '520', 4.0),
 ('896', '557', 3.0),
 ('561', '228', 3.0),
 ('784', '302', 5.0),
 ('175', '88', 4.0),
 ('830', '310', 4.0),
 ('936', '327', 4.0),
 ('374', '222', 4.0),

In [18]:
test_df = pd.DataFrame(testset, columns=['userId', 'itemId', 'rating'])

In [19]:
test_df

Unnamed: 0,userId,itemId,rating
0,676,344,5.0
1,871,315,3.0
2,393,497,4.0
3,14,238,5.0
4,445,221,1.0
...,...,...,...
19995,83,111,3.0
19996,796,218,3.0
19997,94,823,3.0
19998,234,477,1.0


In [20]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userId  20000 non-null  object 
 1   itemId  20000 non-null  object 
 2   rating  20000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 468.9+ KB


In [21]:
test_df.describe()

Unnamed: 0,rating
count,20000.0
mean,3.521
std,1.131646
min,1.0
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [22]:
len(test_df['userId'].unique())

942

In [23]:
len(test_df['itemId'].unique())

1406

# CrossValidatuion

##### SVD

In [24]:
algo = SVD()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9365  0.9361  0.9386  0.9323  0.9378  0.9363  0.0022  
MAE (testset)     0.7391  0.7399  0.7388  0.7331  0.7392  0.7380  0.0025  
Fit time          8.16    3.75    3.99    3.04    3.40    4.47    1.87    
Test time         0.94    0.74    0.60    0.29    0.64    0.64    0.21    


{'test_rmse': array([0.93654254, 0.93610777, 0.93863517, 0.93227304, 0.93779052]),
 'test_mae': array([0.7390861 , 0.73994394, 0.73877459, 0.73313466, 0.73920422]),
 'fit_time': (8.159401893615723,
  3.7514944076538086,
  3.9875235557556152,
  3.0390899181365967,
  3.4046804904937744),
 'test_time': (0.943295955657959,
  0.7402687072753906,
  0.5955240726470947,
  0.28809189796447754,
  0.6407034397125244)}

##### SVD++

In [25]:
algo = SVDpp()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9115  0.9215  0.9236  0.9084  0.9269  0.9184  0.0072  
MAE (testset)     0.7155  0.7195  0.7259  0.7129  0.7280  0.7204  0.0058  
Fit time          67.01   54.30   74.09   84.29   73.19   70.58   9.85    
Test time         11.69   9.57    18.62   16.08   11.69   13.53   3.31    


{'test_rmse': array([0.91147632, 0.9214624 , 0.92358394, 0.90844713, 0.92693357]),
 'test_mae': array([0.71550953, 0.71949987, 0.72585637, 0.71293785, 0.72798588]),
 'fit_time': (67.01326847076416,
  54.303995847702026,
  74.09488081932068,
  84.2902283668518,
  73.1886875629425),
 'test_time': (11.685973167419434,
  9.571140766143799,
  18.617234468460083,
  16.080275058746338,
  11.689474821090698)}

##### MNF

In [26]:
algo = NMF()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9686  0.9560  0.9623  0.9601  0.9712  0.9636  0.0056  
MAE (testset)     0.7600  0.7526  0.7582  0.7560  0.7645  0.7583  0.0040  
Fit time          7.58    6.78    5.08    5.32    6.06    6.16    0.93    
Test time         0.70    0.60    0.24    0.41    0.50    0.49    0.16    


{'test_rmse': array([0.96860761, 0.9559885 , 0.96228238, 0.96008779, 0.97119056]),
 'test_mae': array([0.7600141 , 0.75261206, 0.75820644, 0.75604971, 0.76453137]),
 'fit_time': (7.584118366241455,
  6.7754669189453125,
  5.082669019699097,
  5.31873083114624,
  6.055077075958252),
 'test_time': (0.6999974250793457,
  0.5999953746795654,
  0.24100255966186523,
  0.4050009250640869,
  0.49965596199035645)}

# Sampling

In [27]:
all_ratings = data.raw_ratings
print(f"Total number of ratings in the dataset: {len(all_ratings)}")

Total number of ratings in the dataset: 100000


In [28]:
slice_ratings = raw_ratings[:1000]

ratings_df = pd.DataFrame(slice_ratings, columns=['user', 'item', 'rating', 'timestamp']).drop(columns=['timestamp'])


# reader = Reader(rating_scale=(1, 5))
sample = DatasetAutoFolds(df=ratings_df, reader=reader)

# Печатаем информацию о новом наборе данных для проверки
print(f"Total number of ratings in the new dataset: {len(sample.raw_ratings)}")


Total number of ratings in the new dataset: 1000


# GridSearch

##### SVD

In [29]:
param_grid = {
    'n_factors': [10,20],
    'n_epochs': [10, 20],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.05, 0.1]
}

In [30]:
gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_svd.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [31]:
best_params_svd = gs_svd.best_params['rmse']


##### SVD++

In [32]:
gs_svdpp = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_svd.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [33]:
# best_params_svdpp = gs_svdpp.best_params['rmse']

AttributeError: 'GridSearchCV' object has no attribute 'best_params'

##### NMF

In [40]:
param_grid_nmf = {
    'n_factors': [10, 20],
    'n_epochs': [10, 20],
    'lr_bu': [ 0.005, 0.01],
    'lr_bi': [ 0.005, 0.01],
    'reg_bu': [0.05, 0.1],
    'reg_bi': [0.05, 0.1]
}

In [41]:
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=5, n_jobs=-1)
for i in trange(100):
  gs_nmf.fit(sample)

  0%|          | 0/100 [00:00<?, ?it/s]

In [42]:
best_params_nmf = gs_nmf.best_params['rmse']


In [43]:
print(f"Best parameters for SVD: {best_params_svd}")
# print(f"Best parameters for SVD++: {best_params_svdpp}")
print(f"Best parameters for NMF: {best_params_nmf}")

Best parameters for SVD: {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.05}
Best parameters for NMF: {'n_factors': 20, 'n_epochs': 20, 'lr_bu': 0.01, 'lr_bi': 0.01, 'reg_bu': 0.05, 'reg_bi': 0.1}


# Result

In [44]:
algo_svd = SVD(**best_params_svd)
algo_svdpp = SVDpp(**best_params_svd)
algo_nmf = NMF(**best_params_nmf)

In [45]:
cv_results_svd = cross_validate(algo_svd, data, measures=['rmse'], cv=5, verbose=True)


Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9406  0.9431  0.9364  0.9361  0.9284  0.9369  0.0050  
Fit time          2.76    2.67    2.65    2.55    2.51    2.63    0.09    
Test time         0.68    0.75    0.88    0.72    0.64    0.73    0.08    


In [46]:
cv_results_svdpp = cross_validate(algo_svdpp, data, measures=['rmse'], cv=5, verbose=True)


Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9317  0.9230  0.9280  0.9282  0.9183  0.9258  0.0047  
Fit time          66.37   65.33   62.67   64.93   64.56   64.77   1.21    
Test time         11.39   12.79   12.87   11.78   12.02   12.17   0.58    


In [47]:
cv_results_nmf = cross_validate(algo_nmf, data, measures=['rmse'], cv=5, verbose=True)

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1068  1.0979  1.1085  1.1055  1.1004  1.1038  0.0040  
Fit time          2.77    2.80    2.61    2.41    2.17    2.55    0.24    
Test time         0.46    0.29    0.46    0.22    0.41    0.37    0.10    


In [48]:
print(f"Mean RMSE for SVD: {cv_results_svd['test_rmse'].mean()}")
# print(f"Mean RMSE for SVD++: {cv_results_svdpp['test_rmse'].mean()}")
print(f"Mean RMSE for NMF: {cv_results_nmf['test_rmse'].mean()}")

Mean RMSE for SVD: 0.9369246942986729
Mean RMSE for NMF: 1.1038250786555426


# Fit

In [37]:
for i in trange(100):
    algo_svd.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

In [49]:
for i in trange(100):
    algo_svdpp.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

In [50]:
for i in trange(100):
    algo_nmf.fit(trainset)

  0%|          | 0/100 [00:00<?, ?it/s]

# Test

In [51]:
userId = 930

In [52]:
items_to_recommend = [117, 137, 252]

In [53]:
for itemId in items_to_recommend:
    prediction = algo_svd.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 930 та об'єкта 117: 3.529125
Рекомендований рейтинг для користувача 930 та об'єкта 137: 3.529125
Рекомендований рейтинг для користувача 930 та об'єкта 252: 3.529125


In [54]:
for itemId in items_to_recommend:
    prediction = algo_nmf.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 930 та об'єкта 117: 3.532075
Рекомендований рейтинг для користувача 930 та об'єкта 137: 3.532075
Рекомендований рейтинг для користувача 930 та об'єкта 252: 3.532075


In [55]:
for itemId in items_to_recommend:
    prediction = algo_svdpp.predict(userId, itemId)
    print(f"Рекомендований рейтинг для користувача {userId} та об'єкта {itemId}: {prediction.est}")

Рекомендований рейтинг для користувача 930 та об'єкта 117: 3.532075
Рекомендований рейтинг для користувача 930 та об'єкта 137: 3.532075
Рекомендований рейтинг для користувача 930 та об'єкта 252: 3.532075


In [38]:
predictions_svd = algo_svd.test(testset)

In [39]:
predictions_svd[0:5]

[Prediction(uid='676', iid='344', r_ui=5.0, est=3.885716768208556, details={'was_impossible': False}),
 Prediction(uid='871', iid='315', r_ui=3.0, est=3.8863255321579517, details={'was_impossible': False}),
 Prediction(uid='393', iid='497', r_ui=4.0, est=3.9172185167122495, details={'was_impossible': False}),
 Prediction(uid='14', iid='238', r_ui=5.0, est=4.027141193104152, details={'was_impossible': False}),
 Prediction(uid='445', iid='221', r_ui=1.0, est=2.351745401808631, details={'was_impossible': False})]

In [56]:
predictions_svdpp = algo_svdpp.test(testset)

In [57]:
predictions_svdpp[0:5]

[Prediction(uid='676', iid='344', r_ui=5.0, est=3.9985104947206866, details={'was_impossible': False}),
 Prediction(uid='871', iid='315', r_ui=3.0, est=3.7843470532260213, details={'was_impossible': False}),
 Prediction(uid='393', iid='497', r_ui=4.0, est=3.732389446233453, details={'was_impossible': False}),
 Prediction(uid='14', iid='238', r_ui=5.0, est=4.045462211277159, details={'was_impossible': False}),
 Prediction(uid='445', iid='221', r_ui=1.0, est=2.2832722647921164, details={'was_impossible': False})]

In [58]:
predictions_nmf = algo_svdpp.test(testset)

In [59]:
predictions_nmf[0:5]

[Prediction(uid='676', iid='344', r_ui=5.0, est=3.9985104947206866, details={'was_impossible': False}),
 Prediction(uid='871', iid='315', r_ui=3.0, est=3.7843470532260213, details={'was_impossible': False}),
 Prediction(uid='393', iid='497', r_ui=4.0, est=3.732389446233453, details={'was_impossible': False}),
 Prediction(uid='14', iid='238', r_ui=5.0, est=4.045462211277159, details={'was_impossible': False}),
 Prediction(uid='445', iid='221', r_ui=1.0, est=2.2832722647921164, details={'was_impossible': False})]