# **COLLABORATIVE FILTERING**

In [None]:
# !pip install scikit-surprise

In [3]:
import numpy as np 
import pandas as pd 

from surprise import Reader, Dataset

from surprise import SVD, BaselineOnly
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

### Load Dataset

In [16]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']

df = pd.read_csv('u.data', sep='\t', names=col_names)
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742
...,...,...,...,...
99998,880,476,3,880175444
99999,716,204,5,879795543
100000,276,1090,1,874795795
100001,13,225,2,882399156


In [17]:
df = df.iloc[: , :-1]
df

Unnamed: 0,user_id,item_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3
...,...,...,...
99998,880,476,3
99999,716,204,5
100000,276,1090,1
100001,13,225,2


In [9]:
df['rating'].value_counts()

4    34174
3    27145
5    21203
2    11370
1     6111
Name: rating, dtype: int64

In [12]:
# User Item Matrix (with Rating)

user_item_matrix = df.pivot_table(values='rating', columns='item_id', index='user_id')
user_item_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


### Dataset for surprise

In [19]:
# define reader
reader = Reader(rating_scale=(0,5))

# Dataset for surprise
data = Dataset.load_from_df(df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x19ed1f6ee48>

In [21]:
# data.df

### Data Splitting

In [23]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=10)

### Modeling

**SVD**

In [26]:
# define algorithm
algo = SVD()

# fitting
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19ed902e608>

In [28]:
# predict
prediction = algo.test(testset)

In [29]:
# accuracy
accuracy.rmse(prediction)

RMSE: 0.9406


0.9406262420198763

**ALS** 

In [30]:
bsl_options = {
    'method':'als',     # algorithm
    'n_epoch':5,        # jumlah iterasi
    'reg_u':12,         # regularization untuk user
    'reg_i':5,          # regularization untuk item
}

algo = BaselineOnly(bsl_options=bsl_options)

algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x19ed9030308>

In [32]:
prediction = algo.test(testset)

accuracy.rmse(prediction)

RMSE: 0.9427


0.9426653045993448

Berdasarkan nilai RMSE, metode SVD lebih baik (RMSE lebih kecil) dari pada ALS

### Cross Validation

In [33]:
algo = SVD()

cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
cv_svd

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9278  0.9369  0.9440  0.9374  0.9414  0.9375  0.0055  
MAE (testset)     0.7314  0.7379  0.7444  0.7400  0.7412  0.7390  0.0043  
Fit time          34.27   26.40   23.77   23.30   27.23   26.99   3.93    
Test time         1.08    0.91    1.01    0.95    2.42    1.27    0.58    


{'test_rmse': array([0.92784649, 0.93687515, 0.94399573, 0.93739245, 0.94140266]),
 'test_mae': array([0.73143666, 0.73789723, 0.7443945 , 0.74001502, 0.74116472]),
 'fit_time': (34.267773389816284,
  26.403361082077026,
  23.772446870803833,
  23.298287630081177,
  27.23083758354187),
 'test_time': (1.0830810070037842,
  0.9079990386962891,
  1.0089945793151855,
  0.9460029602050781,
  2.422436475753784)}

In [36]:
print('Cross Validation RMSE mean', cv_svd['test_rmse'].mean())

Cross Validation RMSE mean 0.9375024985468258


### Hyperparameter Tuning

In [38]:
# ?SVD

In [42]:
hyperparam_space = {
    'n_epochs':[5,10,20],
    'lr_all':[0.002, 0.005],
    'reg_all':[0.02, 0.04, 0.06]
}

# define gridsearch
grid_svd = GridSearchCV(
    SVD,
    hyperparam_space,
    measures=['rmse', 'mae'],
    cv=3
)

# fit
grid_svd.fit(data)

In [None]:
# Best score and hypeparameter
print(grid_svd.best_score['rmse'])
print(grid_svd.best_params['rmse'])