In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, CoClustering, BaselineOnly
from surprise.model_selection.validation import cross_validate

In [2]:
from surprise import accuracy
from surprise.model_selection import KFold

## Provided data

In [None]:
df = pd.read_csv('data/ReviewClean.csv', index_col=0)

In [None]:
df.head()

Unnamed: 0,customer_id,product_id,rating
0,709310,10001012,3
1,10701688,10001012,5
2,11763074,10001012,5
3,9909549,10001012,5
4,1827148,10001012,5


In [None]:
df.shape

(359540, 3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359540 entries, 0 to 359539
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   customer_id  359540 non-null  int64
 1   product_id   359540 non-null  int64
 2   rating       359540 non-null  int64
dtypes: int64(3)
memory usage: 11.0 MB


In [None]:
n_rating = len(df)
n_product = len(df.product_id.unique())
n_customer = len(df.customer_id.unique())
print(n_rating, n_product, n_customer)

359540 4211 251146


In [None]:
reader = Reader()
data = Dataset.load_from_df(df[['customer_id', 'product_id', 'rating']], reader)

In [None]:
algos = [SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering(), BaselineOnly()]

names = []
results = []
for algo in algos:
    results.append(cross_validate(algo, data, measures=['RMSE','MSE'],cv=5, verbose=True))
    names.append(algo.__class__.__name__)

Evaluating RMSE, MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9763  0.9764  0.9696  0.9737  0.9810  0.9754  0.0037  
MSE (testset)     0.9531  0.9534  0.9402  0.9482  0.9624  0.9515  0.0073  
Fit time          9.27    9.04    9.14    9.16    9.07    9.13    0.08    
Test time         0.31    0.21    0.22    0.31    0.21    0.25    0.05    
Evaluating RMSE, MSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9895  0.9914  0.9792  0.9870  0.9899  0.9874  0.0043  
MSE (testset)     0.9792  0.9829  0.9588  0.9741  0.9800  0.9750  0.0086  
Fit time          14.80   15.01   14.82   15.03   15.16   14.96   0.14    
Test time         0.48    0.47    0.35    0.35    0.35    0.40    0.06    
Evaluating RMSE, MSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (tests

BaselineOnly, SVD và SVDpp cho kết quả thấp nhất trong các thuật toán tuy nhiên BaselineOnly có tốc độ nhanh hơn hẳn 2 thuật toán còn lại, nên ta sẽ lựa chọn thuật toán này để xây dựng Recommendation System - Collaborative Filtering

## NewData: scraping data

In [3]:
df = pd.read_csv('data/ReviewClean_scraping.csv', index_col=0)

In [4]:
df.shape

(7172, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7172 entries, 0 to 7171
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   customer_id  7172 non-null   int64  
 1   product_id   7172 non-null   float64
 2   rating       7172 non-null   int64  
dtypes: float64(1), int64(2)
memory usage: 224.1 KB


In [6]:
df.head()

Unnamed: 0,customer_id,product_id,rating
0,5546,72748642.0,5
1,5875,72748642.0,5
2,188,72748642.0,5
3,4568,72748642.0,5
4,4155,72748642.0,5


In [7]:
reader = Reader()
data = Dataset.load_from_df(df[['customer_id', 'product_id', 'rating']], reader)

In [8]:
algos = [SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering(), BaselineOnly()]

names = []
results = []
for algo in algos:
    results.append(cross_validate(algo, data, measures=['RMSE','MSE'],cv=5, verbose=True))
    names.append(algo.__class__.__name__)

Evaluating RMSE, MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8636  0.8516  0.9148  0.8623  0.8714  0.8727  0.0219  
MSE (testset)     0.7459  0.7253  0.8368  0.7435  0.7593  0.7621  0.0389  
Fit time          0.19    0.16    0.16    0.16    0.16    0.17    0.01    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Evaluating RMSE, MSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8375  0.9281  0.8469  0.9076  0.8475  0.8735  0.0370  
MSE (testset)     0.7014  0.8615  0.7172  0.8237  0.7183  0.7644  0.0652  
Fit time          0.24    0.25    0.24    0.24    0.24    0.24    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Evaluating RMSE, MSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (tests

BaselineOnly, SVD và SVDpp cho kết quả thấp nhất trong các thuật toán tuy nhiên BaselineOnly có tốc độ nhanh hơn hẳn 2 thuật toán còn lại, nên ta sẽ lựa chọn thuật toán này để xây dựng Recommendation System - Collaborative Filtering