In [70]:
import os
import pandas as pd
import numpy as np
import kaggle.api as kaggle

from surprise import SVD, KNNBasic
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split
from tempfile import gettempdir
from pathlib import Path

# Data


In [43]:
# Download Kaggle data
data_dir_path = Path(gettempdir()) / 'movielens-100k'
kaggle.authenticate()
kaggle.dataset_download_files('prajitdatta/movielens-100k-dataset',
                              data_dir_path,
                              unzip=True)

# As Pandas Dataframe
file_path = data_dir_path / 'ml-100k' / 'u.data'
data = (pd.read_csv(file_path,
                    sep='\t',
                    header=None,
                    names=['user_id', 'item_id', 'rating', 'timestamp'])
        .drop('timestamp', axis=1))

# Convert data to surprise dataset
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data, reader=reader)

print('data:', data.shape)
display(data.head())
display(data.groupby('user_id').count().agg(['sum', 'count', 'mean', 'median'])['item_id'])

data: (100000, 3)


Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


sum       100000.000000
count        943.000000
mean         106.044539
median        65.000000
Name: item_id, dtype: float64

# Basic Algorithm

training set을 정규분포라고 가정하고 예측하는 베이스라인 모델. <br>
Maximum Likelihood Estimation을 통해서 정규분포의 평균값과 표준편차를 예측한다. 

- $ \mathcal{N}(\hat{\mu}, \hat{\sigma}^2) $ 에서 $ \hat{\mu} $ 그리고 $ \hat{\sigma}^2 $ 분산을 maximum likely hood로 계산

즉 다음과 같이 계산을 해서 구한다

$$ 
\begin{split}\hat{\mu} &= \frac{1}{|R_{train}|} \sum_{r_{ui} \in R_{train}}
r_{ui}\\\\        \hat{\sigma} &= \sqrt{\sum_{r_{ui} \in R_{train}}
\frac{(r_{ui} - \hat{\mu})^2}{|R_{train}|}}\end{split}
$$

- $ r_{ui} $ : 특정 제품(i)에 대한 유저(u)의 ground truth rating value 
- $ \hat{\mu} $ : 위에 있는 수식
- 첫번째 수식: training 데이터 전체에서 

In [138]:
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.random_pred import NormalPredictor

x_train, x_test = train_test_split(dataset, test_size=.25)

model = NormalPredictor()
model = model.fit(x_train)
y_pred = model.test(x_test)

accuracy.rmse(y_pred, verbose=True)
accuracy.mae(y_pred, verbose=True)
print()

RMSE: 1.5206
MAE:  1.2232



In [109]:
model = NormalPredictor()

_r = cross_validate(model, dataset, measures=['RMSE', 'MAE'], cv=2, n_jobs=2, return_train_measures=True)
pd.DataFrame(_r)

Unnamed: 0,test_rmse,train_rmse,test_mae,train_mae,fit_time,test_time
0,1.514802,1.527065,1.217445,1.22626,0.050855,0.476726
1,1.521616,1.520601,1.221777,1.220073,0.047179,0.439135


# K-NN

In [143]:
from surprise.prediction_algorithms.knns import KNNBasic

x_train, x_test = train_test_split(dataset, test_size=.25)

model = KNNBasic()
model = model.fit(x_train)
y_pred = model.test(x_test)

accuracy.rmse(y_pred, verbose=True)
accuracy.mae(y_pred, verbose=True)
print()

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9814
MAE:  0.7734



In [146]:
sim_options = {
    'name': 'pearson',
    'user_based': False  # compute  similarities between items
}
model = KNNBasic(k=40, min_k=3, sim_options=sim_options)

_r = cross_validate(model, dataset, measures=['RMSE', 'MAE'], cv=2, n_jobs=2, return_train_measures=True)
pd.DataFrame(_r)

Unnamed: 0,test_rmse,train_rmse,test_mae,train_mae,fit_time,test_time
0,1.061437,0.747456,0.847455,0.599355,1.749547,4.855957
1,1.059699,0.754712,0.846543,0.606201,1.737005,5.166402


# SVD

In [149]:
from surprise.prediction_algorithms.matrix_factorization import SVD

x_train, x_test = train_test_split(dataset, test_size=.25)

model = SVD()
model = model.fit(x_train)
y_pred = model.test(x_test)

accuracy.rmse(y_pred, verbose=True)
accuracy.mae(y_pred, verbose=True)
print()

RMSE: 0.9407
MAE:  0.7407



In [152]:
model = SVD(n_epochs=40)

_r = cross_validate(model, dataset, measures=['RMSE', 'MAE'], cv=2, n_jobs=2, return_train_measures=True)
pd.DataFrame(_r)

Unnamed: 0,test_rmse,train_rmse,test_mae,train_mae,fit_time,test_time
0,0.972803,0.421521,0.7646,0.333988,5.122574,0.452387
1,0.973669,0.4229,0.767101,0.33323,5.229807,0.458669


# Slope One

In [154]:
from surprise.prediction_algorithms.slope_one import SlopeOne

x_train, x_test = train_test_split(dataset, test_size=.25)

model = SlopeOne()
model = model.fit(x_train)
y_pred = model.test(x_test)

accuracy.rmse(y_pred, verbose=True)
accuracy.mae(y_pred, verbose=True)
print()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  model = model.fit(x_train)


RMSE: 0.9525
MAE:  0.7485



In [155]:
model = SlopeOne()

_r = cross_validate(model, dataset, measures=['RMSE', 'MAE'], cv=2, n_jobs=2, return_train_measures=True)
pd.DataFrame(_r)

Unnamed: 0,test_rmse,train_rmse,test_mae,train_mae,fit_time,test_time
0,0.958676,0.745088,0.752613,0.577662,0.270145,3.255095
1,0.959583,0.747954,0.753729,0.580702,0.267071,3.316535


# Co-Clustering

In [165]:
from surprise.prediction_algorithms.co_clustering import CoClustering

x_train, x_test = train_test_split(dataset, test_size=.25)

model = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=50)
model = model.fit(x_train)
y_pred = model.test(x_test)

accuracy.rmse(y_pred, verbose=True)
accuracy.mae(y_pred, verbose=True)
print()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  model = model.fit(x_train)


RMSE: 0.9733
MAE:  0.7634



In [166]:
model = CoClustering()

_r = cross_validate(model, dataset, measures=['RMSE', 'MAE'], cv=2, n_jobs=2, return_train_measures=True)
pd.DataFrame(_r)

Unnamed: 0,test_rmse,train_rmse,test_mae,train_mae,fit_time,test_time
0,0.986838,0.90722,0.775396,0.709415,0.629055,0.287011
1,0.998842,0.897763,0.780661,0.703024,0.620002,0.303814
