In [25]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt # https://matplotlib.org/stable/api/pyplot_summary.html#module-matplotlib.pyplot
import sklearn

%matplotlib inline

# from IPython.display import set_matplotlib_formats
# set_matplotlib_formats("retina") 
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

In [26]:
from surprise import Dataset, accuracy, Reader
from surprise.prediction_algorithms import BaselineOnly, SVD, SVDpp, KNNBasic, KNNWithMeans, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

data = Dataset.load_builtin('ml-100k')

trainset, testset = train_test_split(data, test_size=0.25)

## 기본적 사용

In [23]:
algos = [BaselineOnly, KNNWithMeans, SVD, SVDpp]

names, results = [], []

for algo in algos:
    names.append(algo.__name__)
    algo = algo()
    
    algo.fit(trainset)
    predictions = algo.test(testset)

    results.append(accuracy.rmse(predictions))

Estimating biases using als...
RMSE: 0.9416
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9496
RMSE: 0.9356
RMSE: 0.9222


In [24]:
for name, result in zip(names, results):
    print(f"{name} model에 대한 RMSE: {result:.4f}")

BaselineOnly model에 대한 RMSE: 0.9416
KNNWithMeans model에 대한 RMSE: 0.9496
SVD model에 대한 RMSE: 0.9356
SVDpp model에 대한 RMSE: 0.9222


# with options

In [31]:
# https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measures-configuration
sim_options = {
    "name": "pearson_baseline", # https://surprise.readthedocs.io/en/stable/similarities.html#module-surprise.similarities
    "user_based": True # This has a huge impact on the performance of a prediction algorithm
}
algo = KNNWithMeans(k=30, sim_options=sim_options)

algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9423


0.9423139079640078

In [32]:
# https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measures-configuration
sim_options = {
    "name": "pearson_baseline", # https://surprise.readthedocs.io/en/stable/similarities.html#module-surprise.similarities
    "user_based": True # This has a huge impact on the performance of a prediction algorithm
}
algo = KNNWithMeans(k=30, sim_options=sim_options)

algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9408


0.9407977671925994

### hyper param 탐색 (GridSearchCV)

In [35]:
# 그냥 하는 방식

# https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measures-configuration
sim_options = {
    "name": "pearson_baseline", # https://surprise.readthedocs.io/en/stable/similarities.html#module-surprise.similarities
    "user_based": True # This has a huge impact on the performance of a prediction algorithm
}

results: list[tuple[int, float]] = []

# 돌려보니 40이 제일 낮음.
for n_size in range(10, 60, 10):
    algo = KNNWithMeans(k=n_size, sim_options=sim_options)

    algo.fit(trainset)
    predictions = algo.test(testset)
    results.append((n_size, accuracy.rmse(predictions)))

for size, error_rmse in results:
    print(f"{size}개의 이웃에 대한 RMSE: {error_rmse:.4f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9577
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9447
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9423
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9420
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9422
10개의 이웃에 대한 RMSE: 0.9577
20개의 이웃에 대한 RMSE: 0.9447
30개의 이웃에 대한 RMSE: 0.9423
40개의 이웃에 대한 RMSE: 0.9420
50개의 이웃에 대한 RMSE: 0.9422


In [37]:
from surprise.model_selection import GridSearchCV
# https://surprise.readthedocs.io/en/stable/model_selection.html?highlight=GridSearchCV#

param_grid = {
    "k": [5, 10, 15, 20],
    "sim_options": {
        "name": ["pearson_baseline", "cosine"],
        "user_based": [True, False]
    }
}

gs = GridSearchCV(KNNWithMeans, 
                param_grid,
                measures=["rmse"],
                cv=4)

gs.fit(data)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similar

In [38]:
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.9264604546223303
{'k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}


In [39]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [70, 80, 90,],
    "lr_all": [0.005, 0.006, 0.007],
    "reg_all": [0.05, 0.07, 0.1]
}

gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=4)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])