# Using Scikit-Surprise

Surprise!

It's a well-developed CF wrapper that handles what we've just discovered, and more.

Essentially it has created everything we needed.

## Approach

If you recall, we don't use the full dataset as it's extremely large.
Thus, by taking a small representative sample, we can run **quick** tests that estimates the population behaviour.

Then, we narrow down the best parameters after we select the best few algorithms.

## Data Preparation

As usual, we take a representative sample from the set.

We also alter it to make it suitable for `surprise`.

In [1]:
import warnings
from pathlib import Path

import pandas as pd
from torch.utils.tensorboard import SummaryWriter

from opal.score.dataset import Dataset as ODataset
from opal.score.preprocessing_dynamic import PreprocessingDynamic

warnings.filterwarnings('ignore')

data_path = Path("../../data/osu/scores/")

df_raw = ODataset(data_path, "top10k").joined_filtered_df
df = PreprocessingDynamic(
    df_raw,
    unpopular_maps_thres=None,
    unpopular_plays_thres=None,
    sr_min_thres=0,
    acc_filter=None,
    score_filter=None
).filter(calc_acc=True)
df: pd.DataFrame
df = df.rename({'accuracy': 'acc',
                'map_id': 'mid'}, axis=1)
# qt = QuantileTransformer()
# df[['acc_qt']] = qt.fit_transform(df['acc'].to_numpy().reshape(-1, 1))
df['uid'] = df['user_id'].astype(str) + "/" + df['year'].astype(str)
df = df[['uid', 'mid', 'acc']]
df = df.reset_index(drop=True)
df = df.groupby(['uid', 'mid']).agg('mean').reset_index()
df['uid'] = df['uid'].astype(str)
df['mid'] = df['mid'].astype(str)

by_score_year 6021393 -> 3913111
by_sr 3913111 -> 3913111
by_remove_mod 3913111 -> 3137673
Users Left: 9890 | Beatmaps Left: 6622


In [2]:
import torch.nn as nn
import torch
import pytorch_lightning as pl


class Net(nn.Module):
    def __init__(self, uid_no, mid_no, emb_dims):
        super(Net, self).__init__()
        self.uid = nn.Embedding(uid_no, emb_dims)
        self.mid = nn.Embedding(mid_no, emb_dims)
        self.net = nn.Sequential(
            nn.Linear(emb_dims * 2, 1024),
            nn.Dropout(0.2),
            nn.Tanh(),
            nn.Linear(1024, 512),
            nn.Dropout(0.2),
            nn.Tanh(),
            nn.Linear(512, 256),
            nn.Dropout(0.2),
            nn.Tanh(),
            nn.Linear(256, 128),
            nn.Dropout(0.2),
            nn.Tanh(),
            nn.Linear(128, 16),
            nn.Dropout(0.2),
            nn.Sigmoid(),
            nn.Linear(16, 1),
        )

    def forward(self, uid, mid):
        concat = torch.concat([self.uid(uid), self.mid(mid)], dim=2)
        return self.net(concat)


In [10]:

class LitNet(pl.LightningModule):
    def __init__(self, uid_no, mid_no, emb_dims):
        super().__init__()
        self.model = Net(uid_no, mid_no, emb_dims)
        self.writer = SummaryWriter("lightning_logs/")

        layout = {
            "model": {
                "loss": [
                    "Multiline",
                    ["loss/99",
                     "loss/98",
                     "loss/97",
                     "loss/95",
                     "loss/92",
                     "loss/90",
                     "loss/85",
                     ]],
            },
        }
        self.writer.add_custom_scalars(layout)

    def forward(self, uid, mid):
        return self.model(uid, mid)

    def training_step(self, batch, batch_idx):
        x_uid, x_mid, y = batch
        y_hat = self(x_uid, x_mid)
        loss = torch.sqrt(((y_hat - y) ** 2).mean())
        y = y.squeeze()[0]
        if y > 0.99: self.writer.add_scalar("loss/99", loss * 100, batch_idx)
        if y > 0.98: self.writer.add_scalar("loss/98", loss * 100, batch_idx)
        if y > 0.97: self.writer.add_scalar("loss/97", loss * 100, batch_idx)
        if y > 0.95: self.writer.add_scalar("loss/95", loss * 100, batch_idx)
        if y > 0.92: self.writer.add_scalar("loss/92", loss * 100, batch_idx)
        if y > 0.90: self.writer.add_scalar("loss/90", loss * 100, batch_idx)
        if y > 0.85: self.writer.add_scalar("loss/85", loss * 100, batch_idx)
        return loss

    def validation_step(self, batch, batch_idx):
        x_uid, x_mid, y = batch
        y_hat = self(x_uid, x_mid)
        self.log("val_rmse", torch.sqrt(((y_hat - y) ** 2).mean()) * 100)
        self.log("val_mae", torch.abs(y_hat - y).mean() * 100)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0005, weight_decay=0.001)

In [11]:
uids = df['uid'].unique()
mids = df['mid'].unique()
uid_no = len(uids)
mid_no = len(mids)

In [12]:
from sklearn import preprocessing

uid_le = preprocessing.LabelEncoder()
df['uid_le'] = uid_le.fit_transform(df['uid'])
mid_le = preprocessing.LabelEncoder()
df['mid_le'] = mid_le.fit_transform(df['mid'])

In [13]:
net = LitNet(uid_no, mid_no, 512)

In [14]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch import Tensor

x_uid = Tensor(df[['uid_le']].values).to(int)
x_mid = Tensor(df[['mid_le']].values).to(int)
y = Tensor(df[['acc']].values)
ds = TensorDataset(x_uid, x_mid, y)
train_size = int(len(ds) * 0.7)
val_size = int(len(ds) * 0.2)
test_size = len(ds) - train_size - val_size
train_set, val_set, test_set = random_split(ds, [train_size, val_size, test_size])
train_dl = DataLoader(train_set, batch_size=256, shuffle=True)
val_dl = DataLoader(val_set, batch_size=256)
test_dl = DataLoader(test_size)

In [15]:
# early_stop_callback = EarlyStopping(monitor="val_mse", min_delta=0.00, patience=4, verbose=False, mode="min")
trainer = pl.Trainer(max_epochs=40,
                     accelerator='gpu',
                     # callbacks=[early_stop_callback]
                     )
trainer.fit(
    net,
    train_dataloaders=train_dl,
    val_dataloaders=val_dl
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type | Params
-------------------------------
0 | model | Net  | 17.0 M
-------------------------------
17.0 M    Trainable params
0         Non-trainable params
17.0 M    Total params
68.026    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]