In [1]:
import warnings
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import QuantileTransformer

from opal.score.dataset import Dataset
from opal.score.preprocessing_dynamic import PreprocessingDynamic

warnings.filterwarnings('ignore')

data_path = Path("../../data/osu/scores/")

df = PreprocessingDynamic(
    Dataset(data_path, "top1k").joined_filtered_df,
    unpopular_maps_thres=None,
    unpopular_plays_thres=None,
    sr_min_thres=0,
    acc_filter=(0.8, 1),
    score_filter=None
).filter(calc_acc=True)
df: pd.DataFrame
df = df.rename({'accuracy': 'acc',
                'map_id': 'mid'}, axis=1)
qt = QuantileTransformer()
df['uid'] = df['user_id'].astype(str) + "/" + df['year'].astype(str)
df = df[['uid', 'mid', 'acc']]
df = df.reset_index(drop=True)
df = df.groupby(['uid', 'mid']).agg('mean').reset_index()

by_score_year 887452 -> 541019
by_sr 541019 -> 541019
by_acc_filter 541019 -> 540947
by_remove_mod 540947 -> 391549
Users Left: 992 | Beatmaps Left: 6545


In [2]:
print(len(df))
# df = df[df['score'] > 750000]
df = df[df.groupby('mid').mid.transform('count') >= 50]
df = df[df.groupby('uid').uid.transform('count') >= 50]
print(len(df))

369327
275069


In [3]:
from sklearn.preprocessing import LabelEncoder

uid_le = LabelEncoder()
df['uid_le'] = uid_le.fit_transform(df['uid'])
mid_le = LabelEncoder()
df['mid_le'] = uid_le.fit_transform(df['mid'])

In [4]:
import torch.nn as nn
import torch


class NeuMFNet(nn.Module):
    def __init__(self, uid_no, mid_no, mf_emb_dim, mlp_emb_dim, mlp_chn_out):
        super(NeuMFNet, self).__init__()

        self.u_mf_emb = nn.Embedding(uid_no, mf_emb_dim)
        self.m_mf_emb = nn.Embedding(mid_no, mf_emb_dim)
        self.mf_net = nn.Sequential(
            nn.Linear(mlp_emb_dim, mlp_emb_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(mlp_emb_dim, mlp_emb_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        self.u_mlp_emb = nn.Embedding(uid_no, mlp_emb_dim)
        self.m_mlp_emb = nn.Embedding(mid_no, mlp_emb_dim)
        self.mlp_net = nn.Sequential(
            nn.Linear(mlp_emb_dim * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, mlp_chn_out),
        )
        self.neu_mf_net = nn.Sequential(
            nn.Linear(mlp_chn_out + mf_emb_dim, 1),
            nn.Softplus(),
        )

    def forward(self, uid, mid):
        u_mf_emb = self.u_mf_emb(uid)
        m_mf_emb = self.m_mf_emb(mid)
        mf_out = self.mf_net(torch.mul(u_mf_emb, m_mf_emb))

        u_mlp_emb = self.u_mlp_emb(uid)
        m_mlp_emb = self.m_mlp_emb(mid)
        mlp_out = self.mlp_net(torch.concat([u_mlp_emb, m_mlp_emb], dim=-1))

        pred = self.neu_mf_net(torch.concat([mf_out, mlp_out], dim=-1))

        return pred[:, :, 0]


NeuMFNet(128, 128, 16, 16, 16)(torch.randint(0, 100, [1, 1]), torch.randint(0, 100, [1, 1]))

tensor([[0.7075]], grad_fn=<SelectBackward0>)

In [5]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch import Tensor

x_uid = Tensor(df[['uid_le']].values).to(int)
x_mid = Tensor(df[['mid_le']].values).to(int)
y = Tensor(df[['acc']].values)
ds = TensorDataset(x_uid, x_mid, y)
train_size = int(len(ds) * 0.7)
val_size = int(len(ds) * 0.2)
test_size = len(ds) - train_size - val_size
train_set, val_set, test_set = random_split(ds, [train_size, val_size, test_size])
train_dl = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=2)
val_dl = DataLoader(val_set, batch_size=256, num_workers=2)
test_dl = DataLoader(test_set, batch_size=256, num_workers=2)

In [6]:
import pytorch_lightning as pl


# SEE Desmos: https://www.desmos.com/calculator/f7tlysna7c
def adj_inv_sigmoid(x):
    return -torch.log(1 / ((x / 2.5) + 0.5) - 1)


def adj_sigmoid(x):
    return -(0.5 * torch.exp(-x) - 0.5) / (0.4 * (torch.exp(-x) + 1))



In [7]:
assert adj_inv_sigmoid(adj_sigmoid(torch.ones([1]))).item() - 1 < 1e-5

In [20]:
from typing import Any
from torch.optim.lr_scheduler import ReduceLROnPlateau


class LitNet(pl.LightningModule):
    def __init__(self, uid_no, mid_no, mf_emb_dim, mlp_emb_dim, mlp_chn_out):
        super().__init__()
        self.model = NeuMFNet(uid_no, mid_no, mf_emb_dim, mlp_emb_dim, mlp_chn_out)

    def forward(self, uid, mid):
        return self.model(uid, mid)

    def training_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> torch.Tensor:
        x_uid, x_mid, y = batch
        y_hat = self(x_uid, x_mid)
        y_adj = adj_inv_sigmoid(y)

        # We use an inv. sigmoid to make the model learn from a more linear accuracy curve.
        # Here, we measure the loss of the pred - linearized acc
        loss = torch.sqrt(((y_hat - y_adj) ** 2).mean())

        # As we learnt the linearized acc, we need to transform it back to something interpretable
        # We sigmoid it to make it the actual curved acc.
        self.log("train_mae", torch.abs(adj_sigmoid(y_hat) - adj_sigmoid(y_adj)).mean())

        return loss

    def validation_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0):
        x_uid, x_mid, y = batch

        y_hat = self(x_uid, x_mid)
        self.log("val_mae", torch.abs(adj_sigmoid(y_hat) - y).mean())

    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> tuple[torch.Tensor, torch.Tensor]:
        x_uid, x_mid, y = batch
        return adj_sigmoid(self(x_uid, x_mid))

    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), lr=0.0005, weight_decay=0.001)

        return {
            "optimizer": optim,
            "lr_scheduler": {
                "scheduler": ReduceLROnPlateau(optim, mode='min', factor=0.2, patience=2, verbose=True),
                "monitor": "val_mae",
            },
        }

In [21]:
uids = df['uid'].unique()
mids = df['mid'].unique()
uid_no = len(uids)
mid_no = len(mids)

In [22]:
net = LitNet(uid_no, mid_no, 16, 16, 8)

In [23]:
from pytorch_lightning.callbacks import EarlyStopping

early_stop_callback = EarlyStopping(monitor="val_mae", min_delta=0.00, patience=5, verbose=False, mode="min")
trainer = pl.Trainer(max_epochs=1,
                     accelerator='gpu',
                     callbacks=[early_stop_callback])
trainer.fit(net, train_dataloaders=train_dl, val_dataloaders=val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type     | Params
-----------------------------------
0 | model | NeuMFNet | 313 K 
-----------------------------------
313 K     Trainable params
0         Non-trainable params
313 K     Total params
1.253     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [24]:
y_pred = trainer.predict(net, test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 753it [00:00, ?it/s]

In [26]:
for true, pred in zip(test_dl,y_pred):
    break