In [1]:
import sqlite3

con = sqlite3.connect("data.db")

## Data Preparation

Fetch from the DB and process our UID (User ID) and MID (Map ID)
- For each distinct `speed` & `map`, we set a separate `mid`
- For each distinct `create_at_year` (year played) & `player`, we set a separate `uid`.

**Note:**
- `speed`: `{-1: HT, 1: DT, 0: Otherwise}`
- `create_at`: is in unix time

In [2]:
import pandas as pd

df = pd.DataFrame(con.execute(
    "SELECT"
    " beatmap_id, user_id, score, accuracy, speed, create_at "
    "FROM Score"
).fetchall(), columns=['mid', 'uid', 'score', 'accuracy', 'speed', 'create_at'])
df['create_at_year'] = pd.to_datetime(df['create_at'], unit='s').dt.year
df['uid'] = df['uid'].astype(str) + "/" + df['create_at_year'].astype(str)
df['mid'] = df['mid'].astype(str) + "/" + df['speed'].astype(str)

### Remove Outliers
We're not interested in:
- scores below 750K as they aren't "good"
- players who have played < 100 maps as there will be too little associations
- maps who have been played by < 100 players ...

In [5]:
print(len(df))
df = df[df['score'] > 750000]
df = df[df.groupby('mid').mid.transform('count') >= 25]
df = df[df.groupby('uid').uid.transform('count') >= 25]
print(len(df))

3874616
2785274


### Label Encode

Embeddings only take in distinct integers, our `uid` and `mid` are strings, e.g. (`uid = 1928423/2019`, `mid = 284812/-1`)
We use `LabelEncoder` to encode them

In [6]:
from sklearn.preprocessing import LabelEncoder

uid_le = LabelEncoder()
df['uid_le'] = uid_le.fit_transform(df['uid'])
mid_le = LabelEncoder()
df['mid_le'] = uid_le.fit_transform(df['mid'])

## Create the Net

Refer to [**this paper**](https://towardsdatascience.com/paper-review-neural-collaborative-filtering-explanation-implementation-ea3e031b7f96)

- `X_X_emb`: Embedding Layers
- `mf_net`: GMF Layer
- `mlp_net`: MLP Layer
- `neu_mf_net`: NeuMF Layer (the final concat layer)

In [7]:
import torch.nn as nn
import torch


class NeuMFNet(nn.Module):
    def __init__(self, uid_no, mid_no, mf_emb_dim, mlp_emb_dim, mlp_chn_out):
        super(NeuMFNet, self).__init__()

        self.u_mf_emb = nn.Embedding(uid_no, mf_emb_dim)
        self.m_mf_emb = nn.Embedding(mid_no, mf_emb_dim)
        self.mf_net = nn.Sequential(
            nn.Linear(mlp_emb_dim, mlp_emb_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(mlp_emb_dim, mlp_emb_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        self.u_mlp_emb = nn.Embedding(uid_no, mlp_emb_dim)
        self.m_mlp_emb = nn.Embedding(mid_no, mlp_emb_dim)
        self.mlp_net = nn.Sequential(
            nn.Linear(mlp_emb_dim * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, mlp_chn_out),
        )
        self.neu_mf_net = nn.Sequential(
            nn.Linear(mlp_chn_out + mf_emb_dim, 1),
            nn.Softplus(),
        )

    def forward(self, uid, mid):
        u_mf_emb = self.u_mf_emb(uid)
        m_mf_emb = self.m_mf_emb(mid)
        mf_out = self.mf_net(torch.mul(u_mf_emb, m_mf_emb))

        u_mlp_emb = self.u_mlp_emb(uid)
        m_mlp_emb = self.m_mlp_emb(mid)
        mlp_out = self.mlp_net(torch.concat([u_mlp_emb, m_mlp_emb], dim=-1))

        pred = self.neu_mf_net(torch.concat([mf_out, mlp_out], dim=-1))

        return pred[:, :, 0]

NeuMFNet(128, 128, 16, 16, 16)(torch.randint(0, 100, [1, 1]), torch.randint(0, 100, [1, 1]))

tensor([[0.8117]], grad_fn=<SelectBackward0>)

## DataLoaders

In [8]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch import Tensor

x_uid = Tensor(df[['uid_le']].values).to(int)
x_mid = Tensor(df[['mid_le']].values).to(int)
y = Tensor(df[['accuracy']].values)
ds = TensorDataset(x_uid, x_mid, y)
train_size = int(len(ds) * 0.7)
val_size = int(len(ds) * 0.2)
test_size = len(ds) - train_size - val_size
train_set, val_set, test_set = random_split(ds, [train_size, val_size, test_size])
train_dl = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=4)
val_dl = DataLoader(val_set, batch_size=256, num_workers=4)
test_dl = DataLoader(test_size)

## Create Net Wrapper

PyTorch-Lightning wrapper makes it easier to train & visualize.

In [9]:
import pytorch_lightning as pl


# SEE Desmos: https://www.desmos.com/calculator/f7tlysna7c
def adj_inv_sigmoid(x):
    return -torch.log(1 / ((x / 2.5) + 0.5) - 1)


def adj_sigmoid(x):
    return -(0.5 * torch.exp(-x) - 0.5) / (0.4 * (torch.exp(-x) + 1))



In [10]:
adj_inv_sigmoid(adj_sigmoid(torch.ones([1])))

tensor([1.0000])

In [11]:


from torch.optim.lr_scheduler import ReduceLROnPlateau


class LitNet(pl.LightningModule):
    def __init__(self, uid_no, mid_no, mf_emb_dim, mlp_emb_dim, mlp_chn_out):
        super().__init__()
        self.model = NeuMFNet(uid_no, mid_no, mf_emb_dim, mlp_emb_dim, mlp_chn_out)

    def forward(self, uid, mid):
        return self.model(uid, mid)

    def training_step(self, batch, batch_idx):
        x_uid, x_mid, y = batch
        y_hat = self(x_uid, x_mid)
        y_adj = adj_inv_sigmoid(y)
        loss = torch.sqrt(((y_hat - y_adj) ** 2).mean())
        self.log("train_mae", torch.abs(adj_sigmoid(y_hat) - adj_sigmoid(y_adj)).mean())

        return loss

    def validation_step(self, batch, batch_idx):
        x_uid, x_mid, y = batch
        y_hat = self(x_uid, x_mid)
        y_adj = adj_inv_sigmoid(y)
        # self.log("val_rmse", adj_sigmoid(y_del ** 2).mean())
        self.log("val_mae", torch.abs(adj_sigmoid(y_hat) - adj_sigmoid(y_adj)).mean())

    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), lr=0.0005, weight_decay=0.001)

        return {
            "optimizer": optim,
            "lr_scheduler": {
                "scheduler": ReduceLROnPlateau(optim, mode='min', factor=0.2, patience=2, verbose=True),
                "monitor": "val_mae",
            },
        }

In [12]:
uids = df['uid'].unique()
mids = df['mid'].unique()
uid_no = len(uids)
mid_no = len(mids)

In [13]:
net = LitNet(uid_no, mid_no, 256, 256, 128)

In [14]:
from pytorch_lightning.callbacks import EarlyStopping

early_stop_callback = EarlyStopping(monitor="val_mae", min_delta=0.00, patience=5, verbose=False, mode="min")
trainer = pl.Trainer(max_epochs=40,
                     accelerator='gpu',
                     callbacks=[early_stop_callback]
                     )
trainer.fit(net, train_dataloaders=train_dl, val_dataloaders=val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type     | Params
-----------------------------------
0 | model | NeuMFNet | 27.3 M
-----------------------------------
27.3 M    Trainable params
0         Non-trainable params
27.3 M    Total params
109.048   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00005: reducing learning rate of group 0 to 1.0000e-04.


Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
