In [1]:
import sqlite3

con = sqlite3.connect("data.db")

## Data Preparation

Fetch from the DB and process our UID (User ID) and MID (Map ID)
- For each distinct `speed` & `map`, we set a separate `mid`
- For each distinct `create_at_year` (year played) & `player`, we set a separate `uid`.

**Note:**
- `speed`: `{-1: HT, 1: DT, 0: Otherwise}`
- `create_at`: is in unix time

In [2]:
import pandas as pd

df = pd.DataFrame(con.execute(
    "SELECT"
    " beatmap_id, user_id, score, accuracy, speed, create_at "
    "FROM Score"
).fetchall(), columns=['mid', 'uid', 'score', 'accuracy', 'speed', 'create_at'])
df['create_at_year'] = pd.to_datetime(df['create_at'], unit='s').dt.year
df['uid'] = df['uid'].astype(str) + "/" + df['create_at_year'].astype(str)
df['mid'] = df['mid'].astype(str) + "/" + df['speed'].astype(str)

### Remove Outliers
We're not interested in:
- scores below 750K as they aren't "good"
- players who have played < 100 maps as there will be too little associations
- maps who have been played by < 100 players ...

In [3]:
df = df[df['score'] > 750000]
df = df[df.groupby('mid').mid.transform('count') >= 100]
df = df[df.groupby('uid').uid.transform('count') >= 100]

### Label Encode

Embeddings only take in distinct integers, our `uid` and `mid` are strings, e.g. (`uid = 1928423/2019`, `mid = 284812/-1`)
We use `LabelEncoder` to encode them

In [4]:
from sklearn.preprocessing import LabelEncoder

uid_le = LabelEncoder()
df['uid_le'] = uid_le.fit_transform(df['uid'])
mid_le = LabelEncoder()
df['mid_le'] = uid_le.fit_transform(df['mid'])

## Create the Net

Refer to [**this paper**](https://towardsdatascience.com/paper-review-neural-collaborative-filtering-explanation-implementation-ea3e031b7f96)

- `X_X_emb`: Embedding Layers
- `mf_net`: GMF Layer
- `mlp_net`: MLP Layer
- `neu_mf_net`: NeuMF Layer (the final concat layer)

In [5]:
import torch.nn as nn
import torch


class WideDeepNet(nn.Module):
    def __init__(self, uid_no, mid_no, deep_emb_dim, deep_chn_out):
        super(WideDeepNet, self).__init__()

        self.uid_no = uid_no
        self.mid_no = mid_no
        self.wide_net = nn.Sequential()

        self.u_deep_emb = nn.Embedding(uid_no, deep_emb_dim)
        self.m_deep_emb = nn.Embedding(mid_no, deep_emb_dim)
        self.deep_net = nn.Sequential(
            nn.Linear(deep_emb_dim, 8),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(8, 8),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(8, 8),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(8, deep_chn_out),
            nn.ReLU(),
        )

        self.comb_net = nn.Sequential(
            nn.Linear(uid_no + mid_no + deep_chn_out, 1),
            nn.Softplus()
        )

    def forward(self, uid, mid):
        u_wide_emb = nn.functional.one_hot(uid, self.uid_no)
        m_wide_emb = nn.functional.one_hot(mid, self.mid_no)
        # u_wide_emb = self.u_wide_emb(uid)
        # m_wide_emb = self.m_wide_emb(mid)
        wide_out = self.wide_net(torch.concat([u_wide_emb, m_wide_emb], dim=-1))

        u_deep_emb = self.u_deep_emb(uid)
        m_deep_emb = self.m_deep_emb(mid)
        deep_out = self.deep_net(torch.mul(u_deep_emb, m_deep_emb))

        pred = self.comb_net(torch.concat([wide_out, deep_out], dim=-1))

        return pred[:,:,0]

In [6]:
WideDeepNet(128, 128, 100, 100)(torch.randint(0, 100, [5, 1]), torch.randint(0, 100, [5, 1]))

tensor([[0.6677],
        [0.6640],
        [0.6468],
        [0.6353],
        [0.6670]], grad_fn=<SelectBackward0>)

## DataLoaders

In [7]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch import Tensor

x_uid = Tensor(df[['uid_le']].values).to(int)
x_mid = Tensor(df[['mid_le']].values).to(int)
y = Tensor(df[['accuracy']].values)
ds = TensorDataset(x_uid, x_mid, y)
train_size = int(len(ds) * 0.7)
val_size = int(len(ds) * 0.2)
test_size = len(ds) - train_size - val_size
train_set, val_set, test_set = random_split(ds, [train_size, val_size, test_size])
train_dl = DataLoader(train_set, batch_size=16, shuffle=True, num_workers=4)
val_dl = DataLoader(val_set, batch_size=16, num_workers=4)
test_dl = DataLoader(test_size)

## Create Net Wrapper

PyTorch-Lightning wrapper makes it easier to train & visualize.

In [8]:
import pytorch_lightning as pl


# SEE Desmos: https://www.desmos.com/calculator/f7tlysna7c
def adj_inv_sigmoid(x):
    return -torch.log(1 / ((x / 2.5) + 0.5) - 1)


def adj_sigmoid(x):
    return -(0.5 * torch.exp(-x) - 0.5)/ (0.4 * (torch.exp(-x) + 1))



In [9]:
adj_inv_sigmoid(adj_sigmoid(torch.ones([1])))

tensor([1.0000])

In [10]:


class LitNet(pl.LightningModule):
    def __init__(self, uid_no, mid_no, deep_emb_dim, deep_chn_out):
        super().__init__()
        self.model = WideDeepNet(uid_no, mid_no, deep_emb_dim, deep_chn_out)

    def forward(self, uid, mid):
        return self.model(uid, mid)

    def training_step(self, batch, batch_idx):
        x_uid, x_mid, y = batch
        y_hat = self(x_uid, x_mid)
        y_adj = adj_inv_sigmoid(y)
        loss = torch.sqrt(((y_hat - y_adj) ** 2).mean())
        self.log("train_mae", torch.abs(adj_sigmoid(y_hat) - adj_sigmoid(y_adj)).mean())

        return loss

    def validation_step(self, batch, batch_idx):
        x_uid, x_mid, y = batch
        y_hat = self(x_uid, x_mid)
        y_adj = adj_inv_sigmoid(y)
        # self.log("val_rmse", adj_sigmoid(y_del ** 2).mean())
        self.log("val_mae", torch.abs(adj_sigmoid(y_hat) - adj_sigmoid(y_adj)).mean())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0005, weight_decay=0.001)

## Training

In [11]:
from pytorch_lightning.callbacks import EarlyStopping

early_stop_callback = EarlyStopping(monitor="val_mae", min_delta=0.00, patience=3, verbose=False, mode="min")
uids = df['uid'].unique()
mids = df['mid'].unique()
uid_no = len(uids)
mid_no = len(mids)
net = LitNet(uid_no, mid_no, 16, 16)
trainer = pl.Trainer(max_epochs=40,
                     accelerator='gpu',
                     callbacks=[early_stop_callback]
                     )
trainer.fit(net, train_dataloaders=train_dl, val_dataloaders=val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | WideDeepNet | 174 K 
--------------------------------------
174 K     Trainable params
0         Non-trainable params
174 K     Total params
0.699     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
