In [2]:
!nvidia-smi

import os
import warnings
from IPython.display import clear_output

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
warnings.filterwarnings('ignore')

Sun Nov  5 18:10:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   28C    P0    57W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install pytorch_lightning torchfm
clear_output()

In [4]:
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from transformers import AutoTokenizer
from torchfm.model.dfm import DeepFactorizationMachineModel
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
pl.seed_everything(56)

Seed set to 56


56

In [5]:
class CFG:
    wandb=False
    num_workers=12
    train_path='/notebooks/train.gz'
    hidden_size = 512
    val_split_size = 0.2
    num_labels = 2
    scheduler='cosine'
    max_epoches=15
    lr=1e-3
    min_lr=5e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=2048
    weight_decay=0.01
    gradient_accumulation_steps=1
    seed=56

In [6]:
class PLDataset(torch.utils.data.Dataset):
    def __init__(self, df,features,label):
        super().__init__()
        self.cfg = CFG()
        self.data = df
        self.features_col = features
        self.label_col = label
    def __getitem__(self, index):
        features = self.data.iloc[index][self.features_col].values.astype(np.int32)
        label = self.data.iloc[index][self.label_col]
        return features,label
    def __len__(self):
        return len(self.data)

In [7]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self,df,features,label):
        super().__init__()
        self.cfg = CFG()
        self.df = df
        self.features = features
        self.label = label
        self.train_dataset_path = self.cfg.train_path
        self.val_split_size = self.cfg.val_split_size
        self.batch_size = self.cfg.batch_size
        self.num_workers = self.cfg.num_workers
        self.is_setup = False
    def prepare_data(self):
        pass
    
    def setup(self, stage: str):
        if self.is_setup:
            return None
        self.train_df, self.val_df = train_test_split(self.df, test_size=self.val_split_size,random_state=self.cfg.seed)
        self.train_dataset = PLDataset(self.train_df,self.features,self.label)
        self.val_dataset = PLDataset(self.val_df,self.features,self.label)
       # self.predict_dataset = PLDataset(self.test_df,self.features,self.label)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers)

    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          shuffle = False)

In [8]:
class PLModule(pl.LightningModule):
    def __init__(self,):
        super().__init__()
        self.cfg = CFG()
        self.embedings = nn.ModuleList([nn.Embedding(num_emb,emb_dim)
                                       for num_emb,emb_dim in zip(self.cfg.embed_num,self.cfg.embed_dim)])
        self.mlp = nn.Sequential(nn.Linear(self.cfg.embed_size,self.cfg.hidden_size),
                                 nn.GELU(),
                                 nn.LayerNorm(self.cfg.hidden_size),
                                 nn.Linear(self.cfg.hidden_size,1))
        self.criterion = nn.BCEWithLogitsLoss()
        self.val_targets = []
        self.val_preds = []

    def forward(self, x):
        features = torch.cat([emb(x_i.int()) for emb,x_i in zip(self.embedings,x.T)],dim=-1)
        return self.mlp(features).squeeze()
    
    def training_step(self, batch, _):
        x, targets = batch
        logits = self(x).to(torch.float64)
        loss = self.criterion(logits, targets.to(torch.float64))
        return loss
        
    def validation_step(self, batch, _):
        x, targets = batch
        logits = self(x).cpu().detach().tolist()
        self.val_targets += targets.tolist()
        self.val_preds += logits
        
    def predict_step(self, batch, _):
        x, targets = batch
        logits = self(x).cpu().detach().tolist()
        return logits
        
    def calc_metric(self):
        return roc_auc_score(self.val_targets,self.val_preds)
        
    def on_validation_epoch_end(self):
        print(self.calc_metric())
        self.val_targets, self.val_preds = [],[]
            
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(),
                                 self.cfg.lr,
                                 weight_decay=self.cfg.weight_decay,
                                 betas = self.cfg.betas
                                )

In [9]:
df = pd.read_csv("train.gz")

In [10]:
cat_features = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
features = cat_features
label_col = 'click'

In [11]:
class MultyLabelEncoder():
    def __init__(self,cat_cols):
        self.cat_cols = cat_cols
    
    def fit_transform(self,df):
        self.encoders = []
        for col in tqdm(self.cat_cols):
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.encoders += [le]
        return df
    
    def transform(self,df):
        for col,le in tqdm(zip(self.cat_cols,self.encoders),total=len(self.cat_cols)):
            df[col] = le.transform(df[col])
        return df

In [12]:
le = MultyLabelEncoder(cat_features)
df = le.fit_transform(df)

  0%|          | 0/22 [00:00<?, ?it/s]

In [13]:
CFG.embed_num = [df[col].nunique() for col in tqdm(cat_features)]

  0%|          | 0/22 [00:00<?, ?it/s]

In [14]:
CFG.embed_dim = [64] * len(CFG.embed_num)
CFG.embed_size = sum(CFG.embed_dim)
CFG.hidden_size = CFG.embed_size * 2

In [15]:
dm = PLDataModule(df,features,label_col)

In [None]:
class AFMLayer(nn.Module):
    def __init__(self, in_features, attention_factor=4, l2_reg_w=0, dropout_rate=0, seed=1024, device='cpu'):
        super(AFMLayer, self).__init__()
        self.attention_factor = attention_factor
        self.l2_reg_w = l2_reg_w
        self.dropout_rate = dropout_rate
        self.seed = seed
        embedding_size = in_features
    
        self.attention_W = nn.Parameter(torch.Tensor(
            embedding_size, self.attention_factor))

        self.attention_b = nn.Parameter(torch.Tensor(self.attention_factor))

        self.projection_h = nn.Parameter(
            torch.Tensor(self.attention_factor, 1))

        self.projection_p = nn.Parameter(torch.Tensor(embedding_size, 1))

        for tensor in [self.attention_W, self.projection_h, self.projection_p]:
            nn.init.xavier_normal_(tensor, )

        for tensor in [self.attention_b]:
            nn.init.zeros_(tensor, )

        self.dropout = nn.Dropout(dropout_rate)

        self.to(device)
        
    def forward(self, inputs):
        embeds_vec_list = inputs
        row,col = [],[]

        for r, c in itertools.combinations(embeds_vec_list, 2):
            row.append(r)
            col.append(c)

        p = torch.cat(row, dim=1)
        q = torch.cat(col, dim=1)
        inner_product = p * q

        bi_interaction = inner_product
        attention_temp = F.relu(torch.tensordot(
            bi_interaction, self.attention_W, dims=([-1], [0])) + self.attention_b)

        self.normalized_att_score = F.softmax(torch.tensordot(
            attention_temp, self.projection_h, dims=([-1], [0])), dim=1)
        attention_output = torch.sum(
            self.normalized_att_score * bi_interaction, dim=1)

        attention_output = self.dropout(attention_output)  # training

        afm_out = torch.tensordot(
            attention_output, self.projection_p, dims=([-1], [0]))
        return afm_out

In [20]:
pl_model = PLModule()

In [21]:
pl_model

PLModule(
  (embedings): ModuleList(
    (0): Embedding(240, 64)
    (1): Embedding(7, 64)
    (2): Embedding(7, 64)
    (3): Embedding(4737, 64)
    (4): Embedding(7745, 64)
    (5): Embedding(26, 64)
    (6): Embedding(8552, 64)
    (7): Embedding(559, 64)
    (8): Embedding(36, 64)
    (9): Embedding(2686408, 64)
    (10): Embedding(6729486, 64)
    (11): Embedding(8251, 64)
    (12): Embedding(5, 64)
    (13): Embedding(4, 64)
    (14): Embedding(2626, 64)
    (15): Embedding(8, 64)
    (16): Embedding(9, 64)
    (17): Embedding(435, 64)
    (18): Embedding(4, 64)
    (19): Embedding(68, 64)
    (20): Embedding(172, 64)
    (21): Embedding(60, 64)
  )
  (mlp): Sequential(
    (0): Linear(in_features=1408, out_features=2816, bias=True)
    (1): GELU(approximate=none)
    (2): LayerNorm((2816,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=2816, out_features=1, bias=True)
  )
  (criterion): BCEWithLogitsLoss()
)

In [24]:
logger = pl.loggers.TensorBoardLogger("tb_logs", name="text_cls")
trainer = pl.Trainer(
    accelerator="gpu",
    logger=logger,
    max_epochs=15,
    log_every_n_steps=1,
    val_check_interval=0.2
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(pl_model, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params
------------------------------------------------
0 | embedings | ModuleList        | 604 M 
1 | mlp       | Sequential        | 4.0 M 
2 | criterion | BCEWithLogitsLoss | 0     
------------------------------------------------
608 M     Trainable params
0         Non-trainable params
608 M     Total params
2,434.963 Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

0.4561391262436454


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

0.7556783052751697


In [None]:
nll_loss_forward_reduce_cuda_kernel_2d_index