In [1]:
!nvidia-smi

import os
import warnings
from IPython.display import clear_output

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
warnings.filterwarnings('ignore')

Sun Nov 12 11:41:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   31C    P0    59W / 400W |   7237MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install pytorch_lightning torchfm
!pip install protobuf==3.20.*
clear_output()

In [28]:
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import e
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from transformers import get_cosine_schedule_with_warmup
import itertools
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score, log_loss
pl.seed_everything(56)

Seed set to 56


56

In [4]:
class CFG:
    wandb=False
    num_workers=12
    train_path='/notebooks/train.gz'
    hidden_size = 512
    val_split_size = 0.2
    num_labels = 2
    scheduler='cosine'
    max_epoches=15
    lr=1e-3
    min_lr=5e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=2048
    weight_decay=0.01
    gradient_accumulation_steps=1
    seed=56

In [5]:
class PLDataset(torch.utils.data.Dataset):
    def __init__(self, df,features,label):
        super().__init__()
        self.cfg = CFG()
        self.data = df
        self.features_col = features
        self.label_col = label
    def __getitem__(self, index):
        features = self.data.iloc[index][self.features_col].values.astype(np.int32)
        label = self.data.iloc[index][self.label_col]
        return features,label
    def __len__(self):
        return len(self.data)

In [6]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self,df,features,label):
        super().__init__()
        self.cfg = CFG()
        self.df = df
        self.features = features
        self.label = label
        self.train_dataset_path = self.cfg.train_path
        self.val_split_size = self.cfg.val_split_size
        self.batch_size = self.cfg.batch_size
        self.num_workers = self.cfg.num_workers
        self.is_setup = False
    def prepare_data(self):
        pass
    
    def setup(self, stage: str):
        if self.is_setup:
            return None
        self.train_df, self.val_df = train_test_split(self.df, test_size=self.val_split_size,random_state=self.cfg.seed)
        #self.train_df = self.df[:32343173]
        #self.val_df = self.df[32343173:]
        self.train_dataset = PLDataset(self.train_df,self.features,self.label)
        self.val_dataset = PLDataset(self.val_df,self.features,self.label)
       # self.predict_dataset = PLDataset(self.test_df,self.features,self.label)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers)

    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          shuffle = False)

In [7]:
class PLModule(pl.LightningModule):
    def __init__(self,):
        super().__init__()
        self.cfg = CFG()
        self.embedings = nn.ModuleList([nn.Embedding(num_emb,emb_dim)
                                       for num_emb,emb_dim in zip(self.cfg.embed_num,self.cfg.embed_dim)])
        self.mlp = nn.Sequential(nn.Linear(self.cfg.embed_size,self.cfg.hidden_size),
                                 nn.GELU(),
                                 nn.LayerNorm(self.cfg.hidden_size),
                                 nn.Linear(self.cfg.hidden_size,self.cfg.embed_size),
                                 nn.GELU(),
                                 nn.LayerNorm(self.cfg.embed_size),
                                 nn.Linear(self.cfg.embed_size,1),)
        self.criterion = nn.BCEWithLogitsLoss()
        self.val_targets = []
        self.val_preds = []

    def forward(self, x):
        features = torch.cat([emb(x_i.int()) for emb,x_i in zip(self.embedings,x.T)],dim=-1)
        features = self.mlp(features)
        return features.squeeze(1)
    
    def training_step(self, batch, _):
        x, targets = batch
        logits = self(x).to(torch.float64)
        loss = self.criterion(logits, targets.to(torch.float64))
        return loss
        
    def validation_step(self, batch, _):
        x, targets = batch
        logits = self(x).cpu().detach().tolist()
        self.val_targets += targets.tolist()
        self.val_preds += logits
        
    def predict_step(self, batch, _):
        x, targets = batch
        logits = self(x).cpu().detach().tolist()
        return logits
        
    def calc_metric(self):
        metrics = {}
        metrics['auc'] = roc_auc_score(self.val_targets,self.val_preds)
        metrics['log_loss'] = log_loss(self.val_targets,self.val_preds)
        return metrics
        
    def on_validation_epoch_end(self):
        print(self.calc_metric())
        self.val_targets, self.val_preds = [],[]
            
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(),
                                 self.cfg.lr,
                                 weight_decay=self.cfg.weight_decay,
                                 betas = self.cfg.betas
                                )

In [8]:
df = pd.read_csv("train.gz")[:10_000_000]

In [9]:
cat_features = ['hour','C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
features = cat_features
label_col = 'click'

In [10]:
class MultyLabelEncoder():
    def __init__(self,cat_cols):
        self.cat_cols = cat_cols
    
    def fit_transform(self,df):
        self.encoders = []
        for col in tqdm(self.cat_cols):
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.encoders += [le]
        return df
    
    def transform(self,df):
        for col,le in tqdm(zip(self.cat_cols,self.encoders),total=len(self.cat_cols)):
            df[col] = le.transform(df[col])
        return df

In [11]:
le = MultyLabelEncoder(cat_features)
df = le.fit_transform(df)

  0%|          | 0/22 [00:00<?, ?it/s]

In [12]:
CFG.embed_num = [df[col].nunique() for col in tqdm(cat_features)]

  0%|          | 0/22 [00:00<?, ?it/s]

In [13]:
CFG.embed_num

[53,
 7,
 7,
 3496,
 4585,
 23,
 5469,
 390,
 33,
 786741,
 2129662,
 6863,
 4,
 4,
 1030,
 8,
 9,
 226,
 4,
 47,
 168,
 42]

In [16]:
CFG.embed_dim = [16,4,4,256,256,16,128,32,16,512,512,256,2,2,64,4,4,32,4,32,32,8]#[64] * len(CFG.embed_num)
CFG.embed_size = sum(CFG.embed_dim)
CFG.hidden_size = CFG.embed_size * 2

In [17]:
dm = PLDataModule(df,features,label_col)

In [18]:
pl_model = PLModule()

In [19]:
CFG.embed_size

2192

In [21]:
logger = pl.loggers.TensorBoardLogger("tb_logs", name="text_cls")
trainer = pl.Trainer(
    accelerator="gpu",
    logger=logger,
    max_epochs=15,
    log_every_n_steps=1,
    val_check_interval=0.5
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
trainer.fit(pl_model, datamodule=dm)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params
------------------------------------------------
0 | embedings | ModuleList        | 1.5 B 
1 | mlp       | Sequential        | 19.2 M
2 | criterion | BCEWithLogitsLoss | 0     
------------------------------------------------
1.5 B     Trainable params
0         Non-trainable params
1.5 B     Total params
6,068.241 Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

{'auc': 0.5140414750658512, 'log_loss': 3.866926615381203}


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

{'auc': 0.7619042285813155, 'log_loss': 5.324592765281096}


Validation: |          | 0/? [00:00<?, ?it/s]

In [24]:
preds = trainer.predict(pl_model,dm.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [26]:
y_p = np.concatenate(preds)

In [29]:
def sigmoid(x):
    return 1 / (1 + e**(-1 * x))

In [30]:
y_p = [sigmoid(x) for x in y_p]

In [33]:
dm.val_df['click']

6141725    0
8317400    0
6616162    0
7676727    0
5759427    0
          ..
3337546    0
8470632    0
4909823    1
4523224    0
9868897    1
Name: click, Length: 2000000, dtype: int64

In [34]:
roc_auc_score(dm.val_df['click'],y_p)

0.7759941672902971