In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import gc
import seaborn as sns
from typing import List, Optional
import torch
from optuna.integration import PyTorchLightningPruningCallback
import pytorch_lightning as pl
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
from torchmetrics import AUROC

from utilities import (
    RANDOM_STATE, TARGET_COL, N_FOLD, FOLD_STRAT_NAME,
    EPOCHS, BATCH_SIZE, LEARNING_RATE, WEIGHT_DECAY, 
    EARLY_STOPPING_STEPS, EARLY_STOP
)

from nn_utilities import (
    seed_everything, TabularDataset, InferenceDataset, run_training, inference_fn, Model_ff
)

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_PATH = '../input/tabular-playground-series-oct-2021'
PATH_NOTEBOOK = '../input/preprocess-gpu'

In [2]:
train = pd.read_pickle(os.path.join(PATH_NOTEBOOK, 'train_scaled.pkl'))

In [3]:
with open(os.path.join(PATH_NOTEBOOK, 'feature_dic.pkl'), 'rb') as file:
    feature_dic = pickle.load(file)

In [7]:
#CONSTANT
FEATURE = feature_dic['feature']
CAT_COL = feature_dic['categorical']
NUMERIC_COL = feature_dic['numerical']

FOLD_LIST = list(range(N_FOLD))

gc.collect()

70

In [8]:
class TabularDataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        item = [
            torch.tensor(self.features[idx, :], dtype=torch.float),
            torch.tensor(self.targets[idx], dtype=torch.float)
        ]
        return item

In [9]:
class DataModule(pl.LightningDataModule):
    def __init__(self, batch_size: int):
        super().__init__()
        self.batch_size = batch_size

    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            train_dataset, batch_size=self.batch_size, shuffle=True, pin_memory=True,
            num_workers=os.cpu_count(), drop_last=True,
        )
    
    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            valid_dataset, batch_size=self.batch_size, shuffle=False, pin_memory=True,
            num_workers=os.cpu_count(), drop_last=False,
        )

In [27]:
def objective(trial: optuna.trial.Trial) -> float:
    max_epochs = 100
    steps_per_epoch = int(len(train_dataset)/max_epochs)

    # We optimize the number of layers, hidden units in each layer and dropouts.
    n_layers = trial.suggest_int("n_layers", 1, 6)
    dropout_list = [
        trial.suggest_float("dropout_{}".format(i), 0, 0.5) for i in range(n_layers)
    ]
    output_dims = [
        trial.suggest_int("n_units_l{}".format(i), 4, 1200, log=True) for i in range(n_layers)
    ]

    lr = trial.suggest_float("lr", 1e-6, 1e-3)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-2)
    
    datamodule = DataModule(batch_size = BATCH_SIZE)
    model = LightningNet(output_dims, dropout_list, lr, weight_decay, num_features = len(FEATURE),
                         epoch = max_epochs, steps_per_epoch = steps_per_epoch)
    
    early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_auc", patience = 5, verbose=False, mode="max")
    pruning_callback = PyTorchLightningPruningCallback(trial, monitor="val_auc")
    
    trainer = pl.Trainer(
        logger=True,
        checkpoint_callback=False,
        max_epochs=max_epochs,
        gpus=1 if torch.cuda.is_available() else None,
        progress_bar_refresh_rate=0,
        callbacks=[early_stop_callback, pruning_callback],
    )
    hyperparameters = dict(n_layers=n_layers, dropout_list=dropout_list, output_dims=output_dims, 
                           lr = lr, weight_decay = weight_decay)
    
    trainer.logger.log_hyperparams(hyperparameters)
    trainer.fit(model, datamodule=datamodule)
    
    return trainer.callback_metrics["val_auc"].item()


In [16]:
class Model(nn.Module):
    def __init__(self, output_dims: List[int], dropout_list: List[float], num_features):
        super().__init__()
        
        layers: List[nn.Module] = []

        input_dim: int = num_features
        for i, output_dim in enumerate(output_dims):
            layers.append(nn.BatchNorm1d(input_dim))
            layers.append(nn.Linear(input_dim, output_dim))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout_list[i]))
            input_dim = output_dim
        
        layers.append(nn.BatchNorm1d(input_dim))
        layers.append(nn.Linear(input_dim, 1))
    
        self.layers: nn.Module = nn.Sequential(*layers)
    def forward(self, data: torch.Tensor) -> torch.Tensor:
        logits = self.layers(data).squeeze(1)
        return logits
    
    
class LightningNet(pl.LightningModule):
    def __init__(self, output_dims: List[int], dropout_list: List[float], lr, weight_decay, num_features,
                epoch, steps_per_epoch):
        super().__init__()
        self.model = Model(output_dims = output_dims, dropout_list = dropout_list, num_features = num_features)
        self.lr = lr
        self.weight_decay = weight_decay
        self.steps_per_epoch = steps_per_epoch
        self.epoch = epoch
        self.auc_metric = AUROC(pos_label = 1)

    def step(self, batch):
        # return batch loss
        data, target  = batch
        pred = self(data)
        loss  = F.binary_cross_entropy_with_logits(pred, target)
        return loss, target, pred.sigmoid()

    def forward(self, data: torch.Tensor) -> torch.Tensor:
        return self.model(data)

    def training_step(self, batch, batch_idx: int) -> torch.Tensor:
        
        loss, target, prob_pred = self.step(batch)
        auc = self.auc_metric(preds = prob_pred, target = target.long())
        tensorboard_logs = {'train_loss': loss, 'auc': auc}
        
        return {'loss': loss, 'auc': auc, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx: int) -> None:
        loss, target, prob_pred = self.step(batch)
        return {'val_loss': loss, 'target': target.detach(), 'prob_pred': prob_pred.detach()}


    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        target = torch.cat([x['target'] for x in outputs])
        prob_pred = torch.cat([x['prob_pred'] for x in outputs])
        
        auc = self.auc_metric(preds = prob_pred, target = target.long())
        tensorboard_logs = {'val_loss': avg_loss, 'val_auc': auc}
        return {
            'avg_val_loss': avg_loss, 'val_auc': auc, 'log': tensorboard_logs
        }
    

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr = self.lr, weight_decay = self.weight_decay)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            max_lr=1e-2,
            steps_per_epoch=self.steps_per_epoch,
            epochs=self.epoch,
            optimizer=optimizer,
        )
        return [optimizer], [scheduler]


In [28]:
class Model(nn.Module):
    def __init__(self, output_dims: List[int], dropout_list: List[float], num_features):
        super().__init__()
        
        layers: List[nn.Module] = []

        input_dim: int = num_features
        for i, output_dim in enumerate(output_dims):
            layers.append(nn.BatchNorm1d(input_dim))
            layers.append(nn.Linear(input_dim, output_dim))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout_list[i]))
            input_dim = output_dim
        
        layers.append(nn.BatchNorm1d(input_dim))
        layers.append(nn.Linear(input_dim, 1))
    
        self.layers: nn.Module = nn.Sequential(*layers)
    def forward(self, data: torch.Tensor) -> torch.Tensor:
        logits = self.layers(data).squeeze(1)
        return logits
    
    
class LightningNet(pl.LightningModule):
    def __init__(self, output_dims: List[int], dropout_list: List[float], lr, weight_decay, num_features,
                epoch, steps_per_epoch):
        super().__init__()
        self.model = Model(output_dims = output_dims, dropout_list = dropout_list, num_features = num_features)
        self.lr = lr
        self.weight_decay = weight_decay
        self.steps_per_epoch = steps_per_epoch
        self.epoch = epoch
        self.auc_metric = AUROC(pos_label = 1)

    def step(self, batch):
        # return batch loss
        data, target  = batch
        pred = self(data)
        loss  = F.binary_cross_entropy_with_logits(pred, target)
        return loss, target, pred.sigmoid()

    def forward(self, data: torch.Tensor) -> torch.Tensor:
        return self.model(data)

    def training_step(self, batch, batch_idx: int) -> torch.Tensor:
        loss, _, _ = self.step(batch)
        return loss

    def validation_step(self, batch, batch_idx: int) -> None:
        loss, target, pred_prob = self.step(batch)
        auc = self.auc_metric(pred_prob, target.long())
        self.log("val_auc", auc)
        self.log("hp_metric", auc, on_step=False, on_epoch=True)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr = self.lr, weight_decay = self.weight_decay)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            max_lr=1e-2,
            steps_per_epoch=self.steps_per_epoch,
            epochs=self.epoch,
            optimizer=optimizer,
        )
        return [optimizer], [scheduler]



In [29]:
#train test split for optuna-study
train_x, test_x, train_y, test_y = train_test_split(
    train[FEATURE], train[TARGET_COL], random_state = RANDOM_STATE, 
    stratify = train[TARGET_COL], test_size = .75
)


train_dataset = TabularDataset(train_x.values, train_y.values)
valid_dataset = TabularDataset(test_x.values, test_y.values)

gc.collect()

751

In [30]:
pruner = optuna.pruners.MedianPruner()

study = optuna.create_study(direction="maximize", pruner=pruner)
study.optimize(objective, timeout=30500, show_progress_bar = True)

[32m[I 2021-10-14 17:38:06,235][0m A new study created in memory with name: no-name-a15bb043-4871-4b27-b2c7-8ce2a3c973f6[0m
  self._init_valid()


0it [00:00, ?it/s]

  step
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


[32m[I 2021-10-14 17:42:53,538][0m Trial 0 finished with value: 0.8408616781234741 and parameters: {'n_layers': 2, 'dropout_0': 0.43849223366952667, 'dropout_1': 0.3001934981760253, 'n_units_l0': 9, 'n_units_l1': 28, 'lr': 0.0004553089117116581, 'weight_decay': 0.0028698355585040853}. Best is trial 0 with value: 0.8408616781234741.[0m


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


[33m[W 2021-10-14 17:43:04,152][0m Trial 1 failed because of the following error: KeyError('val_auc')[0m
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_40/691249158.py", line 38, in objective
    return trainer.callback_metrics["val_auc"].item()
KeyError: 'val_auc'


KeyError: 'val_auc'

In [None]:
best_score = study.best_trial.values
print(best_score)

In [None]:
final_params = study.best_trial.params
print(final_params)

In [None]:
with open("final_nn_param.pkl", "wb") as file_name:
    pickle.dump(final_params, file_name)
