In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import gc
import seaborn as sns
import optuna
from sklearn.model_selection import train_test_split
import torch
from typing import List, Optional
from torch import nn
from torch import optim
from sklearn.metrics import roc_auc_score

from utilities import (
    RANDOM_STATE, TARGET_COL, N_FOLD, FOLD_STRAT_NAME, REDUCED_FOLD_NAME,
    EPOCHS, BATCH_SIZE, LEARNING_RATE, WEIGHT_DECAY, 
    EARLY_STOPPING_STEPS, EARLY_STOP
)

from nn_utilities import (
    seed_everything, TabularDataset, InferenceDataset, run_training, inference_fn, Model_ff, train_fn, valid_fn
)

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_PATH = '../input/tabular-playground-series-oct-2021'
PATH_NOTEBOOK = '../input/preprocess-gpu'

In [2]:
train = pd.read_pickle(os.path.join(PATH_NOTEBOOK, 'train_scaled.pkl'))

In [3]:
with open(os.path.join(PATH_NOTEBOOK, 'feature_dic.pkl'), 'rb') as file:
    feature_dic = pickle.load(file)

In [4]:
#CONSTANT
FEATURE = feature_dic['feature']
CAT_COL = feature_dic['categorical']
NUMERIC_COL = feature_dic['numerical']

FOLD_LIST = list(range(N_FOLD))

gc.collect()

42

# TRAIN

In [5]:
num_feature = len(FEATURE)
hidden = int(len(FEATURE) * 2)

In [6]:
#train test split for optuna-study
train_x, valid_x, train_y, valid_y = train_test_split(
    train[FEATURE].values, train[TARGET_COL].values, random_state = RANDOM_STATE, 
    stratify = train[TARGET_COL].values, test_size = .75
)

gc.collect()

42

In [7]:
mask_train = (train[REDUCED_FOLD_NAME] == 0)
mask_test = (train[REDUCED_FOLD_NAME] == 1)


train_x, train_y = train.loc[mask_train, FEATURE].values, train.loc[mask_train, TARGET_COL].values
valid_x, valid_y = train.loc[mask_test, FEATURE].values, train.loc[mask_test, TARGET_COL].values

In [8]:
class Model_list(nn.Module):
    def __init__(self, output_dims: List[int], dropout_list: List[float], num_features):
        super().__init__()
        
        layers: List[nn.Module] = []

        input_dim: int = num_features
        for i, output_dim in enumerate(output_dims):
            layers.append(nn.BatchNorm1d(input_dim))
            layers.append(nn.Linear(input_dim, output_dim))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout_list[i]))
            input_dim = output_dim
        
        layers.append(nn.BatchNorm1d(input_dim))
        layers.append(nn.Linear(input_dim, 1))
    
        self.layers: nn.Module = nn.Sequential(*layers)
    def forward(self, data: torch.Tensor) -> torch.Tensor:
        logits = self.layers(data)
        return logits

In [9]:
def run_training_opt(train, valid, fold, model, batch_size, epochs, 
                 seed, early_stop_step, early_stop, device, learning_rate, weight_decay):

    assert isinstance(train, list) & isinstance(valid, list) & (len(train) == 2) & (len(valid) == 2)
    
    seed_everything(seed)
            
    x_train, y_train  = train
    x_valid, y_valid =  valid
    
    train_dataset = TabularDataset(x_train, y_train)
    valid_dataset = TabularDataset(x_valid, y_valid)
    
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=learning_rate*10, epochs=epochs, steps_per_epoch=len(trainloader))
    
    criterion = nn.BCEWithLogitsLoss()
    
    early_step = 0
    
    best_loss = np.inf
    best_auc = -np.inf
    
    for epoch in range(epochs):
        
        train_loss = train_fn(model, optimizer, scheduler, criterion, trainloader, device)
        valid_loss, valid_preds = valid_fn(model, criterion, validloader, device)

        valid_auc = roc_auc_score(y_valid, valid_preds)
                
        if valid_auc > best_auc:
            
            best_auc = valid_auc
            best_pred = valid_preds
                        
        elif(early_stop == True):
            
            early_step += 1
            if (early_step >= early_stop_step):
                break
        
    return best_auc


In [10]:
def objective(trial: optuna.trial.Trial) -> float:

    # We optimize the number of layers, hidden units in each layer and dropouts.
    n_layers = trial.suggest_int("n_layers", 1, 6)
    
    dropout_list = [
        trial.suggest_float("dropout_{}".format(i), 0, 0.5) for i in range(n_layers)
    ]
    
    output_dims = [
        trial.suggest_int("n_units_l{}".format(i), 4, 2000) for i in range(n_layers)
    ]

    model_ff = Model_list(output_dims, dropout_list, num_feature)
    
    auc = run_training_opt([train_x, train_y], [valid_x, valid_y], fold = -1, model = model_ff, batch_size = BATCH_SIZE, epochs = EPOCHS, 
                                     seed = RANDOM_STATE, early_stop_step = 3, early_stop = EARLY_STOP, 
                                     device = DEVICE, learning_rate = LEARNING_RATE, weight_decay = WEIGHT_DECAY)
    
    return auc

In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=30000, show_progress_bar = True)

[32m[I 2021-10-17 06:36:01,416][0m A new study created in memory with name: no-name-c836d2fe-26ef-4482-89b7-d0ff077ae4e5[0m
  self._init_valid()


0it [00:00, ?it/s]

[32m[I 2021-10-17 06:36:23,934][0m Trial 0 finished with value: 0.836861864061864 and parameters: {'n_layers': 4, 'dropout_0': 0.37093467039947686, 'dropout_1': 0.221984112103435, 'dropout_2': 0.21975916609162593, 'dropout_3': 0.019915804528714498, 'n_units_l0': 1885, 'n_units_l1': 180, 'n_units_l2': 881, 'n_units_l3': 334}. Best is trial 0 with value: 0.836861864061864.[0m
[32m[I 2021-10-17 06:36:54,938][0m Trial 1 finished with value: 0.8380123444123444 and parameters: {'n_layers': 5, 'dropout_0': 0.43985609525445934, 'dropout_1': 0.4050704793242778, 'dropout_2': 0.25810207798617457, 'dropout_3': 0.44011540165622787, 'dropout_4': 0.24181001090869775, 'n_units_l0': 513, 'n_units_l1': 1026, 'n_units_l2': 1181, 'n_units_l3': 1646, 'n_units_l4': 1728}. Best is trial 1 with value: 0.8380123444123444.[0m
[32m[I 2021-10-17 06:37:12,340][0m Trial 2 finished with value: 0.8362698666698667 and parameters: {'n_layers': 2, 'dropout_0': 0.3916050844238443, 'dropout_1': 0.3393561036475946,

# Save result

In [12]:
best_score = study.best_trial.values
print(best_score)

[0.8411066339066339]


In [13]:
final_params = study.best_trial.params
print(final_params)

{'n_layers': 3, 'dropout_0': 0.12461594951685562, 'dropout_1': 0.24982079175712119, 'dropout_2': 0.1418559793954281, 'n_units_l0': 6, 'n_units_l1': 882, 'n_units_l2': 78}


In [14]:
with open("final_nn_param.pkl", "wb") as file_name:
    pickle.dump(final_params, file_name)