In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import gc
import seaborn as sns
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

from utilities import (
    RANDOM_STATE, TARGET_COL, N_FOLD, FOLD_STRAT_NAME,
    EPOCHS, BATCH_SIZE, 
    EARLY_STOPPING_STEPS, EARLY_STOP
)

from nn_utilities import (
    seed_everything, run_training_model_fix, inference_fn
)

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_PATH = '../input/tabular-playground-series-oct-2021'
PATH_NOTEBOOK = '../input/preprocess-category-gpu'

In [2]:
train = pd.read_pickle(os.path.join(PATH_NOTEBOOK, 'train_unscaled.pkl'))

In [3]:
with open(os.path.join(PATH_NOTEBOOK, 'feature_dic.pkl'), 'rb') as file:
    feature_dic = pickle.load(file)

In [4]:
#CONSTANT
FEATURE = feature_dic['feature']
CAT_COL = feature_dic['categorical']
NUMERIC_COL = feature_dic['numerical']

FOLD_LIST = list(range(N_FOLD))

gc.collect()

21

In [5]:
class TabularDataset:
    def __init__(self, numeric_col, cat_col, targets):
        self.numeric_col = numeric_col
        self.cat_col = cat_col
        self.targets = targets

    def __len__(self):
        return (self.numeric_col.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x_numeric' : torch.tensor(self.numeric_col[idx, :], dtype=torch.long),
            'x_cat' : torch.tensor(self.cat_col[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx], dtype=torch.float)
        }
        return dct
    
class InferenceDataset:
    def __init__(self, numeric_col, cat_col):
        self.numeric_col = numeric_col
        self.cat_col = cat_col
        
    def __len__(self):
        return (self.numeric_col.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x_numeric' : torch.tensor(self.numeric_col[idx, :], dtype=torch.long),
            'x_cat' : torch.tensor(self.cat_col[idx, :], dtype=torch.float),
        }
        return dct


In [6]:
def train_fn(model, optimizer, scheduler, criterion, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs_numeric, input_cat, targets = data['x_numeric'].to(device), data['x_cat'].to(device), data['y'].to(device).unsqueeze(1)

        outputs = model(inputs_numeric, input_cat)
        
        loss = criterion(outputs, targets)
        loss.backward()
            
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, criterion, dataloader, device):
    model.eval()
    
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs_numeric, input_cat, targets = data['x_numeric'].to(device), data['x_cat'].to(device), data['y'].to(device).unsqueeze(1)
        
        outputs = model(inputs_numeric, input_cat)
        loss = criterion(outputs, targets)

        final_loss += loss.item()
        
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs_numeric, input_cat = data['x_numeric'].to(device), data['x_cat'].to(device)

        with torch.no_grad():
            outputs = model(inputs_numeric, input_cat)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds


In [7]:
def run_training(train, valid, fold, model, batch_size, epochs, 
                 seed, early_stop_step, early_stop, device, learning_rate, weight_decay, verbose= True, save = True):

    assert isinstance(train, list) & isinstance(valid, list) & (len(train) == 3) & (len(valid) == 3)
    
    seed_everything(seed)
            
    x_train_numeric, x_train_cat, y_train  = train
    x_valid_numeric, x_valid_cat, y_valid =  valid
    
    train_dataset = TabularDataset(x_train_numeric, x_train_cat, y_train)
    valid_dataset = TabularDataset(x_valid_numeric, x_valid_cat, y_valid)
    
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=learning_rate*10, epochs=epochs, steps_per_epoch=len(trainloader))
    
    criterion = nn.BCEWithLogitsLoss()
    
    early_step = 0
    
    best_loss = np.inf
    best_auc = -np.inf
    
    for epoch in range(epochs):
        
        train_loss = train_fn(model, optimizer, scheduler, criterion, trainloader, device)
        valid_loss, valid_preds = valid_fn(model, criterion, validloader, device)

        valid_auc = roc_auc_score(y_valid, valid_preds)
        
        if verbose:
            print(f"EPOCH: {epoch},  train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}, valid_auc: {valid_auc:.5f}")
        
        if valid_auc > best_auc:
            
            best_auc = valid_auc
            best_pred = valid_preds
            
            if save:
                torch.save(model.state_dict(), f"FOLD_{fold}_.pth")
            
        elif(early_stop == True):
            
            early_step += 1
            if (early_step >= early_stop_step):
                break
                
    return best_auc, best_pred


# TRAIN

In [8]:
num_feature = len(FEATURE)

In [9]:
class ModelBinned(nn.Module):
    def __init__(self, num_cat_features, hidden_size):
        super(ModelBinned, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.embedding = nn.Sequential(
            nn.Embedding(num_embeddings = 256, embedding_dim = 8)
        )
        self.output_embedding = nn.Sequential(            
            nn.Linear(1920, self.hidden_size),
            nn.GELU(),
        )
        
        self.layer_embedding = nn.Sequential(
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size),
            nn.GELU()
        )
        
        self.layer_categorical = nn.Sequential(
            nn.Linear(num_cat_features, hidden_size),
            nn.GELU(),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.GELU()

        )

        self.layer_concat = nn.Sequential(
            nn.BatchNorm1d(hidden_size * 2),
            nn.Dropout(0.2),
            nn.Linear(hidden_size * 2, hidden_size * 2),
            nn.GELU(),
            nn.BatchNorm1d(hidden_size * 2),
            nn.Dropout(0.2),
            nn.Linear(hidden_size * 2, hidden_size * 2),
            nn.GELU()
        )
        
        self.classifier = nn.Sequential(
            nn.BatchNorm1d(hidden_size * 2),
            nn.Dropout(0.1),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.GELU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 1),

      )

    def forward(self, x_num, x_cat):
        x_num = self.embedding(x_num)

        x_num = x_num.reshape(x_num.size(0), -1)
        new_size = x_num.size(1)
        
        x_num = self.output_embedding(x_num)
        
        numeric = self.layer_embedding(x_num)
        cat = self.layer_categorical(x_cat)
        
        concat = torch.cat((numeric, cat), axis = 1)
        concat = self.layer_concat(concat)

        output = self.classifier(concat)
        
        return output


In [10]:
score = 0
model_list = []

prediction_df = pd.DataFrame(
    {
        'fold': train[FOLD_STRAT_NAME],
        'prediction': np.zeros((train.shape[0]))
    }
)

for i, fold_ in enumerate(FOLD_LIST):

    mask_train = (train[FOLD_STRAT_NAME] != fold_)
    mask_test = (train[FOLD_STRAT_NAME] == fold_)
    
    
    train_numeric, train_cat, train_y = train.loc[mask_train, NUMERIC_COL].values, train.loc[mask_train, CAT_COL].values, train.loc[mask_train, TARGET_COL].values
    valid_numeric, valid_cat, valid_y = train.loc[mask_test, NUMERIC_COL].values, train.loc[mask_test, CAT_COL].values, train.loc[mask_test, TARGET_COL].values
    
    model_ff = ModelBinned(num_cat_features = len(CAT_COL), hidden_size = 128)
    
    score_fold, pred_valid = run_training([train_numeric, train_cat, train_y], [valid_numeric, valid_cat, valid_y], 
                                        fold = fold_, model = model_ff, batch_size = BATCH_SIZE, epochs = EPOCHS, 
                                        seed = RANDOM_STATE, early_stop_step = EARLY_STOPPING_STEPS, early_stop = EARLY_STOP, 
                                        device = DEVICE, learning_rate = 1e-3, weight_decay = 1e-5)

    #oof prediction
    prediction_df.loc[mask_test, 'prediction'] = pred_valid
    
    #evaluate score and save model for importance/prediction
    score += score_fold/N_FOLD
    
    print('\nFold: {}; Auc: {:.5f}\n'.format(fold_, score_fold))
    print('-'*50)
    print('\n\n\n')
    gc.collect()
    
print('CV-Auc: {:.5f}\n'.format(score))

EPOCH: 0,  train_loss: 0.5306, valid_loss: 0.5125, valid_auc: 0.82494
EPOCH: 1,  train_loss: 0.5072, valid_loss: 0.5017, valid_auc: 0.83291
EPOCH: 2,  train_loss: 0.4948, valid_loss: 0.4916, valid_auc: 0.83960
EPOCH: 3,  train_loss: 0.4842, valid_loss: 0.4859, valid_auc: 0.84287
EPOCH: 4,  train_loss: 0.4771, valid_loss: 0.4844, valid_auc: 0.84418
EPOCH: 5,  train_loss: 0.4712, valid_loss: 0.4826, valid_auc: 0.84543
EPOCH: 6,  train_loss: 0.4665, valid_loss: 0.4805, valid_auc: 0.84680
EPOCH: 7,  train_loss: 0.4622, valid_loss: 0.4808, valid_auc: 0.84605
EPOCH: 8,  train_loss: 0.4596, valid_loss: 0.4797, valid_auc: 0.84808
EPOCH: 9,  train_loss: 0.4586, valid_loss: 0.4774, valid_auc: 0.84758
EPOCH: 10,  train_loss: 0.4586, valid_loss: 0.4763, valid_auc: 0.84849
EPOCH: 11,  train_loss: 0.4594, valid_loss: 0.4755, valid_auc: 0.84950
EPOCH: 12,  train_loss: 0.4608, valid_loss: 0.4748, valid_auc: 0.85026
EPOCH: 13,  train_loss: 0.4635, valid_loss: 0.4740, valid_auc: 0.85118
EPOCH: 14,  trai

# TEST Blending

In [11]:
del train
gc.collect()

21

In [12]:
test = pd.read_pickle(
    os.path.join(PATH_NOTEBOOK, 'test_unscaled.pkl')
)

test_numeric, test_cat = test[NUMERIC_COL].values, test[CAT_COL].values,

In [13]:
pred_test = np.zeros((test.shape[0], 1))

for fold_ in FOLD_LIST:
    model = ModelBinned(num_cat_features = len(CAT_COL), hidden_size = 128)
    
    model.load_state_dict(torch.load(f"FOLD_{fold_}_.pth"))
    model.to(DEVICE)
    
    test_dataset = InferenceDataset(test_numeric, test_cat)

    testloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    pred = inference_fn(model, testloader, DEVICE)
    pred_test += pred/N_FOLD

# SAVE RESULT

In [14]:
submission = pd.read_csv(os.path.join(INPUT_PATH, 'sample_submission.csv'))
submission['target'] = pred_test

submission.to_csv('submission.csv', index = False)

In [15]:
prediction_df.to_csv('oof_prediction', index = False)