# TabNet CODE

### Introduce TabNET
---
* 고스트 배치 정규화 (GBN)

* Sparsemax

* 


### code_source
---
* TabNet Torch code: https://ichi.pro/ko/pytorcheseo-tabnet-guhyeon-277727554318969
* Sparsemax code: https://github.com/gokceneraslan/SparseMax.torch
* paper: https://arxiv.org/pdf/1908.07442v4.pdf
* pytorch-TabNet1: https://pypi.org/project/pytorch-tabnet/
* pytorch-TabNet2: https://wsshin.tistory.com/5
* pytorch-TabNet-Regressor: https://www.kaggle.com/rapela/tps-02-21-tabnet-regressor


In [1]:
import sys
sys.version

'3.8.11 (default, Aug  3 2021, 05:10:14) \n[Clang 10.0.0 ]'

In [2]:
import warnings

warnings.filterwarnings( 'ignore' )

In [18]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from sklearn.preprocessing import QuantileTransformer


from sklearn import preprocessing
import torch.optim as optim

import numpy as np
import pandas as pd

import torch.nn.functional as F

import optuna

import plotly as pl

from sklearn.model_selection import StratifiedKFold

In [3]:
train_data = pd.read_csv("train.csv") 
test_data = pd.read_csv("test.csv")

In [4]:
x_data = train_data.loc[:, 'f0':'f99']
y_data = train_data.loc[:, 'loss']

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
x_train, x_test, y_train, y_test=train_test_split(x_data,
                                                  y_data,
                                                  test_size=0.2,   #전체 중 20%를 테스트용으로 분할
                                                                   #나머지 80%는 훈련용
                                                  shuffle=True,    #무작위로 섞어서 추출
                                                  random_state=20) #무작위 추출 시 일정한 기준으로

In [37]:
x_valid, x_test, y_valid, y_test=train_test_split(x_test,
                                                  y_test,
                                                  test_size=0.5,   #50%를 validation으로 분할
                                                                   #나머지 50%는 테스트용
                                                  shuffle=True,    #무작위로 섞어서 추출
                                                  random_state=20) #무작위 추출 시 일정한 기준으로

In [1]:
class GBN(nn.Module):
    
    def __init__(self,inp,vbs=128,momentum=0.01):
        super().__init__()
        self.bn = nn.BatchNorm1d(inp,momentum=momentum)
        self.vbs = vbs
        
    def forward(self,x):
        chunk = torch.chunk(x,x.size(0)//self.vbs,0)
        res = [self.bn(y) for y in chunk]
        return torch.cat(res,0)

# Attention Transformer
class AttentionTransformer(nn.Module):
    
    def __init__(self,d_a,inp_dim,relax,vbs=128):
        super().__init__()
        self.fc = nn.Linear(d_a,inp_dim)
        self.bn = GBN(out_dim,vbs=vbs)
        self.smax = Sparsemax()
        self.r = relax
        
    #a:feature from previous decision step
    def forward(self,a,priors): 
        a = self.bn(self.fc(a)) 
        mask = self.smax(a*priors) 
        priors =priors*(self.r-mask)  #updating the prior
        return mask
    
# 
class GLU(nn.Module):
    
    def __init__(self,inp_dim,out_dim,fc=None,vbs=128):
        super().__init__()
        if fc:
            self.fc = fc
        else:
            self.fc = nn.Linear(inp_dim,out_dim*2)
        self.bn = GBN(out_dim*2,vbs=vbs) 
        self.od = out_dim
        
    def forward(self,x):
        x = self.bn(self.fc(x))
        return x[:,:self.od]*torch.sigmoid(x[:,self.od:])

class FeatureTransformer(nn.Module):
    
    def __init__(self,inp_dim,out_dim,shared,n_ind,vbs=128):
        super().__init__()
        first = True
        self.shared = nn.ModuleList()
        if shared:
            self.shared.append(GLU(inp_dim,out_dim,shared[0],vbs=vbs))
            first= False    
            for fc in shared[1:]:
                self.shared.append(GLU(out_dim,out_dim,fc,vbs=vbs))
        else:
            self.shared = None
        self.independ = nn.ModuleList()
        if first:
            self.independ.append(GLU(inp,out_dim,vbs=vbs))
        for x in range(first, n_ind):
            self.independ.append(GLU(out_dim,out_dim,vbs=vbs))
        self.scale = torch.sqrt(torch.tensor([.5],device=device))
        
    def forward(self,x):
        if self.shared:
            x = self.shared[0](x)
            for glu in self.shared[1:]:
                x = torch.add(x, glu(x))
                x = x*self.scale
        for glu in self.independ:
            x = torch.add(x, glu(x))
            x = x*self.scale
        return x

class DecisionStep(nn.Module):
    
    def __init__(self, inp_dim, n_d, n_a, shared, n_ind, relax,vbs=128):
        super().__init__()
        self.fea_tran = FeatureTransformer(inp_dim,n_d+n_a,shared,n_ind,vbs)
        self.atten_tran =  AttentionTransformer(n_a,inp_dim,relax,vbs)
        
    def forward(self,x,a,priors):
        mask = self.atten_tran(a,priors)
        sparse_loss = ((-1)*mask*torch.log(mask+1e-10)).mean()
        x = self.fea_tran(x*mask)
        return x,sparse_loss

class TabNet(nn.Module):
    
    def __init__(self,inp_dim,final_out_dim,n_d=64,n_a=64,
                 n_shared=2,n_ind=2,n_steps=5,relax=1.2,vbs=128):
        super().__init__()
        if n_shared>0:
            self.shared = nn.ModuleList()
            self.shared.append(nn.Linear(inp_dim,2*(n_d+n_a)))
            for x in range(n_shared-1):
                self.shared.append(nn.Linear(n_d+n_a,2*(n_d+n_a)))
        else:
            self.shared=None
        self.first_step = FeatureTransformer(inp_dim,n_d+n_a,self.shared,n_ind) 
        self.steps = nn.ModuleList()
        for x in range(n_steps-1):
            self.steps.append(DecisionStep(inp_dim,n_d,n_a,self.shared,n_ind,relax,vbs))
        self.fc = nn.Linear(n_d,final_out_dim)
        self.bn = nn.BatchNorm1d(inp_dim)
        self.n_d = n_d
        
    def forward(self,x):
        x = self.bn(x)
        x_a = self.first_step(x)[:,self.n_d:]
        sparse_loss = torch.zeros(1).to(x.device)
        out = torch.zeros(x.size(0),self.n_d).to(x.device)
        priors = torch.ones(x.shape).to(x.device)
        for step in self.steps:
            x_te,l = step(x,x_a,priors)
            out += F.relu(x_te[:,:self.n_d])
            x_a = x_te[:,self.n_d:]
            sparse_loss += l
        return self.fc(out),sparse_loss

NameError: name 'nn' is not defined

In [66]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

In [67]:
train_data = pd.read_csv("train.csv") 
test_data = pd.read_csv("test.csv")

x_data = train_data.loc[:, 'f0':'f99']
y_data = train_data.loc[:, 'loss']

x_train, x_test, y_train, y_test=train_test_split(x_data,
                                                  y_data,
                                                  test_size=0.2,   #전체 중 20%를 테스트용으로 분할
                                                                   #나머지 80%는 훈련용
                                                  shuffle=True,    #무작위로 섞어서 추출
                                                  random_state=20) #무작위 추출 시 일정한 기준으로

y_train = y_train.values

In [68]:
from pytorch_tabnet.tab_model import TabNetRegressor


NUM_FOLDS = 5
SEED = 42

## TabNet Parameters
MAX_EPOCH = 1000
N_D = 8 
N_A = 8 
N_STEPS = 2
GAMMA = 1.1
LAMBDA_SPARSE = 0
OPT_LR = 1e-3
OPT_WEIGHT_DECAY = 1e-5
OPT_MOMENTUM = 0.9
MASK_TYPE = "entmax"
SCHEDULER_MIN_LR = 1e-6
SCHEDULER_FACTOR = 0.9
DEVICE_NAME = "cuda"

BATCH_SIZE = 512




In [69]:
import torch
from torch import nn
from pytorch_tabnet.tab_model import TabNetRegressor

from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd 

import os
import random
import sys
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(SEED)

In [70]:
for c in x_train.columns:
    if x_train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(x_train[c].values) + list(x_test[c].values))
        
        x_train[c] = lbl.transform(x_train[c].values)
        x_test[c] = lbl.transform(x_test[c].values)

In [71]:
columns = x_test.columns
print(columns)

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
       'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
       'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
       'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
       'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80',
       'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90',
       'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99'],
      dtype='object')


In [72]:
tabnet_params = dict(n_d=N_D, n_a=N_A, n_steps=N_STEPS, gamma=GAMMA,
                     lambda_sparse=LAMBDA_SPARSE, optimizer_fn=torch.optim.SGD,
                     optimizer_params=dict(lr=OPT_LR, weight_decay=OPT_WEIGHT_DECAY, momentum=OPT_MOMENTUM),
                     mask_type=MASK_TYPE,
                     scheduler_params=dict(mode="min",
                                           patience=20,
                                           min_lr=SCHEDULER_MIN_LR,
                                           factor=SCHEDULER_FACTOR,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     device_name=DEVICE_NAME,
                     seed=SEED
                     )

In [73]:
print(tabnet_params)

{'n_d': 8, 'n_a': 8, 'n_steps': 2, 'gamma': 1.1, 'lambda_sparse': 0, 'optimizer_fn': <class 'torch.optim.sgd.SGD'>, 'optimizer_params': {'lr': 0.001, 'weight_decay': 1e-05, 'momentum': 0.9}, 'mask_type': 'entmax', 'scheduler_params': {'mode': 'min', 'patience': 20, 'min_lr': 1e-06, 'factor': 0.9}, 'scheduler_fn': <class 'torch.optim.lr_scheduler.ReduceLROnPlateau'>, 'verbose': 10, 'device_name': 'cuda', 'seed': 42}


In [74]:
train_oof = np.zeros((len(x_train)))
test_preds = 0

kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(x_train, y_train))):

    print(f'Fold {f}')
    train_df, val_df = x_train.iloc[train_ind][columns], x_train.iloc[val_ind][columns]

    train_target, val_target = y_train[train_ind], y_train[val_ind]

    print(train_df.shape, train_target.shape)
    print(val_df.shape, val_target.shape)

    train_target=train_target.reshape(-1,1)
    val_target=val_target.reshape(-1,1)

    train_df      = train_df.to_numpy()
    train_target      = train_target.reshape(-1, 1)

    val_df = val_df.to_numpy()
    val_target = val_target.reshape(-1, 1)

    model = TabNetRegressor(**tabnet_params)

    model.fit(X_train=train_df,
              y_train=train_target,
              eval_set=[(val_df, val_target)],
              eval_name = ["val"],
              eval_metric = ['mse'],#["logits_ll"],
              max_epochs=MAX_EPOCH, #20
              patience=20, batch_size=BATCH_SIZE,
              num_workers=5, drop_last=False)#,

    temp_oof = model.predict(val_df)
    train_oof[val_ind] = temp_oof.reshape(-1)
    temp_test = model.predict(x_test.to_numpy())

    test_preds += temp_test/NUM_FOLDS     

    print(mean_squared_error(temp_oof, val_target, squared=False))

0it [00:00, ?it/s]

Fold 0
(160000, 100) (160000,)
(40000, 100) (40000,)
Device used : cpu
epoch 0  | loss: 89.05727| val_mse: 83.2949 |  0:01:12s
epoch 10 | loss: 63.96283| val_mse: 64.29377|  0:12:48s
epoch 20 | loss: 63.26599| val_mse: 63.79278|  0:24:20s


OSError: [Errno 23] Too many open files in system

In [None]:
# XG-Boost Score: 61.34741620778698