In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import DataLoader

from src.nn_tabular_models.node_model import NODEModel
from src.nn_tabular_models.saint_model import SAINTModel
from src.nn_tabular_models.tabpfn_model import TABPFNModel
from src.nn_tabular_models.tab_transformer_model import TabTransformerModel
from src.nn_tabular_models.autoint_model import AutoIntModel
from src.nn_tabular_models.ft_transformer_model import FTTransformerModel
from pytorch_tabnet.tab_model import TabNetRegressor

from src.abalone_dataset import AbaloneDataset
from src.training_loop import training_loop
from src.styles import TXT_ACC, TXT_RESET

from src.tabular_nn_tuner import TabularNNTuner


import warnings
warnings.filterwarnings("ignore")

SEED=42

PROJECT = 'PGs04e04'

# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1fc90afa2f0>

In [2]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    path_original = 'data/abalone.csv'
    target = 'Rings'
    project = PROJECT
    num_folds=5

In [3]:
df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,F,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,F,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,I,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,M,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,I,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,M,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
90611,M,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
90612,I,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6
90613,I,0.345,0.270,0.075,0.2000,0.0980,0.0490,0.0700,6


In [4]:
def OH_transform(df_input):
    df = df_input.copy()
    for val in sorted(df['Sex'].unique()):
        df[f'Sex_{val}'] = (df['Sex'] == val).astype(int)
    df = df.drop('Sex', axis=1)
    return df

def label_transform(df_input):
    df = df_input.copy()
    mapper_sex = {'I':0, 'F':1, 'M':2}
    df['Sex'] = df['Sex'].map(mapper_sex)
    return df


def preproc_data(df_input, scaler=None, df_original=None):
    df = OH_transform(df_input)

    if scaler is None:
        scaler = StandardScaler()
        data = scaler.fit_transform(df.drop(CFG.target, axis=1))
        target = np.log1p(df[CFG.target]).values
        return data, target, scaler
    else:
        return scaler.transform(df)

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_idx = [idx for idx in cv.split(df_train, df_train[CFG.target])]

df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)

df_original = pd.read_csv(CFG.path_original)
df_original = df_original.rename(columns={'Shucked weight': 'Whole weight.1', 'Viscera weight': 'Whole weight.2'})

data, target, scaler = preproc_data(df_train)
data_original = preproc_data(df_original.drop(CFG.target, axis=1), scaler)
target_original = np.log1p(df_original[CFG.target]).values

In [6]:
# tuner = TabularNNTuner(data, target, cv_idx)
# tuner.tune_parameters('NODE')

In [7]:
for fold in range(CFG.num_folds):

    train_data = np.concatenate([data[cv_idx[fold][0]], data_original])
    train_target = np.concatenate([target[cv_idx[fold][0]], target_original])
    val_data = data[cv_idx[fold][1]]
    val_target = target[cv_idx[fold][1]]

    dataset_train = AbaloneDataset(train_data, train_target)
    dataset_val = AbaloneDataset(val_data, val_target)

    batch_size=2**5
    loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    loader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

    DEVICE = torch.device('cuda')

    experiments = (
        # ('saint_8_8', SAINTModel, {'in_features': 10, 'hidden_dim': 8, 'num_attention_heads': 8, 'num_layers': 8, 'dropout': 0.2}),
        ('ft_8_4', FTTransformerModel, {'in_features': 10, 'hidden_dim': 8, 'num_attention_heads': 8, 'num_layers': 4, 'dropout': 0.2}),
        # ('tab_8_8', TabTransformerModel, {'in_features': 10, 'hidden_dim': 8, 'num_attention_heads': 8, 'num_layers': 8, 'dropout': 0.2}),
    )

    dir_save = 'nn_models'
    for label, model_class, model_params in experiments:
        print(f'{TXT_ACC} {label}    fold {fold} {TXT_RESET}')
        scores, epochs, best_state = TabularNNTuner.score(
                                            model_class, 
                                            model_params,
                                            loader_train,
                                            loader_val,
                                            learning_rate=1e-3, 
                                            device=DEVICE,
                                            verbose=True)
        torch.save(best_state, f'{dir_save}/{label}_fold_{fold}.pth')

In [8]:
df_test = pd.read_csv(CFG.path_test).drop('id', axis=1)

test_data = preproc_data(df_test, scaler)

dataset_test = AbaloneDataset(test_data)

batch_size=2**5
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

for label, model_class, model_params in experiments:

    preds_folds = []
    for fold in range(CFG.num_folds):

        train_data = np.concatenate([data[cv_idx[fold][0]], data_original])
        train_target = np.concatenate([target[cv_idx[fold][0]], target_original])
        val_data = data[cv_idx[fold][1]]
        val_target = target[cv_idx[fold][1]]

        dataset_train = AbaloneDataset(train_data, train_target)
        dataset_val = AbaloneDataset(val_data, val_target)

        batch_size=2**5
        loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
        loader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

        model = model_class(**model_params).to(DEVICE)
        model.load_state_dict(torch.load(f'{dir_save}/{label}_fold_{fold}.pth'))
        model.eval()

        preds = []
        model.eval()
        with torch.no_grad():
            for batch in loader_val:
                out = model(batch[0].to(DEVICE))
                preds.extend(out.cpu().numpy())

        preds_folds.append(preds)

In [9]:
for fold in range(CFG.num_folds):
    df_sub = pd.DataFrame()
    df_sub[CFG.target] = np.expm1(preds_folds[fold])
    df_sub.to_csv(f'OOF_ft_8_4_fold{fold}.csv', index=False)
    # display(df_sub)