## Libraries

In [1]:
import torch
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from pandas_profiling import ProfileReport
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

## Load data

In [16]:
df = pd.read_csv('./data/train.csv')

print(df.shape)

df.head()

(600000, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [4]:
# profile = ProfileReport(train_df, minimal = True)

# profile.to_file('profile_report.html')

## Split data

In [5]:
col_list = [col for col in df.columns if col not in ['id', 'target']]

train_df, test_df = train_test_split(
    df,
    stratify = df['target'],
    test_size = 0.2,
    shuffle = True,
    random_state = 2020
)

train_df, val_df = train_test_split(
    train_df,
    stratify = train_df['target'],
    test_size = 0.15,
    shuffle = True,
    random_state = 2020
)

train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

x_train = train_df.loc[:, col_list]
y_train = train_df.loc[:, 'target']

x_val = val_df.loc[:, col_list]
y_val = val_df.loc[:, 'target']

x_test = test_df.loc[:, col_list]
y_test = test_df.loc[:, 'target']

print(f'Train: {train_df.shape}')
print(f'Val: {val_df.shape}')
print(f'Test: {test_df.shape}')

Train: (408000, 25)
Val: (72000, 25)
Test: (120000, 25)


## Numerical encoding

In [8]:
x_all = pd.concat([x_train, x_val, x_test], axis = 0).reset_index(drop = True)

# numerically encode all categorical variables to {0, 1, ..., N-1}
for col in col_list:
    encoder = LabelEncoder()
    encoder.fit(x_all[col].fillna('-1').astype(str).values)
    x_train[col] = encoder.transform(x_train[col].fillna('-1').astype(str).values)
    x_val[col] = encoder.transform(x_val[col].fillna('-1').astype(str).values)
    x_test[col] = encoder.transform(x_test[col].fillna('-1').astype(str).values)

x_train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,1,1,1,1,1,3,5,4,5,4,646,293,212,17,1502,2,5,1,13,14,98,0,10
1,1,1,1,1,1,3,1,3,5,4,743,788,91,7,1541,3,4,1,14,17,29,2,11
2,1,1,0,1,0,3,6,1,3,4,321,2,112,129,1723,3,3,6,5,18,27,7,4
3,1,1,2,1,2,3,2,4,3,4,1093,876,207,113,1225,1,5,3,5,21,140,2,12
4,1,2,1,1,1,1,6,1,5,4,414,1186,86,115,0,3,3,6,3,3,13,2,6


## Data loader

In [9]:
class CategoricalDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        x = self.inputs.loc[index, col_list].values
        y = self.targets.iloc[index]
        return x, y

In [10]:
dataset_train = CategoricalDataset(x_train, y_train)
dataset_val = CategoricalDataset(x_val, y_val)
dataset_test = CategoricalDataset(x_test, y_test)

train_gen = DataLoader(dataset_train, batch_size = 64, shuffle = True)
val_gen = DataLoader(dataset_val, batch_size = 64, shuffle = True)
test_gen = DataLoader(dataset_test, batch_size = 64, shuffle = False)

In [11]:
# x, y = next(iter(train_gen))

# x, y

## Specify model

In [13]:
class EntityEmbeddingModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([])
        for col in col_list:
            num_categories = int(x_all[col].nunique())
            embed_dim = int(min(np.ceil((num_categories) / 2), 50))
            self.embeddings.append(nn.Embedding(num_categories, embed_dim))
        self.dim_concat = np.sum([layer.embedding_dim for layer in self.embeddings])
        self.lin1 = nn.Linear(self.dim_concat, 300)
        self.lin2 = nn.Linear(300, 300)
        self.lin3 = nn.Linear(300, 2)
        self.softmax = nn.Softmax()
    
    def forward(self, x):
        e = [layer(x[:, i]).view(x[:, i].shape[0], layer.embedding_dim) for i, layer in zip(range(len(self.embeddings)), self.embeddings)]
        c = torch.cat(e, dim = 1)
        h = self.lin1(c)
        h = self.lin2(h)
        y = self.lin3(h)
        y = self.softmax(y)
        return y

device = torch.device('cuda:0')

model = EntityEmbeddingModel().to(device)
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3, weight_decay = 5e-3)

print(model)

EntityEmbeddingModel(
  (embeddings): ModuleList(
    (0): Embedding(2, 1)
    (1): Embedding(2, 1)
    (2): Embedding(2, 1)
    (3): Embedding(2, 1)
    (4): Embedding(2, 1)
    (5): Embedding(3, 2)
    (6): Embedding(6, 3)
    (7): Embedding(6, 3)
    (8): Embedding(6, 3)
    (9): Embedding(4, 2)
    (10): Embedding(1220, 50)
    (11): Embedding(1519, 50)
    (12): Embedding(222, 50)
    (13): Embedding(222, 50)
    (14): Embedding(2218, 50)
    (15): Embedding(3, 2)
    (16): Embedding(5, 3)
    (17): Embedding(6, 3)
    (18): Embedding(15, 8)
    (19): Embedding(26, 13)
    (20): Embedding(190, 50)
    (21): Embedding(7, 4)
    (22): Embedding(12, 6)
  )
  (lin1): Linear(in_features=357, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=300, bias=True)
  (lin3): Linear(in_features=300, out_features=2, bias=True)
  (softmax): Softmax(dim=None)
)


## Fit model

In [None]:
PLATEAU_PATIENCE = 1
EARLY_PATIENCE = 3
NUM_EPOCHS = 15
patience_counter = EARLY_PATIENCE
best_val_loss = 999

plateau_scheduler = ReduceLROnPlateau(
    optimizer,
    mode = 'min',
    patience = PLATEAU_PATIENCE,
    factor = 0.2,
    verbose = True
)

for i_epoch in range(NUM_EPOCHS):
    train_losses = []
    val_losses = []

    # optimization on training data
    model.train() 
    for x, y in tqdm(train_gen):
        x = x.to(device, dtype = torch.int64)
        y = y.to(device, dtype = torch.float32)  
        optimizer.zero_grad() 
        pred = model(x)
        loss = loss_fun(pred, y)
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()
        
    # evaluation on validation data
    model.eval() 
    with torch.no_grad():
        for x, y in tqdm(val_gen):
            x = x.to(device, dtype = torch.int64)
            y = y.to(device, dtype = torch.float32)  
            pred = model(x)
            loss = loss_fun(pred, y)
            val_losses.append(loss.item())
    
    # display progress
    print(f'{i_epoch+1} | Train loss: {np.mean(train_losses):.4f} | Val loss: {np.mean(val_losses):.4f}')
    
    # check learning plateau criterion
    curr_val_loss = np.mean(val_losses)
    plateau_scheduler.step(curr_val_loss)
    
    # check early stopping criterion
    if curr_val_loss < best_val_loss:
        best_val_loss = curr_val_loss
        patience_counter = EARLY_PATIENCE # reset patience counter
        torch.save(model, './models/model.pth')
    else:
        patience_counter -= 1
        if patience_counter == 0:
            print('Early stopping')
            break