## Libraries

In [1]:
import torch
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from pandas_profiling import ProfileReport
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

## Load data

In [2]:
df = pd.read_csv('./data/train.csv')

print(df.shape)

df.head()

(600000, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [3]:
# profile = ProfileReport(train_df, minimal = True)

# profile.to_file('profile_report.html')

## Split data

In [4]:
col_list = [col for col in df.columns if col not in ['id', 'target']]

train_df, test_df = train_test_split(
    df,
    stratify = df['target'],
    test_size = 0.2,
    shuffle = True,
    random_state = 2020
)

train_df, val_df = train_test_split(
    train_df,
    stratify = train_df['target'],
    test_size = 0.15,
    shuffle = True,
    random_state = 2020
)

train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

x_train = train_df.loc[:, col_list]
y_train = train_df.loc[:, 'target']

x_val = val_df.loc[:, col_list]
y_val = val_df.loc[:, 'target']

x_test = test_df.loc[:, col_list]
y_test = test_df.loc[:, 'target']

print(f'Train: {train_df.shape}')
print(f'Val: {val_df.shape}')
print(f'Test: {test_df.shape}')

Train: (408000, 25)
Val: (72000, 25)
Test: (120000, 25)


## Numerical encoding

In [5]:
x_all = pd.concat([x_train, x_val, x_test], axis = 0).reset_index(drop = True)

# numerically encode all categorical variables to {0, 1, ..., N_cats - 1}
for col in col_list:
    encoder = LabelEncoder()
    encoder.fit(x_all[col].fillna('-1').astype(str).values)
    x_train[col] = encoder.transform(x_train[col].fillna('-1').astype(str).values)
    x_val[col] = encoder.transform(x_val[col].fillna('-1').astype(str).values)
    x_test[col] = encoder.transform(x_test[col].fillna('-1').astype(str).values)

x_train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,1,1,1,1,1,3,5,4,5,4,646,293,212,17,1502,2,5,1,13,14,98,0,10
1,1,1,1,1,1,3,1,3,5,4,743,788,91,7,1541,3,4,1,14,17,29,2,11
2,1,1,0,1,0,3,6,1,3,4,321,2,112,129,1723,3,3,6,5,18,27,7,4
3,1,1,2,1,2,3,2,4,3,4,1093,876,207,113,1225,1,5,3,5,21,140,2,12
4,1,2,1,1,1,1,6,1,5,4,414,1186,86,115,0,3,3,6,3,3,13,2,6


## Data loader

In [6]:
class CategoricalDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        x = self.inputs.loc[index, col_list].values
        y = self.targets.iloc[index]
        return x, y

In [7]:
dataset_train = CategoricalDataset(x_train, y_train)
dataset_val = CategoricalDataset(x_val, y_val)
dataset_test = CategoricalDataset(x_test, y_test)

train_gen = DataLoader(dataset_train, batch_size = 64, shuffle = True)
val_gen = DataLoader(dataset_val, batch_size = 64, shuffle = True)
test_gen = DataLoader(dataset_test, batch_size = 64, shuffle = False)

## Specify model

In [9]:
class EntityEmbeddingModel(nn.Module):
    def __init__(self):
        super().__init__()
        # create a list of the 23 embedding layers
        self.embeddings = nn.ModuleList()
        for col in col_list:
            num_categories = int(x_all[col].fillna('-1').astype(str).nunique())
            embed_dim = int(min(np.ceil((num_categories) / 2), 50))
            self.embeddings.append(nn.Embedding(num_categories, embed_dim))
        # input dimension for first linear layer following concatenated embeddings
        self.dim_concat = np.sum([layer.embedding_dim for layer in self.embeddings])
        # indices from 0 to 22
        self.embedding_indices = range(len(self.embeddings))
        self.lin1 = nn.Linear(self.dim_concat, 150)
        self.relu1 = nn.ReLU()
        self.drop2 = nn.Dropout(0.2)
        self.lin2 = nn.Linear(150, 150)
        self.relu2 = nn.ReLU()
        self.drop3 = nn.Dropout(0.3)
        self.lin3 = nn.Linear(150, 1)
        self.sigmoid3 = nn.Sigmoid()
    
    def forward(self, x):
        # (n, 23) -> (23, n, E_i)
        e = [self.embeddings[i](x[:, i]) for i in list(self.embedding_indices)]
        # (23, n, E_i) -> (n, sum(E_i))
        c = torch.cat(e, dim = 1)
        # (n, sum(E_i)) -> (n, 300)
        h = self.relu1(self.lin1(c))
        # (n, 300) -> (n, 300)
        h = self.relu2(self.lin2(self.drop2(h)))
        # (n, 300) -> (n)
        y = self.sigmoid3(self.lin3(self.drop3(h))).view(-1)
        return y

device = torch.device('cuda:0')

model = EntityEmbeddingModel().to(device)
loss_fun = nn.BCELoss()
optimizer = AdamW(model.parameters(), lr = 5e-4, weight_decay = 5e-3)

print(model)

EntityEmbeddingModel(
  (embeddings): ModuleList(
    (0): Embedding(3, 2)
    (1): Embedding(3, 2)
    (2): Embedding(3, 2)
    (3): Embedding(3, 2)
    (4): Embedding(3, 2)
    (5): Embedding(4, 2)
    (6): Embedding(7, 4)
    (7): Embedding(7, 4)
    (8): Embedding(7, 4)
    (9): Embedding(5, 3)
    (10): Embedding(1221, 50)
    (11): Embedding(1520, 50)
    (12): Embedding(223, 50)
    (13): Embedding(223, 50)
    (14): Embedding(2219, 50)
    (15): Embedding(4, 2)
    (16): Embedding(6, 3)
    (17): Embedding(7, 4)
    (18): Embedding(16, 8)
    (19): Embedding(27, 14)
    (20): Embedding(191, 50)
    (21): Embedding(8, 4)
    (22): Embedding(13, 7)
  )
  (lin1): Linear(in_features=369, out_features=150, bias=True)
  (relu1): ReLU()
  (drop2): Dropout(p=0.2, inplace=False)
  (lin2): Linear(in_features=150, out_features=150, bias=True)
  (relu2): ReLU()
  (drop3): Dropout(p=0.3, inplace=False)
  (lin3): Linear(in_features=150, out_features=1, bias=True)
  (sigmoid3): Sigmoid()
)


## Fit model

In [10]:
PLATEAU_PATIENCE = 1
EARLY_PATIENCE = 3
NUM_EPOCHS = 15
patience_counter = EARLY_PATIENCE
best_val_loss = 999

plateau_scheduler = ReduceLROnPlateau(
    optimizer,
    mode = 'min',
    patience = PLATEAU_PATIENCE,
    factor = 0.2,
    verbose = True
)

for i_epoch in range(NUM_EPOCHS):
    train_losses = []
    val_losses = []

    # optimization on training data
    model.train() 
    for x, y in tqdm(train_gen):
        x = x.to(device, dtype = torch.int64)
        y = y.to(device, dtype = torch.float32)  
        optimizer.zero_grad() 
        pred = model(x)
        loss = loss_fun(pred, y)
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()
        
    # evaluation on validation data
    model.eval() 
    with torch.no_grad():
        for x, y in tqdm(val_gen):
            x = x.to(device, dtype = torch.int64)
            y = y.to(device, dtype = torch.float32)  
            pred = model(x)
            loss = loss_fun(pred, y)
            val_losses.append(loss.item())
    
    # display progress
    print(f'{i_epoch+1} | Train loss: {np.mean(train_losses):.4f} | Val loss: {np.mean(val_losses):.4f}')
    
    # check learning plateau criterion
    curr_val_loss = np.mean(val_losses)
    plateau_scheduler.step(curr_val_loss)
    
    # check early stopping criterion
    if curr_val_loss < best_val_loss:
        best_val_loss = curr_val_loss
        patience_counter = EARLY_PATIENCE # reset patience counter
        torch.save(model, './models/model.pth')
    else:
        patience_counter -= 1
        if patience_counter == 0:
            print('Early stopping')
            break

100%|██████████| 6375/6375 [03:55<00:00, 27.10it/s]
100%|██████████| 1125/1125 [00:36<00:00, 31.12it/s]
  0%|          | 3/6375 [00:00<04:33, 23.33it/s]

1 | Train loss: 0.4239 | Val loss: 0.4068


100%|██████████| 6375/6375 [03:54<00:00, 27.17it/s]
100%|██████████| 1125/1125 [00:36<00:00, 31.13it/s]
  0%|          | 3/6375 [00:00<04:42, 22.55it/s]

2 | Train loss: 0.4016 | Val loss: 0.4035


100%|██████████| 6375/6375 [03:53<00:00, 27.27it/s]
100%|██████████| 1125/1125 [00:36<00:00, 31.08it/s]
  0%|          | 3/6375 [00:00<04:30, 23.56it/s]

3 | Train loss: 0.3922 | Val loss: 0.4058


100%|██████████| 6375/6375 [03:47<00:00, 28.00it/s]
100%|██████████| 1125/1125 [00:34<00:00, 32.85it/s]
  0%|          | 3/6375 [00:00<04:11, 25.36it/s]

4 | Train loss: 0.3830 | Val loss: 0.4091
Epoch     4: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 6375/6375 [03:58<00:00, 26.73it/s]
100%|██████████| 1125/1125 [00:38<00:00, 29.14it/s]

5 | Train loss: 0.3634 | Val loss: 0.4119
Early stopping



