In [1]:
%matplotlib inline
from pathlib import Path
import pandas as pd

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split

In [2]:
data_path = Path('../data/')
df        = pd.read_csv(data_path / 'interim' / 'fed_emp.csv')

In [3]:
# Removing the lines for which the target in unknown
df                                   = df[~df.EDLVL.isnull()]
# Removing the nan values in columns by either adding a new category
# or dropping the lines
df.loc[df.GSEGRD.isnull(), 'GSEGRD'] = 0
df.loc[df.OCC.isnull(), 'OCC']       = 0
df                                   = df[~df.SUPERVIS.isnull()]
df                                   = df[~df.TOA.isnull()]
df                                   = df[~df.SALARY.isnull()]
df                                   = df[~df.LOS.isnull()]
# df.isnull().sum(axis = 0)

In [4]:
df_data    = df.drop(['EDLVL'], axis = 1)
df_target  = df['EDLVL']
df_target  = df_target - 1 # Values between 0 and 21 instead of 1 and 22
# df_target.hist(bins = 22)

In [5]:
numerical_columns = ['SALARY', 'LOS']
df_num            = df_data[numerical_columns]
num_val_mean      = df_num.mean(axis = 0)
num_val_std       = df_num.std(axis = 0)
df_num            = (df_num - num_val_mean) / num_val_std
df_cat            = df_data.drop(numerical_columns, axis = 1) 

In [6]:
columns_encoders = {
    col : {
        val : i 
        for i, val in enumerate(df[col].unique())
    }
    for col in df_cat.columns
}
column_order = list(columns_encoders.keys())

In [7]:
for col in df_cat.columns:
    df_cat[col] = df_cat[col].apply(lambda x: columns_encoders[col][x])

In [8]:
class CategoricalEmbeddings(nn.Module):
    def __init__(self, col_order, col_encoders, col_to_emb_dim):
        super(CategoricalEmbeddings, self).__init__()
        self.col_order = col_order 
        self.cat_embs  = nn.ModuleDict({
            col: nn.Embedding(len(col_encoders[col]), col_to_emb_dim[col])
            for col in col_order
        })
        
    def forward(self, cat_variables):
        embeddings = [self.cat_embs[col](cat_variables[col]) for col in self.col_order]
        
        return torch.cat(embeddings, dim = 1)

In [9]:
class EdlvlClassifier(nn.Module):
    def __init__(self, col_order, col_encoders, col_to_emb_dim, lin_size = 256, dropout_rate = 0.2):
        super(EdlvlClassifier, self).__init__()
        self.cat_emb    = CategoricalEmbeddings(col_order, col_encoders, col_to_emb_dim)
        sum_cat_emb_dim = sum(col_to_emb_dim.values())
        self.linear1    = nn.Linear(sum_cat_emb_dim + 2, lin_size)
        self.linear2    = nn.Linear(lin_size, 22)
        self.dropout    = nn.Dropout(dropout_rate)
        
    def forward(self, cat_variables, num_variables):
        cat_embeddings = self.cat_emb(cat_variables)
        cat_num_tensor = torch.cat([cat_embeddings, num_variables], dim = 1)
        cat_num_tensor = self.dropout(cat_num_tensor)
        out_linear1    = F.relu(self.dropout(self.linear1(cat_num_tensor)))
        out_linear2    = self.linear2(out_linear1)
        
        return out_linear2

In [10]:
model = EdlvlClassifier(
    column_order,
    columns_encoders,
    {
        col : 4
        for col in columns_encoders
    }
)
model

EdlvlClassifier(
  (cat_emb): CategoricalEmbeddings(
    (cat_embs): ModuleDict(
      (AGELVL): Embedding(12, 4)
      (AGYSUB): Embedding(523, 4)
      (GSEGRD): Embedding(16, 4)
      (LOC): Embedding(219, 4)
      (LOSLVL): Embedding(10, 4)
      (OCC): Embedding(656, 4)
      (PATCO): Embedding(7, 4)
      (PPGRD): Embedding(933, 4)
      (SALLVL): Embedding(25, 4)
      (STEMOCC): Embedding(100, 4)
      (SUPERVIS): Embedding(6, 4)
      (TOA): Embedding(18, 4)
      (WORKSCH): Embedding(10, 4)
      (WORKSTAT): Embedding(2, 4)
    )
  )
  (linear1): Linear(in_features=58, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=22, bias=True)
  (dropout): Dropout(p=0.2)
)

In [11]:
dataset = TensorDataset(
    *[
        torch.tensor(df_cat[col].values)
        for col in column_order
    ], # categorical variables in the correct order
    torch.tensor(df_num.values, dtype = torch.float32), # numerical variables
    torch.tensor(df_target.values, dtype = torch.int64) # target variables
)

In [12]:
dataset_size                 = len(dataset)
valid_prop                   = 0.2
valid_size                   = round(valid_prop * dataset_size)
lengths                      = [dataset_size - valid_size, valid_size]
train_dataset, valid_dataset = random_split(dataset, lengths)

In [13]:
device       = torch.device('cuda')
model        = model.to(device)
epochs       = 20
batch_size   = 2048
optimizer    = optim.Adam(model.parameters())
criterion    = nn.CrossEntropyLoss()
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False)

In [None]:
for epoch in range(epochs):
    correct = 0
    total   = 0
    for i, (*cat_var_list, num_var, y) in enumerate(train_loader):
        optimizer.zero_grad()
        cat_var_list   = [t.to(device) for t in cat_var_list]
        num_var        = num_var.to(device)
        y              = y.to(device)
        cat_variables  = dict(zip(column_order, cat_var_list))
        res            = model(cat_variables, num_var)
        loss           = criterion(res, y)
        correct       += (res.argmax(dim = 1) == y).detach().sum().item()
        total         += y.shape[0]
        loss.backward()
        optimizer.step()
        if i % 300 == 0: 
            model.eval()
            valid_correct = 0
            valid_total   = 0
            with torch.no_grad():
                for *cat_var_list, num_var, y in valid_loader:
                    cat_var_list   = [t.to(device) for t in cat_var_list]
                    num_var        = num_var.to(device)
                    y              = y.to(device)
                    cat_variables  = dict(zip(column_order, cat_var_list))
                    res            = model(cat_variables, num_var)
                    valid_correct += (res.argmax(dim = 1) == y).detach().sum().item()
                    valid_total   += y.shape[0]
            print(valid_correct, valid_total)
            print(f'[{epoch}:{i}] [T] {100. * correct / total:5.2f}%, [V] {100. * valid_correct / valid_total:5.2f}% {loss.item():5.2f}')
            model.train()

56826 362579
[0:0] [T]  5.96%, [V] 15.67%  3.06
166021 362579
[0:300] [T] 40.04%, [V] 45.79%  1.73
170166 362579
[0:600] [T] 42.26%, [V] 46.93%  1.67
171534 362579
[1:0] [T] 47.41%, [V] 47.31%  1.63
173776 362579
[1:300] [T] 46.09%, [V] 47.93%  1.63
174963 362579
[1:600] [T] 46.40%, [V] 48.26%  1.64
175415 362579
[2:0] [T] 48.34%, [V] 48.38%  1.65
176260 362579
[2:300] [T] 47.19%, [V] 48.61%  1.63
177352 362579
[2:600] [T] 47.35%, [V] 48.91%  1.59
177511 362579
[3:0] [T] 49.61%, [V] 48.96%  1.55
178011 362579
[3:300] [T] 47.84%, [V] 49.10%  1.59
178762 362579
[3:600] [T] 47.89%, [V] 49.30%  1.58
178623 362579
[4:0] [T] 48.78%, [V] 49.26%  1.58
179168 362579
[4:300] [T] 48.22%, [V] 49.41%  1.55
