In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification

# Enable debugging while on GPU
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

## Run only once

In [2]:
import pandas as pd
SSOC_Definitions = pd.read_excel('../Data/Raw/SSOC2020 Detailed Definitions.xlsx', skiprows = 4)

  warn("""Cannot parse header or footer so it will be ignored""")


In [3]:
SSOC_4D = SSOC_Definitions[SSOC_Definitions['SSOC 2020'].apply(len) == 4][['SSOC 2020', 'Tasks']]
SSOC_4D.columns = ['4D SSOC', 'Tasks']
SSOC_5D = SSOC_Definitions[(SSOC_Definitions['SSOC 2020'].apply(len) == 5) & ~SSOC_Definitions['SSOC 2020'].str.contains('X')].reset_index(drop = True)
SSOC_5D['4D SSOC'] = SSOC_5D['SSOC 2020'].str.slice(0, 4)
SSOC_5D.drop('Tasks', axis = 1, inplace = True)
SSOC_Final = SSOC_5D.merge(SSOC_4D, how = 'left', on = '4D SSOC')
SSOC_Final['Description'] = SSOC_Final['Detailed Definitions'] + " " + SSOC_Final['Tasks'].str.replace('\n', '').str.replace('-', '.')
df = SSOC_Final[['SSOC 2020', 'Description']]

In [53]:
df.drop('ENCODE_CAT', axis = 1).to_csv('../Data/Processed/Training/train.csv', index = False)

In [4]:
mcf_labelled = pd.read_csv('../Data/Processed/Artifacts/MCF_Subset_WithLabels.csv')

In [5]:
mcf_validation = mcf_labelled.sample(50).reset_index(drop = True)
mcf_validation = mcf_validation.merge(mcf_labelled[mcf_labelled['Job_ID'].isin(mcf_validation['Job_ID'])][['Job_ID', 'Description']],
                     how = 'left',
                     on = 'Job_ID')

In [54]:
testing = mcf_validation[['Predicted SSOC', 'Cleaned_Description']]
testing.columns = ['SSOC 2020', 'Description']
testing.to_csv('../Data/Processed/Training/test.csv', index = False)

## Main

In [2]:
train = pd.read_csv('../Data/Processed/Training/train.csv')
test = pd.read_csv('../Data/Processed/Training/test.csv')

In [3]:
def encode_category(ssocs):
    output = {}
    for i, ssoc in enumerate(ssocs):
        output[ssoc] = i
    return output

In [4]:
ssoc1d_idx = encode_category(train['SSOC 2020'].astype('str').str.slice(0, 1).unique())
idx_ssoc1d = {v: k for k, v in ssoc1d_idx.items()}

ssoc2d_idx = encode_category(train['SSOC 2020'].astype('str').str.slice(0, 2).unique())
idx_ssoc2d = {v: k for k, v in ssoc2d_idx.items()}

train['SSOC_1D'] = train['SSOC 2020'].astype('str').str.slice(0, 1).replace(ssoc1d_idx)
test['SSOC_1D'] = test['SSOC 2020'].astype('str').str.slice(0, 1).replace(ssoc1d_idx)

train['SSOC_2D'] = train['SSOC 2020'].astype('str').str.slice(0, 2).replace(ssoc2d_idx)
test['SSOC_2D'] = test['SSOC 2020'].astype('str').str.slice(0, 2).replace(ssoc2d_idx)

In [5]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [6]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        
        text = self.data.Description[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_token_type_ids = True,
            truncation = True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets_1d': torch.tensor(self.data.SSOC_1D[index], dtype=torch.long),
            'targets_2d': torch.tensor(self.data.SSOC_2D[index], dtype=torch.long),
        } 
    
    def __len__(self):
        return self.len

In [7]:
# Creating the dataset and dataloader for the neural network
training_set = Triage(train, tokenizer, MAX_LEN)
testing_set = Triage(test, tokenizer, MAX_LEN)

In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [9]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        
        # Stack 1: Predicting 1D SSOC (9)
        self.ssoc_1d_stack = torch.nn.Sequential(
            torch.nn.Linear(768, 768), 
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(768, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 9)
        )
        
        # Stack 2: Predicting 2D SSOC (40 + 2 nec)
        self.ssoc_2d_stack = torch.nn.Sequential(
            torch.nn.Linear(777, 777), 
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(777, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 42)
        )        

    def forward(self, input_ids, attention_mask):
        
        # Obtain the sentence embeddings from the DistilBERT model
        embeddings = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = embeddings[0]
        X = hidden_state[:, 0]
        
        # 1D Prediction
        preds_1d = self.ssoc_1d_stack(X)
        
        # 2D Prediction
        X = torch.cat((X, preds_1d), dim = 1)
        preds_2d = self.ssoc_2d_stack(X)
        
        return preds_1d, preds_2d

In [10]:
model = DistillBERTClass()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
custom_loss_fn
# think of how to adjust the crossentropyloss function
# change the targets upfront before passing it in

'43'

In [12]:
def compare_ssoc(predicted, actual):
    base_penalty = 10
    penalty = 0
    for i in range(len(predicted)):
        if predicted[i] != actual[i]:
            penalty += base_penalty/(i+1)
    return penalty

def custom_loss_fn(top_probs_idx, targets, ssoc_level):
          
    if ssoc_level == '1d':
          mapping = idx_ssoc1d
    elif ssoc_level == '2d':
          mapping = idx_ssoc2d
          
    loss = 0
    
    for i in range(len(top_probs_idx)):
        predicted_ssoc = mapping[top_probs_idx[i].item()]
        actual_ssoc = mapping[targets[i].item()]
        loss += compare_ssoc(predicted_ssoc, actual_ssoc)
        
    return Variable(torch.tensor(float(loss)), requires_grad = True)

# need to use Torch variable

In [22]:
testing1 = Variable(torch.tensor([float(5), float(15)]), requires_grad = True)
print(testing1.grad)

None


In [13]:
Variable(torch.tensor(float(1)), requires_grad = True)

tensor(1., requires_grad=True)

In [14]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [15]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [19]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    
    # Set the NN to train mode
    model.train()
    
    # Iterate over each batch
    for batch, data in enumerate(training_loader):
        
        # Extract the data
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets_1d = data['targets_1d'].to(device, dtype = torch.long)
        targets_2d = data['targets_2d'].to(device, dtype = torch.long)
        
        # Run the forward prop
        preds_1d, preds_2d = model(ids, mask)
        
        # Find the indices of the top prediction
        top_probs_1d, top_probs_idx_1d = torch.max(preds_1d.data, dim = 1)
        top_probs_2d, top_probs_idx_2d = torch.max(preds_2d.data, dim = 1)
        
        # Calculate the loss
        
        loss1 = custom_loss_fn(top_probs_idx_1d, targets_1d, '1d')
        loss2 = custom_loss_fn(top_probs_idx_2d, targets_2d, '2d')
        loss = loss1 + loss2
        #print(f'Overall loss: {loss} = {loss1} + {loss2}')

        # Deprecated
        #loss = loss_function(preds_1d, targets_1d) + loss_function(preds_2d, targets_2d)
        
        # Add this batch's loss to the overall training loss
        tr_loss += loss.item()
        
        n_correct += calcuate_accu(top_probs_idx_2d, targets_2d)

        nb_tr_steps += 1
        nb_tr_examples += targets_2d.size(0)
        
        if batch % 50 == 0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 50 steps: {loss_step}")
            print(f"Training Accuracy per 50 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        print(loss1.grad)
        print(loss2.grad)
        print(loss.grad)
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [17]:
device = 'cuda'
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [20]:
for epoch in range(1):
    train(epoch)

Training Loss per 50 steps: 50.0
Training Accuracy per 50 steps: 0.0
tensor(1.)
tensor(1.)
None


  print(loss.grad)


tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None
tensor(1.)
tensor(1.)
None


KeyboardInterrupt: 