## Developing Hierarchical Classification Approach

Importing the libraries we need

In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
import copy

import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split

# Enable debugging while on GPU
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
from ssoc_autocoder import processing

Importing our datasets

Use a custom function to encode the category correctly as PyTorch requires (as a dictionary)

In [4]:

def generate_encoding(reference_data, ssoc_colname = 'SSOC 2020'):

    '''
    Generates encoding for SSOC to indices, as required by PyTorch
    for multi-class classification, for the training data

    Args:
        reference_data: Pandas dataframe containing all SSOCs
        ssoc_colname: Name of the SSOC column

    Returns:
        Dictionary containing the SSOC to index mapping (for preparing the
        dataset) and index to SSOC mapping (for interpreting the predictions),
        for each SSOC level from 1D to 5D.
    '''

    # Initialise the dictionary object to store the encodings for each level
    encoding = {}

    # Iterate through each level from 1 to 5
    for level in range(1, 6):

        # Initialise a dictionary object to store the respective-way encodings
        ssoc_idx_mapping = {}

        # Slice the SSOC column by the level required, drop duplicates, and sort
        ssocs = list(np.sort(reference_data[ssoc_colname].astype('str').str.slice(0, level).unique()))

        # Iterate through each unique SSOC (at i-digit level) and add to dict
        for i, ssoc in enumerate(ssocs):
            ssoc_idx_mapping[ssoc] = i

        # Add each level's encodings to the output dictionary
        encoding[f'SSOC_{level}D'] = {

            # Store the SSOC to index encoding
            'ssoc_idx': ssoc_idx_mapping,
            # Store the index to SSOC encoding
            'idx_ssoc': {v: k for k, v in ssoc_idx_mapping.items()}
        }

    return encoding

def encode_dataset(data,
                   encoding,
                   ssoc_colname = 'SSOC 2020'):

    '''
    Uses the generated encoding to encode the SSOCs at each
    digit level.

    Args:
        data: Pandas dataframe of the training data with the correct SSOC
        encoding: Encoding for each SSOC level
        ssoc_colname: Name of the SSOC column

    Returns:
        Pandas dataframe with each digit SSOC encoded correctly
    '''

    # Create a copy of the dataframe
    encoded_data = copy.deepcopy(data)[~data[ssoc_colname].str.contains('X')]

    # For each digit, encode the SSOC correctly
    for ssoc_level, encodings in encoding.items():
        encoded_data[ssoc_level] = encoded_data[ssoc_colname].astype('str').str.slice(0, int(ssoc_level[5])).replace(encodings['ssoc_idx'])

    return encoded_data

# Create a new Python class to handle the additional complexity
class SSOC_Dataset(Dataset):

    # Define the class attributes
    def __init__(self, dataframe, encoding, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    # Define the iterable over the Dataset object 
    def __getitem__(self, index):

        # Extract the text
        text = self.data[colnames['job_description']][index]

        # Pass in the data into the tokenizer
        inputs = self.tokenizer(
            text = text,
            text_pair = None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_token_type_ids = True,
            truncation = True
        )

        # Extract the IDs and attention mask
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        # Return all the outputs needed for training and evaluation
        return {
            'ids': torch.tensor(ids, dtype = torch.long),
            'mask': torch.tensor(mask, dtype = torch.long),
            'SSOC_1D': torch.tensor(self.data.SSOC_1D[index], dtype = torch.long),
            'SSOC_2D': torch.tensor(self.data.SSOC_2D[index], dtype = torch.long),
            'SSOC_3D': torch.tensor(self.data.SSOC_3D[index], dtype = torch.long),
            'SSOC_4D': torch.tensor(self.data.SSOC_4D[index], dtype = torch.long),
            'SSOC_5D': torch.tensor(self.data.SSOC_5D[index], dtype = torch.long),
        } 

    # Define the length attribute
    def __len__(self):
        return self.len

In [5]:
def prepare_data(encoded_data,
                 colnames,
                 parameters):
    
    # Split the dataset into training and testing
    training_set_number = int(len(encoded_data)*0.8)
    testing_set_number = len(encoded_data) - int(len(encoded_data)*0.8)
    training_data, testing_data = train_test_split(encoded_data,
                                                   test_size = 0.8,
                                                   random_state = 2021)
    training_data.reset_index(drop = True, inplace = True)
    testing_data.reset_index(drop = True, inplace = True)
    
    tokenizer = DistilBertTokenizer.from_pretrained(parameters['pretrained_model'])
    
    # Creating the dataset and dataloader for the neural network
    training_loader = DataLoader(SSOC_Dataset(training_data, encoding, tokenizer, parameters['sequence_max_length']),
                                 batch_size = parameters['training_batch_size'],
                                 num_workers = parameters['num_workers'],
                                 shuffle = True)
    testing_loader = DataLoader(SSOC_Dataset(testing_data, encoding, tokenizer, parameters['sequence_max_length']),
                                batch_size = parameters['training_batch_size'],
                                num_workers = parameters['num_workers'],
                                shuffle = True)
    
    return training_loader, testing_loader

def prepare_model(encoding, parameters):
    
    class HierarchicalSSOCClassifier(torch.nn.Module):
        
        def __init__(self):
            
            super(HierarchicalSSOCClassifier, self).__init__()
            
            self.l1 = DistilBertModel.from_pretrained(parameters['pretrained_model'])

            # Generating dimensions
            SSOC_1D_count = len(encoding['SSOC_1D']['ssoc_idx'].keys())
            SSOC_2D_count = len(encoding['SSOC_2D']['ssoc_idx'].keys())
            SSOC_3D_count = len(encoding['SSOC_3D']['ssoc_idx'].keys())
            SSOC_4D_count = len(encoding['SSOC_4D']['ssoc_idx'].keys())
            SSOC_5D_count = len(encoding['SSOC_5D']['ssoc_idx'].keys())            
            
            # Stack 1: Predicting 1D SSOC (9)
            if parameters['max_level'] >= 1:
                self.ssoc_1d_stack = torch.nn.Sequential(
                    torch.nn.Linear(768, 768), 
                    torch.nn.ReLU(),
                    torch.nn.Dropout(0.3),
                    torch.nn.Linear(768, 128),
                    torch.nn.ReLU(),
                    torch.nn.Dropout(0.3),
                    torch.nn.Linear(128, SSOC_1D_count)
                )

            # Stack 2: Predicting 2D SSOC (42)
            if parameters['max_level'] >= 2:
                n_dims_2d = 768 + SSOC_1D_count
                self.ssoc_2d_stack = torch.nn.Sequential(
                    torch.nn.Linear(n_dims_2d, n_dims_2d), 
                    torch.nn.ReLU(),
                    torch.nn.Dropout(0.3),
                    torch.nn.Linear(n_dims_2d, 128),
                    torch.nn.ReLU(),
                    torch.nn.Dropout(0.3),
                    torch.nn.Linear(128, SSOC_2D_count)
                )        

        def forward(self, input_ids, attention_mask):

            # Obtain the sentence embeddings from the DistilBERT model
            embeddings = self.l1(input_ids=input_ids, attention_mask=attention_mask)
            hidden_state = embeddings[0]
            X = hidden_state[:, 0]

            predictions = {}
            
            # 1D Prediction
            if parameters['max_level'] >= 1:
                predictions['SSOC_1D'] = self.ssoc_1d_stack(X)

            # 2D Prediction
            if parameters['max_level'] >= 2:
                X = torch.cat((X, predictions['SSOC_1D']), dim = 1)
                predictions['SSOC_2D'] = self.ssoc_2d_stack(X)

            return {f'SSOC_{i}D': predictions[f'SSOC_{i}D'] for i in range(1, parameters['max_level'] + 1)}
        
    model = HierarchicalSSOCClassifier()
    model.to(parameters['device'])
    loss_function = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params =  model.parameters(), lr = parameters['learning_rate'])
    
    return model, loss_function, optimizer
    

In [6]:
import time
from datetime import datetime

def calculate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

def train_model(model, loss_function, optimizer, epochs):

    start_time = time.time()
    now = datetime.now()
    current_time = now.strftime("%d %b %Y - %H:%M:%S")
    print("Training started on:", current_time)
    
    for epoch in range(epochs):
        tr_loss = 0
        n_correct = 0
        nb_tr_steps = 0
        nb_tr_examples = 0
        
        epoch_start_time = time.time()
        batch_start_time = time.time()

        # Set the NN to train mode
        model.train()

        # Iterate over each batch
        for batch, data in enumerate(training_loader):

            # Extract the data
            ids = data['ids'].to(parameters['device'], dtype = torch.long)
            mask = data['mask'].to(parameters['device'], dtype = torch.long)

            # Run the forward prop
            predictions = model(ids, mask)

            # Iterate through each SSOC level
            for ssoc_level, preds in predictions.items():

                # Extract the correct target for the SSOC level
                targets = data[ssoc_level].to(parameters['device'], dtype = torch.long)

                # Compute the loss function using the predictions and the targets
                level_loss = loss_function(preds, targets)

                # Initialise the loss variable if this is the 1D level
                # Else add to the loss variable
                # Note the weights on each level
                if ssoc_level == 'SSOC_1D':
                    loss = level_loss * parameters['loss_weights'][ssoc_level]
                else:
                    loss += level_loss * parameters['loss_weights'][ssoc_level]

            # Use the deepest level predictions to calculate accuracy
            top_probs, top_probs_idx = torch.max(preds.data, dim = 1)
            n_correct += calculate_accu(top_probs_idx, targets)

            # Calculate the loss
    #         targets_1d = data['targets_1d'].to(device, dtype = torch.long)
    #         targets_2d = data['targets_2d'].to(device, dtype = torch.long)
    #         loss1 = loss_function(preds_1d, targets_1d)
    #         loss2 = loss_function(preds_2d, targets_2d)
    #         loss = loss1*5 + loss2

            # Add this batch's loss to the overall training loss
            tr_loss += loss.item()

            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

            optimizer.zero_grad()
            loss.backward()
            # # When using GPU
            optimizer.step()
            
            if (batch+1) % 500 == 0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples 
                print(f"Training Loss per 500 steps: {loss_step}")
                print(f"Training Accuracy per 500 steps: {accu_step}")
                print(f"Batch of 500 took {(time.time() - batch_start_time)/60:.2f} mins")
                batch_start_time = time.time()

        print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
        epoch_loss = tr_loss/nb_tr_steps
        epoch_accu = (n_correct*100)/nb_tr_examples
        print(f"Training Loss Epoch: {epoch_loss}")
        print(f"Training Accuracy Epoch: {epoch_accu}")
        print(f"Epoch training time: {(time.time() - epoch_start_time)/60:.2f} mins")

    print(f"Total training time: {(time.time() - start_time)/60:.2f} mins")
    now = datetime.now()
    current_time = now.strftime("%d %b %Y - %H:%M:%S")
    print("Training ended on:", current_time)
        
    return

In [7]:
colnames = {
    'SSOC': 'SSOC 2020',
    'job_description': 'Cleaned_Description'
}

parameters = {
    'sequence_max_length': 512,
    'max_level': 2,
    'training_batch_size': 4,
    'validation_batch_size': 2,
    'epochs': 1,
    'learning_rate': 1e-05,
    'pretrained_model': 'distilbert-base-uncased',
    'num_workers': 0,
    'loss_weights': {
        'SSOC_1D': 20,
        'SSOC_2D': 5,
        'SSOC_3D': 3,
        'SSOC_4D': 2,
        'SSOC_5D': 1
    },
    'device': 'cuda'
}

In [8]:
import pandas as pd
data = pd.read_csv('Data/Processed/Training/train_full.csv')
SSOC_2020 = pd.read_csv('Data/Processed/Training/train.csv')

In [9]:
encoding = generate_encoding(SSOC_2020)
encoded_data = encode_dataset(data[0:5000], encoding)
training_loader, testing_loader = prepare_data(encoded_data, colnames, parameters)
model, loss_function, optimizer = prepare_model(encoding, parameters)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
len(data)/4/50*45/3600*1

2.67775

In [None]:
train_model(model, loss_function, optimizer, parameters['epochs'])

In [None]:
stop here

In [None]:
data[0:5000]

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        
        text = self.data.Description[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_token_type_ids = True,
            truncation = True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets_1d': torch.tensor(self.data.SSOC_1D[index], dtype=torch.long),
            'targets_2d': torch.tensor(self.data.SSOC_2D[index], dtype=torch.long),
        } 
    
    def __len__(self):
        return self.len

In [None]:
# Creating the dataset and dataloader for the neural network
training_set = Triage(train, tokenizer, MAX_LEN)
testing_set = Triage(test, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        
        # Stack 1: Predicting 1D SSOC (9)
        self.ssoc_1d_stack = torch.nn.Sequential(
            torch.nn.Linear(768, 768), 
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(768, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 9)
        )
        
        # Stack 2: Predicting 2D SSOC (40 + 2 nec)
        self.ssoc_2d_stack = torch.nn.Sequential(
            torch.nn.Linear(777, 777), 
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(777, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 42)
        )        

    def forward(self, input_ids, attention_mask):
        
        # Obtain the sentence embeddings from the DistilBERT model
        embeddings = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = embeddings[0]
        X = hidden_state[:, 0]
        
        # 1D Prediction
        preds_1d = self.ssoc_1d_stack(X)
        
        # 2D Prediction
        X = torch.cat((X, preds_1d), dim = 1)
        preds_2d = self.ssoc_2d_stack(X)
        
        return preds_1d, preds_2d

In [None]:
model = DistillBERTClass()

In [None]:
custom_loss_fn
# think of how to adjust the crossentropyloss function
# change the targets upfront before passing it in

In [None]:
def compare_ssoc(predicted, actual):
    base_penalty = 10
    penalty = 0
    for i in range(len(predicted)):
        if predicted[i] != actual[i]:
            penalty += base_penalty/(i+1)
    return penalty

def custom_loss_fn(top_probs_idx, targets, ssoc_level):
          
    if ssoc_level == '1d':
          mapping = idx_ssoc1d
    elif ssoc_level == '2d':
          mapping = idx_ssoc2d
          
    loss = 0
    
    for i in range(len(top_probs_idx)):
        predicted_ssoc = mapping[top_probs_idx[i].item()]
        actual_ssoc = mapping[targets[i].item()]
        loss += compare_ssoc(predicted_ssoc, actual_ssoc)
        
    return Variable(torch.tensor(float(loss)), requires_grad = True)

# need to use Torch variable

In [None]:
testing1 = Variable(torch.tensor([float(5), float(15)]), requires_grad = True)
print(testing1.grad)

In [None]:
Variable(torch.tensor(float(1)), requires_grad = True)

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    
    # Set the NN to train mode
    model.train()
    
    # Iterate over each batch
    for batch, data in enumerate(training_loader):
        
        # Extract the data
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets_1d = data['targets_1d'].to(device, dtype = torch.long)
        targets_2d = data['targets_2d'].to(device, dtype = torch.long)
        
        # Run the forward prop
        preds_1d, preds_2d = model(ids, mask)
        
        # Find the indices of the top prediction
        top_probs_1d, top_probs_idx_1d = torch.max(preds_1d.data, dim = 1)
        top_probs_2d, top_probs_idx_2d = torch.max(preds_2d.data, dim = 1)
        
        # Calculate the loss
        
        loss1 = loss_function(preds_1d, targets_1d)
        loss2 = loss_function(preds_2d, targets_2d)
        loss = loss1*5 + loss2
        #print(f'Overall loss: {loss} = {loss1} + {loss2}')

        # Deprecated
        #loss = loss_function(preds_1d, targets_1d) + loss_function(preds_2d, targets_2d)
        
        # Add this batch's loss to the overall training loss
        tr_loss += loss.item()
        
        n_correct += calcuate_accu(top_probs_idx_2d, targets_2d)

        nb_tr_steps += 1
        nb_tr_examples += targets_2d.size(0)
        
        if batch % 50 == 0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 50 steps: {loss_step}")
            print(f"Training Accuracy per 50 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
device = 'cuda'
model.to(device)

In [None]:
for epoch in range(1):
    train(epoch)

In [None]:
for epoch in range(4):
    train(epoch)

In [None]:
100 % 100