#### Installing the required libraries

In [None]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

### Importing necessaries libraries...

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import os
import sys

import numpy as np
import random as rn
import pandas as pd
import re
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EvalPrediction
from datasets import load_dataset

from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

#### Important Parameters


In [None]:
MAX_LENGTH = 512
BATCH_SIZE = 8 #the higher is better but requires more memory
EPOCHS = 5 #the higher is better but requires more time to complete the training process
THRESHOLD = 0.5
METRIC_NAME = "f1"

#### Setting up data path

In [None]:
#root_data_path = '/content/drive/MyDrive/Colab Notebooks/cv_splits_csv'

root_data_path = '/content/drive/MyDrive/ColabNotebooks/MSc-dissertation/skill-prediction/preprocess/data/cv_splits_csv'
#root_data_path = '/projets/sig/mullah/nlp/cv/data/corpus_splits/'

#root_data_dir = '/projets/sig/mullah/nlp/cv/models' (create )
root_data_dir = '/content/drive/MyDrive/ColabNotebooks/MSc-dissertation/skill-prediction/models'

#### Loading train, validation, and test datasets

In [None]:
data_files = {
    "train": os.path.join(root_data_path, "train.csv"),
    "validation": os.path.join(root_data_path, "val.csv"),
    "test": os.path.join(root_data_path, "test.csv")
}
#print (data_files)

dataset = load_dataset('csv', data_files = data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: ignored

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Id', 'Text', 'Tags'],
        num_rows: 17870
    })
    validation: Dataset({
        features: ['Id', 'Text', 'Tags'],
        num_rows: 5957
    })
    test: Dataset({
        features: ['Id', 'Text', 'Tags'],
        num_rows: 5956
    })
})

#### Initializing random seed values to stabilize the outcomes.

In [None]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [None]:
dataset['train']['Tags'][:5]

["['Software_Developer']",
 "['Network_Administrator']",
 "['Project_manager']",
 "['Web_Developer', 'Software_Developer']",
 "['Web_Developer', 'Software_Developer', 'Front_End_Developer']"]

Two critical problems here: </br>
 **Multiple labels** for each sample</br>
 **Labels** are in string of list format</br>
We have to solve these two problems, convert the string of list to list format and binarized the multilabel of samples

#### Encoding labels of train, validation, and test set
**Converting** string of list to list using *literal_eval* function. </br>
Using MultiLabelBinarizer to encode the multiclass multilabel target


In [None]:
train_labels = [literal_eval(labels) for labels in dataset['train']['Tags']]
validation_labels = [literal_eval(labels) for labels in dataset['validation']['Tags']]
test_labels = [literal_eval(labels) for labels in dataset['test']['Tags']]

In [None]:
train_labels[:5]

[['Software_Developer'],
 ['Network_Administrator'],
 ['Project_manager'],
 ['Web_Developer', 'Software_Developer'],
 ['Web_Developer', 'Software_Developer', 'Front_End_Developer']]

Fitting the _MultiLabelBinarizer_ on the train subset labels

In [None]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train_labels)

MultiLabelBinarizer()

In [None]:
multilabel_binarizer.classes_

array(['Database_Administrator', 'Front_End_Developer', 'Java_Developer',
       'Network_Administrator', 'Project_manager', 'Python_Developer',
       'Security_Analyst', 'Software_Developer', 'Systems_Administrator',
       'Web_Developer'], dtype=object)

In [None]:
labels = multilabel_binarizer.classes_
print ("Labels: ", labels)
id2label = {idx:label for idx, label in enumerate(labels)}
print ("Id2Labels: ", id2label)
label2id = {id2label.get(idx):idx for idx in id2label}
print ("Labels2Id: ", label2id)

Labels:  ['Database_Administrator' 'Front_End_Developer' 'Java_Developer'
 'Network_Administrator' 'Project_manager' 'Python_Developer'
 'Security_Analyst' 'Software_Developer' 'Systems_Administrator'
 'Web_Developer']
Id2Labels:  {0: 'Database_Administrator', 1: 'Front_End_Developer', 2: 'Java_Developer', 3: 'Network_Administrator', 4: 'Project_manager', 5: 'Python_Developer', 6: 'Security_Analyst', 7: 'Software_Developer', 8: 'Systems_Administrator', 9: 'Web_Developer'}
Labels2Id:  {'Database_Administrator': 0, 'Front_End_Developer': 1, 'Java_Developer': 2, 'Network_Administrator': 3, 'Project_manager': 4, 'Python_Developer': 5, 'Security_Analyst': 6, 'Software_Developer': 7, 'Systems_Administrator': 8, 'Web_Developer': 9}


#### Preprocess data (Encoding)
BERT doesn't expect text as direct input, but rather text encoding in terms of *input_ids*, *attention masks*, etc. We tokenise the text using the BERT's tokenizer (**AutoTokenizer** API from Huggingface)

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_resume_text( raw_text ):
    '''
        cleaning html tags, non-alphanumeric symbol, stop words from the given text
    '''
    escaped_text = BeautifulSoup(raw_text).get_text()
    alphanum_text = re.sub("[^a-zA-Z0-9]", " ", escaped_text)
    alphanum_lower_text = alphanum_text.lower()

    #Tokenize text into words
    words = word_tokenize(alphanum_lower_text)

    #Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    preprocessed_text = ' '.join( lemmatized_words )

    return preprocessed_text

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def preprocess_data(examples):
    #take a batch of texts
    texts = examples['Text']

    clean_texts= [clean_resume_text(text) for text in texts]

    #encode them
    encoding = tokenizer(clean_texts, padding="max_length", truncation=True, max_length=MAX_LENGTH)

    #add labels
    tags = examples['Tags']
    tags_label = [literal_eval(tag) for tag in tags]
    tags_label_binarizer = np.array(multilabel_binarizer.transform(tags_label), dtype=np.float32)

    encoding['labels'] = tags_label_binarizer.tolist()

    #labels_batch = {k:examples[k] for k in examples.keys() if k in labels}
    #transform labels_batch dictionary to numpy arrays
    #labels_matrix = np.zeros((len(texts), len(labels)))
    #for idx, label in enumerate(labels):
    #    labels_matrix[:, idx] = labels_batch[label]
    #encoding["labels"] = labels_matrix.tolist()


    return encoding

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /users/sig/mullah/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncase

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
encoded_dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [None]:
check_idx = 1

In [None]:
example = encoded_dataset['train'][check_idx]['input_ids']
len(example)

512

In [None]:
tokenizer.decode(example)

'[CLS] network administratoir network administratoir network administrator bergquist company river fall wi network experience lan wan window nt 2000 2003 2008r2 managed switch extreme network hp procurve san dell equallogic emc vnx na qnap ad dhcp dns win network monitoring solarwinds netflow avaya voip shoretel voip siemens voip router firewall vpn cisco asa watchguard sonicwall nsa tz series application experience m application primarily m office 97 2013 solidworks autocad epicor vantage corvu backup exec arcserver dp backup veeam blackberry server dreamweaver operating system experience m do window 3 1 95 98 nt w 2000 pro xp pro home vista 7 window server nt 2000 2003 2008 2012 exchange server 5 5 2000 2003 redhat centos suse vmware esx 3 5 vsphere 4 x 5 x programming experience basic c html work experience network administratoir bergquist company chanhassen mn march 2014 present responsibility maintain wan connection three site setup maintain 100 window 2003 2008 2012 server setup 

In [None]:
encoded_dataset['train'][check_idx]['labels']

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [None]:
labels

array(['Database_Administrator', 'Front_End_Developer', 'Java_Developer',
       'Network_Administrator', 'Project_manager', 'Python_Developer',
       'Security_Analyst', 'Software_Developer', 'Systems_Administrator',
       'Web_Developer'], dtype=object)

In [None]:
[id2label.get(idx) for idx, label in enumerate(encoded_dataset['train'][check_idx]['labels']) if label == 1.0]

['Network_Administrator']

In [None]:
encoded_dataset.set_format("torch")

### Multi-label Classification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                          problem_type="multi_label_classification",
                                                          num_labels = len(labels),
                                                          id2label = id2label,
                                                          label2id = label2id)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /users/sig/mullah/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Database_Administrator",
    "1": "Front_End_Developer",
    "2": "Java_Developer",
    "3": "Network_Administrator",
    "4": "Project_manager",
    "5": "Python_Developer",
    "6": "Security_Analyst",
    "7": "Software_Developer",
    "8": "Systems_Administrator",
    "9": "Web_Developer"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Database_Admini

#### Setting up the Training Arguments

In [None]:
args = TrainingArguments(
    os.path.join(root_data_dir, f"mcml_bert-finetuned-skills-prediction-clean-data"),
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    num_train_epochs = EPOCHS,
    weight_decay = 0.01,
    load_best_model_at_end = True,
    metric_for_best_model = METRIC_NAME
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


#### Computing Multi-label Metrics

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    #applying sigmoid on the prediction (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    #threshold to turn them into integer predictions (like class id)
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs>=threshold)] = 1

    #compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true = y_true, y_pred = y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    metrics_score = {
            'f1' : f1_micro_average,
            'roc_auc'  : roc_auc,
            'accuracy' : accuracy}
    return metrics_score

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions = preds, labels = p.label_ids)
    return result

Let's verify a batch as well as a forward pass:

In [None]:
encoded_dataset['train'][0]['labels'].unsqueeze(0)

tensor([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

In [None]:
encoded_dataset['train'][0]['input_ids'].unsqueeze(0)

tensor([[  101,  3169,  3208,  3169,  3208, 11577, 18712,  9362,  2147,  1057,
         11194,  2147,  3325,  3169,  3208, 27589,  4974,  9117, 11775, 12982,
         18712,  2233,  2760,  2556, 22834,  4087,  8619, 19429, 24997,  2102,
         14521, 18712,  2251,  2418,  2233,  2760,  2622, 10669,  5843,  7159,
         14521, 18712,  2254,  2418,  2238,  2418,  3698,  6872, 26680, 17655,
          2194,  6701,  2821,  2254,  2355,  2254,  2418,  2503,  5096,  2966,
          5949,  2326,  4937, 20897,  9695, 18712,  2257,  2325,  2254,  2355,
          4012, 23041,  5555,  4263, 18558, 28472, 16364,  1059,  2615,  2089,
          2325,  2257,  2325, 27166,  2278, 24532,  5498,  3367,  7829, 10669,
          3212,  2504,  2028,  3435, 24454, 16364,  1059,  2615,  2257,  2297,
          2089,  2325, 25718,  9722,  3353,  2490, 21929,  7520, 21405,  2986,
          2396,  3916,  2811,  2286,  2297,  2495,  2152,  2082,  9827,  8066,
          7513,  2436,  1019,  2095, 25718,  1019,  

In [None]:
#forward pass
outputs = model(input_ids=encoded_dataset['train'][0]['input_ids'].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.8332, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.2531,  0.5737, -0.1311,  0.2489, -0.0673,  0.8570,  0.2247, -0.8437,
         -0.2123,  0.2131]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
outputs.logits.squeeze().cpu()
print (torch.nn.Sigmoid()(outputs.logits.squeeze().cpu()).shape)

torch.Size([10])


#### Training the models
Let's start training the model

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset = encoded_dataset["train"],
    eval_dataset = encoded_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [None]:
#launching the trainer
trainer.train()

***** Running training *****
  Num examples = 17870
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11170


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1113,0.104182,0.907939,0.934101,0.744334
2,0.0862,0.094103,0.912367,0.945863,0.749203
3,0.069,0.0863,0.922374,0.944197,0.781937


***** Running Evaluation *****
  Num examples = 5957
  Batch size = 8
Saving model checkpoint to /projets/sig/mullah/nlp/cv/models/mcml_bert-finetuned-skills-prediction-clean-data/checkpoint-2234
Configuration saved in /projets/sig/mullah/nlp/cv/models/mcml_bert-finetuned-skills-prediction-clean-data/checkpoint-2234/config.json
Model weights saved in /projets/sig/mullah/nlp/cv/models/mcml_bert-finetuned-skills-prediction-clean-data/checkpoint-2234/pytorch_model.bin
tokenizer config file saved in /projets/sig/mullah/nlp/cv/models/mcml_bert-finetuned-skills-prediction-clean-data/checkpoint-2234/tokenizer_config.json
Special tokens file saved in /projets/sig/mullah/nlp/cv/models/mcml_bert-finetuned-skills-prediction-clean-data/checkpoint-2234/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5957
  Batch size = 8
Saving model checkpoint to /projets/sig/mullah/nlp/cv/models/mcml_bert-finetuned-skills-prediction-clean-data/checkpoint-4468
Configuration saved in /projet

#### Evaluate on the train set

In [None]:
predictions_train = trainer.predict(
    test_dataset=encoded_dataset['train']
)

***** Running Prediction *****
  Num examples = 17870
  Batch size = 8


#### Evaluate the model

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 5957
  Batch size = 8


{'eval_loss': 0.08639294654130936,
 'eval_f1': 0.9231722428748451,
 'eval_roc_auc': 0.9479516307770601,
 'eval_accuracy': 0.7842873929830452,
 'eval_runtime': 110.6472,
 'eval_samples_per_second': 53.838,
 'eval_steps_per_second': 6.733,
 'epoch': 5.0}

#### Predictions on test set

In [None]:
predictions = trainer.predict(
    test_dataset=encoded_dataset['test']
)

***** Running Prediction *****
  Num examples = 5956
  Batch size = 8


In [None]:
predictions.metrics

{'test_loss': 0.08285614848136902,
 'test_f1': 0.9277367040305271,
 'test_roc_auc': 0.9509022786745007,
 'test_accuracy': 0.7960040295500336,
 'test_runtime': 111.2222,
 'test_samples_per_second': 53.55,
 'test_steps_per_second': 6.698}

In [None]:
#re-run the experiments for number of tokens 512 (instead of 128)