# Setup

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [34]:

# Base Model

base_model_id = 'bert-base-uncased'


seed = 2024

# Training
num_train_epochs=5
batch_size = 32
learning_rate = 5e-5

# Regularisation
hidden_dropout_prob=0.25
attention_probs_dropout_prob=0.25
weight_decay=0.001

use_gradient_checkpointing = False #True,  # Save some memory at the expense of training
# See https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one

hf_site_id = '2024-mcm-everitt-ryan'
dataset_id = f'{hf_site_id}/job-bias-synthetic-human-benchmark'

#dataset_id = f'{hf_site_id}/job-bias-synthetic-human-verified'


base_model_name = base_model_id.split('/')[-1]
model_id = f'{base_model_name}-job-bias-mixed'
hub_model_id = f'{hf_site_id}/{model_id}'

In [7]:
!pip install -q transformers datasets sentencepiece accelerate evaluate hf_transfer huggingface_hub scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# Dataset

In [8]:
from datasets import load_dataset

dataset = load_dataset(dataset_id)
column_names = dataset['train'].column_names


text_col = 'text'
label_cols = [col for col in column_names if col.startswith('label_')]

labels = [label.replace("label_", "") for label in label_cols]

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Remove all columns apart from the two needed for multi-class classification
keep_columns = ['id', text_col] + label_cols
for split in ["train", "val", "test"]:
    dataset[split] = dataset[split].remove_columns(
        [col for col in dataset[split].column_names if col not in keep_columns])

dataset

Downloading readme:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.52M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2713 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/1051 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1053 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 2713
    })
    val: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 1051
    })
    test: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 1053
    })
})

In [9]:
import pandas as pd

# Merge train,val, test into one dataframe
df = pd.concat([
    dataset['train'].to_pandas(),
    dataset['val'].to_pandas(),
    dataset['test'].to_pandas()])

df.head(3)

Unnamed: 0,id,label_age,label_disability,label_masculine,label_feminine,label_racial,label_sexuality,label_general,text
0,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,False,False,False,True,False,False,"Company: Harrington, Richardson and Collins\n\..."
1,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,False,False,False,True,False,False,Part Time Checker at Allen Inc\n\nCompany Back...
2,Synthetic:gpt-4o-2024-05-13:20240627151822:3e7...,False,False,False,False,True,False,False,Torres-Spencer is a leading firm in the transp...


In [10]:
# Longest phrase
longest_text = df[text_col].apply(lambda x: (len(x), x)).max()[1]
longest_text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <title>Job Posting</title>\n</head>\n<body>\n\n<h1>Software Developer - CodeWave Solutions - Canada</h1>\n\n<p><strong>Company Background:</strong></p>\n<p>CodeWave Solutions is a leading software development firm based in Toronto, Canada. We specialize in delivering innovative software solutions to clients across various industries. Our commitment to excellence and passion for technology drives us to create cutting-edge applications that power businesses worldwide.</p>\n\n<p><strong>Job Type:</strong> Full-time</p>\n\n<p><strong>Job Description:</strong></p>\n<p>As a Software Developer at CodeWave Solutions, you will play a key role in designing, developing, and deploying high-quality software solutions. We are seeking knowledgeable candidates who are enthusiastic about technology and have a strong commitment to producing top-notch wor

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_prefix_space=True)
tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
max_char = len(longest_text)
max_words = len(longest_text.split())
max_tokens = len(tokenizer.encode(longest_text))

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
print(f'Max tokens: {max_tokens}')

Token indices sequence length is longer than the specified maximum sequence length for this model (984 > 512). Running this sequence through the model will result in indexing errors


Max characters: 3863
Max words: 441
Max tokens: 984


In [13]:
tokenizer_max_length = min(max_tokens, tokenizer.model_max_length)
tokenizer_max_length

512

In [14]:
import numpy as np


def preprocess_data(sample):
    # take a batch of texts
    text = sample[text_col]
    # encode them
    encoding = tokenizer(text, truncation=True, max_length=tokenizer_max_length, padding="max_length")
    #encoding = tokenizer(text, truncation=True, max_length=tokenizer_max_length, padding=True)
    # add labels
    labels_batch = {k: sample[k] for k in sample.keys() if k in label_cols}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(label_cols)))
    # fill numpy array
    for idx, label in enumerate(label_cols):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

In [16]:
#ds_train = ds_train.map(tokenize, batched=True, batch_size=len(ds_train))
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

Map:   0%|          | 0/2713 [00:00<?, ? examples/s]

# Model

In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(base_model_id,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(label_cols),
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           hidden_dropout_prob=hidden_dropout_prob,
                                                           attention_probs_dropout_prob=attention_probs_dropout_prob)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.25, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.25, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=

In [19]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.25,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.25,
  "hidden_size": 768,
  "id2label": {
    "0": "age",
    "1": "disability",
    "2": "masculine",
    "3": "feminine",
    "4": "racial",
    "5": "sexuality",
    "6": "general"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "age": 0,
    "disability": 1,
    "feminine": 3,
    "general": 6,
    "masculine": 2,
    "racial": 4,
    "sexuality": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "transformers_version": "4.42.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size":

# Define Metrics

In [20]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, classification_report
from transformers import EvalPrediction
import torch


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
# added extras
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels

    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    
    f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    f1_samples = f1_score(y_true=y_true, y_pred=y_pred, average='samples')
    f1_weighted = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')

    precision_micro = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_micro = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc_micro = roc_auc_score(y_true=y_true, y_score=y_pred, average='micro')

    print(classification_report(y_true, y_pred, target_names=list(id2label.values())))
    
    # return as dictionary
    metrics = {
        'accuracy': accuracy,
        f'f1_micro': f1_micro,
        f'f1_macro': f1_macro,
        f'f1_samples': f1_samples,
        f'f1_weighted': f1_weighted,
        f'precision_micro': precision_micro,
        f'recall_micro': recall_micro,
        f'roc_auc_micro': roc_auc_micro}
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

# Train

In [30]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding,TrainerCallback
from huggingface_hub import HfFolder

metric_name = 'loss' #"f1_micro"

args = TrainingArguments(
    model_id,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    #optim=optimiser,
    #lr_scheduler_type="cosine",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    fp16=False,
    gradient_checkpointing=use_gradient_checkpointing,
    overwrite_output_dir=True,
    #push_to_hub=True,
    #output_dir=repository_id,
    #logging_dir=f"{model_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    #warmup_steps=500,
    #warmup_ratio=0.1,
    #max_grad_norm=0.3,
    save_total_limit=3,
    #report_to="tensorboard",
    #push_to_hub=True,
    #hub_strategy="every_save",
    #hub_model_id=hub_model_id,
    #hub_token=HfFolder.get_token(),
)

#early_stop = transformers.EarlyStoppingCallback(10, 1.15)
class PrintClassificationCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        print("----------------------------------------------------------")


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    # For padding a batch of examples to the maximum length seen in the batch
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[PrintClassificationCallback]
    #tokenizer=tokenizer,
    #   callbacks=[early_stop]
)

model.config.use_cache = False  # Silence the warnings.
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Samples,F1 Weighted,Precision Micro,Recall Micro,Roc Auc Micro
1,0.0268,0.246391,0.683159,0.630332,0.616528,0.347578,0.615893,0.565957,0.71123,0.833102
2,0.0205,0.243734,0.685062,0.641918,0.646464,0.357739,0.645784,0.56694,0.73975,0.846553
3,0.0415,0.240111,0.668887,0.641623,0.644463,0.363495,0.644105,0.554545,0.761141,0.855335
4,0.0336,0.247201,0.665081,0.637703,0.64272,0.364811,0.642483,0.543287,0.771836,0.859138
5,0.0203,0.214356,0.710752,0.665094,0.668466,0.367253,0.668206,0.594937,0.754011,0.855816


              precision    recall  f1-score   support

         age       0.56      0.66      0.61        80
  disability       0.39      0.84      0.54        81
   masculine       0.62      0.68      0.65        81
    feminine       0.76      0.91      0.83        80
      racial       0.51      0.91      0.65        77
   sexuality       0.76      0.81      0.79        81
     general       0.48      0.17      0.25        81

   micro avg       0.57      0.71      0.63       561
   macro avg       0.58      0.71      0.62       561
weighted avg       0.58      0.71      0.62       561
 samples avg       0.35      0.36      0.35       561

----------------------------------------------------------
              precision    recall  f1-score   support

         age       0.60      0.66      0.63        80
  disability       0.41      0.79      0.54        81
   masculine       0.51      0.77      0.61        81
    feminine       0.87      0.86      0.87        80
      racial       

TrainOutput(global_step=425, training_loss=0.026663449915016398, metrics={'train_runtime': 174.4168, 'train_samples_per_second': 77.773, 'train_steps_per_second': 2.437, 'total_flos': 3569261693568000.0, 'train_loss': 0.026663449915016398, 'epoch': 5.0})

# Evaluate

In [31]:
test_results = trainer.evaluate(eval_dataset=encoded_dataset['test'])
test_results

              precision    recall  f1-score   support

         age       0.55      0.63      0.59        81
  disability       0.58      0.77      0.66        81
   masculine       0.56      0.53      0.54        81
    feminine       0.77      0.89      0.82        81
      racial       0.64      0.90      0.74        78
   sexuality       0.73      0.84      0.78        81
     general       0.38      0.55      0.45        82

   micro avg       0.59      0.73      0.65       565
   macro avg       0.60      0.73      0.66       565
weighted avg       0.60      0.73      0.65       565
 samples avg       0.36      0.38      0.36       565

----------------------------------------------------------


{'eval_loss': 0.21743419766426086,
 'eval_accuracy': 0.7160493827160493,
 'eval_f1_micro': 0.6534181240063593,
 'eval_f1_macro': 0.6552847922083297,
 'eval_f1_samples': 0.3599240265906933,
 'eval_f1_weighted': 0.6544428244654181,
 'eval_precision_micro': 0.5930735930735931,
 'eval_recall_micro': 0.727433628318584,
 'eval_roc_auc_micro': 0.8429997997602323,
 'eval_runtime': 3.55,
 'eval_samples_per_second': 296.619,
 'eval_steps_per_second': 9.296,
 'epoch': 5.0}

In [32]:
import pandas as pd
df = pd.DataFrame(list(test_results.items()), columns=['Metric', 'Value'])
print(df.to_string(index=False))

                 Metric      Value
              eval_loss   0.217434
          eval_accuracy   0.716049
          eval_f1_micro   0.653418
          eval_f1_macro   0.655285
        eval_f1_samples   0.359924
       eval_f1_weighted   0.654443
   eval_precision_micro   0.593074
      eval_recall_micro   0.727434
     eval_roc_auc_micro   0.843000
           eval_runtime   3.550000
eval_samples_per_second 296.619000
  eval_steps_per_second   9.296000
                  epoch   5.000000


# Upload model

In [36]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:

model.push_to_hub(hub_model_id)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/2024-mcm-everitt-ryan/bert-base-uncased-job-bias-mixed/commit/f415e3a2cb8b583ba9b96303e404890f46b6333d', commit_message='Upload BertForSequenceClassification', commit_description='', oid='f415e3a2cb8b583ba9b96303e404890f46b6333d', pr_url=None, pr_revision=None, pr_num=None)