# Setup

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Base Model

base_model_id = 'bert-base-uncased'


seed = 2024

# Training
num_train_epochs=10
batch_size = 32
learning_rate = 5e-5

# Regularisation
hidden_dropout_prob=0.02
attention_probs_dropout_prob=0.00
weight_decay=0.001


# Evaluation
label_threshold=0.5

use_gradient_checkpointing = False #True,  # Save some memory at the expense of training
# See https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one

hf_site_id = '2024-mcm-everitt-ryan'
dataset_id = f'{hf_site_id}/job-bias-synthetic-human-verified'

#dataset_id = f'{hf_site_id}/job-bias-synthetic-human-verified'


base_model_name = base_model_id.split('/')[-1]
model_id = f'{base_model_name}-job-bias-verified'
hub_model_id = f'{hf_site_id}/{model_id}'

In [2]:
!pip install -q transformers datasets sentencepiece accelerate evaluate hf_transfer huggingface_hub scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# Dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset(dataset_id)
column_names = dataset['train'].column_names


text_col = 'text'
label_cols = [col for col in column_names if col.startswith('label_')]

labels = [label.replace("label_", "") for label in label_cols]

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Remove all columns apart from the two needed for multi-class classification
keep_columns = ['id', text_col] + label_cols
for split in ["train", "val", "test"]:
    dataset[split] = dataset[split].remove_columns(
        [col for col in dataset[split].column_names if col not in keep_columns])

dataset

Downloading readme:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/988k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1256 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/314 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1046 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 1256
    })
    val: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 314
    })
    test: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 1046
    })
})

In [4]:
import pandas as pd

# Merge train,val, test into one dataframe
df = pd.concat([
    dataset['train'].to_pandas(),
    dataset['val'].to_pandas(),
    dataset['test'].to_pandas()])

df.head(3)

Unnamed: 0,id,label_age,label_disability,label_masculine,label_feminine,label_racial,label_sexuality,label_general,text
0,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,False,False,False,False,False,False,Company: Lewis-Estrada\nJob Title: Cashier\n\n...
1,Kaggle::techmap::61377d32f2fd7421561f6664::see...,True,False,False,False,False,False,False,"Have you worked at 2Degrees, Slingshot, Skinny..."
2,Kaggle::techmap::61427019a973d70733cfaec4::bri...,True,False,False,True,False,False,True,About the role\n\nAre you looking to join our ...


In [5]:
# Longest phrase
longest_text = df[text_col].apply(lambda x: (len(x), x)).max()[1]
longest_text

'Consumer, Business and Digital Banking – We work with our retail banking, business banking, consumer lending, mortgage, and digital banking businesses to define far-reaching technology strategies to evolve our customer experiences, to make us easier to do business with, and to deliver solutions that provide real value. This includes all core consumer deposit, loan, and payment processing and servicing platforms, all core channel systems for retail branches, ATMs, and call centers, custom-built online and mobile banking platforms, and mtb.com and marketing ecosystem capabilities.\n\nOverview:\n- Manages the activities of several Technology Team Leaders or units and is responsible for each Team’s/unit’s development and systems support efforts.\n- Provides day-to-day direction for the units and applications in line with the goals of the department and the clients they support.\n- Responsible for managing client relations and expectations.\n- Manages the project queue for their area.\n- S

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_prefix_space=True)
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
max_char = len(longest_text)
max_words = len(longest_text.split())
max_tokens = len(tokenizer.encode(longest_text))

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
print(f'Max tokens: {max_tokens}')

Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors


Max characters: 3170
Max words: 442
Max tokens: 548


In [8]:
tokenizer_max_length = min(max_tokens, tokenizer.model_max_length)
tokenizer_max_length

512

In [9]:
import numpy as np


def preprocess_data(sample):
    # take a batch of texts
    text = sample[text_col]
    # encode them
    encoding = tokenizer(text, truncation=True, max_length=tokenizer_max_length, padding="max_length")
    #encoding = tokenizer(text, truncation=True, max_length=tokenizer_max_length, padding=True)
    # add labels
    labels_batch = {k: sample[k] for k in sample.keys() if k in label_cols}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(label_cols)))
    # fill numpy array
    for idx, label in enumerate(label_cols):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

In [10]:
#ds_train = ds_train.map(tokenize, batched=True, batch_size=len(ds_train))
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Map:   0%|          | 0/1046 [00:00<?, ? examples/s]

# Model

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(base_model_id,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(label_cols),
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           hidden_dropout_prob=hidden_dropout_prob,
                                                          # attention_probs_dropout_prob=attention_probs_dropout_prob
                                                          )
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.02, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [15]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.02,
  "hidden_size": 768,
  "id2label": {
    "0": "age",
    "1": "disability",
    "2": "masculine",
    "3": "feminine",
    "4": "racial",
    "5": "sexuality",
    "6": "general"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "age": 0,
    "disability": 1,
    "feminine": 3,
    "general": 6,
    "masculine": 2,
    "racial": 4,
    "sexuality": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "transformers_version": "4.42.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 

# Define Metrics

In [16]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, classification_report
from transformers import EvalPrediction
import torch


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
# added extras
def multi_label_metrics(predictions, labels):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= label_threshold)] = 1
    # finally, compute metrics
    y_true = labels

    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    
    f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    f1_samples = f1_score(y_true=y_true, y_pred=y_pred, average='samples')
    f1_weighted = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')

    precision_micro = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_micro = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc_micro = roc_auc_score(y_true=y_true, y_score=y_pred, average='micro')

    print(classification_report(y_true, y_pred, target_names=list(id2label.values())))
    
    # return as dictionary
    metrics = {
        'accuracy': accuracy,
        f'f1_micro': f1_micro,
        f'f1_macro': f1_macro,
        f'f1_samples': f1_samples,
        f'f1_weighted': f1_weighted,
        f'precision_micro': precision_micro,
        f'recall_micro': recall_micro,
        f'roc_auc_micro': roc_auc_micro}
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

# Train

In [17]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding,TrainerCallback
from huggingface_hub import HfFolder

metric_name = 'loss' #"f1_micro"

args = TrainingArguments(
    model_id,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    #optim=optimiser,
    #lr_scheduler_type="cosine",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    fp16=False,
    gradient_checkpointing=use_gradient_checkpointing,
    overwrite_output_dir=True,
    #push_to_hub=True,
    #output_dir=repository_id,
    #logging_dir=f"{model_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    #warmup_steps=500,
    #warmup_ratio=0.1,
    #max_grad_norm=0.3,
    save_total_limit=3,
    #report_to="tensorboard",
    #push_to_hub=True,
    #hub_strategy="every_save",
    #hub_model_id=hub_model_id,
    #hub_token=HfFolder.get_token(),
)

#early_stop = transformers.EarlyStoppingCallback(10, 1.15)
class PrintClassificationCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        print("----------------------------------------------------------")


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    # For padding a batch of examples to the maximum length seen in the batch
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[PrintClassificationCallback]
    #tokenizer=tokenizer,
    #   callbacks=[early_stop]
)

model.config.use_cache = False  # Silence the warnings.
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Samples,F1 Weighted,Precision Micro,Recall Micro,Roc Auc Micro
1,0.2704,0.266314,0.515924,0.0,0.0,0.0,0.0,0.0,0.0,0.5
2,0.2299,0.24814,0.515924,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3,0.1927,0.193946,0.611465,0.331707,0.311432,0.107219,0.254987,0.894737,0.203593,0.600812
4,0.1472,0.165815,0.675159,0.548872,0.521751,0.222611,0.495535,0.737374,0.437126,0.712162
5,0.0948,0.153179,0.707006,0.61745,0.603855,0.280998,0.57517,0.70229,0.550898,0.765848
6,0.0714,0.142202,0.738854,0.647273,0.62997,0.275159,0.6025,0.824074,0.532934,0.76179
7,0.055,0.138892,0.751592,0.675958,0.67612,0.299575,0.656854,0.808333,0.580838,0.784757
8,0.0408,0.138,0.748408,0.66899,0.665751,0.297452,0.645501,0.8,0.57485,0.781517
9,0.0316,0.142623,0.742038,0.657143,0.650402,0.284713,0.628022,0.814159,0.550898,0.770279
10,0.0367,0.141701,0.742038,0.657244,0.652032,0.287898,0.629941,0.801724,0.556886,0.772781


              precision    recall  f1-score   support

         age       0.00      0.00      0.00        25
  disability       0.00      0.00      0.00        26
   masculine       0.00      0.00      0.00        27
    feminine       0.00      0.00      0.00        20
      racial       0.00      0.00      0.00        15
   sexuality       0.00      0.00      0.00        27
     general       0.00      0.00      0.00        27

   micro avg       0.00      0.00      0.00       167
   macro avg       0.00      0.00      0.00       167
weighted avg       0.00      0.00      0.00       167
 samples avg       0.00      0.00      0.00       167

----------------------------------------------------------
              precision    recall  f1-score   support

         age       0.00      0.00      0.00        25
  disability       0.00      0.00      0.00        26
   masculine       0.00      0.00      0.00        27
    feminine       0.00      0.00      0.00        20
      racial       

TrainOutput(global_step=400, training_loss=0.12981932155787945, metrics={'train_runtime': 161.3821, 'train_samples_per_second': 77.828, 'train_steps_per_second': 2.479, 'total_flos': 3304823212032000.0, 'train_loss': 0.12981932155787945, 'epoch': 10.0})

# Evaluate

In [18]:
test_results = trainer.evaluate(eval_dataset=encoded_dataset['test'])
test_results

              precision    recall  f1-score   support

         age       0.74      0.46      0.56        81
  disability       0.75      0.62      0.68        81
   masculine       0.88      0.38      0.53        79
    feminine       0.86      0.92      0.89        76
      racial       0.85      0.73      0.79        78
   sexuality       0.80      0.88      0.84        81
     general       1.00      0.30      0.47        82

   micro avg       0.82      0.61      0.70       558
   macro avg       0.84      0.61      0.68       558
weighted avg       0.84      0.61      0.68       558
 samples avg       0.32      0.32      0.32       558

----------------------------------------------------------


{'eval_loss': 0.13193343579769135,
 'eval_accuracy': 0.7839388145315488,
 'eval_f1_micro': 0.7003089598352215,
 'eval_f1_macro': 0.6788635860334568,
 'eval_f1_samples': 0.32023581899298914,
 'eval_f1_weighted': 0.6765300657386585,
 'eval_precision_micro': 0.8232445520581114,
 'eval_recall_micro': 0.6093189964157706,
 'eval_roc_auc_micro': 0.7992632829506411,
 'eval_runtime': 3.5617,
 'eval_samples_per_second': 293.684,
 'eval_steps_per_second': 9.265,
 'epoch': 10.0}

In [20]:
import pandas as pd
df = pd.DataFrame(list(test_results.items()), columns=['Metric', 'Value'])
print(df.to_string(index=False))

                 Metric      Value
              eval_loss   0.131933
          eval_accuracy   0.783939
          eval_f1_micro   0.700309
          eval_f1_macro   0.678864
        eval_f1_samples   0.320236
       eval_f1_weighted   0.676530
   eval_precision_micro   0.823245
      eval_recall_micro   0.609319
     eval_roc_auc_micro   0.799263
           eval_runtime   3.561700
eval_samples_per_second 293.684000
  eval_steps_per_second   9.265000
                  epoch  10.000000


# Push to Hugging Face

In [21]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

from huggingface_hub import ModelCard, EvalResult, ModelCardData
import platform
import sys
import os

model.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())
tokenizer.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())

###### Update Model Card ######

eval_results = []
for k, v in test_results.items():
    eval_results.append(EvalResult(
        task_type='multi_label_classification',
        dataset_type='mix_human-eval_synthetic',
        dataset_name=dataset_id,
        metric_type=k.replace("eval_", "", 1),
        metric_value=v))

direct_use = """
    ```python
    from transformers import pipeline

    pipe = pipeline("text-classification", model="${hub_model_id}", return_all_scores=True)

    results = pipe("Join our dynamic and fast-paced team as a Junior Marketing Specialist. We seek a tech-savvy and energetic individual who thrives in a vibrant environment. Ideal candidates are digital natives with a fresh perspective, ready to adapt quickly to new trends. You should have recent experience in social media strategies and a strong understanding of current digital marketing tools. We're looking for someone with a youthful mindset, eager to bring innovative ideas to our young and ambitious team. If you're a recent graduate or early in your career, this opportunity is perfect for you!")
    print(results)
    ```
    >> [[
    {'label': 'age', 'score': 0.9883460402488708}, 
    {'label': 'disability', 'score': 0.00787709467113018}, 
    {'label': 'feminine', 'score': 0.007224376779049635}, 
    {'label': 'general', 'score': 0.09967829287052155}, 
    {'label': 'masculine', 'score': 0.0035264550242573023}, 
    {'label': 'racial', 'score': 0.014618005603551865}, 
    {'label': 'sexuality', 'score': 0.005568435415625572}
    ]]
    """
direct_use = direct_use.replace('${hub_model_id}', hub_model_id, -1)

card_data = ModelCardData(
    model_id=model_id,
    model_name=model_id,
    model_description="The model is a multi-label classifier designed to detect various types of bias within job descriptions.",
    base_model=base_model_id,
    language='en',
    license='apache-2.0',
    developers="Tristan Everitt and Paul Ryan",
    model_card_authors='See developers',
    model_card_contact='See developers',
    repo="https://gitlab.computing.dcu.ie/everitt2/2024-mcm-everitt-ryan",
    eval_results=eval_results,
    compute_infrastructure=f'{platform.system()} {platform.release()} {platform.processor()}',
    # hardware_requirements=f"CPUs: {psutil.cpu_count()}, Memory: {psutil.virtual_memory().total} bytes",
    software=f'Python {platform.python_version()}',
    hardware_type=platform.machine(),
    hours_used='N/A',
    cloud_provider='N/A',
    cloud_region='N/A',
    co2_emitted='N/A',
    datasets=[dataset_id],
    direct_use=direct_use
)

card = ModelCard.from_template(card_data)

card.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]