Multi-label Classification

Adds a linear layer on top of the base model, which is used to produce a tensor of shape (batch_size, num_labels), indicating the unnormalized scores for a number of labels for every example in the batch.

Based on https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

# Login to Hugging Face

In [None]:
#!pip install -q transformers datasets sentencepiece accelerate evaluate peft bitsandbytes protobuf scikit-learn tiktoken huggingface_hub

In [None]:
from huggingface_hub import login
import os

login(token=os.getenv("HF_TOKEN"))

#from huggingface_hub import notebook_login
#notebook_login()

# Setup

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Base Model

base_model_id = 'google-bert/bert-large-uncased'


seed = 2024

# Training
num_train_epochs=3
batch_size = 8

use_gradient_checkpointing = False,  # Save some memory at the expense of training
# See https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one
gradient_accumulation_steps=1


# 5e-5, 4e-5, 3e-5, and 2e-5
learning_rate=3e-5

# Regularisation
hidden_dropout_prob=0.1
attention_probs_dropout_prob=0.0
weight_decay=0.001

# Evaluation
label_threshold=0.5


hf_site_id = '2024-mcm-everitt-ryan'
dataset_id = f'{hf_site_id}/job-bias-synthetic-human-benchmark-v2'
#dataset_id = f'{hf_site_id}/job-bias-synthetic-human-verified'
base_model_name = base_model_id.split('/')[-1]

model_id = f'{base_model_name}-job-bias-seq-cls'
hub_model_id = f'{hf_site_id}/{model_id}'


# Dataset

In [17]:
from datasets import load_dataset

dataset = load_dataset(dataset_id)
column_names = dataset['train'].column_names


text_col = 'text'
label_cols = [col for col in column_names if col.startswith('label_')]

labels = [label.replace("label_", "") for label in label_cols]

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Remove all columns apart from the two needed for multi-class classification
keep_columns = ['id', text_col] + label_cols
for split in ["train", "val", "test"]:
    dataset[split] = dataset[split].remove_columns(
        [col for col in dataset[split].column_names if col not in keep_columns])

for type in ['train','val','test']:
    dataset[type] = dataset[type].shuffle(seed=seed)#.select(range(10))

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 4820
    })
    val: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 1051
    })
    test: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 1053
    })
})

In [18]:
import pandas as pd

# Merge train,val, test into one dataframe
df = pd.concat([
    dataset['train'].to_pandas(),
    dataset['val'].to_pandas(),
    dataset['test'].to_pandas()])

df.head(3)

Unnamed: 0,id,label_age,label_disability,label_masculine,label_feminine,label_racial,label_sexuality,label_general,text
0,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,False,False,False,True,False,False,"Company: Harrington, Richardson and Collins\n\..."
1,Synthetic:meta-llama:Meta-Llama-3-70B-Instruct...,False,False,False,False,True,False,False,Part Time Checker at Allen Inc\n\nCompany Back...
2,Synthetic:gpt-4o-2024-05-13:20240627151822:3e7...,False,False,False,False,True,False,False,Torres-Spencer is a leading firm in the transp...


# Tokeniser

In [19]:
# Longest phrase
longest_text = df[text_col].apply(lambda x: (len(x), x)).max()[1]
longest_text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <title>Job Posting</title>\n</head>\n<body>\n\n<h1>Software Developer - CodeWave Solutions - Canada</h1>\n\n<p><strong>Company Background:</strong></p>\n<p>CodeWave Solutions is a leading software development firm based in Toronto, Canada. We specialize in delivering innovative software solutions to clients across various industries. Our commitment to excellence and passion for technology drives us to create cutting-edge applications that power businesses worldwide.</p>\n\n<p><strong>Job Type:</strong> Full-time</p>\n\n<p><strong>Job Description:</strong></p>\n<p>As a Software Developer at CodeWave Solutions, you will play a key role in designing, developing, and deploying high-quality software solutions. We are seeking knowledgeable candidates who are enthusiastic about technology and have a strong commitment to producing top-notch wor

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_prefix_space=True)
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [21]:
max_char = len(longest_text)
max_words = len(longest_text.split())
max_tokens = len(tokenizer.encode(longest_text))

print(f'Max characters: {max_char}')
print(f'Max words: {max_words}')
print(f'Max tokens: {max_tokens}')

Token indices sequence length is longer than the specified maximum sequence length for this model (984 > 512). Running this sequence through the model will result in indexing errors


Max characters: 3863
Max words: 441
Max tokens: 984


In [22]:
tokenizer_max_length = min(max_tokens, tokenizer.model_max_length)
tokenizer_max_length

512

In [23]:
import numpy as np


def preprocess_data(sample):
    # take a batch of texts
    text = sample[text_col]
    # encode them
    encoding = tokenizer(text, truncation=True, max_length=tokenizer_max_length, padding="max_length")
    #encoding = tokenizer(text, truncation=True, max_length=tokenizer_max_length, padding=True)
    # add labels
    labels_batch = {k: sample[k] for k in sample.keys() if k in label_cols}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(label_cols)))
    # fill numpy array
    for idx, label in enumerate(label_cols):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

In [24]:
#ds_train = ds_train.map(tokenize, batched=True, batch_size=len(ds_train))
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

Map:   0%|          | 0/1053 [00:00<?, ? examples/s]

# Model

Here we define a model that includes a pre-trained base (i.e. the weights) are loaded, with a random initialized classification head (linear layer) on top. One should fine-tune this head, together with the pre-trained base on a labeled dataset.

This is also printed by the warning.

We set the `problem_type` to be "multi_label_classification", as this will make sure the appropriate loss function is used (namely [`BCEWithLogitsLoss`](https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html)). We also make sure the output layer has `len(label_cols)` output neurons, and we set the id2label and label2id mappings.

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(base_model_id,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(label_cols),
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           hidden_dropout_prob=hidden_dropout_prob,
                                                          # attention_probs_dropout_prob=attention_probs_dropout_prob
                                                          )


model.config.use_cache = False  # Silence the warnings.

model

In [26]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "id2label": {
    "0": "age",
    "1": "disability",
    "2": "masculine",
    "3": "feminine",
    "4": "racial",
    "5": "sexuality",
    "6": "general"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "age": 0,
    "disability": 1,
    "feminine": 3,
    "general": 6,
    "masculine": 2,
    "racial": 4,
    "sexuality": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "transformers_version": "4.42.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

# Define Metrics

In [27]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, classification_report
from transformers import EvalPrediction
import torch


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
# added extras
def multi_label_metrics(predictions, labels):
    # apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= label_threshold)] = 1
    y_true = labels

    #print(labels)
    #print(predictions)

    print(classification_report(y_true, y_pred, target_names=list(id2label.values())))
    
    # return as dictionary
    metrics = {
        'accuracy': accuracy_score(y_true=y_true, y_pred=y_pred)
    }
    
    for average in ['micro','macro','samples','weighted']:
        metrics[f'f1_{average}'] = f1_score(y_true=y_true, y_pred=y_pred, average=average)
        metrics[f'precision_{average}'] = precision_score(y_true=y_true, y_pred=y_pred, average=average)
        metrics[f'recall_{average}'] = recall_score(y_true=y_true, y_pred=y_pred, average=average)
        metrics[f'roc_auc_{average}'] = roc_auc_score(y_true=y_true, y_score=y_pred, average=average)
    
    return metrics


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

# Train

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from huggingface_hub import HfFolder

metric_name = "loss"

args = TrainingArguments(
    model_id,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1, #to prevent running out of disk space
    learning_rate=learning_rate,
    #optim=optimiser,
    #lr_scheduler_type="cosine",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    fp16=False,
    gradient_checkpointing=use_gradient_checkpointing,
    gradient_accumulation_steps=gradient_accumulation_steps,
    overwrite_output_dir=True,
    group_by_length=True,
    #push_to_hub=True,
    #output_dir=repository_id,
    #logging_dir=f"{model_id}/logs",
    #logging_strategy="steps",
    #logging_steps=10,
    #warmup_steps=500,
    #warmup_ratio=0.1,
    #max_grad_norm=0.3,
    #save_total_limit=2,
    #report_to="tensorboard",
    #push_to_hub=True,
    #hub_strategy="every_save",
    #hub_model_id=hub_model_id,
    #hub_token=HfFolder.get_token(),
)

#early_stop = transformers.EarlyStoppingCallback(10, 1.15)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    # For padding a batch of examples to the maximum length seen in the batch
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    #tokenizer=tokenizer,
    #   callbacks=[early_stop]
)


In [None]:
!nvidia-smi

In [None]:

trainer.train()

# Evaluate

In [32]:
test_results = trainer.evaluate(eval_dataset=encoded_dataset['test'])
test_results

              precision    recall  f1-score   support

         age       0.62      0.72      0.66        81
  disability       0.52      0.72      0.60        81
   masculine       0.75      0.44      0.56        81
    feminine       0.79      0.88      0.83        81
      racial       0.53      0.88      0.67        78
   sexuality       0.72      0.79      0.75        81
     general       0.21      0.51      0.30        82

   micro avg       0.53      0.70      0.60       565
   macro avg       0.59      0.71      0.62       565
weighted avg       0.59      0.70      0.62       565
 samples avg       0.33      0.36      0.33       565

----------------------------------------------------------


{'eval_loss': 0.20577850937843323,
 'eval_accuracy': 0.6182336182336182,
 'eval_f1_micro': 0.6016628873771731,
 'eval_f1_macro': 0.6248869172020873,
 'eval_f1_samples': 0.334156378600823,
 'eval_f1_weighted': 0.6240938766848936,
 'eval_precision_micro': 0.525065963060686,
 'eval_recall_micro': 0.7044247787610619,
 'eval_roc_auc_micro': 0.8257651369561995,
 'eval_runtime': 3.5732,
 'eval_samples_per_second': 294.697,
 'eval_steps_per_second': 9.236,
 'epoch': 5.0}

In [None]:

y_true = encoded_dataset['test']['labels']
predictions = trainer.predict(encoded_dataset['test'])
predictions = predictions.predictions

sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(predictions))
# use threshold to turn them into integer predictions
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= label_threshold)] = 1

report = classification_report(y_true, y_pred, target_names=list(id2label.values()))
#print(report)

# Convert to Markdown
report_lines = report.split('\n')
markdown_classification_report = "\n".join([f"    {line}" for line in report_lines])
print(markdown_classification_report)

In [None]:
import pandas as pd
df = pd.DataFrame(list(test_results.items()), columns=['Metric', 'Value'])
print(df.to_string(index=False))
import pandas as pd
df = pd.DataFrame(list(test_results.items()), columns=['Metric', 'Value'])
print(df.to_string(index=False))

# Push to Hugging Face

In [None]:

from huggingface_hub import ModelCard, EvalResult, ModelCardData
import platform
import sys
import os

model.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())
tokenizer.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())

In [None]:
args.to_sanitized_dict()

In [None]:
from huggingface_hub import ModelCard, EvalResult, ModelCardData
import platform
import json
import sys
import os

training_regime = []
training_regime_args = args.to_sanitized_dict()
for k,v in training_regime_args.items():
    if (isinstance(v, (int, str, bool, float))
            and '_dir' not in k
            and 'logging' not in k
            and 'log_' not in k
            and 'hub_' not in k
            and '_hub' not in k
            and 'save_' not in k
            and 'run_name' not in k
            and 'debug' not in k
            and 'token' not in k):
        training_regime.append(f'{k}={json.dumps(v)}')

training_regime = sorted(training_regime)

training_regime = ', '.join(training_regime)


## Hardware
compute_infrastructure = []
mem_total = !cat /proc/meminfo | grep MemTotal
mem_total = list(set(mem_total))[0]
cpu_info = !cat /proc/cpuinfo | grep "model name"
cpu_count = len(list(cpu_info))
cpu_name = list(set(cpu_info))[0]
cpu_name = cpu_name.strip()
cpu_name = cpu_name.replace('model name\t:', '')
cpu_name = cpu_name.strip()

compute_infrastructure.append(f'- {platform.system()} {platform.release()} {platform.processor()}')
compute_infrastructure.append(f'- {mem_total}')
compute_infrastructure.append(f'- {cpu_count} X {cpu_name}')


gpu_name = !nvidia-smi --query-gpu=gpu_name --format=csv,noheader
gpus = set()
for idx, gpu in enumerate(gpu_name):
    compute_infrastructure.append(f"- GPU_{idx}: {gpu}")
    gpus.add(gpu)

gpus =list(gpus)
compute_infrastructure = '\n'.join(compute_infrastructure)
hardware_type = f'{len(gpus)} X {gpus[0]}'

## Software
software_list = !pip list
inc_software = []
inc_software.append(f'python {platform.python_version()}')

for software in software_list:
    if software and '[notice]' not in software and '---' not in software and 'Package' not in software:
        inc_software.append(' '.join(software.split()))
 

software = ", ".join(inc_software)   

hours_used = ""
eval_results = []
for k, v in test_results.items():
    metric_type = k.replace("eval_", "", 1)
    if metric_type == 'runtime':
        hours_used = f"{int(v)/60.0:.2f}"
    eval_results.append(EvalResult(
        task_type='multi_label_classification',
        dataset_type='mix_human-eval_synthetic',
        dataset_name=dataset_id,
        metric_type=metric_type,
        metric_value=v))

direct_use = """
    ```python
    from transformers import pipeline

    pipe = pipeline("text-classification", model="${hub_model_id}", return_all_scores=True)

    results = pipe("Join our dynamic and fast-paced team as a Junior Marketing Specialist. We seek a tech-savvy and energetic individual who thrives in a vibrant environment. Ideal candidates are digital natives with a fresh perspective, ready to adapt quickly to new trends. You should have recent experience in social media strategies and a strong understanding of current digital marketing tools. We're looking for someone with a youthful mindset, eager to bring innovative ideas to our young and ambitious team. If you're a recent graduate or early in your career, this opportunity is perfect for you!")
    print(results)
    ```
    >> [[
    {'label': 'age', 'score': 0.9883460402488708}, 
    {'label': 'disability', 'score': 0.00787709467113018}, 
    {'label': 'feminine', 'score': 0.007224376779049635}, 
    {'label': 'general', 'score': 0.09967829287052155}, 
    {'label': 'masculine', 'score': 0.0035264550242573023}, 
    {'label': 'racial', 'score': 0.014618005603551865}, 
    {'label': 'sexuality', 'score': 0.005568435415625572}
    ]]
    """

direct_use = direct_use.replace('${hub_model_id}', hub_model_id, -1)

card_data = ModelCardData(
    model_id=model_id,
    model_name=model_id,
    model_description="The model is a multi-label classifier designed to detect various types of bias within job descriptions.",
    base_model=base_model_id,
    language='en',
    license='apache-2.0',
    developers="Tristan Everitt and Paul Ryan",
    model_card_authors='See developers',
    model_card_contact='See developers',
    repo="https://gitlab.computing.dcu.ie/everitt2/2024-mcm-everitt-ryan",
    training_regime=training_regime,
    eval_results=eval_results,
    results=markdown_classification_report,
    compute_infrastructure=compute_infrastructure,
    # hardware_requirements='N/A',
    software=software,
    hardware_type=hardware_type,
    hours_used=hours_used,
    cloud_provider='N/A',
    cloud_region='N/A',
    co2_emitted='N/A',
    datasets=[dataset_id],
    direct_use=direct_use
)

card = ModelCard.from_template(card_data)

card.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())