Adapted from the following, but changed to handle multi-label
https://github.com/VanekPetr/flan-t5-text-classifier/blob/main/classifier/AutoModelForSeq2SeqLM/flan-t5-finetuning.py

# Setup

In [1]:
import warnings
warnings.filterwarnings('ignore')

base_model_id = 'google/flan-t5-small'

hf_site_id = '2024-mcm-everitt-ryan'
dataset_id = f'{hf_site_id}/job-bias-synthetic-human-benchmark'
#dataset_id = f'{hf_site_id}/job-bias-synthetic-human-verified'


base_model_name = base_model_id.split('/')[-1]
model_id = f'{base_model_name}-seq2seq-job-bias-mixed'
hub_model_id = f'{hf_site_id}/{model_id}'

In [2]:

seed=2024

# Training
num_train_epochs=10
batch_size=8
learning_rate=3e-4
#learning_rate = 5e-5

# Regularisation
dropout_rate = 0.1
weight_decay=0.0001

# Misc
results_output_dir = 'results'
logging_dir='logs'

In [18]:
!pip install -q transformers datasets sentencepiece accelerate evaluate hf_transfer huggingface_hub scikit-learn protobuf nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Dataset

In [14]:
from datasets import load_dataset

dataset = load_dataset(dataset_id)
column_names = dataset['train'].column_names


text_col = 'text'
label_cols = [col for col in column_names if col.startswith('label_')]

labels = [label.replace("label_", "") for label in label_cols]

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Remove all columns apart from the two needed for multi-class classification
keep_columns = ['id', text_col] + label_cols
for split in ["train", "val", "test"]:
    dataset[split] = dataset[split].remove_columns(
        [col for col in dataset[split].column_names if col not in keep_columns])

for type in ['train','val','test']:
    dataset[type] = dataset[type].shuffle(seed=seed).select(range(10))

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 10
    })
    val: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 10
    })
    test: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_masculine', 'label_feminine', 'label_racial', 'label_sexuality', 'label_general', 'text'],
        num_rows: 10
    })
})

# Tokeniser

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer

T5TokenizerFast(name_or_path='google/flan-t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>

In [16]:
from datasets import concatenate_datasets
from transformers import Seq2SeqTrainingArguments


tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=dataset['train'].column_names,
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# Prepare target sequences for T5
def create_target_sequence(example):
    labels = [key.replace('label_','') for key, value in example.items() if key.startswith('label_') and value]
    labels = ','.join(labels)
    labels = labels.strip()    
    return labels

# Add target sequence to the dataset
dataset = dataset.map(lambda x: {'labels': create_target_sequence(x)}, remove_columns=[col for col in dataset['train'].column_names if col.startswith('label_')])

# Tokenise targets
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["labels"], truncation=True),
    batched=True,
    remove_columns=dataset['train'].column_names,
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Max source length: 512
Max target length: 9


In [17]:
#tokenized_targets["input_ids"]

# Model

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoConfig

config = AutoConfig.from_pretrained(base_model_id, dropout_rate=dropout_rate)

model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model_id,
    config=config
)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [19]:
model.config

T5Config {
  "_name_or_path": "google/flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "ma

# Preprocessing/Evaluation functions

In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, \
    classification_report, confusion_matrix
import nltk
from transformers import  DataCollatorForSeq2Seq, Seq2SeqTrainer
import numpy as np
from nltk import sent_tokenize
from typing import List, Tuple
from datasets import Dataset
import pandas as pd


def preprocess_function(sample: Dataset, padding: str = "max_length") -> dict:
    """Preprocess the dataset."""
    inputs = [item for item in sample["text"]]
    labels = [item for item in sample["labels"]]

    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding=padding, truncation=True
    )

    labels = tokenizer(
        text_target=labels, max_length=max_target_length, padding=padding, truncation=True
    )

    if padding == "max_length":
        labels["input_ids"] = [
            [(la if la != tokenizer.pad_token_id else -100) for la in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def postprocess_text(labels: List[str], preds: List[str]) -> Tuple[List[str], List[str]]:
    """Helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]
    return labels, preds


def compute_metrics(eval_predictions):
    
    y_hat, y = eval_predictions
    
    # Replace -100 in the labels .
    y = np.where(y != -100, y, tokenizer.pad_token_id)
    
    if isinstance(y_hat, tuple):
        y_hat = y_hat[0]
        
    print(y)
    print('--------------------')
    print(y_hat)
    
    y_str = tokenizer.batch_decode(y, skip_special_tokens=True)
    y_hat_str = tokenizer.batch_decode(y_hat, skip_special_tokens=True)

    y_str, y_hat_str = postprocess_text( y_str, y_hat_str)
    
    print('--------------------')
    print(y_str)
    print('--------------------')
    print(y_hat_str)
    print('--------------------')


    # Flatten the list of labels
    true_flat = [label.strip() for sublist in [t.split(',') for t in y_str] for label in sublist]
    pred_flat = [label.strip() for sublist in [p.split(',') for p in y_hat_str] for label in sublist]
    
    
    #print(true_flat)
    #print('--------------------')
    #print(pred_flat)

    # Convert to binary format for multi-label metrics
    #unique_labels = list(set(true_flat + pred_flat))  # This will include out-of-scope (not a label) predictions
    unique_labels = list(set(true_flat))
    
    # Remove the blank label (no bias)
    unique_labels = [label for label in unique_labels if label != '' and label is not None]
    
    y_true = [[1 if label in t else 0 for label in unique_labels] for t in y_str]
    y_pred = [[1 if label in p else 0 for label in unique_labels] for p in y_hat_str]


    y_true_str = [[label if label in t else 0 for label in unique_labels] for t in y_str]
    y_pred_str = [[label if label in p else 0 for label in unique_labels] for p in y_hat_str]
    #unique_labels = ['no_bias' if not label else label for label in unique_labels]

    print('\n------------------ Confusion Matrix ------------------')
    conf_matrix = confusion_matrix(np.asarray(y_true).argmax(axis=1), np.asarray(y_pred).argmax(axis=1))
    df_cm = pd.DataFrame(conf_matrix, index=unique_labels, columns=unique_labels)
    print(df_cm)
    print('\n--------- Classification Report ------------------')
    print(classification_report(y_true, y_pred, target_names=unique_labels))#, target_names=list(id2label.values())))

    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)

    f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    f1_samples = f1_score(y_true=y_true, y_pred=y_pred, average='samples')
    f1_weighted = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')

    precision_micro = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_micro = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc_micro = roc_auc_score(y_true=y_true, y_score=y_pred, average='micro')
    
    
    #for i in range(1): #range(len(y_true_str)):
    #    yt  = [num for num in y_true_str[i] if num != 0]
    #    yth  = [num for num in y_pred_str[i] if num != 0]
    #    print(f't: {yt}')
    #    print(f'p: {yth}')
    #    print(f't: {y_true[i]}')
    #    print(f'p: {y_pred[i]}')
    #    print(f"accuracy: {accuracy_score(y_true=y_true[i], y_pred=y_pred[i])}")
    #    print(f"precision: {precision_score(y_true=y_true[i], y_pred=y_pred[i], average='micro')}")
    #    print(f"recall: {recall_score(y_true=y_true[i], y_pred=y_pred[i], average='micro')}")
    #    print('----------------------')
    
    metrics = {
        'accuracy': accuracy,
        f'f1_micro': f1_micro,
        f'f1_macro': f1_macro,
        f'f1_samples': f1_samples,
        f'f1_weighted': f1_weighted,
        f'precision_micro': precision_micro,
        f'recall_micro': recall_micro,
        f'roc_auc_micro': roc_auc_micro}
    return metrics


# Train

In [21]:
from transformers import TrainerCallback

training_args = Seq2SeqTrainingArguments(
    output_dir=results_output_dir,
    #logging_dir=logging_dir,  # logging & evaluation strategies
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
   # weight_decay=weight_decay
    #report_to="tensorboard",
    #push_to_hub=True,
    #hub_strategy="every_save",
    #hub_model_id=REPOSITORY_ID,
    #hub_token=HfFolder.get_token(),
)

tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["text", "labels"]
)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

nltk.download("punkt")

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

#early_stop = transformers.EarlyStoppingCallback(10, 1.15)
class PrintClassificationCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        print("----------------------------------------------------------")




Keys of tokenized dataset: ['id', 'labels', 'input_ids', 'attention_mask']


[nltk_data] Downloading package punkt to /home/teveritt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    callbacks=[PrintClassificationCallback]
)

model.config.use_cache = False  # Silence the warnings.
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Samples,F1 Weighted,Precision Micro,Recall Micro,Roc Auc Micro
1,5.7664,4.776971,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.5
2,3.9246,3.709516,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3,2.8373,3.10704,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.5
4,2.5221,2.819735,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.5


[[    1     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [ 6949   485     1     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [  879     1     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [21546     1     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [21546     1     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [    1     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [    1     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [    3    52     9  4703     1     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]


KeyboardInterrupt: 

In [None]:
test_results = trainer.evaluate(eval_dataset=tokenized_dataset['test'])
test_results

In [14]:
import pandas as pd
df = pd.DataFrame(list(test_results.items()), columns=['Metric', 'Value'])
print(df.to_string(index=False))

                 Metric     Value
              eval_loss  0.810158
          eval_accuracy  0.590693
          eval_f1_micro  0.517121
          eval_f1_macro  0.546836
        eval_f1_samples  0.293357
       eval_f1_weighted  0.545981
   eval_precision_micro  0.427252
      eval_recall_micro  0.654867
     eval_roc_auc_micro  0.790995
           eval_runtime 18.095300
eval_samples_per_second 58.192000
  eval_steps_per_second  7.295000
                  epoch 10.000000


In [15]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def classify_text(text, model, tokenizer, label_columns, device):
    input_text = f"classify: {text}"
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs)
    predicted_labels = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_labels = [label.strip() for label in predicted_labels.split(',')]
    label_dict = {label: False for label in label_columns}
    for label in predicted_labels:
        if label in label_dict:
            label_dict[label] = True
    return label_dict

In [16]:
text = "Looking for a native English speaker"

classify_text(text, model, tokenizer, labels, device)

{'age': False,
 'disability': False,
 'masculine': False,
 'feminine': False,
 'racial': True,
 'sexuality': False,
 'general': False}

# Pushing to Hugging Face

In [17]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:

from huggingface_hub import ModelCard, EvalResult, ModelCardData, HfFolder
import platform
import sys
import os

model.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())
tokenizer.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())

###### Update Model Card ######

eval_results = []
for k, v in test_results.items():
    eval_results.append(EvalResult(
        task_type='multi_label_classification',
        dataset_type='mix_human-eval_synthetic',
        dataset_name=dataset_id,
        metric_type=k.replace("eval_", "", 1),
        metric_value=v))

direct_use = """
    ```python
    from transformers import pipeline

    pipe = pipeline("text-classification", model="${hub_model_id}", return_all_scores=True)

    results = pipe("Join our dynamic and fast-paced team as a Junior Marketing Specialist. We seek a tech-savvy and energetic individual who thrives in a vibrant environment. Ideal candidates are digital natives with a fresh perspective, ready to adapt quickly to new trends. You should have recent experience in social media strategies and a strong understanding of current digital marketing tools. We're looking for someone with a youthful mindset, eager to bring innovative ideas to our young and ambitious team. If you're a recent graduate or early in your career, this opportunity is perfect for you!")
    print(results)
    ```
    >> [[
    {'label': 'age', 'score': 0.9883460402488708}, 
    {'label': 'disability', 'score': 0.00787709467113018}, 
    {'label': 'feminine', 'score': 0.007224376779049635}, 
    {'label': 'general', 'score': 0.09967829287052155}, 
    {'label': 'masculine', 'score': 0.0035264550242573023}, 
    {'label': 'racial', 'score': 0.014618005603551865}, 
    {'label': 'sexuality', 'score': 0.005568435415625572}
    ]]
    """
direct_use = direct_use.replace('${hub_model_id}', hub_model_id, -1)

card_data = ModelCardData(
    model_id=model_id,
    model_name=model_id,
    model_description="The model is a multi-label classifier designed to detect various types of bias within job descriptions.",
    base_model=base_model_id,
    language='en',
    license='apache-2.0',
    developers="Tristan Everitt and Paul Ryan",
    model_card_authors='See developers',
    model_card_contact='See developers',
    repo="https://gitlab.computing.dcu.ie/everitt2/2024-mcm-everitt-ryan",
    eval_results=eval_results,
    compute_infrastructure=f'{platform.system()} {platform.release()} {platform.processor()}',
    # hardware_requirements=f"CPUs: {psutil.cpu_count()}, Memory: {psutil.virtual_memory().total} bytes",
    software=f'Python {platform.python_version()}',
    hardware_type=platform.machine(),
    hours_used='N/A',
    cloud_provider='N/A',
    cloud_region='N/A',
    co2_emitted='N/A',
    datasets=[dataset_id],
    direct_use=direct_use
)

card = ModelCard.from_template(card_data)

card.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/2024-mcm-everitt-ryan/flan-t5-small-seq2seq-job-bias-mixed/commit/6c498663facc8e84326823f0f534fde2db2718de', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='6c498663facc8e84326823f0f534fde2db2718de', pr_url=None, pr_revision=None, pr_num=None)