Adapted from the following, but changed to handle multi-label
https://github.com/VanekPetr/flan-t5-text-classifier/blob/main/classifier/AutoModelForSeq2SeqLM/flan-t5-finetuning.py

# Login to Hugging Face

In [2]:
#!pip install -q transformers datasets sentencepiece accelerate evaluate peft bitsandbytes protobuf hf_transfer scikit-learn nltk tiktoken huggingface_hub

In [3]:
from huggingface_hub import login
import os

login(token=os.getenv("HF_TOKEN"))

#from huggingface_hub import notebook_login
#notebook_login()
# Setup

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import warnings

warnings.filterwarnings('ignore')

base_model_id = 'google/flan-t5-xl'

seed = 2024

use_lora = True
use_quantization = use_lora and True
use_fp16 = not use_quantization and True

# Training
num_train_epochs = 3
batch_size = 8

use_gradient_checkpointing = False,  # Save some memory at the expense of training
# See https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one
gradient_accumulation_steps = 1

#learning_rate = 5e-5
#learning_rate=3e-4
learning_rate = 1e-3

# Regularisation
dropout_rate = 0.1
weight_decay = 0.001

# Evaluation
label_threshold = 0.5

# Misc
results_output_dir = 'results'
logging_dir = 'logs'

hf_site_id = '2024-mcm-everitt-ryan'
dataset_id = f'{hf_site_id}/job-bias-synthetic-human-benchmark-v2'
#dataset_id = f'{hf_site_id}/job-bias-synthetic-human-verified'
base_model_name = base_model_id.split('/')[-1]
model_id = f'{base_model_name}-job-bias-qlora-seq2seq-cls'
hub_model_id = f'{hf_site_id}/{model_id}'


# Dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset(dataset_id)
column_names = dataset['train'].column_names

text_col = 'text'
label_cols = [col for col in column_names if col.startswith('label_')]

labels = [label.replace("label_", "") for label in label_cols]

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Remove all columns apart from the two needed for multi-class classification
keep_columns = ['id', text_col] + label_cols
for split in ["train", "val", "test"]:
    dataset[split] = dataset[split].remove_columns(
        [col for col in dataset[split].column_names if col not in keep_columns])

for type in ['train', 'val', 'test']:
    dataset[type] = dataset[type].shuffle(seed=seed)#.select(range(1000))

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_feminine', 'label_general', 'label_masculine', 'label_neutral', 'label_racial', 'label_sexuality', 'text'],
        num_rows: 4609
    })
    val: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_feminine', 'label_general', 'label_masculine', 'label_neutral', 'label_racial', 'label_sexuality', 'text'],
        num_rows: 593
    })
    test: Dataset({
        features: ['id', 'label_age', 'label_disability', 'label_feminine', 'label_general', 'label_masculine', 'label_neutral', 'label_racial', 'label_sexuality', 'text'],
        num_rows: 584
    })
})

# Tokeniser

In [6]:
from transformers import AutoTokenizer
from huggingface_hub import HfFolder

tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=HfFolder.get_token())
#tokenizer

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
from datasets import concatenate_datasets
from transformers import Seq2SeqTrainingArguments

tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["text"], truncation=True),
    batched=True,
    remove_columns=dataset['train'].column_names,
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")


# Prepare target sequences for T5
def create_target_sequence(example):
    labels = [key.replace('label_', '') for key, value in example.items() if key.startswith('label_') and value]
    labels = ','.join(labels)
    labels = labels.strip()
    return labels


# Add target sequence to the dataset
dataset = dataset.map(lambda x: {'labels': create_target_sequence(x)},
                      remove_columns=[col for col in dataset['train'].column_names if col.startswith('label_')])

# Tokenise targets
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["labels"], truncation=True),
    batched=True,
    remove_columns=dataset['train'].column_names,
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/5193 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/5193 [00:00<?, ? examples/s]

Max target length: 13


In [8]:
#tokenized_targets["input_ids"]

# Model

In [9]:
from transformers import BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Activate 4-bit precision base model loading
    bnb_4bit_quant_type="nf4",  # Quantization type (fp4 or nf4)
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute dtype for 4-bit base models
    bnb_4bit_use_double_quant=True,  # Activate nested quantization for 4-bit base models (double quantization)
)

In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoConfig

config = AutoConfig.from_pretrained(base_model_id, dropout_rate=dropout_rate)

def get_base_model():
    model = AutoModelForSeq2SeqLM.from_pretrained(
        base_model_id,
        quantization_config=bnb_config,
        low_cpu_mem_usage=True,
        config=config
    )
    
    return model

model = get_base_model()

#model

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:

from peft import get_peft_model, LoraConfig, TaskType
import torch


peft_trainable_parameters = ""
if use_lora:
    print('Using LoRA')
    lora_config = LoraConfig(
        r=32,  #2, #16,
        lora_alpha=32,  #, 16, #8
        target_modules='all-linear',
        #target_modules=['q', 'v'],
        lora_dropout=0.1,
        bias='none',
        task_type=TaskType.SEQ_2_SEQ_LM
    )

    model = get_peft_model(model, lora_config)
    print(model.print_trainable_parameters())
    
    peft_trainable_parameters = model.print_trainable_parameters()
else:
    # Freeze the pre-trained model's parameters
    for param in model.base_model.parameters():
        param.requires_grad = False

model.config.pad_token_id = model.config.eos_token_id
model.config.use_cache = False  # Silence the warnings.
#model.config.pretraining_tp = 1


model

Using LoRA
trainable params: 9,437,184 || all params: 2,859,194,368 || trainable%: 0.33006444422319176
None


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 2048)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 2048)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2048, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()

In [12]:
model.config

T5Config {
  "_name_or_path": "google/flan-t5-xl",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 5120,
  "d_kv": 64,
  "d_model": 2048,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 32,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_

In [1]:
modules = set()
for name, module in model.named_modules():
    l = name.split('.')[-1].strip()
    if l and not any(i.isdigit() for i in l):
        modules.add(l)

modules

NameError: name 'model' is not defined

# Preprocessing/Evaluation functions

In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, \
    classification_report
import nltk
from transformers import  DataCollatorForSeq2Seq, Seq2SeqTrainer
import numpy as np
from nltk import sent_tokenize
from typing import List, Tuple
from datasets import Dataset


def preprocess_function(sample: Dataset, padding: str = "max_length") -> dict:
    """Preprocess the dataset."""
    inputs = [item for item in sample["text"]]
    labels = [item for item in sample["labels"]]

    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding=padding, truncation=True
    )

    labels = tokenizer(
        text_target=labels, max_length=max_target_length, padding=padding, truncation=True
    )

    if padding == "max_length":
        labels["input_ids"] = [
            [(la if la != tokenizer.pad_token_id else -100) for la in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def postprocess_text(labels: List[str], preds: List[str]) -> Tuple[List[str], List[str]]:
    """Helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]
    return labels, preds


def compute_metrics(eval_predictions):
    
    y_hat, y = eval_predictions
    
    # Replace -100 in the labels .
    y = np.where(y != -100, y, tokenizer.pad_token_id)
    
    if isinstance(y_hat, tuple):
        y_hat = y_hat[0]
        
    y_str = tokenizer.batch_decode(y, skip_special_tokens=True)
    y_hat_str = tokenizer.batch_decode(y_hat, skip_special_tokens=True)

    #print(f'y_str:decoded:::{y_str}')
    #print(f'pred_flat:decoded:::{y_hat_str}')
    
    y_str, y_hat_str = postprocess_text( y_str, y_hat_str)
    
    #print(f'y_str:post:::{y_str}')
    #print(f'y_hat_str:post:::{y_hat_str}')
    
    # Flatten the list of labels
    true_flat = [label.strip() for sublist in [t.split(',') for t in y_str] for label in sublist]
    #pred_flat = [label.strip() for sublist in [p.split(',') for p in y_hat_str] for label in sublist]
    
    #print(f'true_flat:::{true_flat}')
    #print(f'pred_flat:::{pred_flat}')
    
    # Convert to binary format for multi-label metrics
    unique_labels = list(set(true_flat))
    #print(f'unique_labels:::{unique_labels}')
    
    # Remove the blank label (no bias)
    unique_labels = list([label for label in unique_labels if label != '' and label is not None])
    unique_labels = sorted(list(set(unique_labels)))
    target_names=sorted(list(set(id2label.values())))
    use_auc_roc = len(unique_labels) == len(target_names)

    #print(f'unique_labels:::{unique_labels}')
    #print(f'target_names:::{target_names}')
    
    y_true = [[1 if label in t else 0 for label in target_names] for t in y_str]
    y_pred = [[1 if label in p else 0 for label in target_names] for p in y_hat_str]

    #print(f'y_true:::{y_true}')
    #print(f'y_pred:::{y_pred}')
    print(classification_report(y_true, y_pred, target_names=target_names))
    
    # return as dictionary
    metrics = {
        'accuracy': accuracy_score(y_true=y_true, y_pred=y_pred)
    }
    
    for average in ['micro','macro','samples','weighted']:
        metrics[f'f1_{average}'] = f1_score(y_true=y_true, y_pred=y_pred, average=average)
        metrics[f'precision_{average}'] = precision_score(y_true=y_true, y_pred=y_pred, average=average)
        metrics[f'recall_{average}'] = recall_score(y_true=y_true, y_pred=y_pred, average=average)
        if use_auc_roc:
            metrics[f'roc_auc_{average}'] = roc_auc_score(y_true=y_true, y_score=y_pred, average=average)
    
    return metrics


2024-07-16 13:14:34.112223: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-16 13:14:34.178007: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-16 13:14:34.178064: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-16 13:14:34.185671: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-16 13:14:34.207024: I tensorflow/core/platform/cpu_feature_guar

# Train

In [13]:
from transformers import TrainerCallback

args = Seq2SeqTrainingArguments(
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=learning_rate,
    output_dir=results_output_dir,
    #logging_dir=logging_dir,  # logging & evaluation strategies
    #auto_find_batch_size=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    #report_to="tensorboard",
    #push_to_hub=True,
    #hub_strategy="every_save",
    #hub_model_id=REPOSITORY_ID,
    #hub_token=HfFolder.get_token(),
)

encoded_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["text", "labels"]
)
print(f"Keys of tokenized dataset: {list(encoded_dataset['train'].features)}")

nltk.download("punkt")

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)


#early_stop = transformers.EarlyStoppingCallback(10, 1.15)
class PrintClassificationCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        print("----------------------------------------------------------")




Map:   0%|          | 0/593 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['id', 'labels', 'input_ids', 'attention_mask']


[nltk_data] Downloading package punkt to /home/teveritt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    compute_metrics=compute_metrics,
    callbacks=[PrintClassificationCallback]
)

model.config.use_cache = False  # Silence the warnings.

!nvidia-smi

Tue Jul 16 13:14:36 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A2000 12GB          On  | 00000000:1C:00.0 Off |                  Off |
| 30%   59C    P2              26W /  70W |   3359MiB / 12282MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
trainer.train()

[2024-07-16 13:14:36,701] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,Precision Micro,Recall Micro,Roc Auc Micro,F1 Macro,Precision Macro,Recall Macro,Roc Auc Macro,F1 Samples,Precision Samples,Recall Samples,Roc Auc Samples,F1 Weighted,Precision Weighted,Recall Weighted,Roc Auc Weighted
1,2.4728,1.238281,0.123103,0.305998,0.251509,0.390625,0.604669,0.319589,0.326518,0.390625,0.604669,0.292074,0.248454,0.401349,0.609959,0.319589,0.326518,0.390625,0.604669
2,0.8399,1.181641,0.111298,0.310174,0.257202,0.390625,0.60735,0.325875,0.320083,0.390625,0.60735,0.287071,0.240585,0.402192,0.612963,0.325875,0.320083,0.390625,0.60735
3,0.7506,1.195312,0.124789,0.306202,0.261013,0.370312,0.603407,0.320832,0.322988,0.370312,0.603407,0.280109,0.239741,0.38027,0.608185,0.320832,0.322988,0.370312,0.603407


              precision    recall  f1-score   support

         age       0.51      0.53      0.52        80
  disability       0.26      0.51      0.35        80
    feminine       0.18      0.38      0.25        80
     general       0.15      0.54      0.23        80
   masculine       0.09      0.03      0.04        80
     neutral       0.16      0.38      0.22        80
      racial       0.87      0.60      0.71        80
   sexuality       0.39      0.17      0.24        80

   micro avg       0.25      0.39      0.31       640
   macro avg       0.33      0.39      0.32       640
weighted avg       0.33      0.39      0.32       640
 samples avg       0.25      0.40      0.29       640

----------------------------------------------------------
              precision    recall  f1-score   support

         age       0.51      0.54      0.52        80
  disability       0.33      0.47      0.39        80
    feminine       0.15      0.30      0.20        80
     general       

TrainOutput(global_step=1731, training_loss=1.354446807808438, metrics={'train_runtime': 1018.5947, 'train_samples_per_second': 13.575, 'train_steps_per_second': 1.699, 'total_flos': 9543300967563264.0, 'train_loss': 1.354446807808438, 'epoch': 3.0})

# Evaluate

In [16]:
test_results = trainer.evaluate(eval_dataset=encoded_dataset['test'])
test_results

              precision    recall  f1-score   support

         age       0.48      0.39      0.43        80
  disability       0.37      0.47      0.42        80
    feminine       0.09      0.16      0.12        80
     general       0.16      0.64      0.25        80
   masculine       0.34      0.14      0.20        80
     neutral       0.16      0.30      0.21        80
      racial       0.84      0.82      0.83        80
   sexuality       0.14      0.09      0.11        80

   micro avg       0.25      0.38      0.30       640
   macro avg       0.32      0.38      0.32       640
weighted avg       0.32      0.38      0.32       640
 samples avg       0.24      0.39      0.28       640

----------------------------------------------------------


{'eval_loss': 1.1513671875,
 'eval_accuracy': 0.11301369863013698,
 'eval_f1_micro': 0.3027638190954774,
 'eval_precision_micro': 0.25315126050420167,
 'eval_recall_micro': 0.3765625,
 'eval_roc_auc_micro': 0.6001116071428572,
 'eval_f1_macro': 0.3191085071327211,
 'eval_precision_macro': 0.3218026766059445,
 'eval_recall_macro': 0.3765625,
 'eval_roc_auc_macro': 0.6001116071428572,
 'eval_f1_samples': 0.28042237442922374,
 'eval_precision_samples': 0.23715753424657535,
 'eval_recall_samples': 0.39140981735159813,
 'eval_roc_auc_samples': 0.6073630136986302,
 'eval_f1_weighted': 0.3191085071327211,
 'eval_precision_weighted': 0.3218026766059445,
 'eval_recall_weighted': 0.3765625,
 'eval_roc_auc_weighted': 0.6001116071428572,
 'eval_runtime': 44.7483,
 'eval_samples_per_second': 13.051,
 'eval_steps_per_second': 1.631,
 'epoch': 3.0}

In [17]:
y_true = dataset["test"].map(
    lambda x: tokenizer(x["labels"], truncation=True, max_length=13, padding='max_length'),
    batched=True,
    remove_columns=dataset['train'].column_names,
)
y_true = y_true['input_ids']
y_true = np.where(y_true != -100, y_true, tokenizer.pad_token_id)

predictions = trainer.predict(encoded_dataset['test'])
y_pred = predictions.predictions


Map:   0%|          | 0/584 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

         age       0.48      0.39      0.43        80
  disability       0.37      0.47      0.42        80
    feminine       0.09      0.16      0.12        80
     general       0.16      0.64      0.25        80
   masculine       0.34      0.14      0.20        80
     neutral       0.16      0.30      0.21        80
      racial       0.84      0.82      0.83        80
   sexuality       0.14      0.09      0.11        80

   micro avg       0.25      0.38      0.30       640
   macro avg       0.32      0.38      0.32       640
weighted avg       0.32      0.38      0.32       640
 samples avg       0.24      0.39      0.28       640



In [1]:

y_str = tokenizer.batch_decode(y_true, skip_special_tokens=True)
y_hat_str = tokenizer.batch_decode(y_pred, skip_special_tokens=True)

y_str, y_hat_str = postprocess_text(y_str, y_hat_str)

# Flatten the list of labels
true_flat = [label.strip() for sublist in [t.split(',') for t in y_str] for label in sublist]
pred_flat = [label.strip() for sublist in [p.split(',') for p in y_hat_str] for label in sublist]

target_names=sorted(list(set(id2label.values())))

y_true = [[1 if label in t else 0 for label in target_names] for t in y_str]
y_pred = [[1 if label in p else 0 for label in target_names] for p in y_hat_str]

report = classification_report(y_true, y_pred, target_names=target_names)

#print(report)

# Convert to Markdown
report_lines = report.split('\n')
markdown_classification_report = "\n".join([f"    {line}" for line in report_lines])
print(markdown_classification_report)

NameError: name 'tokenizer' is not defined

In [19]:
import pandas as pd

df = pd.DataFrame(list(test_results.items()), columns=['Metric', 'Value'])
print(df.to_string(index=False))
import pandas as pd

df = pd.DataFrame(list(test_results.items()), columns=['Metric', 'Value'])
print(df.to_string(index=False))

                 Metric     Value
              eval_loss  1.151367
          eval_accuracy  0.113014
          eval_f1_micro  0.302764
   eval_precision_micro  0.253151
      eval_recall_micro  0.376563
     eval_roc_auc_micro  0.600112
          eval_f1_macro  0.319109
   eval_precision_macro  0.321803
      eval_recall_macro  0.376563
     eval_roc_auc_macro  0.600112
        eval_f1_samples  0.280422
 eval_precision_samples  0.237158
    eval_recall_samples  0.391410
   eval_roc_auc_samples  0.607363
       eval_f1_weighted  0.319109
eval_precision_weighted  0.321803
   eval_recall_weighted  0.376563
  eval_roc_auc_weighted  0.600112
           eval_runtime 44.748300
eval_samples_per_second 13.051000
  eval_steps_per_second  1.631000
                  epoch  3.000000
                 Metric     Value
              eval_loss  1.151367
          eval_accuracy  0.113014
          eval_f1_micro  0.302764
   eval_precision_micro  0.253151
      eval_recall_micro  0.376563
     eval_roc_

In [20]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


def classify_text(text, model, tokenizer, label_columns, device):
    input_text = f"classify: {text}"
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs)
    predicted_labels = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_labels = [label.strip() for label in predicted_labels.split(',')]
    label_dict = {label: False for label in label_columns}
    for label in predicted_labels:
        if label in label_dict:
            label_dict[label] = True
    return label_dict

In [21]:
text = "Looking for a native English speaker"

classify_text(text, model, tokenizer, labels, device)

{'age': False,
 'disability': False,
 'feminine': False,
 'general': False,
 'masculine': False,
 'neutral': False,
 'racial': False,
 'sexuality': False}

# Push to Hugging Face

In [None]:
model.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())
tokenizer.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())

# Merge Adapter and Base Model

In [None]:
# Get peft config.
from peft import PeftConfig
config = PeftConfig.from_pretrained(hub_model_id)

# Get base model
model = get_base_model()

# Load the Lora model.
from peft import PeftModel
model = PeftModel.from_pretrained(model, hub_model_id, torch_dtype=torch.bfloat16, is_trainable=False)

# Merge model and Lora adapter.
merged_model = model.merge_and_unload()

# Push to HF Hub.
merged_model.push_to_hub(hub_model_id)
tokenizer.push_to_hub(hub_model_id)

# Update Model Card 

In [22]:
from huggingface_hub import ModelCard, EvalResult, ModelCardData
import platform
import json
import sys
import os

training_regime = []
if use_lora:
    training_regime.append(f'PEFT: {peft_trainable_parameters}')
training_regime_args = args.to_sanitized_dict()
for k,v in training_regime_args.items():
    if (isinstance(v, (int, str, bool, float))
            and '_dir' not in k
            and 'logging' not in k
            and 'log_' not in k
            and 'hub_' not in k
            and '_hub' not in k
            and 'save_' not in k
            and 'run_name' not in k
            and 'debug' not in k
            and 'token' not in k):
        training_regime.append(f'{k}={json.dumps(v)}')

training_regime = sorted(training_regime)

training_regime = ', '.join(training_regime)


## Hardware
compute_infrastructure = []
mem_total = !cat /proc/meminfo | grep MemTotal
mem_total = list(set(mem_total))[0]
cpu_info = !cat /proc/cpuinfo | grep "model name"
cpu_count = len(list(cpu_info))
cpu_name = list(set(cpu_info))[0]
cpu_name = cpu_name.strip()
cpu_name = cpu_name.replace('model name\t:', '')
cpu_name = cpu_name.strip()

compute_infrastructure.append(f'- {platform.system()} {platform.release()} {platform.processor()}')
compute_infrastructure.append(f'- {mem_total}')
compute_infrastructure.append(f'- {cpu_count} X {cpu_name}')


gpu_name = !nvidia-smi --query-gpu=gpu_name --format=csv,noheader
gpus = set()
for idx, gpu in enumerate(gpu_name):
    compute_infrastructure.append(f"- GPU_{idx}: {gpu}")
    gpus.add(gpu)

gpus =list(gpus)
compute_infrastructure = '\n'.join(compute_infrastructure)
hardware_type = f'{len(gpus)} X {gpus[0]}'

## Software
software_list = !pip list
inc_software = []
inc_software.append(f'python {platform.python_version()}')

for software in software_list:
    if software and '[notice]' not in software and '---' not in software and 'Package' not in software:
        inc_software.append(' '.join(software.split()))
 

software = ", ".join(inc_software)   

hours_used = ""
eval_results = []
for k, v in test_results.items():
    metric_type = k.replace("eval_", "", 1)
    if metric_type == 'runtime':
        hours_used = f"{int(v)/60.0:.2f}"
    eval_results.append(EvalResult(
        task_type='multi_label_classification',
        dataset_type='mix_human-eval_synthetic',
        dataset_name=dataset_id,
        metric_type=metric_type,
        metric_value=v))

direct_use = """
    ```python
    from transformers import pipeline

    pipe = pipeline("text-classification", model="${hub_model_id}", return_all_scores=True)

    results = pipe("Join our dynamic and fast-paced team as a Junior Marketing Specialist. We seek a tech-savvy and energetic individual who thrives in a vibrant environment. Ideal candidates are digital natives with a fresh perspective, ready to adapt quickly to new trends. You should have recent experience in social media strategies and a strong understanding of current digital marketing tools. We're looking for someone with a youthful mindset, eager to bring innovative ideas to our young and ambitious team. If you're a recent graduate or early in your career, this opportunity is perfect for you!")
    print(results)
    ```
    >> [[
    {'label': 'age', 'score': 0.9883460402488708}, 
    {'label': 'disability', 'score': 0.00787709467113018}, 
    {'label': 'feminine', 'score': 0.007224376779049635}, 
    {'label': 'general', 'score': 0.09967829287052155}, 
    {'label': 'masculine', 'score': 0.0035264550242573023}, 
    {'label': 'racial', 'score': 0.014618005603551865}, 
    {'label': 'sexuality', 'score': 0.005568435415625572}
    ]]
    """

direct_use = direct_use.replace('${hub_model_id}', hub_model_id, -1)

card_data = ModelCardData(
    model_id=model_id,
    model_name=model_id,
    model_description="The model is a multi-label classifier designed to detect various types of bias within job descriptions.",
    base_model=base_model_id,
    language='en',
    license='apache-2.0',
    developers="Tristan Everitt and Paul Ryan",
    model_card_authors='See developers',
    model_card_contact='See developers',
    repo="https://gitlab.computing.dcu.ie/everitt2/2024-mcm-everitt-ryan",
    training_regime=training_regime,
    eval_results=eval_results,
    results=markdown_classification_report,
    compute_infrastructure=compute_infrastructure,
    # hardware_requirements='N/A',
    software=software,
    hardware_type=hardware_type,
    hours_used=hours_used,
    cloud_provider='N/A',
    cloud_region='N/A',
    co2_emitted='N/A',
    datasets=[dataset_id],
    direct_use=direct_use
)

card = ModelCard.from_template(card_data)

card.push_to_hub(repo_id=hub_model_id, token=HfFolder.get_token())

adapter_model.safetensors:   0%|          | 0.00/7.10M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/2024-mcm-everitt-ryan/flan-t5-base-job-bias-4bit-qlora-seq2seq-cls/commit/2d80218fb5c98a0fbcb364d97506db03cd98173c', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='2d80218fb5c98a0fbcb364d97506db03cd98173c', pr_url=None, pr_revision=None, pr_num=None)