# Supervised Fine-Tuning

Download the model

In [1]:
# !huggingface-cli login
# !huggingface-cli download meta-llama/Meta-Llama-3-8B --local-dir ../LLMs/llama3-8b --local-dir-use-symlinks False

Import dependencies

In [2]:
from huggingface_hub import login as huggingface_hub_login
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, AutoConfig, AutoModelForSequenceClassification
import torch
from peft import LoraConfig, get_peft_model, TaskType
from dotenv import load_dotenv
import os
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from pprint import pprint
from collections import Counter
import random

  from .autonotebook import tqdm as notebook_tqdm


Log into Hugging Face

In [3]:
load_dotenv(dotenv_path="../secrets/.env")

hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")

huggingface_hub_login(hugging_face_token)

Quantization config

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

Load model and tokenizer

In [5]:
MODEL_PATH = "../LLMs/llama3-8b"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)

config = AutoConfig.from_pretrained(MODEL_PATH, num_labels=2)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    config=config,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
    ignore_mismatched_sizes=True
)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.38s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../LLMs/llama3-8b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Split the dataset into training and avaluation

In [6]:
dataset = load_dataset("json", data_files="../datasets/dataset.jsonl", split="train")

split_ratio = 0.8
split_index = int(len(dataset) * split_ratio)

train_dataset = dataset.select(range(0, split_index))
eval_dataset = dataset.select(range(split_index, len(dataset)))

# Shuffle ONLY the training set
train_dataset = train_dataset.shuffle(seed=42)

train_dataset.to_json("../datasets/train.jsonl", orient="records", lines=True)
eval_dataset.to_json("../datasets/eval.jsonl", orient="records", lines=True)

print(f"✅ Done! {len(train_dataset)} training / {len(eval_dataset)} evaluation samples.")


Creating json from Arrow format: 100%|██████████| 8/8 [00:01<00:00,  5.40ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  3.36ba/s]

✅ Done! 7444 training / 1862 evaluation samples.





Format datasets for classification

In [7]:
def format_for_classification(example):
    return {
        "text": example['prompt'],
        "label": int(example["response"])
    }

train_formatted_dataset = train_dataset.map(format_for_classification)
eval_formatted_dataset = eval_dataset.map(format_for_classification)

Upsample the minority class to address class imbalance

In [8]:
majority_class = train_formatted_dataset.filter(lambda example: example['label'] == 0)
minority_class = train_formatted_dataset.filter(lambda example: example['label'] == 1)

n_majority = len(majority_class)
n_minority = len(minority_class)

minority_upsampled = minority_class.shuffle(seed=42).select(
    [random.randint(0, n_minority - 1) for _ in range(n_majority)]
)

balanced_dataset = concatenate_datasets([majority_class, minority_upsampled])

train_balanced_formatted_dataset = balanced_dataset.shuffle(seed=42)

print(Counter(balanced_dataset['label']))

Counter({0: 7345, 1: 7345})


Tokenize datasets

In [9]:
def tokenize_class(example):
    encoding = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
    encoding["labels"] = example["label"]
    return encoding

train_tokenized_dataset = train_balanced_formatted_dataset.map(tokenize_class, batched=True)
eval_tokenized_dataset = eval_formatted_dataset.map(tokenize_class, batched=True)

Map: 100%|██████████| 14690/14690 [03:03<00:00, 80.13 examples/s]


A sneak peek into the resulting datasets

In [10]:
# pprint(train_balanced_formatted_dataset[0])
# pprint(train_balanced_formatted_dataset.column_names)

# label_counts = Counter(train_formatted_dataset['label'])
# print(label_counts)
# 99/7345

Apply LoRA with PEFT

In [11]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 13,639,680 || all params: 7,518,572,544 || trainable%: 0.1814


Define evaluation metrics

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.exp(logits).sum(-1, keepdims=True)
    preds = np.argmax(probs, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="binary"), # Of all samples predicted as class 1, how many were actually 1?
        "recall": recall_score(labels, preds, average="binary"), # Of all actual class 1 samples, how many did the model correctly predict?
        "f1": f1_score(labels, preds, average="binary"),
        "auc": roc_auc_score(labels, probs[:, 1]),
    }

Set Up TrainingArguments and Trainer

In [13]:
training_args = TrainingArguments(
    output_dir="./training/output/llama-classifier",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./training/logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Train the model

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

Save the fine-tuned model

In [None]:
trainer.save_model("../LLMs/finetuned/classification/model/llama3-8B")
tokenizer.save_pretrained("../LLMs/finetuned/classification/tokenizer/llama3-8B")

Test the fine-tuned model

In [None]:
predictions = trainer.predict(eval_tokenized_dataset)
logits = predictions.predictions
probs = np.exp(logits) / np.exp(logits).sum(-1, keepdims=True)

print("Probabilities for class 1:", probs[:, 1])
