### Setting up the environment

In [None]:
# Setting up the environment
%%capture
!pip install unsloth
!pip install evaluate
!pip install rouge_score
# Install the latest nightly Unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install sacrebleu

In [None]:
# Get your Hugging Face token to access the model
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')


In [None]:
# Imports
import os
import torch
import numpy as np
import random
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm import tqdm
import evaluate
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq, TextStreamer
from trl import SFTTrainer
from collections import Counter
import re


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
# Set random seeds for reproducibility
torch.manual_seed(3407)
np.random.seed(3407)
random.seed(3407)


In [None]:
# Configuration parameters
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

### Loading Model

In [None]:
# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct-bnb-4bit",  # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=HF_TOKEN
)


==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Initialize the LoRA model
mlp_model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # Support for rank stabilized LoRA
    loftq_config=None,  # Support for LoftQ
)


Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2024.11.7 patched 16 layers with 0 QKV layers, 0 O layers and 16 MLP layers.


### Setting up Train Dataset

In [None]:
# Set up the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

In [None]:
# Data formatting function
def formatting_prompts_func(examples):
    inputs = []
    labels = []
    convos = examples['conversations']
    for convo in convos:
        # 'convo' is a list of messages
        if not convo:
            # Handle empty conversations
            inputs.append('')
            labels.append('')
            continue

        if convo[-1]['role'] == 'assistant':
            assistant_response = convo[-1]['content']
            input_convo = convo[:-1]  # Exclude the assistant's response
        else:
            assistant_response = ''
            input_convo = convo

        # Format input prompt
        input_text = tokenizer.apply_chat_template(
            input_convo,
            tokenize=False,
            add_generation_prompt=False
        )
        inputs.append(input_text)
        labels.append(assistant_response)
    return {"input_text": inputs, "labels": labels}


In [None]:
# Load and preprocess the training dataset
train_dataset = load_from_disk('/content/train_data')  # Path where the dataset was saved
train_dataset = standardize_sharegpt(train_dataset)
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=mlp_model)

### Setting the Trainer

In [None]:
# Enable gradient checkpointing and manage GPU memory before initializing the trainer
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
mlp_model.gradient_checkpointing_enable()
torch.cuda.empty_cache()

In [None]:
# Set up the trainer
trainer = SFTTrainer(
    model=mlp_model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="input_text",
    max_seq_length=max_seq_length,
    data_collator=data_collator,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        eval_strategy='no',
        save_steps=10,
        save_total_limit=2,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Use this for WandB, etc.
    )
)


max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Modify the trainer to focus on the assistant's responses
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
# Show current GPU memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
1.137 GB of memory reserved.


In [None]:
# Start training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,368 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 7,864,320


Step,Training Loss
1,3.2928
2,3.9598
3,3.8944
4,3.4327
5,3.3339
6,2.8763
7,2.6932
8,2.6078
9,2.2643
10,1.723


### Inference

In [None]:
# Enable faster inference
FastLanguageModel.for_inference(mlp_model)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example message for generation
messages = [
    {"role": "user",
     "content": "Who is the target audience for the WiTT awards scholarship?"},
]

# Prepare inputs
input_text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,  # Must add for generation
)

# Tokenize the input_text
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=max_seq_length,
).to(device)

# Generate response
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = mlp_model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    streamer=text_streamer,
    max_new_tokens=128,
    use_cache=True,
    temperature=1.0,
)


The target audience includes women pursuing careers or education in technology-related fields.<|eot_id|>


## Model Evaluation

In [None]:
# Load and preprocess the test dataset
test_dataset = load_from_disk('/content/test_data')  # Path where the dataset was saved
test_dataset = standardize_sharegpt(test_dataset)
test_dataset = test_dataset.map(formatting_prompts_func, batched=True)


Standardizing format:   0%|          | 0/342 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

In [None]:
# Create a DataLoader for the test dataset
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)


In [None]:
# Prepare the model for evaluation
mlp_model.eval()
all_predictions = []
all_labels = []

In [None]:
# Function to clean text (to extract assistant responses only)
def clean_text(text):
    # Define a pattern to match the assistant's response
    pattern = r"assistant\s*\n\n(.*)"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        # If the pattern is not found, return the entire text after the last user message
        split_text = re.split(r"user\s*\n\n", text)
        return split_text[-1].strip()

In [None]:
# Iterate through the test dataset
for batch in tqdm(test_dataloader, desc="Evaluating"):
    # Tokenize the inputs
    inputs = tokenizer(
        batch["input_text"],
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=max_seq_length
    ).to(device)

    with torch.no_grad():
        # Generate predictions
        outputs = mlp_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=128,
            temperature=1.0,
            use_cache=True
        )

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    cleaned_preds = [clean_text(pred) for pred in decoded_preds]

    # Use the actual labels
    cleaned_labels = batch["labels"]

    # Collect predictions and labels for metric computation
    all_predictions.extend(cleaned_preds)
    all_labels.extend(cleaned_labels)

Evaluating: 100%|██████████| 86/86 [01:41<00:00,  1.18s/it]


In [None]:
# Initialize metrics from the `evaluate` library
metric_bleu = evaluate.load("bleu")
metric_rouge = evaluate.load("rouge")
metric_meteor = evaluate.load("meteor")
metric_sacrebleu = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
# Exact match scoring function
def exact_match_score(predictions, references):
    matches = [int(pred.strip() == ref.strip()) for pred, ref in zip(predictions, references)]
    return np.mean(matches) * 100

# F1 score function
def f1_score(prediction, reference):
    prediction_tokens = prediction.strip().split()
    reference_tokens = reference.strip().split()
    common = Counter(prediction_tokens) & Counter(reference_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(prediction_tokens)
    recall = num_same / len(reference_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1 * 100

# Function to compute metrics
def compute_metrics(eval_pred):
    predictions, references = eval_pred

    # Ensure predictions and references are lists of strings
    if not isinstance(predictions[0], str):
        predictions = [str(p) for p in predictions]
    if not isinstance(references[0], str):
        references = [str(r) for r in references]

    # Compute Exact Match
    exact_match = exact_match_score(predictions, references)

    # Compute F1 Score
    f1_scores = [f1_score(pred, ref) for pred, ref in zip(predictions, references)]
    avg_f1 = np.mean(f1_scores)

    # Compute BLEU
    bleu = metric_bleu.compute(predictions=predictions, references=references)

    # Compute ROUGE
    rouge = metric_rouge.compute(predictions=predictions, references=references)

    # Compute METEOR
    meteor = metric_meteor.compute(predictions=predictions, references=references)

    # Compute SacreBLEU
    sacrebleu = metric_sacrebleu.compute(predictions=predictions, references=references)

    return {
        "exact_match": exact_match,
        "f1": avg_f1,
        "bleu": bleu["bleu"] * 100,  # Convert to percentage
        "rouge1": rouge["rouge1"] * 100,
        "rougeL": rouge["rougeL"] * 100,
        "meteor": meteor["meteor"] * 100,
        "sacrebleu": sacrebleu["score"],  # Already in percentage
    }

In [None]:
# Compute and print evaluation metrics
eval_results = compute_metrics((all_predictions, all_labels))
print("\nEvaluation Results for MLP Finetuned Model:")
for metric_name, score in eval_results.items():
    print(f"{metric_name}: {score:.4f}")


Evaluation Results for MLP Finetuned Model:
exact_match: 100.0000
f1: 100.0000
bleu: 100.0000
rouge1: 100.0000
rougeL: 100.0000
meteor: 99.9877
sacrebleu: 100.0000
