### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [3]:
!pip install python-dotenv huggingface_hub datasets wandb rouge-score nltk scikit-learn

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try 'pacman -S
[31m   [0m python-xyz', where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch-packaged Python package,
[31m   [0m create a virtual environment using 'python -m venv path/to/venv'.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch packaged Python application,
[31m   [0m it may be easiest to use 'pipx install xyz', which will manage a
[31m   [0m virtual environment for you. Make sure you have python-pipx
[31m   [0m installed via pacman.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS distribution provider. You can override this, at the risk of breaking your Python installation or OS, by p

### Unsloth

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.5: Fast Llama patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.623 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.7.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


#### Download Dataset

In [3]:
!pip install gdown
import gdown
file_id = '1zPCJZR69yBbBYp6oFrDrpbuJvvMpVB72'
url = f'https://drive.google.com/uc?id={file_id}'
output='cleaned_headline_dataset.csv'
gdown.download(url, output, quiet=False)

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try 'pacman -S
[31m   [0m python-xyz', where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch-packaged Python package,
[31m   [0m create a virtual environment using 'python -m venv path/to/venv'.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch packaged Python application,
[31m   [0m it may be easiest to use 'pipx install xyz', which will manage a
[31m   [0m virtual environment for you. Make sure you have python-pipx
[31m   [0m installed via pacman.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS distribution provider. You can override this, at the risk of breaking your Python installation or OS, by p

Downloading...
From: https://drive.google.com/uc?id=1zPCJZR69yBbBYp6oFrDrpbuJvvMpVB72
To: /home/siam/Personal/news2headline/src/cleaned_headline_dataset.csv
100%|██████████| 100M/100M [00:03<00:00, 26.1MB/s] 


'cleaned_headline_dataset.csv'

<a name="Data"></a>
### Data Prep
We now use the `Llama-3.1` format for conversation style finetunes. We use [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. But we convert it to HuggingFace's normal multiturn format `("role", "content")` instead of `("from", "value")`/ Llama-3 renders multi turn conversations like below:

```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How are you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3` and more.

In [4]:
from unsloth import get_chat_template

# ─── 1. TOKENIZER SETUP ─────────────────────────────────────────────────────────

# initialize Unsloth's chat tokenizer for llama‑3.1
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

def formatting_prompts_func(examples):
    """Apply your chat template to create conversation format."""
    conversations = []
    for content, headline in zip(examples["content"], examples["headline"]):
        # Create a conversation with user question and assistant answer
        conversation = [
            {"role": "user", "content": content},
            {"role": "assistant", "content": headline}
        ]
        conversations.append(conversation)

    # Apply chat template to each conversation
    texts = []
    for conversation in conversations:
        text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)

    return {"text": texts}

In [5]:
from datasets import load_dataset, DatasetDict

# ─── 2. LOAD CSV & SPLIT ────────────────────────────────────────────────────────

# load the CSV from local disk
raw = load_dataset(
    "csv",
    data_files={"full": "cleaned_headline_dataset.csv"},
    split="full"
)

# first split off test (10% of full), then split the remaining 90% into train (80%) & validation (10%)
split1 = raw.train_test_split(test_size=0.10, seed=42)
split2 = split1["train"].train_test_split(test_size=0.11, seed=42)
# note: 0.11 of 90% ≈ 10% of total

datasets = DatasetDict({
    "train": split2["train"],
    "validation": split2["test"],
    "test": split1["test"],
})


Generating full split: 0 examples [00:00, ? examples/s]

In [6]:
# ─── 3. APPLY FORMATTING TO DATASETS ───────────────────────────────────────────

# Apply the formatting function to all datasets
datasets = datasets.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/41681 [00:00<?, ? examples/s]

Map:   0%|          | 0/5152 [00:00<?, ? examples/s]

Map:   0%|          | 0/5204 [00:00<?, ? examples/s]

In [9]:
# ─── 4. WANDB AUTHENTICATION & SETUP (CORRECTED) ───────────────────────────────

import os
import wandb
from dotenv import load_dotenv
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
from glob import glob

# Load environment variables
load_dotenv()

# Set up wandb environment variables to avoid warnings
os.environ["WANDB_NOTEBOOK_NAME"] = "TinyLlama_Headline_Generation.ipynb"
os.environ["WANDB_PROJECT"] = "news2headline-tinyllama"

# Get wandb token and set as environment variable (PREFERRED METHOD)
wandb_token = os.getenv("WANDB_KEY")
if wandb_token:
    os.environ["WANDB_API_KEY"] = wandb_token
    print("✅ Wandb API key set via environment variable")
else:
    print("⚠️  Warning: WANDB_KEY not found in .env file")
    print("   Add this line to your .env file: WANDB_KEY=your_api_key_here")
    print("   Get your API key from: https://wandb.ai/authorize")

# Initialize wandb (this will automatically use WANDB_API_KEY if set)
wandb.init(
    project="news2headline-tinyllama",
    name="tinyllama-1.1b-headline-generation",
    notes="Fine-tuning TinyLlama 1.1B for news headline generation using Unsloth",
    tags=["tinyllama", "headline-generation", "unsloth", "lora"],
    config={
        "model_name": "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
        "max_seq_length": max_seq_length,
        "lora_r": 16,
        "lora_alpha": 16,
        "learning_rate": 2e-4,
        "batch_size": 2,
        "gradient_accumulation_steps": 4,
        "num_train_epochs": 1,
        "task": "headline_generation",
        "dataset_size": len(datasets["train"]),
        "validation_size": len(datasets["validation"]),
        "test_size": len(datasets["test"])
    }
)

print(f"🚀 Wandb run initialized: {wandb.run.name}")
print(f"📊 Dashboard: {wandb.run.url}")

# Check for existing checkpoints
output_dir = "outputs"
checkpoint_dirs = glob(os.path.join(output_dir, "checkpoint-*"))
resume_from_checkpoint = None

if checkpoint_dirs:
    # Sort by checkpoint number and get the latest one
    checkpoint_dirs.sort(key=lambda x: int(x.split("-")[-1]))
    resume_from_checkpoint = checkpoint_dirs[-1]
    print(f"Found checkpoint: {resume_from_checkpoint}")
    print("Resuming training from checkpoint...")
else:
    print("No checkpoint found. Starting training from beginning...")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = datasets["train"],
    eval_dataset = datasets["validation"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = output_dir,
        report_to = "wandb", # Enable wandb logging
        save_strategy = "steps",
        save_steps = 500,  # Save checkpoint every 500 steps
        eval_strategy = "steps",
        eval_steps = 500,  # Evaluate every 500 steps
        logging_strategy = "steps",
        load_best_model_at_end = True,
        metric_for_best_model = "eval_loss",
        greater_is_better = False,
    ),
)

✅ Wandb API key set via environment variable


🚀 Wandb run initialized: tinyllama-1.1b-headline-generation
📊 Dashboard: https://wandb.ai/aonyendopaul-american-international-university-bangladesh/news2headline-tinyllama/runs/lk5p5ypv
No checkpoint found. Starting training from beginning...


In [10]:
# ─── 5. APPLY TRAIN ON RESPONSES ONLY ──────────────────────────────────────────

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=20):   0%|          | 0/41681 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/5152 [00:00<?, ? examples/s]

In [11]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nVeet, in celebration of its 20th anniversary in Bangladesh, is launching a special television series titled “Agiye Jao Attobisshashey” with Deepto TV, to honor the strength, resilience, and courage of the women in Bangladesh. The show will premiere on Deepto TV on November 1, 2024, and will air every Friday and Saturday at 9:20 pm. The program will be re-telecasted on Saturday and Sunday at 4.50pm. The popular Bangladeshi actress and model, Zakia Bari Mamo, will host the show. Veet’s anniversary marks two decades of helping women feel beautiful and confident in Bangladesh. The title “Agiye Jao Attobisshashey”, which means "Move Forward with Confidence," reflects Veet’s commitment to inspiring women and giving them a platform to share their powerful life stories. The series will feature t

In [12]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                                                                                                                                                             Veet celebrates 20 Years of journey with "Agiye Jao Attobisshashey” campaign<|eot_id|>'

In [None]:
# ─── 6. START TRAINING WITH CHECKPOINT RESUMING ───────────────────────────────

# Start training - will automatically resume from checkpoint if available
# According to Unsloth docs, use resume_from_checkpoint=True for automatic detection
trainer_stats = trainer.train(resume_from_checkpoint=True if resume_from_checkpoint else None)

# Log final metrics to wandb
wandb.log({
    "final_train_loss": trainer_stats.training_loss,
    "total_steps": trainer_stats.global_step,
})

print(f"Training completed! Final loss: {trainer_stats.training_loss:.4f}")
print(f"Total training steps: {trainer_stats.global_step}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 41,681 | Num Epochs = 1 | Total steps = 5,211
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss


### Model Evaluation on Test Set

In [None]:
# ─── 7. COMPREHENSIVE MODEL EVALUATION ─────────────────────────────────────────

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from tqdm import tqdm
import re

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Starting comprehensive evaluation on test set...")
print(f"Test set size: {len(datasets['test'])}")

# Evaluate on validation set first (faster)
print("\n=== Validation Set Evaluation ===")
val_metrics = trainer.evaluate(eval_dataset=datasets["validation"])
print(f"Validation Loss: {val_metrics['eval_loss']:.4f}")
print(f"Validation Perplexity: {np.exp(val_metrics['eval_loss']):.4f}")

# Log validation metrics to wandb
wandb.log({
    "val_loss": val_metrics['eval_loss'],
    "val_perplexity": np.exp(val_metrics['eval_loss'])
})

In [None]:
# ─── 8. QUALITATIVE EVALUATION WITH GENERATION METRICS ────────────────────────

# Prepare model for inference
FastLanguageModel.for_inference(model)

def generate_headline(content, max_length=50):
    """Generate headline for given content"""
    messages = [
        {"role": "user", "content": content}
    ]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the generated part
    generated_text = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return generated_text.strip()

# Test on a subset of test data for detailed metrics (first 100 samples for speed)
test_subset = datasets["test"].select(range(min(100, len(datasets["test"]))))
print(f"\n=== Generating headlines for {len(test_subset)} test samples ===")

generated_headlines = []
reference_headlines = []

for i, example in enumerate(tqdm(test_subset)):
    try:
        generated = generate_headline(example["content"])
        generated_headlines.append(generated)
        reference_headlines.append(example["headline"])
        
        # Show first few examples
        if i < 3:
            print(f"\n--- Example {i+1} ---")
            print(f"Content: {example['content'][:200]}...")
            print(f"Reference: {example['headline']}")
            print(f"Generated: {generated}")
            
    except Exception as e:
        print(f"Error generating headline for example {i}: {e}")
        generated_headlines.append("")
        reference_headlines.append(example["headline"])

print(f"\nSuccessfully generated {len([h for h in generated_headlines if h])} headlines")

In [None]:
# ─── 9. CALCULATE COMPREHENSIVE METRICS ────────────────────────────────────────

def calculate_rouge_scores(generated, references):
    """Calculate ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for gen, ref in zip(generated, references):
        if gen:  # Only calculate if generation is not empty
            scores = scorer.score(ref, gen)
            rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
            rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
            rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': np.mean(rouge_scores['rouge1']) if rouge_scores['rouge1'] else 0,
        'rouge2': np.mean(rouge_scores['rouge2']) if rouge_scores['rouge2'] else 0,
        'rougeL': np.mean(rouge_scores['rougeL']) if rouge_scores['rougeL'] else 0
    }

def calculate_bleu_scores(generated, references):
    """Calculate BLEU scores"""
    smoothing = SmoothingFunction().method1
    bleu_scores = []
    
    for gen, ref in zip(generated, references):
        if gen:  # Only calculate if generation is not empty
            ref_tokens = [ref.split()]
            gen_tokens = gen.split()
            try:
                bleu = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=smoothing)
                bleu_scores.append(bleu)
            except:
                bleu_scores.append(0.0)
    
    return np.mean(bleu_scores) if bleu_scores else 0

def calculate_length_metrics(generated, references):
    """Calculate length-based metrics"""
    gen_lengths = [len(h.split()) for h in generated if h]
    ref_lengths = [len(h.split()) for h in references]
    
    return {
        'avg_generated_length': np.mean(gen_lengths) if gen_lengths else 0,
        'avg_reference_length': np.mean(ref_lengths),
        'length_ratio': np.mean(gen_lengths) / np.mean(ref_lengths) if gen_lengths and ref_lengths else 0
    }

# Calculate all metrics
print("\n=== Calculating Evaluation Metrics ===")

# ROUGE scores
rouge_scores = calculate_rouge_scores(generated_headlines, reference_headlines)
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

# BLEU score
bleu_score = calculate_bleu_scores(generated_headlines, reference_headlines)
print(f"BLEU: {bleu_score:.4f}")

# Length metrics
length_metrics = calculate_length_metrics(generated_headlines, reference_headlines)
print(f"Avg Generated Length: {length_metrics['avg_generated_length']:.2f} words")
print(f"Avg Reference Length: {length_metrics['avg_reference_length']:.2f} words")
print(f"Length Ratio: {length_metrics['length_ratio']:.2f}")

# Success rate
success_rate = len([h for h in generated_headlines if h]) / len(generated_headlines)
print(f"Generation Success Rate: {success_rate:.2%}")

# Compile final metrics
final_metrics = {
    "test_rouge1": rouge_scores['rouge1'],
    "test_rouge2": rouge_scores['rouge2'],
    "test_rougeL": rouge_scores['rougeL'],
    "test_bleu": bleu_score,
    "test_success_rate": success_rate,
    "test_avg_gen_length": length_metrics['avg_generated_length'],
    "test_avg_ref_length": length_metrics['avg_reference_length'],
    "test_length_ratio": length_metrics['length_ratio']
}

# Log to wandb
wandb.log(final_metrics)

print("\n=== Final Evaluation Summary ===")
print(f"Model Performance on News Headline Generation:")
print(f"• ROUGE-1 (Unigram Overlap): {rouge_scores['rouge1']:.4f}")
print(f"• ROUGE-2 (Bigram Overlap): {rouge_scores['rouge2']:.4f}")
print(f"• ROUGE-L (Longest Common Subsequence): {rouge_scores['rougeL']:.4f}")
print(f"• BLEU Score: {bleu_score:.4f}")
print(f"• Generation Success Rate: {success_rate:.2%}")

# Finish wandb run
wandb.finish()

print("\nEvaluation completed! Check your wandb dashboard for detailed metrics.")