### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [3]:
!pip install python-dotenv huggingface_hub datasets wandb rouge-score nltk scikit-learn

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try 'pacman -S
[31m   [0m python-xyz', where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch-packaged Python package,
[31m   [0m create a virtual environment using 'python -m venv path/to/venv'.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch packaged Python application,
[31m   [0m it may be easiest to use 'pipx install xyz', which will manage a
[31m   [0m virtual environment for you. Make sure you have python-pipx
[31m   [0m installed via pacman.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS distribution provider. You can override this, at the risk of breaking your Python installation or OS, by p

### Unsloth

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.5: Fast Llama patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.623 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.7.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


#### Download Dataset

In [3]:
!pip install gdown
import gdown
file_id = '1zPCJZR69yBbBYp6oFrDrpbuJvvMpVB72'
url = f'https://drive.google.com/uc?id={file_id}'
output='cleaned_headline_dataset.csv'
gdown.download(url, output, quiet=False)

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try 'pacman -S
[31m   [0m python-xyz', where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch-packaged Python package,
[31m   [0m create a virtual environment using 'python -m venv path/to/venv'.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip.
[31m   [0m 
[31m   [0m If you wish to install a non-Arch packaged Python application,
[31m   [0m it may be easiest to use 'pipx install xyz', which will manage a
[31m   [0m virtual environment for you. Make sure you have python-pipx
[31m   [0m installed via pacman.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS distribution provider. You can override this, at the risk of breaking your Python installation or OS, by p

Downloading...
From: https://drive.google.com/uc?id=1zPCJZR69yBbBYp6oFrDrpbuJvvMpVB72
To: /home/siam/Personal/news2headline/src/cleaned_headline_dataset.csv
100%|██████████| 100M/100M [00:03<00:00, 26.1MB/s] 


'cleaned_headline_dataset.csv'

<a name="Data"></a>
### Data Prep
We now use the `Llama-3.1` format for conversation style finetunes. We use [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. But we convert it to HuggingFace's normal multiturn format `("role", "content")` instead of `("from", "value")`/ Llama-3 renders multi turn conversations like below:

```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How are you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3` and more.

In [4]:
from unsloth import get_chat_template

# ─── 1. TOKENIZER SETUP ─────────────────────────────────────────────────────────

# initialize Unsloth's chat tokenizer for llama‑3.1
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

def formatting_prompts_func(examples):
    """Apply your chat template to create conversation format."""
    conversations = []
    for content, headline in zip(examples["content"], examples["headline"]):
        # Create a conversation with user question and assistant answer
        conversation = [
            {"role": "system", "content": "You are an expert news headline writer. Create concise, engaging headlines that capture the main story. Keep headlines between 5-12 words."},
            {"role": "user", "content": f"Write a headline for this news article:\n\n{content}"},
            {"role": "assistant", "content": headline}
        ]
        conversations.append(conversation)

    # Apply chat template to each conversation
    texts = []
    for conversation in conversations:
        text = tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)

    return {"text": texts}

In [5]:
from datasets import load_dataset, DatasetDict

# ─── 2. LOAD CSV & SPLIT ────────────────────────────────────────────────────────

# load the CSV from local disk
raw = load_dataset(
    "csv",
    data_files={"full": "cleaned_headline_dataset.csv"},
    split="full"
)

# first split off test (10% of full), then split the remaining 90% into train (80%) & validation (10%)
split1 = raw.train_test_split(test_size=0.10, seed=42)
split2 = split1["train"].train_test_split(test_size=0.11, seed=42)
# note: 0.11 of 90% ≈ 10% of total

datasets = DatasetDict({
    "train": split2["train"],
    "validation": split2["test"],
    "test": split1["test"],
})


Generating full split: 0 examples [00:00, ? examples/s]

In [6]:
# ─── 3. APPLY FORMATTING TO DATASETS ───────────────────────────────────────────

# Apply the formatting function to all datasets
datasets = datasets.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/41681 [00:00<?, ? examples/s]

Map:   0%|          | 0/5152 [00:00<?, ? examples/s]

Map:   0%|          | 0/5204 [00:00<?, ? examples/s]

In [9]:
# ─── 4. WANDB AUTHENTICATION & SETUP (CORRECTED) ───────────────────────────────

import os
import wandb
from dotenv import load_dotenv
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
from glob import glob

# Load environment variables
load_dotenv()

# Set up wandb environment variables to avoid warnings
os.environ["WANDB_NOTEBOOK_NAME"] = "TinyLlama_Headline_Generation.ipynb"
os.environ["WANDB_PROJECT"] = "news2headline-tinyllama"

# Get wandb token and set as environment variable (PREFERRED METHOD)
wandb_token = os.getenv("WANDB_KEY")
if wandb_token:
    os.environ["WANDB_API_KEY"] = wandb_token
    print("✅ Wandb API key set via environment variable")
else:
    print("⚠️  Warning: WANDB_KEY not found in .env file")
    print("   Add this line to your .env file: WANDB_KEY=your_api_key_here")
    print("   Get your API key from: https://wandb.ai/authorize")

# Initialize wandb (this will automatically use WANDB_API_KEY if set)
wandb.init(
    project="news2headline-tinyllama",
    name="tinyllama-1.1b-headline-generation",
    notes="Fine-tuning TinyLlama 1.1B for news headline generation using Unsloth",
    tags=["tinyllama", "headline-generation", "unsloth", "lora"],
    config={
        "model_name": "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
        "max_seq_length": max_seq_length,
        "r": 32,                # Higher rank for more capacity (was 16)
        "lora_alpha": 64,       # 2x rank for better learning (was 16)
        "lora_dropout": 0.05,   # Small dropout for regularization (was 0)
        "bias": "lora_only",    # Train bias terms for better adaptation
        "target_modules": [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
            "embed_tokens", "lm_head"  # Include embedding layers
        ],
        "use_rslora": True,     # Rank-stabilized LoRA for better training
        "lora_alpha": 16,
        "learning_rate": 2e-4,
        "batch_size": 2,
        "gradient_accumulation_steps": 4,
        "num_train_epochs": 1,
        "task": "headline_generation",
        "dataset_size": len(datasets["train"]),
        "validation_size": len(datasets["validation"]),
        "test_size": len(datasets["test"])
    }
)

print(f"🚀 Wandb run initialized: {wandb.run.name}")
print(f"📊 Dashboard: {wandb.run.url}")

# Check for existing checkpoints
output_dir = "outputs"
checkpoint_dirs = glob(os.path.join(output_dir, "checkpoint-*"))
resume_from_checkpoint = None

if checkpoint_dirs:
    # Sort by checkpoint number and get the latest one
    checkpoint_dirs.sort(key=lambda x: int(x.split("-")[-1]))
    resume_from_checkpoint = checkpoint_dirs[-1]
    print(f"Found checkpoint: {resume_from_checkpoint}")
    print("Resuming training from checkpoint...")
else:
    print("No checkpoint found. Starting training from beginning...")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = datasets["train"],
    eval_dataset = datasets["validation"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        warmup_ratio = 0.1,
        lr_scheduler_type = "cosine",
        num_train_epochs = 2, # Set this for 1 full training run.
        max_steps = -1,
        learning_rate = 1e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.05,   # Higher weight decay to prevent overfitting
        max_grad_norm = 0.5,   # Gradient clipping for stability
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = output_dir,
        report_to = "wandb", # Enable wandb logging
        remove_unused_columns = False,  # Add this line
        save_strategy = "steps",
        save_steps = 250,  # Save checkpoint every 500 steps
        eval_strategy = "steps",
        eval_steps = 250,  # Evaluate every 500 steps
        logging_steps = 50,
        load_best_model_at_end = True,
        logging_strategy = "steps",
        metric_for_best_model = "eval_loss",
        greater_is_better = False,
        
    ),
)

✅ Wandb API key set via environment variable


🚀 Wandb run initialized: tinyllama-1.1b-headline-generation
📊 Dashboard: https://wandb.ai/aonyendopaul-american-international-university-bangladesh/news2headline-tinyllama/runs/lk5p5ypv
No checkpoint found. Starting training from beginning...


In [10]:
# ─── 5. APPLY TRAIN ON RESPONSES ONLY ──────────────────────────────────────────

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=20):   0%|          | 0/41681 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/5152 [00:00<?, ? examples/s]

In [11]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nVeet, in celebration of its 20th anniversary in Bangladesh, is launching a special television series titled “Agiye Jao Attobisshashey” with Deepto TV, to honor the strength, resilience, and courage of the women in Bangladesh. The show will premiere on Deepto TV on November 1, 2024, and will air every Friday and Saturday at 9:20 pm. The program will be re-telecasted on Saturday and Sunday at 4.50pm. The popular Bangladeshi actress and model, Zakia Bari Mamo, will host the show. Veet’s anniversary marks two decades of helping women feel beautiful and confident in Bangladesh. The title “Agiye Jao Attobisshashey”, which means "Move Forward with Confidence," reflects Veet’s commitment to inspiring women and giving them a platform to share their powerful life stories. The series will feature t

In [12]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                                                                                                                                                             Veet celebrates 20 Years of journey with "Agiye Jao Attobisshashey” campaign<|eot_id|>'

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [13]:
# ─── 6. START TRAINING WITH CHECKPOINT RESUMING ───────────────────────────────

# Start training - will automatically resume from checkpoint if available
# According to Unsloth docs, use resume_from_checkpoint=True for automatic detection
trainer_stats = trainer.train(resume_from_checkpoint=True if resume_from_checkpoint else None)

# Log final metrics to wandb
wandb.log({
    "final_train_loss": trainer_stats.training_loss,
    "total_steps": trainer_stats.global_step,
})

print(f"Training completed! Final loss: {trainer_stats.training_loss:.4f}")
print(f"Total training steps: {trainer_stats.global_step}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 41,681 | Num Epochs = 1 | Total steps = 5,211
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
500,1.2515,1.487769
1000,1.088,1.439021
1500,1.4508,1.41474
2000,1.2305,1.388201
2500,1.2821,1.369782
3000,1.2871,1.350047
3500,1.1733,1.331723
4000,0.9298,1.321058
4500,1.5117,1.310141
5000,1.2339,1.30294


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Training completed! Final loss: 1.3965
Total training steps: 5211


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

### Model Evaluation on Test Set

In [16]:
# ─── 7. COMPREHENSIVE MODEL EVALUATION ─────────────────────────────────────────

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from tqdm import tqdm
import re

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Starting comprehensive evaluation on test set...")
print(f"Test set size: {len(datasets['test'])}")

# Evaluate on validation set first (faster)
print("\n=== Validation Set Evaluation ===")
# val_metrics = trainer.evaluate(eval_dataset=datasets["validation"])
val_metrics = trainer.evaluate()
print(f"Validation Loss: {val_metrics['eval_loss']:.4f}")
print(f"Validation Perplexity: {np.exp(val_metrics['eval_loss']):.4f}")

# Log validation metrics to wandb
wandb.log({
    "val_loss": val_metrics['eval_loss'],
    "val_perplexity": np.exp(val_metrics['eval_loss'])
})

Starting comprehensive evaluation on test set...
Test set size: 5204

=== Validation Set Evaluation ===


Validation Loss: 1.3029
Validation Perplexity: 3.6801


In [20]:
# ─── 8. QUALITATIVE EVALUATION WITH GENERATION METRICS ────────────────────────

# Prepare model for inference
FastLanguageModel.for_inference(model)

def generate_headline(content, max_length=50):
    """Generate headline for given content"""
    messages = [
        {"role": "user", "content": content}
    ]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the generated part
    generated_text = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return generated_text.strip()

# Test on a subset of test data for detailed metrics (first 100 samples for speed)
test_subset = datasets["test"].select(range(min(100, len(datasets["test"]))))
print(f"\n=== Generating headlines for {len(test_subset)} test samples ===")

generated_headlines = []
reference_headlines = []

for i, example in enumerate(tqdm(test_subset)):
    try:
        generated = generate_headline(example["content"])
        generated_headlines.append(generated)
        reference_headlines.append(example["headline"])
        
        # Show first few examples
        if i < 3:
            print(f"\n--- Example {i+1} ---")
            print(f"Content: {example['content'][:200]}...")
            print(f"Reference: {example['headline']}")
            print(f"Generated: {generated}")
            
    except Exception as e:
        print(f"Error generating headline for example {i}: {e}")
        generated_headlines.append("")
        reference_headlines.append(example["headline"])

print(f"\nSuccessfully generated {len([h for h in generated_headlines if h])} headlines")


=== Generating headlines for 100 test samples ===


  0%|          | 0/100 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  2%|▏         | 2/100 [00:00<00:26,  3.64it/s]


--- Example 1 ---
Content: Awami League General Secretary Obaidul Quader today strongly condemned the statements made by BNP Secretary General Mirza Fakhrul Islam Alamgir, calling them politically motivated, misleading, and pro...
Reference: Conspiracy is BNP's only strategy to seize power: Quader
Generated: Quader slams Fakhrul’s statements as ‘politically motivated’

--- Example 2 ---
Content: The Election Commission (EC) has received the go ahead to access the Rohingya database maintained by the United Nations High Commissioner for Refugees (UNHCR). The database contains records of over on...
Reference: EC gets go ahead to access Rohingya database
Generated: EC gets go-ahead to access Rohingya database


  3%|▎         | 3/100 [00:00<00:23,  4.18it/s]


--- Example 3 ---
Content: The Office of the Chief Prosecutor of the International Crimes Tribunal (ICT) has issued a strong condemnation of a Prothom Alo report titled “Question on Tajul Islam’s involvement in ATM Azharul’s ca...
Reference: Prothom Alo report part of conspiracy against ICT trial proceedings
Generated: Chief Prosecutor condemns Prothom Alo report on ICT


100%|██████████| 100/100 [00:20<00:00,  4.78it/s]


Successfully generated 100 headlines





In [21]:
# ─── 9. CALCULATE COMPREHENSIVE METRICS ────────────────────────────────────────

def calculate_rouge_scores(generated, references):
    """Calculate ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for gen, ref in zip(generated, references):
        if gen:  # Only calculate if generation is not empty
            scores = scorer.score(ref, gen)
            rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
            rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
            rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': np.mean(rouge_scores['rouge1']) if rouge_scores['rouge1'] else 0,
        'rouge2': np.mean(rouge_scores['rouge2']) if rouge_scores['rouge2'] else 0,
        'rougeL': np.mean(rouge_scores['rougeL']) if rouge_scores['rougeL'] else 0
    }

def calculate_bleu_scores(generated, references):
    """Calculate BLEU scores"""
    smoothing = SmoothingFunction().method1
    bleu_scores = []
    
    for gen, ref in zip(generated, references):
        if gen:  # Only calculate if generation is not empty
            ref_tokens = [ref.split()]
            gen_tokens = gen.split()
            try:
                bleu = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=smoothing)
                bleu_scores.append(bleu)
            except:
                bleu_scores.append(0.0)
    
    return np.mean(bleu_scores) if bleu_scores else 0

def calculate_length_metrics(generated, references):
    """Calculate length-based metrics"""
    gen_lengths = [len(h.split()) for h in generated if h]
    ref_lengths = [len(h.split()) for h in references]
    
    return {
        'avg_generated_length': np.mean(gen_lengths) if gen_lengths else 0,
        'avg_reference_length': np.mean(ref_lengths),
        'length_ratio': np.mean(gen_lengths) / np.mean(ref_lengths) if gen_lengths and ref_lengths else 0
    }

# Calculate all metrics
print("\n=== Calculating Evaluation Metrics ===")

# ROUGE scores
rouge_scores = calculate_rouge_scores(generated_headlines, reference_headlines)
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

# BLEU score
bleu_score = calculate_bleu_scores(generated_headlines, reference_headlines)
print(f"BLEU: {bleu_score:.4f}")

# Length metrics
length_metrics = calculate_length_metrics(generated_headlines, reference_headlines)
print(f"Avg Generated Length: {length_metrics['avg_generated_length']:.2f} words")
print(f"Avg Reference Length: {length_metrics['avg_reference_length']:.2f} words")
print(f"Length Ratio: {length_metrics['length_ratio']:.2f}")

# Success rate
success_rate = len([h for h in generated_headlines if h]) / len(generated_headlines)
print(f"Generation Success Rate: {success_rate:.2%}")

# Compile final metrics
final_metrics = {
    "test_rouge1": rouge_scores['rouge1'],
    "test_rouge2": rouge_scores['rouge2'],
    "test_rougeL": rouge_scores['rougeL'],
    "test_bleu": bleu_score,
    "test_success_rate": success_rate,
    "test_avg_gen_length": length_metrics['avg_generated_length'],
    "test_avg_ref_length": length_metrics['avg_reference_length'],
    "test_length_ratio": length_metrics['length_ratio']
}

# Log to wandb
wandb.log(final_metrics)

print("\n=== Final Evaluation Summary ===")
print(f"Model Performance on News Headline Generation:")
print(f"• ROUGE-1 (Unigram Overlap): {rouge_scores['rouge1']:.4f}")
print(f"• ROUGE-2 (Bigram Overlap): {rouge_scores['rouge2']:.4f}")
print(f"• ROUGE-L (Longest Common Subsequence): {rouge_scores['rougeL']:.4f}")
print(f"• BLEU Score: {bleu_score:.4f}")
print(f"• Generation Success Rate: {success_rate:.2%}")

# Finish wandb run
wandb.finish()

print("\nEvaluation completed! Check your wandb dashboard for detailed metrics.")


=== Calculating Evaluation Metrics ===
ROUGE-1: 0.4594
ROUGE-2: 0.2237
ROUGE-L: 0.4122
BLEU: 0.1033
Avg Generated Length: 8.16 words
Avg Reference Length: 8.57 words
Length Ratio: 0.95
Generation Success Rate: 100.00%

=== Final Evaluation Summary ===
Model Performance on News Headline Generation:
• ROUGE-1 (Unigram Overlap): 0.4594
• ROUGE-2 (Bigram Overlap): 0.2237
• ROUGE-L (Longest Common Subsequence): 0.4122
• BLEU Score: 0.1033
• Generation Success Rate: 100.00%


0,1
eval/loss,█▆▅▄▄▃▂▂▁▁▁
eval/runtime,█▆▂▆▂▄▃▁▁▁▁
eval/samples_per_second,▁▃▇▃▇▅▆█▇██
eval/steps_per_second,▁▃▇▃▇▅▆█▇██
final_train_loss,▁
test_avg_gen_length,▁
test_avg_ref_length,▁
test_bleu,▁
test_length_ratio,▁
test_rouge1,▁

0,1
eval/loss,1.30294
eval/runtime,659.9435
eval/samples_per_second,7.807
eval/steps_per_second,1.952
final_train_loss,1.39646
test_avg_gen_length,8.16
test_avg_ref_length,8.57
test_bleu,0.10327
test_length_ratio,0.95216
test_rouge1,0.45936



Evaluation completed! Check your wandb dashboard for detailed metrics.


# ─── 9. UNSLOTH OPTIMIZED INFERENCE & COMPREHENSIVE EVALUATION ────────────────────────────────────────

In [23]:
# ─── UNSLOTH OPTIMIZED INFERENCE & COMPREHENSIVE EVALUATION ────────────────────

import numpy as np
import random
from tqdm import tqdm
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json
from datetime import datetime

# ─── 1. OPTIMIZE MODEL FOR INFERENCE ───────────────────────────────────────────
print("🚀 Optimizing model for inference...")
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
print("✅ Model optimized for inference (2x speed boost)")

# ─── 2. PREPARE TEST DATASET ───────────────────────────────────────────────────
test_dataset = datasets["test"]
sample_size = min(1000, len(test_dataset))  # Use up to 1000 samples for comprehensive evaluation

# Randomly sample for unbiased evaluation
random.seed(42)
test_indices = random.sample(range(len(test_dataset)), sample_size)
test_samples = [test_dataset[i] for i in test_indices]

print(f"📊 Randomly sampled {sample_size} examples from {len(test_dataset)} total test samples")

# ─── 3. INFERENCE FUNCTION WITH UNSLOTH OPTIMIZATION ──────────────────────────
def generate_headline_unsloth(content, max_new_tokens=20):
    """Generate headline using Unsloth optimized inference"""
    # Create conversation format
    messages = [
        {"role": "user", "content": content}
    ]
    
    # Apply chat template
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    
    # Generate with optimized settings
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True  # Unsloth optimization
        )
    
    # Decode only the generated part
    generated_tokens = outputs[0][len(inputs[0]):]
    headline = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    
    return headline

# ─── 4. COMPREHENSIVE EVALUATION METRICS ──────────────────────────────────────
def calculate_comprehensive_metrics(generated_headlines, reference_headlines):
    """Calculate comprehensive evaluation metrics"""
    
    # Filter out empty generations
    valid_pairs = [(g, r) for g, r in zip(generated_headlines, reference_headlines) if g and g.strip()]
    
    if not valid_pairs:
        return {"error": "No valid generations found"}
    
    valid_generated = [pair[0] for pair in valid_pairs]
    valid_references = [pair[1] for pair in valid_pairs]
    
    # ROUGE Scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for gen, ref in valid_pairs:
        scores = scorer.score(ref, gen)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
    
    # BLEU Scores
    smoothing = SmoothingFunction().method1
    bleu_scores = []
    
    for gen, ref in valid_pairs:
        ref_tokens = [ref.split()]
        gen_tokens = gen.split()
        try:
            bleu = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=smoothing)
            bleu_scores.append(bleu)
        except:
            bleu_scores.append(0.0)
    
    # Length Metrics
    gen_lengths = [len(g.split()) for g in valid_generated]
    ref_lengths = [len(r.split()) for r in valid_references]
    
    # Exact Match
    exact_matches = sum(1 for g, r in valid_pairs if g.lower().strip() == r.lower().strip())
    
    return {
        'rouge1': np.mean(rouge_scores['rouge1']),
        'rouge2': np.mean(rouge_scores['rouge2']),
        'rougeL': np.mean(rouge_scores['rougeL']),
        'bleu': np.mean(bleu_scores),
        'avg_generated_length': np.mean(gen_lengths),
        'avg_reference_length': np.mean(ref_lengths),
        'length_ratio': np.mean(gen_lengths) / np.mean(ref_lengths),
        'success_rate': len(valid_pairs) / len(generated_headlines),
        'exact_match_rate': exact_matches / len(valid_pairs),
        'total_samples': len(generated_headlines),
        'valid_generations': len(valid_pairs)
    }

# ─── 5. RUN COMPREHENSIVE EVALUATION ───────────────────────────────────────────
print("\n🔄 Running comprehensive evaluation with Unsloth optimized inference...")

generated_headlines = []
reference_headlines = []
failed_generations = 0

# Generate headlines with progress tracking
for i, sample in enumerate(tqdm(test_samples, desc="Generating headlines")):
    try:
        # Generate headline
        generated = generate_headline_unsloth(sample["content"])
        generated_headlines.append(generated)
        reference_headlines.append(sample["headline"])
        
        # Progress update every 100 samples
        if (i + 1) % 100 == 0:
            success_rate = (len(generated_headlines) - failed_generations) / len(generated_headlines) * 100
            print(f"   Progress: {i+1}/{sample_size} | Success rate: {success_rate:.1f}%")
            
    except Exception as e:
        print(f"   Failed to generate headline for sample {i}: {str(e)}")
        generated_headlines.append("")
        reference_headlines.append(sample["headline"])
        failed_generations += 1

print(f"\n✅ Generation completed! Failed: {failed_generations}/{sample_size}")

# ─── 6. CALCULATE METRICS ──────────────────────────────────────────────────────
print("📊 Calculating comprehensive metrics...")
metrics = calculate_comprehensive_metrics(generated_headlines, reference_headlines)

# ─── 7. DISPLAY RESULTS ────────────────────────────────────────────────────────
print("\n" + "="*60)
print("🎯 UNSLOTH OPTIMIZED INFERENCE EVALUATION REPORT")
print("="*60)

print(f"\n📊 SAMPLE STATISTICS:")
print(f"   Total samples: {metrics['total_samples']}")
print(f"   Valid generations: {metrics['valid_generations']}")
print(f"   Success rate: {metrics['success_rate']:.2%}")

print(f"\n📝 CONTENT OVERLAP METRICS:")
print(f"   ROUGE-1 (unigram): {metrics['rouge1']:.4f}")
print(f"   ROUGE-2 (bigram):  {metrics['rouge2']:.4f}")
print(f"   ROUGE-L (LCS):     {metrics['rougeL']:.4f}")
print(f"   BLEU score:        {metrics['bleu']:.4f}")

print(f"\n📏 LENGTH METRICS:")
print(f"   Avg generated length: {metrics['avg_generated_length']:.1f} words")
print(f"   Avg reference length: {metrics['avg_reference_length']:.1f} words")
print(f"   Length ratio:         {metrics['length_ratio']:.2f}")

print(f"\n🎯 QUALITY ASSESSMENT:")
print(f"   Exact match rate:     {metrics['exact_match_rate']:.2%}")
if metrics['rouge1'] > 0.4:
    print(f"   ✅ Good content overlap (ROUGE-1 > 0.4)")
else:
    print(f"   ⚠️  Moderate content overlap (ROUGE-1 = {metrics['rouge1']:.4f})")

if 0.8 <= metrics['length_ratio'] <= 1.2:
    print(f"   ✅ Good length matching")
else:
    print(f"   ⚠️  Length mismatch (ratio = {metrics['length_ratio']:.2f})")

if metrics['success_rate'] > 0.95:
    print(f"   ✅ Excellent generation reliability")
else:
    print(f"   ⚠️  Generation reliability needs improvement")

# ─── 8. SHOW SAMPLE RESULTS ────────────────────────────────────────────────────
print(f"\n📰 SAMPLE GENERATED HEADLINES:")
print()
for i in range(min(5, len(generated_headlines))):
    if generated_headlines[i]:  # Only show successful generations
        print(f"   Example {i+1}:")
        print(f"   Reference: {reference_headlines[i]}")
        print(f"   Generated: {generated_headlines[i]}")
        print()

print("="*60)

# ─── 9. SAVE RESULTS ───────────────────────────────────────────────────────────
results = {
    "timestamp": datetime.now().isoformat(),
    "model_name": "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "evaluation_type": "unsloth_optimized_inference",
    "sample_size": sample_size,
    "metrics": metrics,
    "sample_results": [
        {
            "reference": ref,
            "generated": gen,
            "content_preview": test_samples[i]["content"][:100] + "..."
        }
        for i, (ref, gen) in enumerate(zip(reference_headlines[:10], generated_headlines[:10]))
        if gen  # Only include successful generations
    ]
}

# Save to file
with open("unsloth_evaluation_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"\n💾 Detailed results saved to: unsloth_evaluation_results.json")
print(f"✅ Unsloth optimized evaluation completed!")

🚀 Optimizing model for inference...
✅ Model optimized for inference (2x speed boost)
📊 Randomly sampled 1000 examples from 5204 total test samples

🔄 Running comprehensive evaluation with Unsloth optimized inference...


Generating headlines:  10%|█         | 100/1000 [00:20<03:27,  4.34it/s]

   Progress: 100/1000 | Success rate: 100.0%


Generating headlines:  20%|██        | 200/1000 [00:41<02:43,  4.88it/s]

   Progress: 200/1000 | Success rate: 100.0%


Generating headlines:  30%|███       | 301/1000 [01:02<02:12,  5.27it/s]

   Progress: 300/1000 | Success rate: 100.0%


Generating headlines:  40%|████      | 401/1000 [01:22<02:04,  4.80it/s]

   Progress: 400/1000 | Success rate: 100.0%


Generating headlines:  50%|█████     | 500/1000 [01:43<01:42,  4.88it/s]

   Progress: 500/1000 | Success rate: 100.0%


Generating headlines:  60%|██████    | 601/1000 [02:03<01:17,  5.17it/s]

   Progress: 600/1000 | Success rate: 100.0%


Generating headlines:  70%|███████   | 700/1000 [02:23<00:54,  5.48it/s]

   Progress: 700/1000 | Success rate: 100.0%


Generating headlines:  80%|████████  | 801/1000 [02:44<00:38,  5.12it/s]

   Progress: 800/1000 | Success rate: 100.0%


Generating headlines:  90%|█████████ | 900/1000 [03:06<00:24,  4.13it/s]

   Progress: 900/1000 | Success rate: 100.0%


Generating headlines: 100%|██████████| 1000/1000 [03:29<00:00,  4.78it/s]

   Progress: 1000/1000 | Success rate: 100.0%

✅ Generation completed! Failed: 0/1000
📊 Calculating comprehensive metrics...

🎯 UNSLOTH OPTIMIZED INFERENCE EVALUATION REPORT

📊 SAMPLE STATISTICS:
   Total samples: 1000
   Valid generations: 1000
   Success rate: 100.00%

📝 CONTENT OVERLAP METRICS:
   ROUGE-1 (unigram): 0.4216
   ROUGE-2 (bigram):  0.1845
   ROUGE-L (LCS):     0.3737
   BLEU score:        0.0860

📏 LENGTH METRICS:
   Avg generated length: 8.0 words
   Avg reference length: 8.4 words
   Length ratio:         0.95

🎯 QUALITY ASSESSMENT:
   Exact match rate:     1.10%
   ✅ Good content overlap (ROUGE-1 > 0.4)
   ✅ Good length matching
   ✅ Excellent generation reliability

📰 SAMPLE GENERATED HEADLINES:

   Example 1:
   Reference: Stock market trading sees upward trend
   Generated: Stock market sees uptick in morning trade

   Example 2:
   Reference: Rain expected in 3 divisions
   Generated: Thunderstorms may occur in several regions; otherwise, partly cloudy skies expe




In [None]:
# ─── 10. WANDB LOGGING (FIXED) ─────────────────────────────────────────────────

# Initialize wandb if not already active
if not wandb.run:
    wandb.init(
        project="news2headline-tinyllama",
        name="unsloth-optimized-evaluation",
        notes="Comprehensive evaluation using Unsloth optimized inference",
        tags=["evaluation", "unsloth", "inference", "comprehensive"]
    )

# Log comprehensive metrics
wandb.log({
    "unsloth_inference/rouge1": metrics["rouge1"],
    "unsloth_inference/rouge2": metrics["rouge2"],
    "unsloth_inference/rougeL": metrics["rougeL"],
    "unsloth_inference/bleu": metrics["bleu"],
    "unsloth_inference/success_rate": metrics["success_rate"],
    "unsloth_inference/exact_match_rate": metrics["exact_match_rate"],
    "unsloth_inference/avg_gen_length": metrics["avg_generated_length"],
    "unsloth_inference/avg_ref_length": metrics["avg_reference_length"],
    "unsloth_inference/length_ratio": metrics["length_ratio"],
    "unsloth_inference/sample_size": metrics["total_samples"],
    "unsloth_inference/valid_generations": metrics["valid_generations"]
})

print(f"📊 Metrics logged to wandb: {wandb.run.url}")

# Create a summary table for wandb
import pandas as pd

summary_data = {
    "Metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L", "BLEU", "Success Rate", "Exact Match"],
    "Score": [
        f"{metrics['rouge1']:.4f}",
        f"{metrics['rouge2']:.4f}",
        f"{metrics['rougeL']:.4f}",
        f"{metrics['bleu']:.4f}",
        f"{metrics['success_rate']:.2%}",
        f"{metrics['exact_match_rate']:.2%}"
    ]
}

summary_df = pd.DataFrame(summary_data)
wandb.log({"evaluation_summary": wandb.Table(dataframe=summary_df)})

# Log sample results
sample_data = []
for i in range(min(10, len(generated_headlines))):
    if generated_headlines[i]:
        sample_data.append([
            reference_headlines[i],
            generated_headlines[i],
            test_samples[i]["content"][:150] + "..."
        ])

if sample_data:
    sample_df = pd.DataFrame(sample_data, columns=["Reference", "Generated", "Content Preview"])
    wandb.log({"sample_results": wandb.Table(dataframe=sample_df)})

print("✅ Comprehensive evaluation data logged to wandb")
print(f"🔗 View results: {wandb.run.url}")

# Finish wandb run
wandb.finish()
print("📊 Wandb run completed successfully")