# Smart Summarizer: LoRA Fine-Tuning with Unsloth (Colab/T4 Ready)

This notebook implements Part A of the assignment using the [Unsloth](https://github.com/unslothai/unsloth) library for efficient LLM fine-tuning on a Colab T4 GPU.

**Steps:**
1. Data loading & preprocessing
2. LoRA fine-tuning with Unsloth
3. Inference (summarization)
4. Evaluation (ROUGE, BLEU, BERTScore, LLM-as-a-Judge)

In [None]:
# Install required libraries
!pip install unsloth datasets peft transformers accelerate bitsandbytes evaluate bert-score rouge-score nltk --quiet

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

## 1. Data Loading & Preprocessing

In [None]:
from datasets import load_dataset
import random

# Load arXiv summarization dataset
dataset = load_dataset('ccdv/arxiv-summarization')

# Select 5,000 random samples from the train split
random.seed(42)
subset = dataset['train'].shuffle(seed=42).select(range(5000))

# Extract input (article) and target (abstract)
inputs = [item['article'] for item in subset]
targets = [item['abstract'] for item in subset]

### Tokenization & Data Splitting

In [None]:
from unsloth import FastTokenizer
from sklearn.model_selection import train_test_split

import torch


max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
token = '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'

# Choose a base model (Llama-3-8B or Mistral-7B)
BASE_MODEL = 'meta-llama/Meta-Llama-3-8B'  # or 'unsloth/mistral-7b-bnb-4bit'

# Load tokenizer
model, tokenizer = FastTokenizer.from_pretrained(BASE_MODEL, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, token=token)

# Tokenize inputs and targets
def tokenize_function(example):
    return tokenizer(example['input'], truncation=True, padding='max_length', max_length=2048)

data = [{'input': i, 'target': t} for i, t in zip(inputs, targets)]

# Split data
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

## 2. LoRA Fine-Tuning with Unsloth

In [None]:
from unsloth import FastLoraTrainer, FastModel

# LoRA config
lora_config = {
    'r': 8,
    'alpha': 16,
    'dropout': 0.1,
    'target_modules': ['q_proj', 'v_proj'],  # for Llama/Mistral
    'use_gradient_checkpointing': 'unsloth',  # for Llama/Mistral
}

# Load model with LoRA
model = FastModel.from_pretrained(BASE_MODEL, lora_config=lora_config)

# Prepare datasets for Unsloth
def format_example(example):
    return {
        'input_ids': tokenizer.encode(example['input'], truncation=True, max_length=2048),
        'labels': tokenizer.encode(example['target'], truncation=True, max_length=512)
    }

train_dataset = list(map(format_example, train_data))
val_dataset = list(map(format_example, val_data))

# Training arguments
training_args = {
    'epochs': 4,
    'batch_size': 1,
    'eval_steps': 100,
    'save_steps': 500,
    'logging_steps': 50,
    'lr': 2e-4,
    'fp16': True
}

# Trainer
trainer = FastLoraTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    **training_args
)

# Train
trainer.train()

# Save model
model.save_pretrained('lora_summarizer_final')

## 3. Inference: Generate Summaries

In [None]:
# Select 10 random test samples
import numpy as np
np.random.seed(42)
sample_indices = np.random.choice(len(test_data), 10, replace=False)
test_samples = [test_data[i] for i in sample_indices]

# Generate summaries with fine-tuned model
ft_summaries = []
for sample in test_samples:
    input_ids = tokenizer.encode(sample['input'], return_tensors='pt', truncation=True, max_length=2048)
    output = model.generate(input_ids=input_ids, max_new_tokens=256, do_sample=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    ft_summaries.append(summary)

# Generate summaries with base model
base_model = FastModel.from_pretrained(BASE_MODEL)
base_summaries = []
for sample in test_samples:
    input_ids = tokenizer.encode(sample['input'], return_tensors='pt', truncation=True, max_length=2048)
    output = base_model.generate(input_ids=input_ids, max_new_tokens=256, do_sample=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    base_summaries.append(summary)

# Ground-truth summaries
gt_summaries = [sample['target'] for sample in test_samples]

## 4. Evaluation: ROUGE, BLEU, BERTScore

In [None]:
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score

rouge = evaluate.load('rouge')

def compute_metrics(preds, refs):
    results = {'rouge1': [], 'rougeL': [], 'bleu': [], 'bertscore': []}
    P, R, F1 = bert_score(preds, refs, lang='en', rescale_with_baseline=True)
    for i in range(len(preds)):
        rouge_res = rouge.compute(predictions=[preds[i]], references=[refs[i]])
        bleu = sentence_bleu([refs[i].split()], preds[i].split(), smoothing_function=SmoothingFunction().method1)
        results['rouge1'].append(rouge_res['rouge1'])
        results['rougeL'].append(rouge_res['rougeL'])
        results['bleu'].append(bleu)
        results['bertscore'].append(F1[i].item())
    return results

ft_metrics = compute_metrics(ft_summaries, gt_summaries)
base_metrics = compute_metrics(base_summaries, gt_summaries)

### Visualize Evaluation Results

In [None]:
import matplotlib.pyplot as plt
import numpy as np

labels = ['ROUGE-1', 'ROUGE-L', 'BLEU', 'BERTScore']
ft_means = [np.mean(ft_metrics['rouge1']), np.mean(ft_metrics['rougeL']), np.mean(ft_metrics['bleu']), np.mean(ft_metrics['bertscore'])]
base_means = [np.mean(base_metrics['rouge1']), np.mean(base_metrics['rougeL']), np.mean(base_metrics['bleu']), np.mean(base_metrics['bertscore'])]

x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, ft_means, width, label='Fine-tuned')
rects2 = ax.bar(x + width/2, base_means, width, label='Base')

ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()

## 5. LLM-as-a-Judge (Qualitative Evaluation via Together.ai API)

In [None]:
# Example: LLM-as-a-Judge prompt and API call (pseudo-code, fill in your Together.ai key and endpoint)
import requests

TOGETHER_API_KEY = '2b9e51fc4df8e0fd2af8a13e5b9c7672045d144fdeb0af379076fff0d1f7bdc6'
JUDGE_MODEL = 'Meta-Llama-3.1-70B-Instruct-Turbo'

def llm_judge(input_text, summary):
    prompt = f'''Given the following input and the summary produced, evaluate the summary on \n1. Fluency \n2. Factuality \n3. Coverage \nUse a score from 1 (poor) to 5 (excellent) for each. Provide a short justification for each score.\nInput: {input_text}\nGenerated Summary: {summary}'''
    response = requests.post(
        'https://api.together.xyz/v1/chat/completions',
        headers={'Authorization': f'Bearer {TOGETHER_API_KEY}'
                 ,'Content-Type': 'application/json'},
        json={
            'model': JUDGE_MODEL,
            'messages': [{'role': 'user', 'content': prompt}],
            'max_tokens': 256
        }
    )
    return response.json()['output']

# Example usage (run for each test sample)
for i in range(10):
    print(llm_judge(test_samples[i]['input'], ft_summaries[i]))