# Smart Summarizer: LoRA Fine-tuning

This notebook walks through the process of fine-tuning a large language model using LoRA (Low-Rank Adaptation) for academic paper summarization.

## Overview

1. Load and preprocess the arXiv dataset
2. Configure and apply LoRA to the base model
3. Fine-tune the model
4. Save the fine-tuned model
5. Generate summaries with both base and fine-tuned models

In [1]:
# Install required libraries if not already installed
!pip install -q transformers peft datasets accelerate bitsandbytes trl evaluate rouge-score nltk bert-score

In [2]:
# Import necessary libraries
import os
import sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import evaluate
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Add parent directory to path for importing custom modules
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

## 1. Dataset Loading and Preprocessing

We'll load the arXiv dataset and preprocess it for our summarization task.

In [3]:
# Import our preprocessing module
from smart_summarizer.data.data_preprocessing import load_arxiv_dataset


# Load dataset
print("Loading arXiv dataset...")
dataset = load_arxiv_dataset()

# Display dataset information
print(f"Dataset size: {len(dataset)}")
print("Dataset features:", dataset.features)

# Display a sample
print("\nSample entry:")
sample = dataset[0]
# Note: The original dataset loaded by load_dataset('arxiv_dataset') might not have a 'title' column.
# If you encounter a KeyError for 'title', you might need to adjust the sample display.
# print(f"Title: {sample['title']}") # Uncomment or adjust if 'title' exists
print(f"Abstract (first 200 chars): {sample['abstract'][:200]}...")
print(f"Article (first 200 chars): {sample['article'][:200]}...")

Loading arXiv dataset...
Loading arXiv summarization dataset...


Selected 5000 samples from the dataset.
Dataset size: 5000
Dataset features: {'article': Value(dtype='string', id=None), 'abstract': Value(dtype='string', id=None)}

Sample entry:
Abstract (first 200 chars): we study the effect of @xmath0-symmetric complex potentials on the transport properties of non - hermitian systems , which consist of an infinite linear chain and two side - coupled defect points with...
Article (first 200 chars): two decades ago bender and boettcher have found that a broad family of non - hermitian hamiltonians can exhibit entirely real spectra as long as these hamiltonians have parity - time ( @xmath0 ) symme...


In [None]:
from smart_summarizer.data.data_preprocessing import split_dataset
# Split dataset
print("Splitting dataset into train, validation, and test sets...")
split_dataset = split_dataset(dataset)

print(f"Training set size: {len(split_dataset['train'])}")
print(f"Validation set size: {len(split_dataset['validation'])}")
print(f"Test set size: {len(split_dataset['test'])}")

Splitting dataset into train, validation, and test sets...
Splitting dataset into train, validation, and test sets...
Train set: 3999 samples
Validation set: 501 samples
Test set: 500 samples
Training set size: 3999
Validation set size: 501
Test set size: 500


## 2. Base Model and Tokenizer Setup

Load the base model and tokenizer that we'll fine-tune.

In [5]:
# Base model name
BASE_MODEL = "meta-llama/Llama-3-8B"

# Load tokenizer
print(f"Loading tokenizer for {BASE_MODEL}...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Set padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Loading tokenizer for meta-llama/Llama-3-8B...


OSError: meta-llama/Llama-3-8B is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# Prepare the dataset for training
def prepare_inputs(examples):
    # Format the input: article text followed by abstract
    inputs = [f"Summarize this academic paper: {article}\n\nSummary:" for article in examples["article"]]
    targets = [f"{abstract}" for abstract in examples["abstract"]]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, truncation=True, max_length=1024)
    labels = tokenizer(targets, truncation=True, max_length=256)
    
    # Set up the labels
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Process datasets
print("Processing datasets with tokenizer...")
tokenized_datasets = {
    split: dataset.map(prepare_inputs, batched=True, remove_columns=["article", "abstract", "title"])
    for split, dataset in split_dataset.items()
}

print("Dataset processing complete!")

## 3. LoRA Configuration and Model Setup

Configure and apply LoRA to the base model.

In [None]:
# Load base model (quantized for memory efficiency)
print(f"Loading base model {BASE_MODEL}...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,  # Load in 8-bit precision to save memory
    device_map="auto",  # Automatically distribute across available devices
    torch_dtype=torch.float16,  # Use half-precision
)

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Rank of update matrices
    lora_alpha=16,  # Alpha parameter for LoRA scaling
    lora_dropout=0.1,  # Dropout probability for LoRA layers
    bias="none",  # Don't train bias terms
    task_type=TaskType.CAUSAL_LM,  # Task type (causal language modeling)
    target_modules=["q_proj", "v_proj"]  # Apply to query and value projection matrices
)

# Apply LoRA to the model
print("Applying LoRA to the model...")
model = get_peft_model(base_model, lora_config)

# Print trainable parameters info
print("\nTrainable parameters:")
model.print_trainable_parameters()

## 4. Model Training

Set up training arguments and train the model.

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="../smart_summarizer/models/lora_summarizer/checkpoints",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # To simulate larger batch sizes
    learning_rate=2e-4,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=100,
    load_best_model_at_end=True,
    fp16=True,  # Use mixed-precision training
    warmup_steps=100,
    report_to="none",  # Disable reporting to avoid extra dependencies
)

# Create a data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

# Start training
print("Starting training...")
trainer.train()

## 5. Save the Fine-tuned Model

Save the LoRA weights for later use.

In [None]:
# Save the fine-tuned model
output_dir = "../smart_summarizer/models/lora_summarizer/final_model"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)

# Save the tokenizer too
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

## 6. Generate Summaries with Base and Fine-tuned Models

Compare summaries generated by both models.

In [None]:
# Function to generate summaries
def generate_summary(model, tokenizer, text, max_new_tokens=256):
    # Prepare input
    prompt = f"Summarize this academic paper: {text}\n\nSummary:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            num_beams=4,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        
    # Decode and extract summary
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summary = full_text.split("Summary:")[-1].strip()
    return summary

# Load test samples
test_samples = split_dataset["test"].select(range(10))  # Select first 10 test samples

# Load the fine-tuned model
fine_tuned_model = PeftModel.from_pretrained(base_model, output_dir)

# Generate and compare summaries for test samples
results = []

for i, sample in enumerate(tqdm(test_samples)):
    # Get the article and ground truth summary
    article = sample["article"]
    ground_truth = sample["abstract"]
    
    # Generate summaries with both models
    base_summary = generate_summary(base_model, tokenizer, article)
    fine_tuned_summary = generate_summary(fine_tuned_model, tokenizer, article)
    
    results.append({
        "index": i,
        "title": sample["title"],
        "ground_truth": ground_truth,
        "base_summary": base_summary,
        "fine_tuned_summary": fine_tuned_summary
    })

# Convert to DataFrame for easier viewing
results_df = pd.DataFrame(results)
results_df

## 7. Evaluate Generated Summaries

Calculate automatic evaluation metrics for the generated summaries.

In [None]:
# Import evaluation module
from smart_summarizer.evaluation.evaluation import SummaryEvaluator

# Initialize evaluator
evaluator = SummaryEvaluator(
    base_model_name=BASE_MODEL,
    lora_model_dir=output_dir,
    together_api_key=None  # We'll do automatic metrics only in this notebook
)

# Extract lists of summaries for evaluation
ground_truths = results_df["ground_truth"].tolist()
base_summaries = results_df["base_summary"].tolist()
fine_tuned_summaries = results_df["fine_tuned_summary"].tolist()

# Calculate metrics
print("Calculating ROUGE scores...")
base_rouge = evaluator.compute_rouge(base_summaries, ground_truths)
fine_tuned_rouge = evaluator.compute_rouge(fine_tuned_summaries, ground_truths)

print("Calculating BLEU scores...")
base_bleu = evaluator.compute_bleu(base_summaries, ground_truths)
fine_tuned_bleu = evaluator.compute_bleu(fine_tuned_summaries, ground_truths)

# Skip BERTScore if resources are limited
try:
    print("Calculating BERTScores...")
    base_bertscore = evaluator.compute_bertscore(base_summaries, ground_truths)
    fine_tuned_bertscore = evaluator.compute_bertscore(fine_tuned_summaries, ground_truths)
except:
    print("Skipping BERTScore calculation due to resource constraints.")
    base_bertscore = {"precision": 0, "recall": 0, "f1": 0}
    fine_tuned_bertscore = {"precision": 0, "recall": 0, "f1": 0}

In [None]:
# Display metrics comparison
metrics_comparison = {
    "Model": ["Base Model", "Fine-tuned Model"],
    "ROUGE-1": [base_rouge["rouge1"], fine_tuned_rouge["rouge1"]],
    "ROUGE-2": [base_rouge["rouge2"], fine_tuned_rouge["rouge2"]],
    "ROUGE-L": [base_rouge["rougeL"], fine_tuned_rouge["rougeL"]],
    "BLEU": [base_bleu, fine_tuned_bleu],
    "BERTScore F1": [base_bertscore["f1"], fine_tuned_bertscore["f1"]]
}

metrics_df = pd.DataFrame(metrics_comparison)
metrics_df

In [None]:
# Plot metrics for comparison
metrics_to_plot = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "BLEU", "BERTScore F1"]

fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(metrics_to_plot))
width = 0.35

base_scores = [metrics_df.loc[0, metric] for metric in metrics_to_plot]
fine_tuned_scores = [metrics_df.loc[1, metric] for metric in metrics_to_plot]

# Plot bars
base_bars = ax.bar(x - width/2, base_scores, width, label='Base Model')
fine_tuned_bars = ax.bar(x + width/2, fine_tuned_scores, width, label='Fine-tuned Model')

# Add labels and title
ax.set_ylabel('Score')
ax.set_title('Automatic Evaluation Metrics Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics_to_plot)
ax.legend()

# Add score labels on top of the bars
def add_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

add_labels(base_bars)
add_labels(fine_tuned_bars)

plt.tight_layout()
plt.savefig("../smart_summarizer/evaluation/automatic_metrics.png")
plt.show()

## 8. Save Results for Further Analysis

Save the summaries and evaluation results for future reference.

In [None]:
# Save summaries
output_path = "../smart_summarizer/evaluation/summary_comparison.csv"
results_df.to_csv(output_path, index=False)
print(f"Summaries saved to {output_path}")

# Save metrics
metrics_path = "../smart_summarizer/evaluation/metrics_comparison.csv"
metrics_df.to_csv(metrics_path, index=False)
print(f"Metrics saved to {metrics_path}")