# 📘 Simulated Fine-Tuning BART for Text Summarization
# This notebook demonstrates how to fine-tune the facebook/bart-large-cnn model on a custom summarization dataset using Hugging Face Transformers and
# Datasets libraries.


# Imports

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM
)
import evaluate
import torch
from tqdm import tqdm


# Load of Dataset

In [None]:
# Load your custom dataset
df = pd.read_csv("enhanced_ai_llm_dataset.csv")

# Rename columns to 'document' and 'summary'
df = df.rename(columns={"text": "document"})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Train/test split
dataset = dataset.train_test_split(test_size=0.3, seed=42)
print(dataset)
print("\nExample Document:\n", dataset["train"][0]["document"][:300])
print("\nExample Summary:\n", dataset["train"][0]["summary"])
df.columns

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'text_length', 'summary_length', 'text_word_count', 'summary_word_count', 'text_summary_similarity'],
        num_rows: 14
    })
    test: Dataset({
        features: ['document', 'summary', 'text_length', 'summary_length', 'text_word_count', 'summary_word_count', 'text_summary_similarity'],
        num_rows: 6
    })
})

Example Document:
 While GPT-4 (proprietary) leads in performance, open-source models (LLaMA-2, Mistral) offer transparency and customization. Businesses may prefer open models for data privacy, though they require more technical expertise. Key differences: Proprietary models: Better performance but less control over 

Example Summary:
 Open-source LLMs provide transparency and customization but require more technical expertise, while proprietary models offer better performance with less control.


Index(['document', 'summary', 'text_length', 'summary_length',
       'text_word_count', 'summary_word_count', 'text_summary_similarity'],
      dtype='object')

# Tokenization

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# Tokenization function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["document"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply to dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/14 [00:00<?, ? examples/s]



Map:   0%|          | 0/6 [00:00<?, ? examples/s]

# Generate and Evaluate Summaries on the Test Set Before Fine-Tuning

In [None]:
# Load pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to("cuda" if torch.cuda.is_available() else "cpu")

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Generate summaries for the test set
test_texts = dataset["test"]["document"]
references = dataset["test"]["summary"]
generated = []

for text in tqdm(test_texts, desc="Generating Summaries"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    summary_ids = model.generate(**inputs, max_length=128, num_beams=4)
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated.append(output)

# Evaluate with ROUGE
results = rouge.compute(predictions=generated, references=references)
print("\n🔎 ROUGE Scores (Baseline Model):")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

Generating Summaries: 100%|██████████| 6/6 [00:21<00:00,  3.62s/it]



🔎 ROUGE Scores (Baseline Model):
rouge1: 0.3348
rouge2: 0.1093
rougeL: 0.2456
rougeLsum: 0.2502


# Configure Trainer and Train Arguments for Seq2Seq Fine-Tuning

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_steps=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=5,
    report_to="none"  # disable wandb/logging
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [None]:
# Start training
trainer.train()

Step,Training Loss
5,8.0443
10,3.2969
15,1.2701
20,0.6754
25,0.3953
30,0.2381
35,0.1776
40,0.1328
45,0.1009
50,0.0941




TrainOutput(global_step=70, training_loss=1.0477379428488867, metrics={'train_runtime': 199.9767, 'train_samples_per_second': 0.7, 'train_steps_per_second': 0.35, 'total_flos': 151697322147840.0, 'train_loss': 1.0477379428488867, 'epoch': 10.0})

# Generate and Evaluate Summaries Using the Fine-Tuned Model

In [None]:
# Generate summaries using the fine-tuned model
generated_ft = []

for text in tqdm(dataset["test"]["document"], desc="Generating Fine-Tuned Summaries"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    summary_ids = model.generate(**inputs, max_length=128, num_beams=4)
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_ft.append(output)

# Evaluate fine-tuned model
results_ft = rouge.compute(predictions=generated_ft, references=dataset["test"]["summary"])
print("\n📈 ROUGE Scores (Fine-Tuned Model):")
for k, v in results_ft.items():
    print(f"{k}: {v:.4f}")

Generating Fine-Tuned Summaries: 100%|██████████| 6/6 [00:10<00:00,  1.67s/it]


📈 ROUGE Scores (Fine-Tuned Model):
rouge1: 0.3009
rouge2: 0.0747
rougeL: 0.2083
rougeLsum: 0.2096





# Demo using GRADIO UI

In [None]:

import gradio as gr
from transformers import BartTokenizer, BartForConditionalGeneration

# Load tokenizer and model from Hugging Face Hub
MODEL_REPO = "AymB2/fine_tuned_bart_model"

try:
    tokenizer = BartTokenizer.from_pretrained(MODEL_REPO)
    model = BartForConditionalGeneration.from_pretrained(MODEL_REPO)
    print(tokenizer)
except Exception as e:
    raise RuntimeError(f"Failed to load model from Hugging Face: {e}")
def summarize(text):
    if not text or len(text.strip()) == 0:
        return "Please enter some text."

    try:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=max_input_length
        ).to(model.device)

        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_summary_length,
            num_beams=4,
            early_stopping=True
        )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

    except Exception as e:
        return f"Error during summarization: {str(e)}"


max_input_length=133
max_summary_length=31
iface = gr.Interface(
    fn=summarize,
    inputs=gr.Textbox(lines=10, placeholder="Enter text to summarize..."),
    outputs="text",
    title="Fine-Tuned BART Summarizer",
    description="Enter any text and get a summary generated by the fine-tuned model."
)

iface.launch()
