# **Block 1: Environment Setup and Installation**

In [None]:
!pip install datasets transformers sentence-transformers evaluate nltk rouge-score bert_score huggingface_hub fastapi uvicorn optuna sumy accelerate



In [None]:
!pip install protobuf==3.20.*




 **Citations/References:**


*   Hugging Face Transformers Installation: https://huggingface.co/docs/transformers/installation
*   NLTK Installation Guide: https://www.nltk.org/install.html



# **Block 2: Imports, Configuration, and Global Settings**

In [None]:
import os
import re
import torch
import nltk
import random
import logging
import optuna

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments,
    DataCollatorForSeq2Seq, EarlyStoppingCallback, pipeline, BartForConditionalGeneration
)
from sentence_transformers import SentenceTransformer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import evaluate
from fastapi import FastAPI, Request
import uvicorn

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)


# Reproducibility
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Disable progress bars from Hugging Face Hub to keep output clean
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

**Citations/References**:


*   NLTK Documentation: https://www.nltk.org/
*   Hugging Face Datasets & Transformers Docs: https://huggingface.co/docs/



# **Block 3: Enhanced Data Preparation with Dataset Prefixes**

In [None]:
def load_summarization_datasets():
    """Load datasets and quickly sample a subset from the train split for testing"""
    datasets_dict = {
        "cnn": load_dataset("cnn_dailymail", "3.0.0")["train"].select(range(500)),
        "xsum": load_dataset("xsum", trust_remote_code=True)["train"].select(range(500)),
        "multi_news": load_dataset("multi_news", trust_remote_code=True)["train"].select(range(500)),
        "gigaword": load_dataset("gigaword", trust_remote_code=True)["train"].select(range(500))
    }
    return datasets_dict

def preprocess_example(example, dataset_name):
    """Add dataset-specific prefixes and clean text"""
    prefixes = {
        "cnn": "[CNN] ",
        "xsum": "[XSum] ",
        "multi_news": "[MultiNews] ",
        "gigaword": "[Gigaword] "
    }

    text = example.get("article") or example.get("document") or ""
    summary = example.get("highlights") or example.get("summary") or ""

    text = f"{prefixes[dataset_name]}{' '.join(text.split())}"
    summary = ' '.join(summary.split())

    return {"text": text, "summary": summary}

def prepare_datasets(dataset_dict):
    """Prepare datasets with an 80-10-10 split and preprocess all splits."""
    processed = {}
    for name, ds in dataset_dict.items():
        full_ds = ds.train_test_split(test_size=0.2, seed=SEED)
        test_valid = full_ds["test"].train_test_split(test_size=0.5, seed=SEED)
        processed[name] = {
            "train": full_ds["train"].map(lambda x: preprocess_example(x, name)),
            "val": test_valid["test"].map(lambda x: preprocess_example(x, name)),
            "test": test_valid["train"].map(lambda x: preprocess_example(x, name))
        }
    return processed

**Citation/References:**


*   Hugging Face Datasets Documentation: https://huggingface.co/docs/datasets/
*   CNN/DailyMail Dataset Paper: https://arxiv.org/abs/1506.03340




# **Block 4: Model & Tokenizer Initialization with BART-large**

In [None]:
MODEL_NAME = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**Citation/Reference :**

*   BART Model Documentation: https://huggingface.co/docs/transformers/model_doc/bart
*   BART Paper (Lewis et al., 2019): https://arxiv.org/abs/1910.13461



# **Block 5: Complete Hyperparameter Optimization**

In [None]:
class HyperparameterOptimizer:
    def __init__(self, summarizer, val_dataset, sample_size=5):
        self.summarizer = summarizer
        # Use a very small sample for quick optimization
        self.val_sample = val_dataset.shuffle(seed=42).select(range(50))
        self.evaluator = EnhancedEvaluator()

    def evaluate_on_validation_set(self, params):
        scores = []
        for example in self.val_sample:
            try:
                generated = self.summarizer.summarize(
                    example["text"],
                    max_length=params["max_length"],
                    num_key_sentences=params["num_key_sentences"],
                    temperature=params["temperature"],
                    length_penalty=params["length_penalty"]
                )
                scores.append(self.evaluator.evaluate(generated, example["summary"])["rougeL"])
            except Exception as e:
                logger.error(f"Error during evaluation: {e}")
                continue
        return sum(scores) / len(scores) if scores else 0

    def objective(self, trial):
        params = {
            "max_length": trial.suggest_int("max_length", 60, 120),
            "min_length": trial.suggest_int("min_length", 30, 70),
            "num_key_sentences": trial.suggest_int("num_key_sentences", 2, 5),
            "temperature": trial.suggest_float("temperature", 0.8, 1.2),
            "length_penalty": trial.suggest_float("length_penalty", 0.8, 2.0)
        }
        return self.evaluate_on_validation_set(params)

def optimize_hyperparameters(summarizer, val_dataset):
    optimizer = HyperparameterOptimizer(summarizer, val_dataset)
    # Reduce the number of trials for a quicker run (e.g., 3 trials)
    study = optuna.create_study(direction="maximize")
    study.optimize(optimizer.objective, n_trials=6, show_progress_bar=True)
    return study.best_params

**Citation/Reference:**


*   Optuna Documentation: https://optuna.org/




# **Block 6: Enhanced Hybrid Summarization with BART**

In [None]:
class HybridSummarizer:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.text_rank = TextRankSummarizer()
        self.best_params = {
            "max_length": 92,
            "min_length": 40,  # default for generation outside optuna
            "num_key_sentences": 3,
            "temperature": 1.0,
            "length_penalty": 1.2
        }

    def extract_key_sentences(self, text, num_sentences=3):
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        return " ".join([str(s) for s in self.text_rank(parser.document, num_sentences)])

    def summarize(self, text, **kwargs):
        params = self.best_params
        key_points = self.extract_key_sentences(text, params["num_key_sentences"])
        prompt = f"Generate a news summary from these key points: {key_points}"

        inputs = self.tokenizer(
            prompt,
            max_length=1024,
            truncation=True,
            return_tensors="pt"
        )
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
            self.model.to("cuda")

        summary_ids = self.model.generate(
            inputs["input_ids"],
            num_beams=4,
            max_length=params["max_length"],
            min_length=params["min_length"],
            length_penalty=params["length_penalty"],
            temperature=params["temperature"],
            do_sample=False,  # disables sampling for determinism
            early_stopping=True
        )

        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

**Citation/Reference:**


*   Sumy Library on PyPI: https://pypi.org/project/sumy/




# **Block 7: Training Setup with Optuna Params**

In [None]:
def train_model(train_dataset, val_dataset):
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="rougeL",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    return trainer

**Citation/Reference:**


*   Hugging Face Trainer Documentation: https://huggingface.co/docs/transformers/main_classes/trainer




# **Block 8: Enhanced Evaluation Module**

In [None]:
class EnhancedEvaluator:
    def __init__(self):
        self.rouge = evaluate.load("rouge")
        self.bertscore = evaluate.load("bertscore")
        self.meteor = evaluate.load("meteor")

    def evaluate(self, generated, reference):
        scores = self.rouge.compute(
            predictions=[generated],
            references=[reference],
            use_stemmer=True
        )
        bert_scores = self.bertscore.compute(
            predictions=[generated],
            references=[reference],
            lang="en"
        )
        meteor_score = self.meteor.compute(
            predictions=[generated],
            references=[reference]
        )
        return {
            "rouge1": scores["rouge1"],
            "rouge2": scores["rouge2"],
            "rougeL": scores["rougeL"],
            "bertscore_f1": bert_scores["f1"][0],
            "meteor": meteor_score["meteor"]
        }

**Citation/Reference:**


*   Evaluate Library Documentation: https://huggingface.co/docs/evaluate/
*   ROUGE: https://www.aclweb.org/anthology/W04-1013/
*   METEOR: https://www.cs.cmu.edu/~albanie/papers/METEOR.pdf
*   BERTScore: https://arxiv.org/abs/1904.09675




# **Block 9: FastAPI Deployment**

In [None]:
app = FastAPI(title="Optimized News Summarization API")
summarizer_api = HybridSummarizer(model, tokenizer)

@app.post("/summarize")
async def summarize_endpoint(request: Request):
    data = await request.json()
    text = data.get("text", "")
    if not text:
        return {"error": "No text provided"}, 400
    summary = summarizer_api.summarize(text)
    return {"summary": summary}

**Citation/Reference:**


*   FastAPI Documentation: https://fastapi.tiangolo.com/
*   Uvicorn Documentation: https://www.uvicorn.org/



# **Block 10: Main Execution Flow**

In [None]:
if __name__ == "__main__":
    # 1. Load and prepare a small subset of data
    datasets_dict = load_summarization_datasets()
    processed = prepare_datasets(datasets_dict)

    # 2. Concatenate the small subsets
    train_dataset = concatenate_datasets([d["train"] for d in processed.values()])
    val_dataset = concatenate_datasets([d["val"] for d in processed.values()])

    # 3. Hyperparameter optimization with fewer trials and a small sample
    summarizer = HybridSummarizer(model, tokenizer)
    logger.info("Starting hyperparameter optimization (quick run)...")
    best_params = optimize_hyperparameters(summarizer, val_dataset)
    summarizer.best_params = best_params
    logger.info(f"Best Hyperparameters: {best_params}")

    # 4. Interactive demo in Colab (enter a short news article)
    input_text = input("Enter news article to summarize:\n")
    summary = summarizer.summarize(input_text)
    print(f"\nOptimized Summary:\n{summary}")

    ref = input("Enter reference summary for evaluation (or press Enter to skip):\n")
    if ref.strip():
      evaluator = EnhancedEvaluator()
      scores = evaluator.evaluate(summary, ref)
      print(f"\nEvaluation Scores:\n{scores}")



    # 5. (Optional) Fine-tuning call – uncomment if desired.
    # trainer = train_model(train_dataset, val_dataset)

    # 6. (Optional) To run the FastAPI server in Colab, use ngrok or similar.

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/3803957 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/189651 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1951 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[I 2025-03-26 17:09:04,965] A new study created in memory with name: no-name-3757ed87-0c99-433c-ad27-38fca18fbac0


  0%|          | 0/6 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[I 2025-03-26 17:10:21,057] Trial 0 finished with value: 0.17296341275517108 and parameters: {'max_length': 82, 'min_length': 66, 'num_key_sentences': 5, 'temperature': 1.1344346758873165, 'length_penalty': 1.061317550004658}. Best is trial 0 with value: 0.17296341275517108.
[I 2025-03-26 17:11:14,393] Trial 1 finished with value: 0.17296341275517108 and parameters: {'max_length': 81, 'min_length': 62, 'num_key_sentences': 2, 'temperature': 0.9057198445572194, 'length_penalty': 1.1870590600933864}. Best is trial 0 with value: 0.17296341275517108.
[I 2025-03-26 17:12:08,419] Trial 2 finished with value: 0.17296341275517108 and parameters: {'max_length': 120, 'min_length': 31, 'num_key_sentences': 4, 'temperature': 1.111876565077993, 'length_penalty': 1.834805845313566}. Best is trial 0 with value: 0.17296341275517108.
[I 2025-03-26 17:13:02,400] Trial 3 finished with value: 0.17296341275517108 and parameters: {'max_length': 80, 'min_length': 54, 'num_key_sentences': 4, 'temperature': 0.




Optimized Summary:
The central bank has decided to lower interest rates for the first time in over a decade. The decision comes amid rising inflation concerns and a noticeable slowdown in consumer spending. Financial experts predict that this move may boost investments and stimulate the housing market. Meanwhile, government officials expressed cautious optimism, noting that further structural reforms will be essential to sustain a long-term economic recovery.
Enter reference summary for evaluation (or press Enter to skip):
"Reuters reports that the central bank lowered interest rates for the first time in over a decade to stimulate economic growth amid rising inflation and slowing consumer spending. Financial experts expect the move to boost investments and stimulate the housing market, while government officials remain cautiously optimistic and call for further reforms to sustain long-term recovery."


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluation Scores:
{'rouge1': np.float64(0.6991869918699187), 'rouge2': np.float64(0.4297520661157025), 'rougeL': np.float64(0.6341463414634148), 'bertscore_f1': 0.9466803669929504, 'meteor': np.float64(0.6489839319470699)}
