In [7]:
# Step - 1
!pip install transformers datasets evaluate --quiet

# Step 2
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, get_scheduler
from datasets import Dataset, load_metric
import pandas as pd

import pandas as pd
from datasets import Dataset

# Load with low_memory warning suppressed
df = pd.read_csv("processed_batch_0.csv", low_memory=False).dropna()

# Show column names to inspect
print(df.columns.tolist())


if "processed" in df.columns:
    df = df.rename(columns={"processed": "highlights"})

# Double-check required columns exist
assert "article" in df.columns and "highlights" in df.columns, "Missing required columns."

# Create HuggingFace Dataset
dataset = Dataset.from_pandas(df[["article", "highlights"]])
train_valid = dataset.train_test_split(test_size=0.3)

train_valid = dataset.train_test_split(test_size=0.3)

# Step 4: Tokenizer and Base Model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Step 5: Pointer Generator Layer
class PointerGenerator(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.linear = nn.Linear(hidden_dim * 2, 1)

    def forward(self, decoder_hidden, context_vector):
        combined = torch.cat((decoder_hidden, context_vector), dim=-1)
        p_gen = torch.sigmoid(self.linear(combined))
        return p_gen

# Step 6: Combine BART + PGN
class BartWithPointerGenerator(nn.Module):
    def __init__(self, bart_model):
        super().__init__()
        self.bart = bart_model
        self.pointer = PointerGenerator(hidden_dim=1024)

    def forward(self, input_ids, attention_mask, decoder_input_ids, labels=None):
        encoder_outputs = self.bart.model.encoder(input_ids, attention_mask=attention_mask)
        decoder_outputs = self.bart.model.decoder(
            input_ids=decoder_input_ids,
            encoder_hidden_states=encoder_outputs.last_hidden_state,
            encoder_attention_mask=attention_mask
        )
        logits = self.bart.lm_head(decoder_outputs.last_hidden_state)

        p_gen = self.pointer(
            decoder_outputs.last_hidden_state[:, -1, :],
            encoder_outputs.last_hidden_state[:, 0, :]
        )

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

        return {"loss": loss, "logits": logits, "p_gen": p_gen}

# Step 7: Preprocessing Function
def preprocess(example):
    model_inputs = tokenizer(example["article"], truncation=True, padding="max_length", max_length=768)
    labels = tokenizer(example["highlights"], truncation=True, padding="max_length", max_length=256)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_valid["train"].map(preprocess, batched=True)
valid_dataset = train_valid["test"].map(preprocess, batched=True)

# Step 8: DataLoader Setup
def collate_fn(batch):
    return {
        'input_ids': torch.tensor([f["input_ids"] for f in batch]),
        'attention_mask': torch.tensor([f["attention_mask"] for f in batch]),
        'decoder_input_ids': torch.tensor([f["labels"][:-1] for f in batch]),
        'labels': torch.tensor([f["labels"][1:] for f in batch])
    }

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# Step 9: Initialize Model and Training Tools
model = BartWithPointerGenerator(bart_model).to("cuda" if torch.cuda.is_available() else "cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)
scaler = torch.cuda.amp.GradScaler()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 10: Training Loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.cuda.amp.autocast():
            outputs = model(**batch)
            loss = outputs["loss"]

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

# Step 11: ROUGE Evaluation
# from evaluate import load as load_metric
# metric = load_metric("rouge")

# model.eval()
# predictions, references = [], []

# with torch.no_grad():
#     for batch in valid_loader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)

#         generated_ids = model.bart.generate(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             max_length=256,
#             num_beams=4,
#             early_stopping=True
#         )

#         decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
#         decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
#         predictions.extend(decoded_preds)
#         references.extend(decoded_labels)

# results = metric.compute(predictions=predictions, references=references, use_stemmer=True)
# for k, v in results.items():
#     print(f"{k}: {v.mid.fmeasure:.4f}")


['article', 'abstract', 'processed']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Map:   0%|          | 0/702 [00:00<?, ? examples/s]

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1: Loss = 1.7472
Epoch 2: Loss = 1.3230
Epoch 3: Loss = 1.0399


In [8]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6ed37504b04c110f01d8d010b40f8ea9a76d29f78154794c6ba20eceeb93c59f
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [19]:
from evaluate import load as load_metric
metric = load_metric("rouge")

model.eval()
predictions, references = [], []

with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        generated_ids = model.bart.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

        decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
        predictions.extend(decoded_preds)
        references.extend(decoded_labels)

results = metric.compute(predictions=predictions, references=references, use_stemmer=True)


In [12]:
print(results)

{'rouge1': np.float64(0.5285681344226997), 'rouge2': np.float64(0.40000699784359617), 'rougeL': np.float64(0.42627786571099957), 'rougeLsum': np.float64(0.42639267112772233)}


In [None]:
# Inference Example
def summarize(text, model, tokenizer, max_length=1024):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)

    with torch.no_grad():
        summary_ids = model.bart.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            min_length=50,
            num_beams=6,
            length_penalty=2.0,
            early_stopping=True
        )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test Inference
sample_text = dataset[0]["article"]
print("Generated Summary:\n", summarize(sample_text, model, tokenizer))
