In [None]:
from google.colab import drive
drive.mount ('/content/drive')

Mounted at /content/drive


In [6]:
# ---------------------------
# PART C: Transformer-Based Seq2Seq Models for Title Generation
# ---------------------------

# ---------------------------
# Install Required Libraries
# ---------------------------
!pip install datasets evaluate rouge_score --quiet
!pip install --upgrade transformers --quiet

# ---------------------------
# Import Libraries
# ---------------------------
import os
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# ---------------------------
# Check device
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------------
# Load Dataset
# ---------------------------
train_df = pd.read_csv('/content/train_processed (1).csv')
val_df   = pd.read_csv('/content/val_processed (1).csv')
test_df  = pd.read_csv('/content/test_processed (1).csv')

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

# ---------------------------
# ROUGE Evaluation Setup
# ---------------------------
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds  = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }

# ---------------------------
# Section C1: Fine-tuning T5-Small
# ---------------------------
model_checkpoint    = "t5-small"
tokenizer           = AutoTokenizer.from_pretrained(model_checkpoint)
model               = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

max_input_length    = 512
max_target_length   = 64

def preprocess_function(examples):
    inputs  = examples["text"]
    targets = examples["title"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val   = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)
tokenized_test  = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    generation_max_length=max_target_length,
    logging_steps=50,
    fp16=torch.cuda.is_available()
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Fine‑tune
trainer.train()

# ---------------------------
# Evaluation on Test Set – Greedy Decoding
# ---------------------------
print("▶ Generating T5 predictions (greedy decoding)...")
greedy_preds = trainer.predict(tokenized_test)
rouge_greedy = compute_metrics((greedy_preds.predictions, greedy_preds.label_ids))
print("ROUGE scores with greedy decoding (T5):", rouge_greedy)

# Decode and print first 5 greedy predictions
decoded_preds_greedy = tokenizer.batch_decode(greedy_preds.predictions, skip_special_tokens=True)
print("\n--- Sample Greedy Predictions (T5-small) ---")
for i, pred in enumerate(decoded_preds_greedy[:5], 1):
    print(f"{i}. {pred}")

# Save greedy predictions
with open("t5_greedy_predictions.txt", "w") as f:
    for pred in decoded_preds_greedy:
        f.write(pred + "\n")

# ---------------------------
# Evaluation on Test Set – Beam Search
# ---------------------------
print("\n▶ Generating T5 predictions (beam search)...")
# Prepare padded batch
batch = tokenizer(
    test_df["text"].tolist(),
    max_length=max_input_length,
    truncation=True,
    padding="longest",
    return_tensors="pt"
).to(device)

beam_outputs = model.generate(
    input_ids=batch["input_ids"],
    attention_mask=batch["attention_mask"],
    max_length=max_target_length,
    num_beams=5,
    early_stopping=True
)

decoded_preds_beam = tokenizer.batch_decode(beam_outputs, skip_special_tokens=True)
decoded_refs       = test_df["title"].tolist()
beam_rouge         = rouge.compute(predictions=decoded_preds_beam, references=decoded_refs)
print("ROUGE scores with beam search (T5):", beam_rouge)

# Print first 5 beam predictions
print("\n--- Sample Beam Search Predictions (T5-small) ---")
for i, pred in enumerate(decoded_preds_beam[:5], 1):
    print(f"{i}. {pred}")

# Save beam predictions
with open("t5_beam_predictions.txt", "w") as f:
    for pred in decoded_preds_beam:
        f.write(pred + "\n")

# ---------------------------
# Section C2: Prompt Engineering with Flan-T5
# ---------------------------
flan_models = {
    "flan-t5-base":  "google/flan-t5-base",
    "flan-t5-large": "google/flan-t5-large"
}

prompt_variations = [
    "Generate a concise title for the following Wikipedia article: ",
    "Based on the content below, provide a brief and relevant title: "
]

print("\n▶ Generating predictions using Flan‑T5 models...")
for name, checkpoint in flan_models.items():
    flan_tok = AutoTokenizer.from_pretrained(checkpoint)
    flan_mod = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

    preds, refs = [], []
    print(f"\n--- {name} ---")
    for prompt in prompt_variations:
        print(f"\nPrompt: {prompt}")
        for idx, (text, ref) in enumerate(zip(test_df["text"][:5], test_df["title"][:5]), 1):
            inputs = flan_tok(
                prompt + text,
                return_tensors="pt",
                max_length=max_input_length,
                truncation=True
            ).to(device)
            outputs = flan_mod.generate(**inputs, max_length=max_target_length, num_beams=5, early_stopping=True)
            pred = flan_tok.decode(outputs[0], skip_special_tokens=True)
            print(f"{idx}. Predicted: {pred}")
            print(f"   Reference: {ref}")
            preds.append(pred)
            refs.append(ref)

    # Compute full ROUGE over all 100 samples
    for text, ref in zip(test_df["text"][5:100], test_df["title"][5:100]):
        inputs = flan_tok(
            prompt + text,
            return_tensors="pt",
            max_length=max_input_length,
            truncation=True
        ).to(device)
        outputs = flan_mod.generate(**inputs, max_length=max_target_length, num_beams=5, early_stopping=True)
        preds.append(flan_tok.decode(outputs[0], skip_special_tokens=True))
        refs.append(ref)

    scores = rouge.compute(predictions=preds, references=refs)
    print(f"\nROUGE scores for {name}: {scores}")

    # Save predictions
    with open(f"{name}_predictions.txt", "w") as f:
        for p in preds:
            f.write(p + "\n")

print("\n✅ All Section C tasks completed successfully.")


[0m

Map:   0%|          | 0/13379 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss
50,4.6108
100,3.0064
150,2.5721
200,2.2169
250,2.2396
300,2.1297
350,1.9276
400,1.9517
450,1.9133
500,1.8537


▶ Generating T5 predictions (greedy decoding)...


ROUGE scores with greedy decoding (T5): {'rouge1': np.float64(0.7380367965367967), 'rouge2': np.float64(0.47051587301587294), 'rougeL': np.float64(0.7341991341991341)}

--- Sample Greedy Predictions (T5-small) ---
1. Weyburn, Saskatchewan
2. Catholic High School
3. Minnesota Golden Gopher
4. Charles, Louisiana
5. Theobald

▶ Generating T5 predictions (beam search)...
ROUGE scores with beam search (T5): {'rouge1': np.float64(0.7485699855699856), 'rouge2': np.float64(0.47050793650793643), 'rougeL': np.float64(0.7436035353535351), 'rougeLsum': np.float64(0.7434884559884558)}

--- Sample Beam Search Predictions (T5-small) ---
1. Weyburn, Saskatchewan
2. Catholic High School
3. Minnesota Golden Gopher
4. Charles, Louisiana
5. Theobald

▶ Generating predictions using Flan‑T5 models...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


--- flan-t5-base ---

Prompt: Generate a concise title for the following Wikipedia article: 
1. Predicted: weyburn
   Reference: Weyburn
2. Predicted: catholic high school singapore offers chinesemedium high certificate
   Reference: Catholic High School, Singapore
3. Predicted: minnesota golden gopher
   Reference: Minnesota Golden Gophers
4. Predicted: nnamdi asomugha
   Reference: List of people from Louisiana
5. Predicted: theobald
   Reference: Theobald

Prompt: Based on the content below, provide a brief and relevant title: 
1. Predicted: Weyburn is the eleventhlargest city in saskatchewan
   Reference: Weyburn
2. Predicted: catholic high school singapore offered chinesemedium senior high certificate englishmedium cambridge
   Reference: Catholic High School, Singapore
3. Predicted: minnesota golden gopher wrestling team ncaa national championship big ten team title
   Reference: Minnesota Golden Gophers
4. Predicted: nnamdi aso: nnamdi aso
   Reference: List of people from Loui

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


--- flan-t5-large ---

Prompt: Generate a concise title for the following Wikipedia article: 
1. Predicted: weyburn
   Reference: Weyburn
2. Predicted: catholic high school
   Reference: Catholic High School, Singapore
3. Predicted: Minnesota Golden Gophers
   Reference: Minnesota Golden Gophers
4. Predicted: List of notable people born in louisiana
   Reference: List of people from Louisiana
5. Predicted: Theobald
   Reference: Theobald

Prompt: Based on the content below, provide a brief and relevant title: 
1. Predicted: weyburn -lrb- saskatchewan -rrb-
   Reference: Weyburn
2. Predicted: history of catholic schools in singapore
   Reference: Catholic High School, Singapore
3. Predicted: minnesota golden gopher
   Reference: Minnesota Golden Gophers
4. Predicted: following notable people either born raised lived significant period time
   Reference: List of people from Louisiana
5. Predicted: theobald
   Reference: Theobald

ROUGE scores for flan-t5-large: {'rouge1': np.float64(0.5