In [None]:
!pip install rouge_score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
!pip install transformers datasets torch rouge_score pandas numpy -q

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Then paste the complete training code here

  **Text Summarization using FLAN-T5-Base
Complete Solution**


In [None]:
"""
Task 3: Text Summarization using FLAN-T5-Base
Complete Solution - Ready to Run
"""

# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Dataset
from rouge_score import rouge_scorer

# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("Loading datasets...")

train_df = pd.read_csv('/content/drive/MyDrive/nlp-p2/cnn_dailymail/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/nlp-p2/cnn_dailymail/validation.csv')
test_df = pd.read_csv('/content/drive/MyDrive/nlp-p2/cnn_dailymail/test.csv')

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
print("Columns:", train_df.columns.tolist())

text_col = 'article'
summary_col = 'highlights'

# ============================================================================
# STEP 2: PREPROCESSING
# ============================================================================
print("\nPreprocessing...")

def clean_data(df, text_col, summary_col):
    df = df[[text_col, summary_col]].dropna()
    df = df[df[text_col].str.len() > 100]
    df = df[df[summary_col].str.len() > 10]
    return df.reset_index(drop=True)

train_df = clean_data(train_df, text_col, summary_col)
val_df = clean_data(val_df, text_col, summary_col)
test_df = clean_data(test_df, text_col, summary_col)

# Use subset for faster training
train_df = train_df.head(10000)
val_df = val_df.head(1000)

print(f"After preprocessing - Train: {len(train_df)}, Val: {len(val_df)}")

# ============================================================================
# STEP 3: LOAD MODEL
# ============================================================================
print("\nLoading FLAN-T5-Base...")

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("Model loaded successfully!")

# ============================================================================
# STEP 4: TOKENIZATION
# ============================================================================
print("\nTokenizing datasets...")

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples[text_col]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=examples[summary_col], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = Dataset.from_pandas(train_df[[text_col, summary_col]])
val_dataset = Dataset.from_pandas(val_df[[text_col, summary_col]])
test_dataset = Dataset.from_pandas(test_df[[text_col, summary_col]])

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

print("Tokenization complete!")

# ============================================================================
# STEP 5: DATA COLLATOR
# ============================================================================
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ============================================================================
# STEP 6: METRICS
# ============================================================================
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    return {
        'rouge1': np.mean(rouge1_scores) * 100,
        'rouge2': np.mean(rouge2_scores) * 100,
        'rougeL': np.mean(rougeL_scores) * 100
    }

# ============================================================================
# STEP 7: TRAINING SETUP
# ============================================================================
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ============================================================================
# STEP 8: TRAIN
# ============================================================================
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60 + "\n")

trainer.train()

print("\nTraining complete!")

# ============================================================================
# STEP 9: EVALUATE
# ============================================================================
print("\n" + "="*60)
print("EVALUATION")
print("="*60 + "\n")

val_results = trainer.evaluate()
print("\nValidation Results:")
print(f"ROUGE-1: {val_results['eval_rouge1']:.2f}")
print(f"ROUGE-2: {val_results['eval_rouge2']:.2f}")
print(f"ROUGE-L: {val_results['eval_rougeL']:.2f}")

# ============================================================================
# STEP 10: SAVE MODEL
# ============================================================================
print("\nSaving model...")
trainer.save_model("./flan-t5-summarizer")
tokenizer.save_pretrained("./flan-t5-summarizer")
print("Model saved!")

# ============================================================================
# STEP 11: TEST PREDICTIONS
# ============================================================================
print("\n" + "="*60)
print("SAMPLE PREDICTIONS")
print("="*60 + "\n")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_summary(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

for i in range(3):
    article = test_df.iloc[i][text_col]
    actual = test_df.iloc[i][summary_col]
    predicted = generate_summary(article)

    print(f"\nExample {i+1}:")
    print(f"Article: {article[:200]}...")
    print(f"\nActual: {actual}")
    print(f"Predicted: {predicted}")
    print("-" * 60)

print("\n✓ TASK 3 COMPLETE!")

Loading datasets...
Train: 287113, Val: 13368, Test: 11490
Columns: ['id', 'article', 'highlights']

Preprocessing...
After preprocessing - Train: 10000, Val: 1000

Loading FLAN-T5-Base...
Model loaded successfully!

Tokenizing datasets...


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Tokenization complete!


  trainer = Seq2SeqTrainer(



STARTING TRAINING



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,0.0,,23.250694,10.376971,18.88568
2,0.0,,23.250694,10.376971,18.88568
3,0.0,,23.250694,10.376971,18.88568



Training complete!

EVALUATION




Validation Results:
ROUGE-1: 23.25
ROUGE-2: 10.38
ROUGE-L: 18.89

Saving model...
Model saved!

SAMPLE PREDICTIONS


Example 1:
Article: Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting p...

Actual: Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer .
Predicted: Experts say the shrinking space on planes is not only uncomfortable - it's putting our health and safety in danger. They say the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger.
------------------------------------------------------------

Example 2:
Article: A drunk teenage boy had to be rescued by security after jumping into a lions' enclosure at a zoo in western I