In [3]:

import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Trainer, 
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import warnings
warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"

# Kaggle-specific settings
import gc  # Garbage collector for memory management
torch.cuda.empty_cache()  # Clear GPU cache
print("🚀 Starting Text Summarization Project on Kaggle")
print("="*60)


🚀 Starting Text Summarization Project on Kaggle


# Check if GPU is available

In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️  Using CPU (this will be slower)")


⚠️  Using CPU (this will be slower)


# Step 2: Load dataset with error handling


In [5]:
print(f"\n📚 Loading CNN/DailyMail dataset...")
try:
    dataset = load_dataset("cnn_dailymail", "3.0.0")
    print(f"✅ Dataset loaded successfully!")
    print(f"   - Train: {len(dataset['train']):,} examples")
    print(f"   - Validation: {len(dataset['validation']):,} examples") 
    print(f"   - Test: {len(dataset['test']):,} examples")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("💡 Try running: !pip install datasets")



📚 Loading CNN/DailyMail dataset...


Generating train split: 100%|██████████| 287113/287113 [00:04<00:00, 69321.17 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 65774.94 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 66128.89 examples/s]


✅ Dataset loaded successfully!
   - Train: 287,113 examples
   - Validation: 13,368 examples
   - Test: 11,490 examples


In [6]:
 #Preview one example
print(f"\n📖 Sample article:")
example = dataset['train'][0]
print(f"Article preview: {example['article'][:200]}...")
print(f"Summary: {example['highlights']}")


📖 Sample article:
Article preview: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on ...
Summary: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


# Step 3: Use full dataset for comprehensive training

In [7]:
print(f"\n🔥 Using FULL dataset for comprehensive training...")

# Using complete dataset for best results
train_data = dataset['train']      # Full training set: ~287k examples
val_data = dataset['validation']   # Full validation set: ~13k examples  
test_data = dataset['test']        # Full test set: ~11k examples

print(f"   📊 Full dataset sizes:")
print(f"   - Training: {len(train_data):,} examples")
print(f"   - Validation: {len(val_data):,} examples")
print(f"   - Testing: {len(test_data):,} examples")
print(f"   ⚠️  This will take significantly longer to train!")




🔥 Using FULL dataset for comprehensive training...
   📊 Full dataset sizes:
   - Training: 287,113 examples
   - Validation: 13,368 examples
   - Testing: 11,490 examples
   ⚠️  This will take significantly longer to train!


# Step 4: Load model 


In [8]:
model_name = "facebook/bart-base"  # Good balance of quality and efficiency
# Alternative options:
# model_name = "t5-small"  # Faster, less memory  
# model_name = "facebook/bart-large-cnn"  # Best quality, more memory

print(f"   Model: {model_name}")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model = model.to(device)
    print(f"✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")


   Model: facebook/bart-base
✅ Model loaded successfully!


# Step 5: Preprocessing function 

In [9]:
def preprocess_batch(examples):
    """Preprocess data efficiently for Kaggle"""
    # BART doesn't need task prefix (unlike T5)
    articles = examples['article']
    
    # Tokenize inputs
    model_inputs = tokenizer(
        articles,
        max_length=1024,               # BART can handle longer sequences
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['highlights'],
            max_length=142,            # Slightly longer for BART
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Step 6: Apply preprocessing


In [10]:
print(f"\n🔧 Preprocessing data...")

train_dataset = train_data.map(
    preprocess_batch,
    batched=True,
    remove_columns=['article', 'highlights', 'id'],
    desc="Processing training data"
)

val_dataset = val_data.map(
    preprocess_batch,
    batched=True,
    remove_columns=['article', 'highlights', 'id'],
    desc="Processing validation data"
)

test_dataset = test_data.map(
    preprocess_batch,
    batched=True,
    remove_columns=['article', 'highlights', 'id'],
    desc="Processing test data"
)

print(f"✅ Preprocessing complete!")



🔧 Preprocessing data...


Processing training data: 100%|██████████| 287113/287113 [04:53<00:00, 979.00 examples/s] 
Processing validation data: 100%|██████████| 13368/13368 [00:13<00:00, 1008.72 examples/s]
Processing test data: 100%|██████████| 11490/11490 [00:12<00:00, 948.36 examples/s]


✅ Preprocessing complete!


# Step 7: ROUGE evaluation function



In [11]:
def compute_metrics(eval_pred):
    """Compute ROUGE scores"""
    try:
        rouge = evaluate.load('rouge')
        predictions, labels = eval_pred
        
        # Decode predictions and labels
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        # Clean up text
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]
        
        result = rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True
        )
        
        return {
            'rouge1': result['rouge1'],
            'rouge2': result['rouge2'],
            'rougeL': result['rougeL']
        }
    except Exception as e:
        print(f"Warning: Could not compute metrics: {e}")
        return {}

# Step 8: Training arguments optimized for full dataset


In [12]:
print(f"\n⚙️ Setting up training for FULL dataset...")

training_args = TrainingArguments(
    output_dir='/kaggle/working/results',           
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='/kaggle/working/logs',
    logging_steps=100,
    logging_strategy="steps",
    eval_strategy="steps",                # <-- use this
    eval_steps=5000,
    save_steps=10000,
    save_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=None,
    push_to_hub=False,
    max_steps=50000,
    dataloader_pin_memory=False
)



⚙️ Setting up training for FULL dataset...


E0000 00:00:1755612149.931300      10 common_lib.cc:621] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: ===
learning/45eac/tfrc/runtime/common_lib.cc:232
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Step 9: Data collator


In [13]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model, 
    padding=True
)

# Step 10: Create trainer


In [14]:
print(f"🏃‍♂️ Creating trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

🏃‍♂️ Creating trainer...


In [15]:
print(f"Model is on: {next(model.parameters()).device}")


Model is on: xla:0


# Step 11: Training with progress tracking


In [None]:
print(f"\n🎯 Starting training...")
print(f"   📊 Training info:")
print(f"   - Examples: {len(train_dataset):,}")
print(f"   - Batch size: {training_args.per_device_train_batch_size}")
print(f"   - Epochs: {training_args.num_train_epochs}")
print(f"   - Expected steps: {len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs}")

try:
    trainer.train()
    print(f"✅ Training completed successfully!")
except Exception as e:
    print(f"❌ Training error: {e}")
    print("💡 Try reducing batch size or dataset size")



🎯 Starting training...
   📊 Training info:
   - Examples: 287,113
   - Batch size: 2
   - Epochs: 3
   - Expected steps: 430668


Step,Training Loss,Validation Loss


# Step 12: Evaluate on test set


In [None]:
print(f"\n📊 Evaluating model...")
try:
    test_results = trainer.evaluate(eval_dataset=test_dataset)
    print(f"🎯 Test Results:")
    for key, value in test_results.items():
        if 'rouge' in key.lower():
            print(f"   {key}: {value:.4f}")
except Exception as e:
    print(f"❌ Evaluation error: {e}")


# Step 13: Generation function


In [None]:
def generate_summary(text, max_length=120):
    """Generate summary with BART model"""
    try:
        # BART doesn't need task prefix (unlike T5)
        inputs = tokenizer.encode(
            text, 
            return_tensors='pt', 
            max_length=1024,           # BART can handle longer inputs
            truncation=True
        ).to(device)
        
        with torch.no_grad():
            summary_ids = model.generate(
                inputs,
                max_length=max_length,
                min_length=30,
                length_penalty=2.0,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3      # Prevent repetition
            )
        
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary.strip()
    
    except Exception as e:
        return f"Error generating summary: {e}"


# Step 14: Test on examples


In [None]:
print(f"\n🧪 Testing on sample articles...")
print(f"="*70)

sample_texts = [
    dataset['test'][0]['article'] if 'dataset' in locals() else train_dataset[0],
    dataset['test'][1]['article'] if 'dataset' in locals() else train_dataset[1],
    dataset['test'][2]['article'] if 'dataset' in locals() else train_dataset[2]
]

for i in range(min(3, len(sample_texts))):
    print(f"\n📰 Example {i+1}:")
    print(f"-" * 50)
    
    # Get test example if available
    if i < len(test_dataset):
        # Reconstruct text from tokens (approximate)
        article_text = "Sample article for testing summarization..."
    else:
        article_text = "Sample text for demonstration."
    
    summary = generate_summary(article_text)
    
    print(f"Input: {article_text[:200]}...")
    print(f"Summary: {summary}")
    print(f"-" * 50)

# Step 15: Save model to Kaggle output


In [None]:
print(f"\n💾 Saving model...")
output_dir = '/kaggle/working/summarization_model'

try:
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"✅ Model saved to {output_dir}")
    
    # Create a simple usage guide
    with open('/kaggle/working/how_to_use.txt', 'w') as f:
        f.write("How to use your trained summarization model:\n\n")
        f.write("1. Load the model:\n")
        f.write("   from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n")
        f.write(f"   tokenizer = AutoTokenizer.from_pretrained('{output_dir}')\n")
        f.write(f"   model = AutoModelForSeq2SeqLM.from_pretrained('{output_dir}')\n\n")
        f.write("2. Generate summary:\n")
        f.write("   # Use the generate_summary function from this notebook\n")
        f.write("   summary = generate_summary('Your article text here')\n")
    
    print(f"📝 Usage guide saved to /kaggle/working/how_to_use.txt")
    
except Exception as e:
    print(f"❌ Error saving model: {e}")


# Step 16: Create results summary for full dataset


In [None]:
print(f"\n📋 Creating results summary...")
results_df = pd.DataFrame({
    'Metric': ['Training Examples', 'Validation Examples', 'Test Examples', 'Model Used', 'Training Epochs', 'Device Used', 'Max Steps'],
    'Value': [f"{len(train_data):,}", f"{len(val_data):,}", f"{len(test_data):,}", model_name, training_args.num_train_epochs, str(device), training_args.max_steps]
})

# Save results
results_df.to_csv('/kaggle/working/training_summary.csv', index=False)
print(f"✅ Results saved to /kaggle/working/training_summary.csv")

# Final memory cleanup
gc.collect()
torch.cuda.empty_cache()

print(f"\n🎉 All done! Your summarization model is ready!")
print(f"📁 Files created:")
print(f"   - Model: /kaggle/working/summarization_model/")
print(f"   - Usage guide: /kaggle/working/how_to_use.txt")
print(f"   - Training log: /kaggle/working/training_summary.csv")

# Quick demo
demo_text = """
Scientists have discovered a new species of dinosaur in Argentina. The massive creature, 
named Meraxes gigas, lived about 90 million years ago and had tiny arms similar to 
Tyrannosaurus rex. Despite having small arms, the dinosaur was a fearsome predator 
that could grow up to 36 feet long and weigh over 4 tons. This discovery helps 
researchers better understand how different dinosaur species evolved similar features.
"""

print(f"\n🔍 Quick demo:")
print(f"Original: {demo_text}")
print(f"Summary: {generate_summary(demo_text)}")