# T5-small Full Fine-Tuning on BioLaySumm Dataset (Memory Optimized)

**Author:** Nathan Chung  
**Course:** COMP3710 Pattern Analysis  
**Task:** Expert-to-Layperson Radiology Report Translation  
**Model:** T5-small Full Fine-Tuning (60M parameters)
**Optimized for:** Google Colab T4 GPU (15GB memory limit)


## 1. Installation and Setup


In [None]:
# Install required packages
%pip install -q transformers datasets accelerate evaluate rouge-score peft

# Mount Google Drive (optional, for backup)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✅ Google Drive mounted successfully")
except:
    print("⚠️ Google Drive not available - continuing without backup")


In [None]:
# Import libraries
import os
import json
import shutil
import torch
import evaluate
import numpy as np
from pathlib import Path
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    GenerationConfig
)
from datasets import load_dataset
from peft import PeftModel

# Set environment variables for memory optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

print("✅ Libraries imported successfully")
print(f"🔧 PyTorch version: {torch.__version__}")
print(f"🎯 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"💾 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


## 2. Configuration


In [None]:
# Configuration optimized for Colab T4 (15GB memory)
config = {
    # Model configuration
    'model_name': 't5-small',
    'task': 'expert_to_layman',
    
    # Dataset configuration
    'dataset_name': 'BioLaySumm/BioLaySumm2025-LaymanRRG-opensource-track',
    'max_source_length': 256,  # Reduced for memory
    'max_target_length': 128,  # Reduced for memory
    'max_samples': 10000,      # Limit dataset size for faster training
    
    # Training configuration (memory optimized)
    'batch_size': 1,                    # Minimal batch size
    'gradient_accumulation_steps': 16,  # Increase to maintain effective batch size
    'learning_rate': 5e-4,             # Slightly higher for faster convergence
    'num_epochs': 2,                   # Reduced epochs for faster completion
    'warmup_steps': 100,               # Reduced warmup
    'weight_decay': 0.01,
    'max_grad_norm': 1.0,
    'eval_steps': 500,                 # Evaluate every 500 steps
    'save_steps': 1000,                # Save every 1000 steps
    'logging_steps': 100,              # Log every 100 steps
    'seed': 42,
    
    # Output configuration
    'output_dir': '/content/t5-small-full-finetuning',
    'run_name': 't5-small-biolaysumm-colab'
}

print("✅ Configuration loaded")
print(f"📊 Effective batch size: {config['batch_size'] * config['gradient_accumulation_steps']}")
print(f"📏 Max source length: {config['max_source_length']}")
print(f"📏 Max target length: {config['max_target_length']}")


## 3. Dataset Loading and Preprocessing


In [None]:
# Load dataset
print("📥 Loading BioLaySumm dataset...")
dataset = load_dataset(config['dataset_name'], split='train')

# Limit dataset size for faster training
if config['max_samples'] and len(dataset) > config['max_samples']:
    dataset = dataset.select(range(config['max_samples']))
    print(f"📊 Limited dataset to {len(dataset)} samples")

# Split into train/validation
split_dataset = dataset.train_test_split(test_size=0.1, seed=config['seed'])
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f"✅ Dataset loaded: {len(train_dataset)} train, {len(val_dataset)} validation samples")
print(f"📋 Sample columns: {train_dataset.column_names}")

# Show sample data
sample = train_dataset[0]
print(f"\n📝 Sample data:")
for key, value in sample.items():
    if isinstance(value, str) and len(value) > 100:
        print(f"{key}: {value[:100]}...")
    else:
        print(f"{key}: {value}")


In [None]:
# Smart prompt application function
def apply_prompts(examples):
    """
    Apply prompts to dataset examples, auto-detecting column names.
    """
    # Auto-detect column names
    expert_cols = ['expert_report', 'radiology_report', 'expert_summary']
    layman_cols = ['layman_report', 'layman_summary', 'layperson_summary']
    
    expert_col = None
    layman_col = None
    
    for col in expert_cols:
        if col in examples:
            expert_col = col
            break
    
    for col in layman_cols:
        if col in examples:
            layman_col = col
            break
    
    if not expert_col or not layman_col:
        raise ValueError(f"Could not find expert/layman columns. Available: {list(examples.keys())}")
    
    # Apply prompts
    if config['task'] == 'expert_to_layman':
        input_text = f"Translate this medical report to layman terms: {examples[expert_col]}"
        target_text = examples[layman_col]
    else:
        input_text = examples[expert_col]
        target_text = examples[layman_col]
    
    return {
        'input_text': input_text,
        'target_text': target_text
    }

# Apply prompts to datasets
print("🔄 Applying prompts to datasets...")
train_dataset = train_dataset.map(apply_prompts, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(apply_prompts, remove_columns=val_dataset.column_names)

print("✅ Prompts applied successfully")
print(f"📝 Sample input: {train_dataset[0]['input_text'][:100]}...")
print(f"📝 Sample target: {train_dataset[0]['target_text'][:100]}...")


## 4. Model and Tokenizer Setup


In [None]:
# Load model and tokenizer
print(f"🤖 Loading {config['model_name']} model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
model = AutoModelForSeq2SeqLM.from_pretrained(
    config['model_name'],
    torch_dtype=torch.bfloat16,  # Use bfloat16 for memory efficiency
    device_map='auto'            # Automatic device mapping
)

# Print model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"✅ Model loaded successfully")
print(f"📊 Total parameters: {total_params:,} ({total_params/1e6:.1f}M)")
print(f"🎯 Trainable parameters: {trainable_params:,} ({trainable_params/1e6:.1f}M)")
print(f"💾 Model dtype: {model.dtype}")

# Clear cache
torch.cuda.empty_cache()


In [None]:
# Tokenization function
def preprocess_function(examples):
    """
    Tokenize input and target text.
    """
    inputs = tokenizer(
        examples['input_text'],
        max_length=config['max_source_length'],
        truncation=True,
        padding=False
    )
    
    targets = tokenizer(
        examples['target_text'],
        max_length=config['max_target_length'],
        truncation=True,
        padding=False
    )
    
    inputs['labels'] = targets['input_ids']
    return inputs

# Tokenize datasets
print("🔄 Tokenizing datasets...")
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=1,  # Disable multiprocessing for memory
    desc="Tokenizing training dataset"
)

tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    num_proc=1,  # Disable multiprocessing for memory
    desc="Tokenizing validation dataset"
)

print("✅ Datasets tokenized successfully")
print(f"📊 Train samples: {len(tokenized_train)}")
print(f"📊 Validation samples: {len(tokenized_val)}")

# Clear cache
torch.cuda.empty_cache()


## 5. Training Setup


In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

# ROUGE metrics computation
rouge = evaluate.load('rouge')

def compute_metrics(eval_preds):
    """
    Compute ROUGE metrics for evaluation.
    """
    predictions, labels = eval_preds
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    
    # Extract scores
    return {
        'rouge1': result['rouge1'],
        'rouge2': result['rouge2'],
        'rougeL': result['rougeL'],
        'rougeLsum': result['rougeLsum']
    }

print("✅ Training components prepared")


In [None]:
# Create output directory
output_dir = Path(config['output_dir'])
output_dir.mkdir(parents=True, exist_ok=True)

# Training arguments with aggressive memory optimizations
training_args = Seq2SeqTrainingArguments(
    output_dir=str(output_dir),
    num_train_epochs=config['num_epochs'],
    per_device_train_batch_size=config['batch_size'],
    per_device_eval_batch_size=config['batch_size'],
    gradient_accumulation_steps=config['gradient_accumulation_steps'],
    learning_rate=config['learning_rate'],
    weight_decay=config['weight_decay'],
    max_grad_norm=config['max_grad_norm'],
    warmup_steps=config['warmup_steps'],
    eval_strategy='steps',
    eval_steps=config['eval_steps'],
    save_strategy='steps',
    save_steps=config['save_steps'],
    load_best_model_at_end=False,  # Disable to save memory
    logging_steps=config['logging_steps'],
    report_to=[],  # No external logging
    seed=config['seed'],
    bf16=True,                    # Use bfloat16
    remove_unused_columns=False,
    save_total_limit=3,          # Keep fewer checkpoints
    # Aggressive memory optimizations
    gradient_checkpointing=True,     # Enable gradient checkpointing
    dataloader_num_workers=0,        # Disable multiprocessing
    dataloader_pin_memory=False,     # Disable pin memory
    dataloader_drop_last=True,       # Drop last incomplete batch
    prediction_loss_only=False,
    include_inputs_for_metrics=True,
    eval_accumulation_steps=1,       # Process eval in smaller chunks
    fp16=False,                     # Use bf16 instead
    tf32=False,                     # Disable TF32
    dataloader_disable_tqdm=True    # Disable progress bars
)

# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer
)

print("✅ Training setup complete!")
print(f"📊 Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"💾 Output directory: {training_args.output_dir}")

# Clear cache
torch.cuda.empty_cache()


## 6. Training


In [None]:
# Check for existing checkpoints
existing_checkpoints = list(output_dir.glob("checkpoint-*"))

if existing_checkpoints:
    latest_checkpoint = max(existing_checkpoints, key=lambda x: int(x.name.split('-')[1]))
    print(f"🔄 Found existing checkpoint: {latest_checkpoint.name}")
    resume_from_checkpoint = str(latest_checkpoint)
else:
    print("🚀 Starting fresh training...")
    resume_from_checkpoint = None

print(f"🤖 Model: {config['model_name']}")
print(f"📊 Strategy: Full fine-tuning (100% parameters trainable)")
print(f"📊 Total parameters: {total_params:,} ({total_params/1e6:.1f}M)")
print(f"📊 Trainable parameters: {trainable_params:,} ({trainable_params/1e6:.1f}M)")


In [None]:
# Train the model
print("🏋️ Starting training...")

try:
    train_results = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    print("\n✅ Training completed successfully!")
    print(f"📊 Final training loss: {train_results.training_loss:.4f}")
except Exception as e:
    print(f"\n❌ Training failed: {e}")
    print("💡 Try reducing batch_size or max_source_length in config")
    raise
finally:
    # Clear cache
    torch.cuda.empty_cache()


## 7. Evaluation and Sample Predictions


In [None]:
# Run final evaluation
print("🔍 Running final evaluation...")
eval_results = trainer.evaluate()

print("\n📊 Final Evaluation Results:")
print("=" * 50)
for metric, value in eval_results.items():
    if 'rouge' in metric:
        print(f"{metric}: {value:.4f}")
    else:
        print(f"{metric}: {value}")
print("=" * 50)

# Generate sample predictions
print("\n🎯 Sample Predictions:")
test_samples = tokenized_val.select(range(3))
predictions = trainer.predict(test_samples)

decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

for i in range(len(decoded_preds)):
    print(f"\nSample {i+1}:")
    print(f"Prediction: {decoded_preds[i]}")
    print(f"Reference:  {decoded_labels[i]}")
    print("-" * 80)


## 8. Save Results


In [None]:
# Save the trained model
print("💾 Saving trained model...")
model_save_path = output_dir / "final_model"
model_save_path.mkdir(exist_ok=True)

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save results
results = {
    'config': config,
    'training_results': {
        'training_loss': train_results.training_loss,
        'training_time': train_results.metrics['train_runtime'],
        'samples_per_second': train_results.metrics['train_samples_per_second']
    },
    'evaluation_results': eval_results
}

with open(model_save_path / "results.json", 'w') as f:
    json.dump(results, f, indent=2)

print(f"✅ Model and results saved to: {model_save_path}")

# Backup to Google Drive (if available)
try:
    from google.colab import drive
    if drive.is_mounted():
        drive_backup_path = "/content/drive/MyDrive/Colab Notebooks/t5-small-full-finetuning"
        print(f"📤 Backing up to Google Drive...")
        shutil.copytree(output_dir, drive_backup_path, dirs_exist_ok=True)
        print("✅ Backup to Google Drive completed!")
except Exception as e:
    print(f"⚠️ Google Drive backup failed: {e}")

print("\n🎉 All done! Training completed successfully.")
