In [None]:
import os
!pip uninstall -y numpy transformers datasets
!pip install numpy --force-reinstall --no-cache-dir
!pip install transformers datasets --force-reinstall --no-cache-dir


os.kill(os.getpid(), 9)  # Restart the Colab runtime (REQUIRED)

In [None]:
!git clone https://github.com/babylm/baseline-pretraining.git
%cd baseline-pretraining
!wget -O babylm_data.zip "https://files.osf.io/v1/resources/ad7qg/providers/osfstorage/661517db943bee3731dfec25/?zip="
!unzip babylm_data.zip -d babylm_data
!unzip babylm_data/train_10M.zip -d babylm_data/train_10M
!unzip babylm_data/dev.zip -d babylm_data/dev
!unzip babylm_data/test.zip -d babylm_data/test
!cat babylm_data/train_10M/train_10M/*.train > babylm_data/babylm_train.txt
!cat babylm_data/dev/dev/*.dev > babylm_data/babylm_dev.txt
!cat babylm_data/test/test/*.test > babylm_data/babylm_test.txt

# training the t5-small base model from scratch

In [None]:
# -*- coding: utf-8 -*-
"""
Memory-optimized training script for T5 model FROM SCRATCH (Base Model - No Delta Embedding).
This version includes:
- Training T5-small from scratch (random initialization)
- NO delta embedding logic - standard T5 architecture
- Aggressive memory optimization for Colab
- Fixed out-of-memory issues during evaluation
- Gradient accumulation for effective larger batch sizes
- Same dataset and preprocessing as delta embedding models for fair comparison
"""
import os
import random
import logging
from pathlib import Path
from typing import Dict, Optional
import gc

import numpy as np
import torch
import torch.nn as nn
import nltk
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.modeling_outputs import Seq2SeqLMOutput
from transformers import T5Config
from inspect import signature
# Download NLTK data with error handling
try:
    nltk.data.find("tokenizers/punkt")
    logger.info("✅ NLTK punkt tokenizer already downloaded")
except (LookupError, OSError):
    try:
        nltk.download("punkt", quiet=False)
        logger.info("✅ NLTK punkt tokenizer downloaded successfully")
    except Exception as e:
        logger.error(f"❌ Failed to download NLTK punkt: {e}")
        logger.info("Continuing without sentence tokenization...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# ---
# Step 1: Setup, Configuration, and Seeding
# ---

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- Memory-Optimized Hyperparameters ---
MODEL_NAME = "t5-small"
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128
NOISE_DENSITY = 0.15
MEAN_SPAN_LENGTH = 3

# ✅ MEMORY FIX: Ultra-small batch sizes with gradient accumulation
TRAIN_BATCH_SIZE = 32  # Same as delta models
EVAL_BATCH_SIZE = 8   # Same as delta models
GRADIENT_ACCUMULATION_STEPS = 4  # Same as delta models

LEARNING_RATE = 5e-4  # Higher learning rate for from-scratch training
NUM_EPOCHS = 5  # Start with 1 epoch

BASE_PROJECT_DIR = Path("/content/drive/MyDrive/llm-project")
PROCESSED_DATASET_PATH = BASE_PROJECT_DIR / "processed_dataset"
OUTPUT_DIR = str(BASE_PROJECT_DIR / "t5-small-base-babylm-from-scratch")
LOGGING_DIR = str(BASE_PROJECT_DIR / "t5_logs_base_scratch")
BABYLM_ROOT_DIR = Path("/content/baseline-pretraining/babylm_data")

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# ✅ MEMORY FIX: Enhanced GPU memory management
def clear_gpu_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        gc.collect()

set_seed(42)
clear_gpu_memory()

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

if torch.cuda.is_available():
    logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    logger.info(f"Available Memory: {torch.cuda.memory_reserved(0) / 1e9:.1f} GB")

# ---
# Step 2: Load Tokenizer and Initialize Standard T5 Model FROM SCRATCH
# ---
logger.info("Loading tokenizer...")
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

logger.info("Initializing standard T5 model FROM SCRATCH...")
config = T5Config.from_pretrained(MODEL_NAME)

# ✅ FROM SCRATCH: Initialize model with random weights (no pretrained loading)
with torch.cuda.device(device):
    logger.info("🚀 Creating STANDARD T5 model with RANDOM INITIALIZATION (from scratch)")
    model = T5ForConditionalGeneration(config)

    # ✅ CRITICAL: No pretrained weight loading - model starts with random weights
    logger.info("✅ Standard T5 model initialized from scratch with random weights")

# ✅ FIX: Ensure model vocabulary matches tokenizer
original_vocab_size = model.config.vocab_size
tokenizer_vocab_size = len(tokenizer)

if original_vocab_size != tokenizer_vocab_size:
    logger.info(f"Resizing model embeddings from {original_vocab_size} to {tokenizer_vocab_size}")
    model.resize_token_embeddings(tokenizer_vocab_size)

model.config.vocab_size = tokenizer_vocab_size
model.to(device)

# ✅ FIX: Gradient checkpointing with proper error handling
try:
    model.gradient_checkpointing_enable()
    logger.info("✅ Gradient checkpointing enabled")
except Exception as e:
    logger.warning(f"Could not enable gradient checkpointing: {e}")
    logger.info("Continuing without gradient checkpointing to avoid memory conflicts")

# Log model parameters for verification
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"📊 Model Statistics:")
logger.info(f"  Total parameters: {total_params:,}")
logger.info(f"  Trainable parameters: {trainable_params:,}")
logger.info(f"  Model size: ~{total_params * 4 / 1e6:.1f} MB (fp32)")

# ---
# Step 3: Prepare BabyLM Dataset (Same as Delta Models for Fair Comparison)
# ---
def corrupt_text_for_t5(examples: Dict) -> Dict:
    texts = examples["text"]
    inputs, targets = [], []
    sentinel_start_id = tokenizer.convert_tokens_to_ids("<extra_id_0>")

    for text in texts:
        # ✅ MEMORY FIX: Reduce max length for preprocessing
        tokens = tokenizer(text, add_special_tokens=False, return_tensors="np", max_length=256, truncation=True)["input_ids"][0]

        if len(tokens) < 2:
            continue

        num_tokens_to_mask = int(len(tokens) * NOISE_DENSITY)
        num_spans = int(num_tokens_to_mask / MEAN_SPAN_LENGTH)
        if num_spans == 0:
            continue

        span_starts = np.random.choice(np.arange(len(tokens)), size=num_spans, replace=False)
        span_lengths = np.random.poisson(lam=MEAN_SPAN_LENGTH, size=num_spans)
        span_lengths = np.maximum(1, span_lengths)

        mask = np.zeros_like(tokens, dtype=bool)
        for start, length in zip(span_starts, span_lengths):
            mask[start : start + length] = True

        input_ids_list, label_ids, sentinel_id = [], [], sentinel_start_id
        i = 0
        while i < len(tokens):
            if not mask[i]:
                input_ids_list.append(tokens[i])
                i += 1
            else:
                input_ids_list.append(sentinel_id)
                label_ids.append(sentinel_id)
                sentinel_id -= 1
                while i < len(tokens) and mask[i]:
                    label_ids.append(tokens[i])
                    i += 1

        inputs.append(tokenizer.decode(input_ids_list))
        targets.append(tokenizer.decode(label_ids))

    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding="max_length", truncation=True)
    labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Load or process dataset (same logic as delta models)
if PROCESSED_DATASET_PATH.exists():
    logger.info(f"✅ Loading processed dataset from disk: {PROCESSED_DATASET_PATH}")
    tokenized_dataset = DatasetDict.load_from_disk(str(PROCESSED_DATASET_PATH))
else:
    logger.info("Processed dataset not found. Starting preprocessing...")
    try:
        raw_dataset = DatasetDict({
             "train": load_dataset("text", data_files=str(BABYLM_ROOT_DIR / "babylm_train.txt"))["train"],
             "validation": load_dataset("text", data_files=str(BABYLM_ROOT_DIR / "babylm_dev.txt"))["train"],
        })
    except Exception as e:
        logger.error(f"Failed to load dataset: {e}. Ensure babylm_data is in the correct path.")
        exit()

    tokenized_dataset = raw_dataset.map(
        corrupt_text_for_t5, batched=True, remove_columns=["text"],
        num_proc=os.cpu_count() // 2, desc="Running T5 Corruptor"
    )
    logger.info(f"💾 Saving processed dataset to disk at: {PROCESSED_DATASET_PATH}")
    tokenized_dataset.save_to_disk(str(PROCESSED_DATASET_PATH))

logger.info("Preparing train and eval splits...")
train_dataset = tokenized_dataset["train"].shuffle(seed=42)

# ✅ MEMORY FIX: Use much smaller eval dataset during training
eval_dataset_full = tokenized_dataset["validation"].shuffle(seed=42)
eval_dataset = eval_dataset_full.select(range(min(50, len(eval_dataset_full))))  # Only 50 examples for training evals
logger.info(f"Using {len(eval_dataset)} examples for training evaluation (memory optimization)")

# ---
# Step 4: Simple Metrics for Training (Loss Only)
# ---
def simple_compute_metrics(eval_pred):
    """Simple metrics computation that only returns loss-based metrics during training"""
    return {}

# ---
# Step 5: Robust ROUGE Computation for Final Evaluation
# ---
def compute_rouge_metrics(predictions, labels, tokenizer):
    """Robust ROUGE computation with proper error handling"""
    try:
        vocab_size = tokenizer.vocab_size
        predictions = np.clip(predictions, 0, vocab_size - 1)

        max_pred_id = np.max(predictions) if len(predictions) > 0 else 0
        if max_pred_id >= vocab_size:
            logger.warning(f"⚠️ Still found out-of-vocab token ID after clipping! Max ID: {max_pred_id}, Vocab size: {vocab_size}")
            predictions = np.where(predictions >= vocab_size, tokenizer.unk_token_id or tokenizer.pad_token_id, predictions)

        try:
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        except Exception as e:
            logger.error(f"Error decoding predictions: {e}")
            decoded_preds = []
            for pred in predictions:
                try:
                    decoded_preds.append(tokenizer.decode(pred, skip_special_tokens=True))
                except:
                    decoded_preds.append("")

        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        try:
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        except Exception as e:
            logger.error(f"Error decoding labels: {e}")
            decoded_labels = [""] * len(labels)

        decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) if pred.strip() else "empty" for pred in decoded_preds]
        decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) if label.strip() else "empty" for label in decoded_labels]

        rouge_metric = evaluate.load("rouge")
        result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        return {key: value * 100 for key, value in result.items()}

    except Exception as e:
        logger.error(f"Error computing ROUGE metrics: {e}")
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}

# ---
# Step 6: Ultra Memory-Optimized Training Setup
# ---
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=-100,
    pad_to_multiple_of=8 if device.type == "cuda" else None
)

# ✅ MEMORY OPTIMIZATION: Ultra-aggressive memory saving settings (same as delta models)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,  # Effective batch size = 16
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,

    # ✅ MEMORY FIX: Disable generation during training
    predict_with_generate=False,
    generation_max_length=MAX_TARGET_LENGTH,
    generation_num_beams=1,

    # ✅ MEMORY FIX: Optimize logging and evaluation frequency
    logging_dir=LOGGING_DIR,
    report_to="tensorboard",
    fp16=True if device.type == "cuda" else False,
    dataloader_pin_memory=False,

    logging_strategy="steps",
    logging_steps=500,  # Less frequent logging

    # ✅ CRITICAL MEMORY FIX: Very infrequent evaluation
    eval_strategy="steps",
    eval_steps=1000,  # Evaluate very infrequently

    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,  # Keep only 1 checkpoint

    load_best_model_at_end=False,  # ✅ MEMORY FIX: Disable to save memory
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # ✅ MEMORY FIX: Additional memory optimizations
    remove_unused_columns=True,
    group_by_length=False,
    length_column_name=None,
    eval_accumulation_steps=1,
    include_inputs_for_metrics=False,

    # ✅ FROM SCRATCH: Learning rate scheduler for better convergence
    warmup_steps=200,
    lr_scheduler_type="linear",

    # ✅ MEMORY FIX: Additional optimizations
    max_grad_norm=1.0,  # Gradient clipping
    adam_epsilon=1e-6,  # Smaller epsilon for memory
)

# ✅ MEMORY FIX: Use processing_class instead of deprecated tokenizer parameter
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,  # Use processing_class instead of tokenizer
    data_collator=data_collator,
    compute_metrics=simple_compute_metrics,
)

# ---
# Step 7: Ultra Memory-Optimized Training FROM SCRATCH
# ---
# ✅ MEMORY FIX: Clear memory before training
clear_gpu_memory()

logger.info("🚀 Starting FROM SCRATCH training with STANDARD T5 model (NO DELTA EMBEDDING)...")
logger.info(f"📊 Training Configuration:")
logger.info(f"  Model type: STANDARD T5 (No Delta Embedding)")
logger.info(f"  Effective batch size: {TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
logger.info(f"  Learning rate: {LEARNING_RATE}")
logger.info(f"  Number of epochs: {NUM_EPOCHS}")
logger.info(f"  Model initialized: FROM SCRATCH (random weights)")
logger.info(f"  Memory optimizations: Ultra-aggressive")
logger.info(f"  Architecture: Standard T5-small (baseline for comparison)")

# Check for checkpoints
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint:
    logger.info(f"✅ Checkpoint found at {last_checkpoint}. Resuming training.")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    logger.info("ℹ️ No checkpoint found. Starting training from scratch.")
    try:
        trainer.train()
    except torch.cuda.OutOfMemoryError as e:
        logger.error(f"CUDA out of memory during training: {e}")
        logger.info("Try reducing TRAIN_BATCH_SIZE to 1, or reducing MAX_INPUT_LENGTH to 128")
        raise

# ✅ MEMORY FIX: Clear memory after training
clear_gpu_memory()

# ---
# Step 8: Minimal Final Evaluation
# ---
logger.info("Training complete. Running minimal final evaluation...")

# ✅ MEMORY FIX: Use very small dataset for final evaluation
final_eval_dataset = eval_dataset_full.select(range(min(20, len(eval_dataset_full))))  # Only 20 examples
logger.info(f"Using {len(final_eval_dataset)} examples for final evaluation")

try:
    logger.info("Running simple evaluation...")
    eval_results = trainer.evaluate(eval_dataset=final_eval_dataset)
    logger.info(f"Final Evaluation Results: {eval_results}")

except torch.cuda.OutOfMemoryError as e:
    logger.error(f"CUDA out of memory during final evaluation: {e}")
    logger.info("Skipping final evaluation due to memory constraints.")

except Exception as e:
    logger.error(f"Error during final evaluation: {e}")

# ---
# Step 9: Save Final Model
# ---
logger.info(f"💾 Saving final model to: {OUTPUT_DIR}")
try:
    trainer.save_model()
    tokenizer.save_pretrained(OUTPUT_DIR)

    # Save training info
    training_info = {
        "model_type": "T5ForConditionalGeneration",
        "architecture": "Standard T5-small (baseline)",
        "delta_embedding": False,
        "trained_from_scratch": True,
        "total_parameters": total_params,
        "trainable_parameters": trainable_params,
        "final_training_loss": trainer.state.log_history[-1].get('train_loss', 'N/A') if trainer.state.log_history else 'N/A'
    }

    import json
    with open(os.path.join(OUTPUT_DIR, 'training_info.json'), 'w') as f:
        json.dump(training_info, f, indent=2)

    logger.info("✅ Model and training info saved successfully!")
    logger.info(f"📄 Training info: {training_info}")

except Exception as e:
    logger.error(f"Error saving model: {e}")

# ✅ MEMORY FIX: Final cleanup
clear_gpu_memory()

# ---
# Step 10: Memory-Efficient Generation Test
# ---
def test_generation(model, tokenizer, test_text: str, max_length: int = 32):  # Very short generation
    """Ultra memory-efficient generation test"""
    try:
        model.eval()
        inputs = tokenizer(test_text, return_tensors="pt", max_length=128, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=1,
                do_sample=False,
                early_stopping=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text
    except torch.cuda.OutOfMemoryError as e:
        logger.error(f"Out of memory during generation test: {e}")
        return "Generation failed: Out of memory"
    except Exception as e:
        logger.error(f"Error during generation test: {e}")
        return f"Generation failed: {e}"

# Test with a simple example
test_input = "The quick brown fox <extra_id_0> over the lazy dog."
logger.info(f"Testing generation with input: '{test_input}'")
result = test_generation(model, tokenizer, test_input)
logger.info(f"Generated output: '{result}'")

# ✅ FROM SCRATCH STANDARD T5 TRAINING COMPLETE
logger.info("""
🎉 FROM SCRATCH STANDARD T5 BASELINE TRAINING COMPLETED!

Key Features:
✅ Model initialized with RANDOM WEIGHTS (no pretrained loading)
✅ STANDARD T5 architecture (no modifications)
✅ Ultra-aggressive memory optimizations for Colab
✅ Same dataset and preprocessing as delta embedding models
✅ Fair comparison baseline for delta embedding experiments

Standard T5 Architecture:
- No delta embedding modifications
- Standard encoder-decoder structure
- Standard attention mechanisms
- Direct hidden states to language modeling head
- Pure baseline for comparison with delta embedding variants

Your standard T5-small baseline model has been trained from scratch!
This model will serve as the comparison baseline for delta embedding experiments.
""")

# ✅ COMPARISON TIPS:
logger.info("""
📊 COMPARISON ANALYSIS TIPS:

To fairly compare delta embedding models with this baseline:

1. **Training Loss Comparison**: Compare final training losses across models
2. **Evaluation Metrics**: Compare eval_loss on same validation set
3. **Generation Quality**: Test with same prompts and compare outputs
4. **Parameter Count**: All models should have similar parameter counts
5. **Training Conditions**: All use same:
   - Dataset and preprocessing
   - Batch sizes and learning rates
   - Number of epochs
   - Memory optimizations
   - Random seed (42)

Expected Results:
- This baseline should establish the "standard" performance
- Delta embedding models should be compared against this baseline
- Look for improvements in generation quality and training efficiency
- Delta embedding may show benefits in specific tasks or longer sequences
""")

# ✅ MEMORY TIPS for extreme optimization if still having issues:
logger.info("""
💡 If you still encounter memory issues, try these EXTREME steps:
1. Reduce TRAIN_BATCH_SIZE to 1
2. Increase GRADIENT_ACCUMULATION_STEPS to 16 or 32
3. Reduce MAX_INPUT_LENGTH to 128 and MAX_TARGET_LENGTH to 64
4. Set eval_steps to 5000 or disable evaluation entirely (eval_strategy="no")
5. Use only CPU if GPU memory is insufficient
6. This baseline model should use LESS memory than delta embedding variants
""")