# TinyLM Training on Google Colab

## This notebook trains a TinyLM model on ARC-AGI data using Google Colab's GPU resources.

### Setup Instructions:
1. Upload this notebook to Google Colab
2. Enable GPU runtime: Runtime → Change runtime type → GPU (T4/V100)
3. Run all cells in order

In [None]:
# Check GPU availability and install requirements
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected. Please enable GPU runtime in Colab!")

In [None]:
# Clone the repository from GitHub
!git clone https://github.com/CalebTalley2024/ARC-AGI-2.git
%cd ARC-AGI-2
# Checkout specific branch for consistency
!git checkout vedant

In [None]:
# Install exact package versions for reproducibility
!pip install --quiet \
    numpy==1.24.4 \
    matplotlib==3.7.5 \
    pandas==2.0.3 \
    scipy==1.10.1 \
    scikit-learn==1.3.2 \
    torch==2.2.2 \
    torchvision==0.17.2 \
    torchaudio==2.2.2 \
    transformers==4.46.3 \
    huggingface-hub==0.36.0 \
    tokenizers==0.20.3 \
    safetensors==0.5.3 \
    seaborn==0.13.2 \
    plotly==6.4.0 \
    tqdm==4.67.1 \
    pyyaml==6.0.3 \
    requests==2.32.4 \
    packaging==25.0 \
    jsonschema==4.23.0 \
    fastjsonschema==2.21.2 \
    jinja2==3.1.6 \
    markupsafe==2.1.5 \
    urllib3==2.2.3 \
    certifi==2025.10.5 \
    charset-normalizer==3.4.4 \
    idna==3.11 \
    python-dateutil==2.9.0.post0 \
    pytz==2025.2 \
    tzdata==2025.2 \
    six==1.17.0 \
    setuptools==75.3.2

# Install the package in development mode
!pip install -e .

In [None]:
# Import necessary libraries
import sys
import os
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd()
sys.path.append(str(project_root))

# Import the training function and centralized constants
from arc.models.train import train
from arc.models.tiny_lm import TinyLMConfig
from arc.utils.constants import (
    MODEL_CONFIGS, 
    TRAINING_CONFIGS, 
    get_matched_configs,
    estimate_model_parameters
)

print("Successfully imported training modules and centralized constants")
print(f"Project root: {project_root}")
print(f"Available model sizes: {list(MODEL_CONFIGS.keys())}")
print(f"Available training profiles: {list(TRAINING_CONFIGS.keys())}")

In [None]:
# Check data directory structure
data_dir = project_root / "data"
print("Data directory contents:")
if data_dir.exists():
    for item in data_dir.iterdir():
        print(f"  {item.name}")
        if item.is_dir():
            for subitem in item.iterdir():
                print(f"    {subitem.name}")
                if subitem.name == "arc" and subitem.is_dir():
                    for arcitem in subitem.iterdir():
                        print(f"      {arcitem.name}")
else:
    print("  WARNING: Data directory not found!")

# Check if training data exists
training_path = data_dir / "raw" / "arc" / "training"
eval_path = data_dir / "raw" / "arc" / "evaluation"

print(f"\nTraining data exists: {training_path.exists()}")
print(f"Evaluation data exists: {eval_path.exists()}")

if training_path.exists():
    training_files = list(training_path.glob("*.json"))
    print(f"Number of training files: {len(training_files)}")
    
if eval_path.exists():
    eval_files = list(eval_path.glob("*.json"))
    print(f"Number of evaluation files: {len(eval_files)}")

## Training Configuration

Configure the training parameters. Adjust these based on your needs and available GPU memory.

In [None]:
# GPU-aware dynamic configuration selection
def select_optimal_configs(gpu_memory_gb):
    """Select optimal model and training configs based on GPU memory."""
    
    print(f"GPU Memory: {gpu_memory_gb:.1f} GB")
    
    # Select configs based on GPU memory
    if gpu_memory_gb < 4:
        model_size = 'tiny'
        training_profile = 'debug'
        print("Low GPU memory detected - using minimal config for testing")
    elif gpu_memory_gb < 8:
        model_size = 'tiny'
        training_profile = 'small_gpu'
        print("Small GPU detected - using tiny model with memory optimization")
    elif gpu_memory_gb < 16:
        model_size = 'small'
        training_profile = 'medium_gpu'
        print("Medium GPU detected - using small model")
    else:
        model_size = 'medium'
        training_profile = 'large_gpu'
        print("Large GPU detected - using medium model for best performance")
    
    # Get matched configurations
    model_config, training_config = get_matched_configs(model_size, training_profile)
    
    # Display configuration info
    param_count = estimate_model_parameters(model_config)
    effective_batch_size = training_config['batch_size'] * training_config['grad_accumulation_steps']
    
    print(f"\nSelected Configuration:")
    print(f"  Model: {model_size} ({param_count/1e6:.1f}M parameters)")
    print(f"  Training profile: {training_profile}")
    print(f"  Batch size: {training_config['batch_size']} (effective: {effective_batch_size})")
    print(f"  Max sequence length: {training_config['max_sequence_length']}")
    print(f"  Gradient accumulation: {training_config['grad_accumulation_steps']} steps")
    
    return model_config, training_config

# Check GPU and select optimal configuration
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    model_config, training_config = select_optimal_configs(gpu_memory)
else:
    print("No GPU detected! Using CPU-friendly minimal config")
    model_config, training_config = get_matched_configs('tiny', 'debug')

# Training configuration for this session
TRAINING_CONFIG = {
    # Data paths
    "data_path": str(data_dir / "raw" / "arc" / "training"),
    "model_dir": "./models/tinylm_checkpoints",
    
    # Use centralized configurations
    "model_config": model_config,
    "training_config": training_config,
}

print("\nFinal Training Configuration:")
print(f"  Data path: {TRAINING_CONFIG['data_path']}")
print(f"  Model dir: {TRAINING_CONFIG['model_dir']}")
print(f"  Steps: {training_config['steps']:,}")
print(f"  Learning rate: {training_config['learning_rate']}")
print(f"  Weight decay: {training_config['weight_decay']}")
print(f"  Use AMP: {training_config['use_amp']}")

In [None]:
# Create output directory
output_dir = Path(TRAINING_CONFIG["model_dir"])
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Created output directory: {output_dir}")

# Test model creation with centralized config
print("\nTesting model creation with selected configuration...")
try:
    from arc.utils.constants import VOCAB_SIZE
    print(f"Vocabulary size: {VOCAB_SIZE}")
    
    # Create model with selected configuration
    model_cfg = TRAINING_CONFIG["model_config"]
    test_config = TinyLMConfig(**model_cfg)
    print(f"Model config created successfully")
    print(f"  Architecture: {test_config.d_model}D, {test_config.n_layers} layers, {test_config.n_heads} heads")
    print(f"  Parameters: ~{estimate_model_parameters(model_cfg)/1e6:.1f}M")
    print(f"  Max sequence length: {test_config.max_len}")
    
    # Display memory usage estimate
    param_size_mb = estimate_model_parameters(model_cfg) * 4 / 1e6  # 4 bytes per float32
    training_cfg = TRAINING_CONFIG["training_config"]
    batch_memory_mb = (training_cfg['batch_size'] * training_cfg['max_sequence_length'] * 
                      model_cfg['d_model'] * 4) / 1e6
    
    print(f"\nMemory Estimates:")
    print(f"  Model size: ~{param_size_mb:.0f} MB")
    print(f"  Batch memory: ~{batch_memory_mb:.0f} MB")
    print(f"  Total training memory: ~{param_size_mb + batch_memory_mb*3:.0f} MB (estimated)")
    
except Exception as e:
    print(f"Error creating model: {e}")
    print("This might indicate missing implementations in serialize module")
    import traceback
    traceback.print_exc()

## Start Training

**Note:** The current implementation has placeholder functions for data loading. This will train on empty data but demonstrates the training loop. You'll need to implement proper data loading for actual training.

In [None]:
# Start training with centralized configurations
import time

print("Starting TinyLM training with centralized configuration...")
print(f"Model will be saved to: {TRAINING_CONFIG['model_dir']}")
print("="*60)

# Display final configuration summary
model_cfg = TRAINING_CONFIG["model_config"]
training_cfg = TRAINING_CONFIG["training_config"]

print("CONFIGURATION SUMMARY:")
print(f"  Model: {model_cfg['d_model']}D, {model_cfg['n_layers']}L, {model_cfg['n_heads']}H")
print(f"  Parameters: ~{estimate_model_parameters(model_cfg)/1e6:.1f}M")
print(f"  Training steps: {training_cfg['steps']:,}")
print(f"  Effective batch size: {training_cfg['batch_size'] * training_cfg['grad_accumulation_steps']}")
print(f"  Learning rate: {training_cfg['learning_rate']}")
print(f"  Sequence length: {training_cfg['max_sequence_length']}")
print("="*60)

start_time = time.time()

try:
    # Call the training function with centralized configs
    train(
        model_dir=TRAINING_CONFIG["model_dir"],
        data_path=TRAINING_CONFIG["data_path"],
        # Pass individual model config parameters for compatibility
        steps=training_cfg["steps"],
        bs=training_cfg["batch_size"],
        lr=training_cfg["learning_rate"],
        d_model=model_cfg["d_model"],
        # n_layers=model_cfg["n_layers"],
        # n_heads=model_cfg["n_heads"],
        # Use training profile for gradient accumulation
        training_profile=list(TRAINING_CONFIGS.keys())[
            list(TRAINING_CONFIGS.values()).index(training_cfg)
        ]
    )
    
    end_time = time.time()
    training_time = end_time - start_time
    
    print("="*60)
    print(f"Training completed successfully!")
    print(f"  Training time: {training_time/60:.1f} minutes")
    print(f"Models saved to: {TRAINING_CONFIG['model_dir']}")
    
except Exception as e:
    print(f"Training failed with error: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Check training results
import os

model_dir = Path(TRAINING_CONFIG["model_dir"])
if model_dir.exists():
    print("Training output files:")
    for file in sorted(model_dir.iterdir()):
        if file.is_file():
            size_mb = file.stat().st_size / (1024 * 1024)
            print(f"  {file.name}: {size_mb:.1f} MB")
    
    # Check if best model exists
    best_model = model_dir / "best.pt"
    if best_model.exists():
        print(f"\nBest model saved: {best_model}")
        # Load and display best model info
        try:
            import torch
            checkpoint = torch.load(best_model, map_location='cpu')
            if 'loss' in checkpoint:
                print(f"   Best loss: {checkpoint['loss']:.4f}")
            if 'cfg' in checkpoint:
                cfg = checkpoint['cfg']
                print(f"   Model config: {cfg}")
        except Exception as e:
            print(f"   Could not load model info: {e}")
else:
    print("No training output found!")

## Model Testing (Optional)

Test the trained model with a simple forward pass to ensure it's working correctly.

In [None]:
# Load and test the best model
model_path = model_dir / "best.pt"

if model_path.exists():
    print("Testing the trained model...")
    
    try:
        # Load the model
        checkpoint = torch.load(model_path, map_location='cpu')
        
        # Recreate the model using centralized config
        from arc.models.tiny_lm import TinyLM, TinyLMConfig
        from arc.utils.constants import VOCAB_SIZE
        
        # Use the same config that was used for training
        cfg = TinyLMConfig(**checkpoint['cfg'])
        model = TinyLM(cfg)
        model.load_state_dict(checkpoint['model'])
        
        # Move to GPU if available
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model = model.to(device)
        model.eval()
        
        print(f"Model loaded successfully on {device}")
        
        # Display model info
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        
        print(f"Model Statistics:")
        print(f"  Total parameters: {total_params:,}")
        print(f"  Trainable parameters: {trainable_params:,}")
        print(f"  Model size: ~{total_params * 4 / 1e6:.1f} MB")
        print(f"  Architecture: {cfg.d_model}D x {cfg.n_layers}L x {cfg.n_heads}H")
        
        # Test with dummy input using the trained sequence length
        batch_size = 4
        seq_len = min(64, cfg.max_len)  # Use shorter sequence for testing
        dummy_input = torch.randint(0, VOCAB_SIZE, (batch_size, seq_len)).to(device)
        
        with torch.no_grad():
            output = model(dummy_input)
            print(f"\nForward Pass Test:")
            print(f"  Input shape: {dummy_input.shape}")
            print(f"  Output shape: {output.shape}")
            print(f"  Output range: [{output.min():.3f}, {output.max():.3f}]")
            print(f"  Output mean: {output.mean():.3f}")
            print(f"  Model is working correctly!")
            
    except Exception as e:
        print(f"Model testing failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("No trained model found to test")

## Download Trained Models

Download the trained models to your local machine or save to Google Drive.

In [None]:
# Option 1: Download files directly (in Colab)
from google.colab import files
import zipfile

if model_dir.exists():
    # Create a zip file of all models
    zip_path = "tinylm_models.zip"
    
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file in model_dir.iterdir():
            if file.is_file():
                zipf.write(file, file.name)
    
    print(f"Created zip file: {zip_path}")
    print("Downloading...")
    files.download(zip_path)
else:
    print("No models to download")

In [None]:
# Option 2: Save to Google Drive (uncomment to use)
# from google.colab import drive
# drive.mount('/content/drive')
# 
# # Copy models to Google Drive
# import shutil
# drive_path = "/content/drive/MyDrive/TinyLM_Models"
# if model_dir.exists():
#     shutil.copytree(model_dir, drive_path, dirs_exist_ok=True)
#     print(f"Models saved to Google Drive: {drive_path}")

print("Training notebook complete!")
print("\n" + "="*60)
print("TRAINING SUMMARY:")

# Display final configuration that was used
final_model_cfg = TRAINING_CONFIG["model_config"]
final_training_cfg = TRAINING_CONFIG["training_config"]

print(f"Model: {final_model_cfg['d_model']}D x {final_model_cfg['n_layers']}L x {final_model_cfg['n_heads']}H")
print(f"Parameters: ~{estimate_model_parameters(final_model_cfg)/1e6:.1f}M")
print(f"Training steps: {final_training_cfg['steps']:,}")
print(f"Batch size: {final_training_cfg['batch_size']} (effective: {final_training_cfg['batch_size'] * final_training_cfg['grad_accumulation_steps']})")
print(f"Sequence length: {final_training_cfg['max_sequence_length']}")
print(f"Learning rate: {final_training_cfg['learning_rate']}")
print(f"Models saved to: {TRAINING_CONFIG['model_dir']}")

print(f"\nGPU Optimizations Used:")
print(f"  - Gradient accumulation: {final_training_cfg['grad_accumulation_steps']} steps")
print(f"  - Mixed precision: {'Yes' if final_training_cfg['use_amp'] else 'No'}")
print(f"  - Gradient clipping: {final_training_cfg['grad_clip_norm']}")

print(f"\nNext Steps:")
print("   - Centralized configuration system implemented")
print("   - GPU-aware automatic config selection")
print("   - Implement proper data loading (arc.io.load_task)")
print("   - Implement tokenization (arc.serialize.pack_example)")
print("   - Run evaluation on the trained model")
print("   - Experiment with different hyperparameters")
print("   - Try larger models if GPU memory allows")

print("="*60)