# TinyLM Training on Google Colab

This notebook trains a TinyLM model on ARC-AGI data using Google Colab's GPU resources.

## Setup Instructions:
1. Upload this notebook to Google Colab
2. Enable GPU runtime: Runtime → Change runtime type → GPU (T4/V100)
3. Run all cells in order

In [None]:
# Check GPU availability and install requirements
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected. Please enable GPU runtime in Colab!")

In [None]:
# Clone the repository from GitHub
!git clone https://github.com/CalebTalley2024/ARC-AGI-2.git
%cd ARC-AGI-2
# Checkout specific branch for consistency
!git checkout vedant

In [None]:
# Install the package and dependencies
!pip install -e .
!pip install tqdm transformers

In [None]:
# Import necessary libraries
import sys
import os
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd()
sys.path.append(str(project_root))

# Import the training function
from arc.models.train import train
from arc.models.tiny_lm import TinyLMConfig

print("Successfully imported training modules")
print(f"Project root: {project_root}")

In [None]:
# Check data directory structure
data_dir = project_root / "data"
print("Data directory contents:")
if data_dir.exists():
    for item in data_dir.iterdir():
        print(f"  {item.name}")
        if item.is_dir():
            for subitem in item.iterdir():
                print(f"    {subitem.name}")
                if subitem.name == "arc" and subitem.is_dir():
                    for arcitem in subitem.iterdir():
                        print(f"      {arcitem.name}")
else:
    print("  WARNING: Data directory not found!")

# Check if training data exists
training_path = data_dir / "raw" / "arc" / "training"
eval_path = data_dir / "raw" / "arc" / "evaluation"

print(f"\nTraining data exists: {training_path.exists()}")
print(f"Evaluation data exists: {eval_path.exists()}")

if training_path.exists():
    training_files = list(training_path.glob("*.json"))
    print(f"Number of training files: {len(training_files)}")
    
if eval_path.exists():
    eval_files = list(eval_path.glob("*.json"))
    print(f"Number of evaluation files: {len(eval_files)}")

## Training Configuration

Configure the training parameters. Adjust these based on your needs and available GPU memory.

In [None]:
# Training configuration
TRAINING_CONFIG = {
    # Data paths
    "data_path": str(data_dir / "raw" / "arc" / "training"),
    "model_dir": "./models/tinylm_checkpoints",
    
    # Model parameters
    "d_model": 448,        # Model dimension - adjust for GPU memory
    
    # Training parameters
    "steps": 10_000,       # Number of training steps (reduce for testing)
    "batch_size": 16,      # Batch size - adjust for GPU memory
    "learning_rate": 3e-4, # Learning rate
    
    # Logging
    "save_every": 1000,    # Save checkpoint every N steps
}

print("Training Configuration:")
for key, value in TRAINING_CONFIG.items():
    print(f"  {key}: {value}")

# Check GPU memory and suggest batch size
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    if gpu_memory < 8:
        print(f"\nWARNING: GPU has {gpu_memory:.1f}GB memory. Consider reducing batch_size to 8 or 4")
        TRAINING_CONFIG["batch_size"] = 8
    elif gpu_memory >= 16:
        print(f"\nSUCCESS: GPU has {gpu_memory:.1f}GB memory. You can increase batch_size to 32")
        TRAINING_CONFIG["batch_size"] = 32

In [None]:
# Create output directory
output_dir = Path(TRAINING_CONFIG["model_dir"])
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Created output directory: {output_dir}")

# Quick test of model creation
print("\nTesting model creation...")
try:
    from arc.serialize import VOCAB_SIZE
    print(f"Vocabulary size: {VOCAB_SIZE}")
    
    # Create a test config
    test_config = TinyLMConfig(vocab_size=VOCAB_SIZE, d_model=TRAINING_CONFIG["d_model"])
    print(f"Model config created: {test_config.d_model}D model")
    
except Exception as e:
    print(f"ERROR: Error creating model: {e}")
    print("This might be due to missing implementations in serialize module")

## Start Training

**Note:** The current implementation has placeholder functions for data loading. This will train on empty data but demonstrates the training loop. You'll need to implement proper data loading for actual training.

In [None]:
# Start training
import time

print("Starting TinyLM training...")
print(f"Model will be saved to: {TRAINING_CONFIG['model_dir']}")
print("="*50)

start_time = time.time()

try:
    # Call the training function
    train(
        model_dir=TRAINING_CONFIG["model_dir"],
        data_path=TRAINING_CONFIG["data_path"],
        steps=TRAINING_CONFIG["steps"],
        bs=TRAINING_CONFIG["batch_size"],
        lr=TRAINING_CONFIG["learning_rate"],
        d_model=TRAINING_CONFIG["d_model"]
    )
    
    end_time = time.time()
    training_time = end_time - start_time
    
    print("="*50)
    print(f"Training completed successfully!")
    print(f"Training time: {training_time/60:.1f} minutes")
    print(f"Models saved to: {TRAINING_CONFIG['model_dir']}")
    
except Exception as e:
    print(f"Training failed with error: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Check training results
import os

model_dir = Path(TRAINING_CONFIG["model_dir"])
if model_dir.exists():
    print("Training output files:")
    for file in sorted(model_dir.iterdir()):
        if file.is_file():
            size_mb = file.stat().st_size / (1024 * 1024)
            print(f"  {file.name}: {size_mb:.1f} MB")
    
    # Check if best model exists
    best_model = model_dir / "best.pt"
    if best_model.exists():
        print(f"\nBest model saved: {best_model}")
        # Load and display best model info
        try:
            import torch
            checkpoint = torch.load(best_model, map_location='cpu')
            if 'loss' in checkpoint:
                print(f"   Best loss: {checkpoint['loss']:.4f}")
            if 'cfg' in checkpoint:
                cfg = checkpoint['cfg']
                print(f"   Model config: {cfg}")
        except Exception as e:
            print(f"   Could not load model info: {e}")
else:
    print("No training output found!")

## Model Testing (Optional)

Test the trained model with a simple forward pass to ensure it's working correctly.

In [None]:
# Load and test the best model
model_path = model_dir / "best.pt"

if model_path.exists():
    print("Testing the trained model...")
    
    try:
        # Load the model
        checkpoint = torch.load(model_path, map_location='cpu')
        
        # Recreate the model
        from arc.models.tiny_lm import TinyLM, TinyLMConfig
        from arc.serialize import VOCAB_SIZE
        
        cfg = TinyLMConfig(**checkpoint['cfg'])
        model = TinyLM(cfg)
        model.load_state_dict(checkpoint['model'])
        
        # Move to GPU if available
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model = model.to(device)
        model.eval()
        
        print(f"Model loaded successfully on {device}")
        print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
        
        # Test with dummy input
        batch_size, seq_len = 4, 16
        dummy_input = torch.randint(0, VOCAB_SIZE, (batch_size, seq_len)).to(device)
        
        with torch.no_grad():
            output = model(dummy_input)
            print(f"Forward pass successful!")
            print(f"  Input shape: {dummy_input.shape}")
            print(f"  Output shape: {output.shape}")
            print(f"  Output range: [{output.min():.3f}, {output.max():.3f}]")
            
    except Exception as e:
        print(f"Model testing failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("No trained model found to test")

## Download Trained Models

Download the trained models to your local machine or save to Google Drive.

In [None]:
# Option 1: Download files directly (in Colab)
from google.colab import files
import zipfile

if model_dir.exists():
    # Create a zip file of all models
    zip_path = "tinylm_models.zip"
    
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file in model_dir.iterdir():
            if file.is_file():
                zipf.write(file, file.name)
    
    print(f"Created zip file: {zip_path}")
    print("Downloading...")
    files.download(zip_path)
else:
    print("No models to download")

In [None]:
# Option 2: Save to Google Drive (uncomment to use)
# from google.colab import drive
# drive.mount('/content/drive')
# 
# # Copy models to Google Drive
# import shutil
# drive_path = "/content/drive/MyDrive/TinyLM_Models"
# if model_dir.exists():
#     shutil.copytree(model_dir, drive_path, dirs_exist_ok=True)
#     print(f"Models saved to Google Drive: {drive_path}")

print("Training notebook complete!")
print("\nSummary:")
print(f"   • Model architecture: TinyLM with {TRAINING_CONFIG['d_model']} dimensions")
print(f"   • Training steps: {TRAINING_CONFIG['steps']:,}")
print(f"   • Batch size: {TRAINING_CONFIG['batch_size']}")
print(f"   • Learning rate: {TRAINING_CONFIG['learning_rate']}")
print(f"   • Models saved to: {TRAINING_CONFIG['model_dir']}")
print("\nNext steps:")
print("   • Implement proper data loading in arc.io.load_task()")
print("   • Implement tokenization in arc.serialize.pack_example()")
print("   • Run evaluation on the trained model")
print("   • Experiment with different hyperparameters")