In [1]:
import os

In [4]:

# Reverse engineering: What vocab size fits in 10 GB for 100k steps?

max_storage_gb = 10
steps = 1_000 + 1  # Include step 0
hidden_dim = 64
bytes_per_value = 2  # bfloat16

# Convert GB to bytes
max_storage_bytes = max_storage_gb * (1024**3)

print(f"Target storage: {max_storage_gb} GB = {max_storage_bytes:,} bytes")
print(f"Steps to save: {steps:,}")
print(f"Hidden dim: {hidden_dim}")
print(f"Precision: bfloat16 ({bytes_per_value} bytes)")
print()

# Calculate max vocab size
# Total bytes = steps * vocab_size * hidden_dim * bytes_per_value
# Solving for vocab_size:
# vocab_size = total_bytes / (steps * hidden_dim * bytes_per_value)

max_vocab_size = max_storage_bytes / (steps * hidden_dim * bytes_per_value)

print(f"Maximum vocab size: {max_vocab_size:,.0f} tokens")
print()

# What does this look like per step?
bytes_per_step = max_vocab_size * hidden_dim * bytes_per_value
mb_per_step = bytes_per_step / (1024**2)

print(f"Storage per timestep: {mb_per_step:.2f} MB")
print()

# How does this compare to common tokenizers?
print("Common tokenizer sizes for reference:")
print(f"  GPT-2: 50,257 tokens")
print(f"  BERT: 30,522 tokens") 
print(f"  GPT-2 (our max): {int(max_vocab_size):,} tokens")
print()

# If we used GPT-2, how much would we actually need?
gpt2_vocab = 50_257
gpt2_storage_gb = (gpt2_vocab * hidden_dim * bytes_per_value * steps) / (1024**3)

print(f"If we used GPT-2 (50,257 tokens):")
print(f"  Storage needed: {gpt2_storage_gb:.2f} GB")
print()

# What if we sample every Nth step instead?
for sample_rate in [1, 10, 100]:
    sampled_steps = (steps - 1) // sample_rate + 1
    sampled_gb = (gpt2_vocab * hidden_dim * bytes_per_value * sampled_steps) / (1024**3)
    print(f"  Every {sample_rate:3d} step(s): {sampled_steps:6,} samples = {sampled_gb:6.2f} GB")


Target storage: 10 GB = 10,737,418,240 bytes
Steps to save: 1,001
Hidden dim: 64
Precision: bfloat16 (2 bytes)

Maximum vocab size: 83,802 tokens

Storage per timestep: 10.23 MB

Common tokenizer sizes for reference:
  GPT-2: 50,257 tokens
  BERT: 30,522 tokens
  GPT-2 (our max): 83,802 tokens

If we used GPT-2 (50,257 tokens):
  Storage needed: 6.00 GB

  Every   1 step(s):  1,001 samples =   6.00 GB
  Every  10 step(s):    101 samples =   0.61 GB
  Every 100 step(s):     11 samples =   0.07 GB


In [5]:

# Wordybird memory requirements

import torch

# Model architecture (same as Lil Gatsby except vocab)
vocab_size = 50_257  # GPT-2
hidden_dim = 64
n_layers = 4
n_heads = 4
seq_length = 128

print("=" * 70)
print("WORDYBIRD MEMORY REQUIREMENTS")
print("=" * 70)
print()

# E and W matrices (the big ones)
e_w_params = 2 * vocab_size * hidden_dim  # E and W
e_w_bytes_fp32 = e_w_params * 4  # float32
e_w_bytes_bf16 = e_w_params * 2  # bfloat16
e_w_mb_fp32 = e_w_bytes_fp32 / (1024**2)
e_w_mb_bf16 = e_w_bytes_bf16 / (1024**2)

print(f"Embedding matrices (E + W):")
print(f"  Parameters: {e_w_params:,}")
print(f"  FP32: {e_w_mb_fp32:.1f} MB")
print(f"  BF16: {e_w_mb_bf16:.1f} MB")
print()

# Rest of the model (transformer layers, etc.)
# Rough estimate: attention (QKV + output) + FFN
params_per_layer = (
    4 * hidden_dim * hidden_dim +  # QKV + output projection
    2 * hidden_dim * (4 * hidden_dim)  # FFN (typically 4x expansion)
)
other_params = n_layers * params_per_layer
other_mb_fp32 = (other_params * 4) / (1024**2)

print(f"Transformer layers ({n_layers} layers):")
print(f"  Parameters: {other_params:,}")
print(f"  FP32: {other_mb_fp32:.1f} MB")
print()

total_params = e_w_params + other_params
total_mb_fp32 = e_w_mb_fp32 + other_mb_fp32

print(f"Total model:")
print(f"  Parameters: {total_params:,}")
print(f"  FP32: {total_mb_fp32:.1f} MB")
print()

print("=" * 70)
print("TRAINING MEMORY (per batch)")
print("=" * 70)
print()

# Activations scale with batch_size
def estimate_memory(batch_size):
    # Model parameters (fixed)
    model_mb = e_w_mb_fp32 + other_mb_fp32
    
    # Activations (rough estimate)
    # Each layer stores: input, attention weights, FFN intermediate
    activation_elements = batch_size * seq_length * hidden_dim * n_layers * 3
    activation_mb = (activation_elements * 4) / (1024**2)  # fp32
    
    # Gradients (same size as parameters during training)
    gradient_mb = model_mb
    
    # Optimizer state (AdamW stores 2 moments per parameter)
    optimizer_mb = model_mb * 2
    
    # Total
    total_mb = model_mb + activation_mb + gradient_mb + optimizer_mb
    
    return {
        'model': model_mb,
        'activations': activation_mb,
        'gradients': gradient_mb,
        'optimizer': optimizer_mb,
        'total': total_mb
    }

for batch_size in [1, 8, 16, 32, 64]:
    mem = estimate_memory(batch_size)
    print(f"Batch size {batch_size:2d}:")
    print(f"  Model:       {mem['model']:7.1f} MB")
    print(f"  Activations: {mem['activations']:7.1f} MB")
    print(f"  Gradients:   {mem['gradients']:7.1f} MB")
    print(f"  Optimizer:   {mem['optimizer']:7.1f} MB")
    print(f"  Total:       {mem['total']:7.1f} MB ({mem['total']/1024:.2f} GB)")
    print()

print("=" * 70)
print(f"Your M4 Pro has 48 GB RAM")
print(f"Safe budget: ~24 GB for training (leaving room for OS/other processes)")
print("=" * 70)


WORDYBIRD MEMORY REQUIREMENTS

Embedding matrices (E + W):
  Parameters: 6,432,896
  FP32: 24.5 MB
  BF16: 12.3 MB

Transformer layers (4 layers):
  Parameters: 196,608
  FP32: 0.8 MB

Total model:
  Parameters: 6,629,504
  FP32: 25.3 MB

TRAINING MEMORY (per batch)

Batch size  1:
  Model:          25.3 MB
  Activations:     0.4 MB
  Gradients:      25.3 MB
  Optimizer:      50.6 MB
  Total:         101.5 MB (0.10 GB)

Batch size  8:
  Model:          25.3 MB
  Activations:     3.0 MB
  Gradients:      25.3 MB
  Optimizer:      50.6 MB
  Total:         104.2 MB (0.10 GB)

Batch size 16:
  Model:          25.3 MB
  Activations:     6.0 MB
  Gradients:      25.3 MB
  Optimizer:      50.6 MB
  Total:         107.2 MB (0.10 GB)

Batch size 32:
  Model:          25.3 MB
  Activations:    12.0 MB
  Gradients:      25.3 MB
  Optimizer:      50.6 MB
  Total:         113.2 MB (0.11 GB)

Batch size 64:
  Model:          25.3 MB
  Activations:    24.0 MB
  Gradients:      25.3 MB
  Optimizer:   