# 12.2c: Generate Synthetic Snowball Dataset

**Goal:** Generate 1,000 synthetic snowballs at σ = 1.5×10⁻⁹ and save to disk for reusable analysis.

## Rationale

Generate once, analyze many ways:
- Topology analysis (12.3c)
- Population statistics (12.3b)
- Future questions we haven't thought of yet

This notebook streams data directly to disk using HDF5, keeping peak RAM usage at ~2.6 GB instead of 10+ GB.

## Parameters

In [None]:
# Experiment parameters
SIGMA = 1.5e-9         # Initialization noise scale
N_TRIALS = 10000       # Number of independent snowballs to generate
N_TOKENS = 2100        # Qwen's dead token count
HIDDEN_DIM = 2560      # Qwen's embedding dimension
BATCH_SIZE = 256       # Trials per batch (memory constraint)

# Output
OUTPUT_FILE = "../data/tensors/synthetic_snowballs_n10000_sigma1.5e-9.h5"

RANDOM_SEED = 42

## Imports

In [9]:
import torch
import numpy as np
import h5py
from safetensors.torch import load_file
from tqdm.auto import tqdm
from pathlib import Path
import time
import gc

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Setup device
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("✓ MPS (Metal Performance Shaders) available")
else:
    device = torch.device('cpu')
    print("⚠ MPS not available, using CPU only")

print(f"  Device: {device}")

✓ MPS (Metal Performance Shaders) available
  Device: mps


## Load Qwen Centroid

In [10]:
print("\nLoading Qwen black hole centroid...\n")

centroid_data = load_file("../data/tensors/black_hole_centroid_qwen3_4b.safetensors")
qwen_centroid = centroid_data['centroid'].to(torch.float32).to(device)

print(f"✓ Centroid loaded to {device}")
print(f"  Shape: {qwen_centroid.shape}")
print(f"  Norm: {qwen_centroid.norm().item():.6f}")


Loading Qwen black hole centroid...

✓ Centroid loaded to mps
  Shape: torch.Size([2560])
  Norm: 0.166061


## Create HDF5 File and Stream Snowballs to Disk

In [11]:
print(f"\nGenerating {N_TRIALS:,} synthetic snowballs (streaming to disk)...")
print(f"  σ = {SIGMA:.2e}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Shape per trial: [{N_TOKENS}, {HIDDEN_DIM}]")
print(f"  Total shape: [{N_TRIALS}, {N_TOKENS}, {HIDDEN_DIM}]")

# Calculate memory
total_elements = N_TRIALS * N_TOKENS * HIDDEN_DIM
total_mb = total_elements * 2 / (1024 * 1024)  # float16 = 2 bytes
print(f"  Total dataset size: {total_mb:.1f} MB ({total_mb/1024:.2f} GB)")
print(f"  Peak RAM usage: ~{BATCH_SIZE * N_TOKENS * HIDDEN_DIM * 4 / (1024**2):.1f} MB (one batch in float32)\n")

output_path = Path(OUTPUT_FILE)
output_path.parent.mkdir(parents=True, exist_ok=True)

n_batches = (N_TRIALS + BATCH_SIZE - 1) // BATCH_SIZE

start_time = time.time()

# Create HDF5 file with chunked dataset (allows incremental writes)
with h5py.File(str(output_path), 'w') as f:
    # Create dataset with chunking (one batch per chunk for efficient I/O)
    dataset = f.create_dataset(
        'embeddings',
        shape=(N_TRIALS, N_TOKENS, HIDDEN_DIM),
        dtype='float16',
        chunks=(BATCH_SIZE, N_TOKENS, HIDDEN_DIM),
        compression='gzip',
        compression_opts=1  # Light compression for speed
    )
    
    # Save centroid as separate dataset
    f.create_dataset('centroid', data=qwen_centroid.cpu().numpy().astype('float32'))
    
    # Save metadata as attributes
    f.attrs['sigma'] = SIGMA
    f.attrs['n_trials'] = N_TRIALS
    f.attrs['n_tokens'] = N_TOKENS
    f.attrs['hidden_dim'] = HIDDEN_DIM
    f.attrs['random_seed'] = RANDOM_SEED
    f.attrs['description'] = 'Synthetic snowballs: centroid + Gaussian(0, sigma) quantized to bfloat16'
    
    # Generate and write batches
    for batch_idx in tqdm(range(n_batches), desc="Generating & writing batches"):
        # Determine batch size (last batch might be smaller)
        start_idx = batch_idx * BATCH_SIZE
        end_idx = min(start_idx + BATCH_SIZE, N_TRIALS)
        current_batch_size = end_idx - start_idx
        
        # Generate batch on MPS
        noise = torch.randn(current_batch_size, N_TOKENS, HIDDEN_DIM, dtype=torch.float32, device=device) * SIGMA
        embeddings_batch = qwen_centroid.unsqueeze(0).unsqueeze(0) + noise
        
        # Quantize to bfloat16 then back to float32 (simulates bfloat16 storage)
        embeddings_batch = embeddings_batch.to(torch.bfloat16).to(torch.float32)
        
        # Write directly to disk (no RAM accumulation!)
        dataset[start_idx:end_idx] = embeddings_batch.cpu().numpy().astype('float16')
        
        # Cleanup
        del noise, embeddings_batch
        gc.collect()

gen_time = time.time() - start_time

print(f"\n✓ Generated and saved {N_TRIALS:,} snowballs in {gen_time:.1f}s")
print(f"  File: {output_path}")
print(f"  Size: {output_path.stat().st_size / (1024**2):.1f} MB")


Generating 10,000 synthetic snowballs (streaming to disk)...
  σ = 1.50e-09
  Batch size: 256
  Shape per trial: [2100, 2560]
  Total shape: [10000, 2100, 2560]
  Total dataset size: 102539.1 MB (100.14 GB)
  Peak RAM usage: ~5250.0 MB (one batch in float32)



Generating & writing batches:   0%|          | 0/40 [00:00<?, ?it/s]


✓ Generated and saved 10,000 snowballs in 217.2s
  File: ../data/tensors/synthetic_snowballs_n1000_sigma1.5e-9.h5
  Size: 3970.2 MB


## Verification

In [12]:
print(f"\nVerifying saved file...")

with h5py.File(str(output_path), 'r') as f:
    print(f"\n✓ File loads successfully")
    print(f"  Datasets: {list(f.keys())}")
    print(f"  Embeddings shape: {f['embeddings'].shape}")
    print(f"  Embeddings dtype: {f['embeddings'].dtype}")
    print(f"  Centroid shape: {f['centroid'].shape}")
    print(f"\nMetadata:")
    for key, value in f.attrs.items():
        print(f"  {key}: {value}")


Verifying saved file...

✓ File loads successfully
  Datasets: ['centroid', 'embeddings']
  Embeddings shape: (10000, 2100, 2560)
  Embeddings dtype: float16
  Centroid shape: (2560,)

Metadata:
  description: Synthetic snowballs: centroid + Gaussian(0, sigma) quantized to bfloat16
  hidden_dim: 2560
  n_tokens: 2100
  n_trials: 10000
  random_seed: 42
  sigma: 1.5e-09


## Sanity Check

In [13]:
print(f"\nSanity check on first trial...")

with h5py.File(str(output_path), 'r') as f:
    # Load just first trial (efficient, doesn't load whole dataset!)
    trial_0 = torch.from_numpy(f['embeddings'][0]).to(torch.float32)

# Find unique vectors
unique_vectors, inverse_indices, counts = torch.unique(
    trial_0,
    dim=0,
    return_inverse=True,
    return_counts=True
)

# Black holes
black_hole_mask = counts >= 2
n_black_holes = black_hole_mask.sum().item()
black_hole_population = counts[black_hole_mask].sum().item()
n_unique = len(unique_vectors)
n_singletons = n_unique - n_black_holes

print(f"\nTrial 0 statistics:")
print(f"  Total tokens: {len(trial_0)}")
print(f"  Unique vectors: {n_unique}")
print(f"  Black holes (C): {n_black_holes}")
print(f"  Black hole population (P): {black_hole_population}")
print(f"  Singletons: {n_singletons}")

if 10 <= n_black_holes <= 17 and 2090 <= black_hole_population <= 2100:
    print(f"\n✓ Results look reasonable (matches expected ranges)")
else:
    print(f"\n⚠ Results outside expected ranges - check generation")


Sanity check on first trial...

Trial 0 statistics:
  Total tokens: 2100
  Unique vectors: 10
  Black holes (C): 10
  Black hole population (P): 2100
  Singletons: 0

✓ Results look reasonable (matches expected ranges)


## Summary

In [14]:
file_size_mb = output_path.stat().st_size / (1024 * 1024)

print(f"\n{'='*60}")
print(f"DATASET GENERATION COMPLETE")
print(f"{'='*60}")
print(f"Trials: {N_TRIALS:,}")
print(f"Tokens per trial: {N_TOKENS:,}")
print(f"Dimensions: {HIDDEN_DIM:,}")
print(f"σ = {SIGMA:.2e}")
print(f"\nFile: {output_path}")
print(f"Size: {file_size_mb:.1f} MB ({file_size_mb/1024:.2f} GB)")
print(f"\nGeneration time: {gen_time:.1f}s")
print(f"\nUsage in other notebooks:")
print(f"  import h5py")
print(f"  import torch")
print(f"  with h5py.File('{OUTPUT_FILE}', 'r') as f:")
print(f"      trial_42 = torch.from_numpy(f['embeddings'][42])")
print(f"      subset = torch.from_numpy(f['embeddings'][:100])")
print(f"      centroid = torch.from_numpy(f['centroid'][:])")
print(f"      sigma = f.attrs['sigma']")
print(f"{'='*60}")


DATASET GENERATION COMPLETE
Trials: 10,000
Tokens per trial: 2,100
Dimensions: 2,560
σ = 1.50e-09

File: ../data/tensors/synthetic_snowballs_n1000_sigma1.5e-9.h5
Size: 3970.2 MB (3.88 GB)

Generation time: 217.2s

Usage in other notebooks:
  import h5py
  import torch
  with h5py.File('../data/tensors/synthetic_snowballs_n1000_sigma1.5e-9.h5', 'r') as f:
      trial_42 = torch.from_numpy(f['embeddings'][42])
      subset = torch.from_numpy(f['embeddings'][:100])
      centroid = torch.from_numpy(f['centroid'][:])
      sigma = f.attrs['sigma']
