# Frozen Smoke Census

**Purpose:** Complete population counts and classification of all vectors in the frozen smoke.

---

## Classification Scheme

| Class | Definition | Character |
|-------|------------|----------|
| **Black Hole** | Vector with count > 1 | Multiple tokens collapsed to same point |
| **Singleton** | Vector with count = 1 | One token, unique position |

We further subdivide by spatial location (core vs Oort Cloud) and connectivity.

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from safetensors.torch import load_file
from pathlib import Path
from transformers import AutoTokenizer

# Paths
DATA_DIR = Path("../../../tensors/Qwen3-4B-Instruct-2507")

# Load data
W = load_file(DATA_DIR / "W_unembed.safetensors")['W'].view(torch.bfloat16)
masks = load_file(DATA_DIR / "masks.safetensors")
neighborhood_mask = masks['neighborhood_mask']

# Tokenizer for decoding
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")

print(f"Total vocabulary: {len(W):,}")
print(f"Neighborhood tokens: {neighborhood_mask.sum().item():,}")

Total vocabulary: 151,936
Neighborhood tokens: 2,212


In [2]:
# Extract neighborhood
neighborhood_indices = torch.where(neighborhood_mask)[0]
W_neighborhood = W[neighborhood_indices]

# Find unique vectors
unique_bits, inverse_indices, counts = torch.unique(
    W_neighborhood.view(torch.uint16).to(torch.int32), dim=0,
    return_inverse=True, return_counts=True
)

n_unique = len(unique_bits)
n_tokens = neighborhood_mask.sum().item()

print(f"Unique vectors: {n_unique}")
print(f"Tokens: {n_tokens:,}")
print(f"Compression ratio: {n_tokens / n_unique:.1f}x")

Unique vectors: 125
Tokens: 2,212
Compression ratio: 17.7x


## Black Hole Census

In [3]:
# Identify black holes
black_hole_mask = counts > 1
black_hole_indices = torch.where(black_hole_mask)[0]
n_black_holes = len(black_hole_indices)

print(f"Black holes: {n_black_holes}")
print(f"Total tokens in black holes: {counts[black_hole_mask].sum().item():,}")
print()

# Create black hole table
bh_data = []
for i, bh_idx in enumerate(black_hole_indices):
    bh_count = counts[bh_idx].item()
    
    # Find tokens in this black hole
    token_indices = [neighborhood_indices[j].item() for j in range(len(inverse_indices)) 
                     if inverse_indices[j] == bh_idx]
    
    # Sample tokens for display
    sample_tokens = token_indices[:3]
    sample_decoded = [repr(tokenizer.decode([t])) for t in sample_tokens]
    
    bh_data.append({
        'BH': f'BH{i}',
        'Vector Idx': bh_idx.item(),
        'Tokens': bh_count,
        'Sample Token IDs': str(sample_tokens[:3]),
        'Sample Decoded': ', '.join(sample_decoded)
    })

bh_df = pd.DataFrame(bh_data)
bh_df = bh_df.sort_values('Tokens', ascending=False).reset_index(drop=True)
print(bh_df.to_string(index=False))

Black holes: 13
Total tokens in black holes: 2,100

  BH  Vector Idx  Tokens         Sample Token IDs               Sample Decoded
 BH4          40     814  [80091, 119346, 119348]             '２０', '珊�', '珊�'
 BH6          61     704          [125, 177, 178]                '�', '�', '�'
BH10          75     306    [124, 123876, 123948]               '�', '�', 'ติ'
 BH3          39     228 [124350, 124658, 125147]          'เรีย', 'ย์', 'ติด'
 BH0          25      11 [123939, 131955, 131957]          'ร์', 'ฟุต', 'ฝ่าย'
 BH9          71      10 [119349, 125087, 126630]          '珊󠄁', 'ช่วย', 'จัง'
 BH1          26       6 [126268, 132713, 138041]        'สี', 'ประกัน', 'ซี่'
BH11          79       5 [132383, 132398, 139050]        'เย็น', 'นี', 'เพิ่ง'
 BH5          43       4 [135619, 138490, 140815] 'เปอร์', 'ร่วมกัน', 'ที่น่า'
BH12          80       4 [136831, 138068, 138072]      'สติ', 'ể', ' จากนั้น'
 BH2          34       3 [126775, 140303, 147056]            'รัฐ', 'ซ้ำ', '切'


## Singleton Census

In [4]:
# Identify singletons
singleton_mask = counts == 1
singleton_indices = torch.where(singleton_mask)[0]
n_singletons = len(singleton_indices)

print(f"Singletons: {n_singletons}")
print()

# Check for Thai, CJK, special characters
thai_count = 0
cjk_count = 0
special_count = 0
other_count = 0

for s_idx in singleton_indices:
    # Find the token
    token_idx = neighborhood_indices[(inverse_indices == s_idx).nonzero()[0]].item()
    decoded = tokenizer.decode([token_idx])
    
    is_thai = any('\u0e00' <= c <= '\u0e7f' for c in decoded)
    is_cjk = any('\u4e00' <= c <= '\u9fff' or '\u3400' <= c <= '\u4dbf' for c in decoded)
    is_special = decoded.startswith('<|') and decoded.endswith('|>')
    
    if is_thai:
        thai_count += 1
    elif is_cjk:
        cjk_count += 1
    elif is_special:
        special_count += 1
    else:
        other_count += 1

print(f"Singleton demographics:")
print(f"  Thai: {thai_count} ({thai_count/n_singletons*100:.1f}%)")
print(f"  CJK: {cjk_count} ({cjk_count/n_singletons*100:.1f}%)")
print(f"  Special: {special_count} ({special_count/n_singletons*100:.1f}%)")
print(f"  Other: {other_count} ({other_count/n_singletons*100:.1f}%)")

Singletons: 112

Singleton demographics:
  Thai: 85 (75.9%)
  CJK: 1 (0.9%)
  Special: 2 (1.8%)
  Other: 24 (21.4%)


## Spatial Distribution: Core vs Oort Cloud

In [5]:
# Compute L2 distances from center (biggest black hole)
unique_bf16 = unique_bits.to(torch.uint16).view(torch.bfloat16)
W_unique = unique_bf16.float()

center_idx = counts.argmax().item()
r = torch.norm(W_unique - W_unique[center_idx], dim=1).numpy()

# Core boundary at r = 0.00005 (where density drops to 1)
CORE_BOUNDARY = 0.00005

core_mask = r < CORE_BOUNDARY
oort_mask = r >= CORE_BOUNDARY

core_vectors = core_mask.sum()
core_tokens = counts.numpy()[core_mask].sum()
oort_vectors = oort_mask.sum()
oort_tokens = counts.numpy()[oort_mask].sum()

print(f"CORE (L2 < {CORE_BOUNDARY}):")
print(f"  Vectors: {core_vectors}")
print(f"  Tokens: {core_tokens:,}")
print(f"  Density: {core_tokens/core_vectors:.1f}")
print()
print(f"OORT CLOUD (L2 >= {CORE_BOUNDARY}):")
print(f"  Vectors: {oort_vectors}")
print(f"  Tokens: {oort_tokens}")
print(f"  Density: {oort_tokens/oort_vectors:.1f}")

CORE (L2 < 5e-05):
  Vectors: 75
  Tokens: 2,159
  Density: 28.8

OORT CLOUD (L2 >= 5e-05):
  Vectors: 50
  Tokens: 53
  Density: 1.1


In [6]:
# Cross-tabulate: BH/Singleton vs Core/Oort
bh_core = (black_hole_mask.numpy() & core_mask).sum()
bh_oort = (black_hole_mask.numpy() & oort_mask).sum()
single_core = (singleton_mask.numpy() & core_mask).sum()
single_oort = (singleton_mask.numpy() & oort_mask).sum()

print("\nCross-tabulation (vector counts):")
print(f"{'':>15} {'Core':>10} {'Oort':>10} {'Total':>10}")
print(f"{'Black Holes':>15} {bh_core:>10} {bh_oort:>10} {bh_core + bh_oort:>10}")
print(f"{'Singletons':>15} {single_core:>10} {single_oort:>10} {single_core + single_oort:>10}")
print(f"{'Total':>15} {core_vectors:>10} {oort_vectors:>10} {n_unique:>10}")


Cross-tabulation (vector counts):
                      Core       Oort      Total
    Black Holes         12          1         13
     Singletons         63         49        112
          Total         75         50        125


## The Outlier

The most distant vector in our selection—sitting at the L∞ = 5 exponent boundary.

In [7]:
# Find The Outlier
outlier_idx = np.argmax(r)
outlier_dist = r[outlier_idx]
outlier_count = counts[outlier_idx].item()

# Find its token
outlier_token = neighborhood_indices[(inverse_indices == outlier_idx).nonzero()[0]].item()
outlier_decoded = tokenizer.decode([outlier_token])

print(f"THE OUTLIER")
print(f"  Vector index: {outlier_idx}")
print(f"  L2 distance from center: {outlier_dist:.6f}")
print(f"  Token count: {outlier_count}")
print(f"  Token ID: {outlier_token}")
print(f"  Decoded: {repr(outlier_decoded)}")

# Check for Thai/CJK
is_thai = any('\u0e00' <= c <= '\u0e7f' for c in outlier_decoded)
is_cjk = any('\u4e00' <= c <= '\u9fff' for c in outlier_decoded)
print(f"  Thai: {is_thai}, CJK: {is_cjk}")

THE OUTLIER
  Vector index: 0
  L2 distance from center: 0.005495
  Token count: 1
  Token ID: 27487
  Decoded: '��取'
  Thai: False, CJK: True


## Summary Table

In [8]:
print("=" * 60)
print("FROZEN SMOKE CENSUS SUMMARY")
print("=" * 60)
print()
print(f"Total tokens in selection: {n_tokens:,}")
print(f"Unique vectors: {n_unique}")
print()
print(f"BLACK HOLES: {n_black_holes}")
print(f"  Tokens: {counts[black_hole_mask].sum().item():,}")
print(f"  Populations: {sorted(counts[black_hole_mask].tolist(), reverse=True)}")
print()
print(f"SINGLETONS: {n_singletons}")
print()
print(f"CORE (L2 < {CORE_BOUNDARY}): {core_vectors} vectors, {core_tokens:,} tokens")
print(f"OORT CLOUD (L2 >= {CORE_BOUNDARY}): {oort_vectors} vectors, {oort_tokens} tokens")
print()
print(f"THE OUTLIER: Token {outlier_token}, {repr(outlier_decoded)}, r = {outlier_dist:.6f}")

FROZEN SMOKE CENSUS SUMMARY

Total tokens in selection: 2,212
Unique vectors: 125

BLACK HOLES: 13
  Tokens: 2,100
  Populations: [814, 704, 306, 228, 11, 10, 6, 5, 4, 4, 3, 3, 2]

SINGLETONS: 112

CORE (L2 < 5e-05): 75 vectors, 2,159 tokens
OORT CLOUD (L2 >= 5e-05): 50 vectors, 53 tokens

THE OUTLIER: Token 27487, '��取', r = 0.005495
