In [1]:
!pixi run CUDA_VISIBLE_DEVICES=1 pixi run python ./experiments/hybrid_growth_experiment.py --load-model ./data/promising_models/20250707_020152/model_cifar10_3layers_seed9_acc0.47_patch0.345_sparse0.050_BEST_ACCURACY_GLOBAL.pt --evolution-steps 3 --sort-frequency 2

[2K[32m⠁[0m activating environment                                                                 [32m⠁[0m                                                                               Files already downloaded and verified
Files already downloaded and verified
🔄 Neuron sorting: Enabled (every 2 steps)
🔬 Loading pretrained model from: ./data/promising_models/20250707_020152/model_cifar10_3layers_seed9_acc0.47_patch0.345_sparse0.050_BEST_ACCURACY_GLOBAL.pt
  checkpoint = torch.load(checkpoint_path, map_location=device)
   🎯 Using sparsity from checkpoint: 0.05
   🏗️  Architecture: [3072, 2048, 10]
   📊 Sparsity: 0.05
   🔍 Available state dict keys:
      0.linear.bias: torch.Size([2048])
      0.linear.weight: torch.Size([2048, 3072])
      0.mask: torch.Size([2048, 3072])
      2.linear.bias: torch.Size([10])
      2.linear.weight: torch.Size([10, 2048])
      2.mask: torch.Size([10, 2048])
   ✅ Loading pretrained weights for layer 0 (keys '0.linear.weight', '0.linear.bias')
     

In [1]:
import torch
import numpy as np

import os
def setup_correct_device(physical_gpu_id):
    """
    Correctly selects a physical GPU and makes it the ONLY one visible to PyTorch.
    This function preserves the mapping you expect.
    """
    # 1. Set the environment variable using the physical ID.
    #    This MUST be done before any torch.cuda calls.
    os.environ["CUDA_VISIBLE_DEVICES"] = str(physical_gpu_id)

    # 2. Now that PyTorch only sees one GPU, it will always be 'cuda:0'.
    if torch.cuda.is_available():
        # Get the name to confirm we got the right one.
        gpu_name = torch.cuda.get_device_name(0) 
        print(f"✅ Success! Physical GPU {physical_gpu_id} ('{gpu_name}') is now active as PyTorch device 'cuda:0'.")
        return torch.device("cuda:0")
    else:
        print("⚠️ CUDA not available. Falling back to CPU.")
        return torch.device("cpu")

device = setup_correct_device(1)

def inspect_checkpoint(checkpoint_path):
    """Comprehensive checkpoint inspector"""
    print(f"\n🔍 Inspecting: {checkpoint_path}")
    print("=" * 60)
    
    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    
    # 1. Basic info
    print("\n📋 Checkpoint Keys:")
    for key in checkpoint.keys():
        if isinstance(checkpoint[key], torch.Tensor):
            print(f"  {key}: {checkpoint[key].shape}")
        else:
            print(f"  {key}: {checkpoint[key]}")
    
    # 2. Model state dict analysis
    if 'model_state_dict' in checkpoint:
        print("\n🏗️  Model Architecture:")
        state_dict = checkpoint['model_state_dict']
        
        # Analyze layers
        layers = {}
        for key, tensor in state_dict.items():
            layer_name = key.split('.')[0]
            if layer_name not in layers:
                layers[layer_name] = {}
            layers[layer_name][key] = tensor
        
        for layer_name in sorted(layers.keys()):
            print(f"\n  Layer {layer_name}:")
            for key, tensor in layers[layer_name].items():
                print(f"    {key}: {tensor.shape}")
                
                # Statistics for weights
                if 'weight' in key:
                    non_zero = (tensor != 0).sum().item()
                    total = tensor.numel()
                    sparsity = 1 - (non_zero / total)
                    print(f"      Non-zero: {non_zero}/{total} (sparsity: {sparsity:.2%})")
                    print(f"      Stats: mean={tensor.mean():.4f}, std={tensor.std():.4f}")
                    print(f"      Range: [{tensor.min():.4f}, {tensor.max():.4f}]")
    
    # 3. Training info
    print("\n📊 Training Info:")
    for key in ['accuracy', 'epoch', 'loss', 'sparsity']:
        if key in checkpoint:
            print(f"  {key}: {checkpoint[key]}")
    
    # 4. Architecture if available
    if 'architecture' in checkpoint:
        print(f"\n🏛️  Architecture: {checkpoint['architecture']}")
    
    # 5. Any optimizer state
    if 'optimizer_state_dict' in checkpoint:
        print("\n📈 Optimizer state: Present")
    
    print("=" * 60)

# Quick usage function
def quick_inspect(path):
    """Just the essentials"""
    ckpt = torch.load(path, map_location='cpu')
    print(f"\n{path.split('/')[-1]}:")
    print(f"  Keys: {list(ckpt.keys())}")
    if 'accuracy' in ckpt:
        print(f"  Accuracy: {ckpt['accuracy']}")
    if 'architecture' in ckpt:
        print(f"  Architecture: {ckpt['architecture']}")
    if 'model_state_dict' in ckpt:
        total_params = sum(p.numel() for p in ckpt['model_state_dict'].values() if 'weight' in p)
        print(f"  Total parameters: {total_params:,}")

✅ Success! Physical GPU 1 ('NVIDIA GeForce RTX 2060 SUPER') is now active as PyTorch device 'cuda:0'.


In [2]:
checkpoint_path = "data/promising_models/20250707_020152/model_cifar10_3layers_seed9_acc0.47_patch0.345_sparse0.050_BEST_ACCURACY_GLOBAL.pt"
inspect_checkpoint(checkpoint_path)


🔍 Inspecting: data/promising_models/20250707_020152/model_cifar10_3layers_seed9_acc0.47_patch0.345_sparse0.050_BEST_ACCURACY_GLOBAL.pt

📋 Checkpoint Keys:
  model_state_dict: OrderedDict([('0.mask', tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])), ('0.linear.weight', tensor([[0., -0., -0.,  ..., 0., -0., 0.],
        [0., -0., -0.,  ..., 0., 0., -0.],
        [-0., -0., -0.,  ..., -0., -0., -0.],
        ...,
        [-0., -0., 0.,  ..., -0., -0., -0.],
        [-0., -0., 0.,  ..., -0., -0., 0.],
        [0., 0., 0.,  ..., -0., -0., -0.]])), ('0.linear.bias', tensor([0.0137, 0.0165, 0.0170,  ..., 0.0012, 0.0162, 0.0020])), ('2.mask', tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 

  checkpoint = torch.load(checkpoint_path, map_location='cpu')


In [3]:
# Test EXACTLY as the hunter would
import torch
import torch.nn as nn
device = setup_correct_device(1)
# Load checkpoint
ckpt = torch.load(checkpoint_path)
print(f"Checkpoint claims accuracy: {ckpt.get('accuracy', 'NOT FOUND')}")

# Recreate EXACT model from hunter
model = nn.Sequential()
arch = ckpt['architecture']
for i in range(len(arch)-1):
    model.add_module(str(i*2), nn.Linear(arch[i], arch[i+1]))
    if i < len(arch)-2:
        model.add_module(str(i*2+1), nn.ReLU())

# Load weights EXACTLY as saved
model.load_state_dict(ckpt['model_state_dict'])

# Test it
model.eval()
# ... test accuracy

✅ Success! Physical GPU 1 ('NVIDIA GeForce RTX 2060 SUPER') is now active as PyTorch device 'cuda:0'.
Checkpoint claims accuracy: 0.4735


  ckpt = torch.load(checkpoint_path)


RuntimeError: Error(s) in loading state_dict for Sequential:
	Missing key(s) in state_dict: "0.weight", "0.bias", "2.weight", "2.bias". 
	Unexpected key(s) in state_dict: "0.mask", "0.linear.weight", "0.linear.bias", "2.mask", "2.linear.weight", "2.linear.bias". 

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import os

device = setup_correct_device(1)
# Set CUDA devices BEFORE importing torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0,2" # This is fine, but for simplicity let's let PyTorch manage it

class SparseLayer(nn.Module):
    """Matches what the hunter used"""
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        # Initialize mask properly
        self.register_buffer('mask', torch.ones(out_features, in_features))
    
    def forward(self, x):
        # The mask will be on the same device as the weight because both are part of the model
        return F.linear(x, self.linear.weight * self.mask, self.linear.bias)

# CIFAR-10 test loader
def get_cifar10_test_loader(batch_size=64):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    
    test_dataset = datasets.CIFAR10(
        root='./data', 
        train=False,
        download=True, 
        transform=transform
    )
    
    return DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --- Main Script ---

# Load checkpoint
checkpoint_path = "./data/promising_models/20250707_020152/model_cifar10_3layers_seed9_acc0.47_patch0.345_sparse0.050_BEST_ACCURACY_GLOBAL.pt"
# device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load to CPU first to prevent device mismatches during loading
# Using weights_only=True is safer for untrusted files
ckpt = torch.load(checkpoint_path, map_location='cpu', weights_only=False)

# Build model
arch = ckpt['architecture']
model = nn.Sequential()

for i in range(len(arch)-1):
    layer = SparseLayer(arch[i], arch[i+1])
    
    # Get keys for the state dict
    weight_key = f'{i*2}.linear.weight'
    bias_key = f'{i*2}.linear.bias'
    mask_key = f'{i*2}.mask'
    
    # Load the weights and mask data
    if weight_key in ckpt['model_state_dict']:
        layer.linear.weight.data = ckpt['model_state_dict'][weight_key]
        layer.linear.bias.data = ckpt['model_state_dict'][bias_key]
        
        # --- THIS IS THE FIX ---
        # Load data INTO the buffer, don't replace the buffer object itself.
        layer.mask.data = ckpt['model_state_dict'][mask_key]
    
    model.add_module(str(i*2), layer)
    if i < len(arch)-2:
        model.add_module(str(i*2+1), nn.ReLU())

# NOW move the entire model (including registered buffers) to the correct device
model = model.to(device)
model.eval()

# Get test loader
test_loader = get_cifar10_test_loader()

# Test accuracy
correct = 0
total = 0

print("Testing model on CIFAR-10...")
with torch.no_grad():
    for batch_idx, (data, target) in enumerate(test_loader):
        # Move data for the current batch to the device
        data = data.view(data.size(0), -1).to(device)
        target = target.to(device)
        
        output = model(data)
        pred = output.argmax(dim=1)
        correct += (pred == target).sum().item()
        total += len(target)
        
        # Optional: Break early for a quick check
        # if total >= 1000:
        #     break

actual_accuracy = correct / total
print(f"\n✅ Model loaded! Architecture: {ckpt['architecture']}")
print(f"📊 Claimed accuracy: {ckpt['accuracy']:.2%}")
print(f"🔍 Actual tested accuracy: {actual_accuracy:.2%}")
print(f"❓ Match? {abs(actual_accuracy - ckpt['accuracy']) < 0.05}")

if actual_accuracy < 0.1:
    print("\n🚨 Accuracy suspiciously low! Debugging...")
    # Add device check to confirm the fix
    for name, p in model.named_parameters():
        print(f"{name} device: {p.device}")
    for name, b in model.named_buffers():
        print(f"{name} device: {b.device}")

✅ Success! Physical GPU 1 ('NVIDIA GeForce RTX 2060 SUPER') is now active as PyTorch device 'cuda:0'.
Using device: cuda:0
Files already downloaded and verified
Testing model on CIFAR-10...

✅ Model loaded! Architecture: [3072, 2048, 10]
📊 Claimed accuracy: 47.35%
🔍 Actual tested accuracy: 9.76%
❓ Match? False

🚨 Accuracy suspiciously low! Debugging...
0.linear.weight device: cuda:0
0.linear.bias device: cuda:0
2.linear.weight device: cuda:0
2.linear.bias device: cuda:0
0.mask device: cuda:0
2.mask device: cuda:0


In [5]:
# Debug the loaded weights
print("\n🔍 Debugging weight statistics:")
for name, param in model.named_parameters():
    if 'weight' in name:
        non_zero = (param != 0).sum().item()
        total = param.numel()
        print(f"{name}:")
        print(f"  Shape: {param.shape}")
        print(f"  Non-zero: {non_zero}/{total} ({non_zero/total*100:.2f}%)")
        print(f"  Mean: {param.mean():.6f}, Std: {param.std():.6f}")
        print(f"  Max: {param.max():.6f}, Min: {param.min():.6f}")

# Also check the masks
print("\n🔍 Debugging mask statistics:")
for name, module in model.named_modules():
    if hasattr(module, 'mask'):
        mask = module.mask
        active = mask.sum().item()
        total = mask.numel()
        print(f"{name}.mask: {active}/{total} active ({active/total*100:.2f}%)")

# Test with the actual weights as they are
print("\n🔍 Testing raw output:")
with torch.no_grad():
    test_input = torch.randn(1, 3072).to(device)
    layer0_out = model[0](test_input)
    print(f"After layer 0: mean={layer0_out.mean():.6f}, std={layer0_out.std():.6f}")
    layer1_out = F.relu(layer0_out)
    print(f"After ReLU: mean={layer1_out.mean():.6f}, std={layer1_out.std():.6f}")
    final_out = model[2](layer1_out)
    print(f"Final output: {final_out}")


🔍 Debugging weight statistics:
0.linear.weight:
  Shape: torch.Size([2048, 3072])
  Non-zero: 315349/6291456 (5.01%)
  Mean: -0.000002, Std: 0.002333
  Max: 0.018042, Min: -0.018042
2.linear.weight:
  Shape: torch.Size([10, 2048])
  Non-zero: 1082/20480 (5.28%)
  Mean: -0.000019, Std: 0.002931
  Max: 0.021964, Min: -0.022068

🔍 Debugging mask statistics:
0.mask: 315349.0/6291456 active (5.01%)
2.mask: 1082.0/20480 active (5.28%)

🔍 Testing raw output:
After layer 0: mean=0.001343, std=0.129675
After ReLU: mean=0.052195, std=0.075405
Final output: tensor([[ 0.0064, -0.0011,  0.0100,  0.0042, -0.0057, -0.0154, -0.0239, -0.0168,
         -0.0139, -0.0348]], device='cuda:0')


In [6]:
# Print what's actually in the checkpoint
print("🔍 Checkpoint architecture analysis:")
print(f"Architecture from metadata: {ckpt['architecture']}")
print("\nActual layers in state dict:")
for key in sorted(ckpt['model_state_dict'].keys()):
    if 'weight' in key:
        layer_idx = key.split('.')[0]
        shape = ckpt['model_state_dict'][key].shape
        print(f"  Layer {layer_idx}: {shape}")

# The architecture [3072, 2048, 10] only has 2 layers!
# But the filename says 3 layers...

# Let's try assuming there's a missing middle layer
# Maybe the architecture should be [3072, ?, ?, 10]?

# Check if this is actually a 2-layer network mislabeled as 3
if len(ckpt['architecture']) == 3:
    print("\n⚠️ Architecture has 3 values = 2 layers, but filename says '3layers'")
    print("This might be a naming inconsistency.")

🔍 Checkpoint architecture analysis:
Architecture from metadata: [3072, 2048, 10]

Actual layers in state dict:
  Layer 0: torch.Size([2048, 3072])
  Layer 2: torch.Size([10, 2048])

⚠️ Architecture has 3 values = 2 layers, but filename says '3layers'
This might be a naming inconsistency.


In [7]:
# Just load it as a 2-layer network and test
model_2layer = nn.Sequential(
    nn.Linear(3072, 2048),
    nn.ReLU(),
    nn.Linear(2048, 10)
)

# Load the weights (without masks for now)
model_2layer[0].weight.data = ckpt['model_state_dict']['0.linear.weight']
model_2layer[0].bias.data = ckpt['model_state_dict']['0.linear.bias']
model_2layer[2].weight.data = ckpt['model_state_dict']['2.linear.weight']
model_2layer[2].bias.data = ckpt['model_state_dict']['2.linear.bias']

model_2layer = model_2layer.to(device)
model_2layer.eval()

# Quick test
correct = 0
with torch.no_grad():
    for i, (data, target) in enumerate(test_loader):
        if i >= 10:  # Just 10 batches
            break
        data = data.view(data.size(0), -1).to(device)
        output = model_2layer(data)
        pred = output.argmax(dim=1)
        correct += (pred == target.to(device)).sum().item()

print(f"\n🎯 Plain 2-layer model accuracy: {correct/640:.2%}")


🎯 Plain 2-layer model accuracy: 8.12%


In [14]:
from src.structure_net import create_standard_network, get_network_stats, lsuv_init_network
import torch

print('🧪 Testing modular structure_net...')

# Test network creation
network = create_standard_network([784, 128, 10], 0.02, device='cpu')
print(f'✅ Network created: {len(network)} layers')

# Test network stats
stats = get_network_stats(network)
print(f'✅ Network stats: {stats["architecture"]}')

# Test LSUV
sample_batch = torch.randn(32, 784)
lsuv_init_network(network, sample_batch, verbose=False)
print('✅ LSUV initialization successful')

print('🎯 All modular components working!')

🧪 Testing modular structure_net...
✅ Network created: 3 layers
✅ Network stats: [784, 128, 10]
✅ LSUV initialization successful
🎯 All modular components working!
