# Learned Activations v2: Fair Comparisons & Hybrid Approaches

## Issues from Notebook 10
1. **SatCLIP comparison was broken** - MLP head overfitted on frozen embeddings
2. **Unfair param count** - SatCLIP had 33K params, Direct had 231K
3. **SIREN initialization wrong** - ω₀=30 too high for [-1,1] coordinates

## This notebook fixes these issues:
1. **Fair SatCLIP baseline** - Use sklearn (Ridge) like notebook 07
2. **Hybrid approach** - SH features + learned activations
3. **Fixed SIREN** - Lower ω₀ values
4. **Ablation study** - Number of frequencies, layers, experts

## Key Questions
1. Can learned activations match pretrained SatCLIP?
2. Does SH + learned activations beat SH + SIREN?
3. What's the optimal number of frequencies?

In [None]:
# Setup
import os
import sys

if 'COLAB_GPU' in os.environ:
    !rm -rf sample_data .config satclip gpw_data 2>/dev/null
    !git clone https://github.com/1hamzaiqbal/satclip.git
    !pip install lightning torchgeo huggingface_hub rasterio --quiet

In [None]:
# Mount Google Drive and extract GPW data
from google.colab import drive
drive.mount('/content/drive')

import os
import zipfile

GPW_DIR = './gpw_data'
os.makedirs(GPW_DIR, exist_ok=True)

SOURCE_ZIP_PATH = '/content/drive/MyDrive/grad/learned_activations/dataverse_files.zip'

print("Extracting GPW data...")
with zipfile.ZipFile(SOURCE_ZIP_PATH, 'r') as z:
    z.extractall(GPW_DIR)

# Extract 15-min resolution
zip_path = os.path.join(GPW_DIR, 'gpw-v4-population-density-rev11_2020_15_min_tif.zip')
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(GPW_DIR)
    print("Extracted 15-min resolution")

print("Done!")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

if 'COLAB_GPU' in os.environ:
    sys.path.append('./satclip/satclip')
    GPW_DIR = './gpw_data'
else:
    sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'satclip'))
    GPW_DIR = './gpw_data'

from huggingface_hub import hf_hub_download
from load import get_satclip

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load SatCLIP models
print("Loading SatCLIP models...")
satclip_l10 = get_satclip(hf_hub_download("microsoft/SatCLIP-ViT16-L10", "satclip-vit16-l10.ckpt"), device=device)
satclip_l40 = get_satclip(hf_hub_download("microsoft/SatCLIP-ViT16-L40", "satclip-vit16-l40.ckpt"), device=device)
satclip_l10.eval()
satclip_l40.eval()
print("SatCLIP models loaded!")

---
## 1. Data Loading (same as notebook 10)

In [None]:
Image.MAX_IMAGE_PIXELS = None

def load_gpw_raster(resolution='15_min', year=2020):
    tif_file = f"{GPW_DIR}/gpw_v4_population_density_rev11_{year}_{resolution}.tif"
    if not os.path.exists(tif_file):
        print(f"File not found: {tif_file}")
        return None, None
    
    img = Image.open(tif_file)
    data = np.array(img)
    height, width = data.shape
    
    lon_step = 360 / width
    lat_step = 180 / height
    lons = np.linspace(-180 + lon_step/2, 180 - lon_step/2, width)
    lats = np.linspace(90 - lat_step/2, -90 + lat_step/2, height)
    
    return data, (lons, lats)

def sample_from_raster(data, coords, n_samples=10000, seed=42, bounds=None):
    np.random.seed(seed)
    lons, lats = coords
    valid_mask = data > -1e30
    
    if bounds is not None:
        lon_min, lat_min, lon_max, lat_max = bounds
        lon_grid, lat_grid = np.meshgrid(lons, lats)
        bounds_mask = (
            (lon_grid >= lon_min) & (lon_grid <= lon_max) &
            (lat_grid >= lat_min) & (lat_grid <= lat_max)
        )
        valid_mask = valid_mask & bounds_mask
    
    valid_idx = np.where(valid_mask)
    n_valid = len(valid_idx[0])
    
    if n_valid < n_samples:
        sample_idx = np.arange(n_valid)
    else:
        sample_idx = np.random.choice(n_valid, n_samples, replace=False)
    
    row_idx = valid_idx[0][sample_idx]
    col_idx = valid_idx[1][sample_idx]
    
    sample_lons = lons[col_idx]
    sample_lats = lats[row_idx]
    sample_values = data[row_idx, col_idx]
    
    coords_arr = np.stack([sample_lons, sample_lats], axis=1)
    return coords_arr, sample_values

# Load data
print("Loading population data...")
pop_data, pop_coords = load_gpw_raster('15_min')
print(f"Shape: {pop_data.shape}")

REGIONS = {
    'Global': None,
    'USA': (-125, 24, -66, 50),
    'Europe': (-10, 35, 40, 70),
    'China': (73, 18, 135, 54),
}

---
## 2. Fixed Model Architectures

In [None]:
class LearnedActivation(nn.Module):
    """Fourier-parameterized learned activation function."""
    def __init__(self, n_frequencies=25, freq_init='linear', learnable_freq=False, max_freq=10.0):
        super().__init__()
        self.n_frequencies = n_frequencies
        
        if freq_init == 'linear':
            freqs = torch.linspace(0.1, max_freq, n_frequencies)
        elif freq_init == 'log':
            freqs = torch.logspace(-1, np.log10(max_freq), n_frequencies)
        else:
            freqs = torch.rand(n_frequencies) * max_freq
        
        if learnable_freq:
            self.frequencies = nn.Parameter(freqs)
        else:
            self.register_buffer('frequencies', freqs)
        
        self.sin_coeffs = nn.Parameter(torch.randn(n_frequencies) * 0.1)
        self.cos_coeffs = nn.Parameter(torch.randn(n_frequencies) * 0.1)
        self.bias = nn.Parameter(torch.zeros(1))
        self.scale = nn.Parameter(torch.ones(1))
    
    def forward(self, x):
        wx = x.unsqueeze(-1) * self.frequencies
        sin_terms = torch.sin(wx) * self.sin_coeffs
        cos_terms = torch.cos(wx) * self.cos_coeffs
        result = (sin_terms + cos_terms).sum(dim=-1)
        return self.scale * result + self.bias


class SineActivation(nn.Module):
    """Sine activation with configurable omega."""
    def __init__(self, omega_0=30.0):
        super().__init__()
        self.omega_0 = omega_0
    
    def forward(self, x):
        return torch.sin(self.omega_0 * x)


class LocationEncoder(nn.Module):
    """Location encoder with configurable activations."""
    def __init__(self, input_dim=2, hidden_dim=256, output_dim=256, n_layers=3,
                 activation='relu', n_frequencies=25, omega_0=30.0):
        super().__init__()
        self.activation_type = activation
        self.n_layers = n_layers
        
        dims = [input_dim] + [hidden_dim] * n_layers + [output_dim]
        self.linears = nn.ModuleList([nn.Linear(dims[i], dims[i+1]) for i in range(len(dims)-1)])
        
        if activation == 'relu':
            self.activations = nn.ModuleList([nn.ReLU() for _ in range(n_layers)])
        elif activation == 'siren':
            self.activations = nn.ModuleList([SineActivation(omega_0=omega_0) for _ in range(n_layers)])
        elif activation == 'learned':
            self.activations = nn.ModuleList([LearnedActivation(n_frequencies=n_frequencies) for _ in range(n_layers)])
        
        self._init_weights()
    
    def _init_weights(self):
        for i, linear in enumerate(self.linears):
            if self.activation_type == 'siren':
                omega_0 = self.activations[0].omega_0 if i > 0 else 1.0
                bound = np.sqrt(6/linear.in_features) / omega_0
                nn.init.uniform_(linear.weight, -bound, bound)
            else:
                nn.init.kaiming_normal_(linear.weight)
            nn.init.zeros_(linear.bias)
    
    def forward(self, coords):
        x = coords.clone()
        x[:, 0] = x[:, 0] / 180.0
        x[:, 1] = x[:, 1] / 90.0
        
        for linear, act in zip(self.linears[:-1], self.activations):
            x = act(linear(x))
        x = self.linears[-1](x)
        return x


class HybridEncoder(nn.Module):
    """Spherical harmonics input + learned activations.
    
    This tests: can learned activations improve on SIREN
    when using the same SH positional encoding?
    """
    def __init__(self, sh_model, hidden_dim=256, output_dim=256, n_layers=3,
                 activation='learned', n_frequencies=25, freeze_sh=True):
        super().__init__()
        self.sh_model = sh_model
        self.activation_type = activation
        
        if freeze_sh:
            for param in self.sh_model.parameters():
                param.requires_grad = False
        
        # Get SH output dim (L=10: 100, L=40: 1600)
        with torch.no_grad():
            test_coord = torch.tensor([[0.0, 0.0]]).double().to(next(sh_model.parameters()).device)
            sh_out = sh_model.spherical_harmonics(test_coord)
            sh_dim = sh_out.shape[-1]
        
        self.sh_dim = sh_dim
        print(f"  SH dim: {sh_dim}")
        
        # Build MLP with learned activations
        dims = [sh_dim] + [hidden_dim] * n_layers + [output_dim]
        self.linears = nn.ModuleList([nn.Linear(dims[i], dims[i+1]) for i in range(len(dims)-1)])
        
        if activation == 'learned':
            self.activations = nn.ModuleList([LearnedActivation(n_frequencies=n_frequencies) for _ in range(n_layers)])
        elif activation == 'relu':
            self.activations = nn.ModuleList([nn.ReLU() for _ in range(n_layers)])
        elif activation == 'siren':
            self.activations = nn.ModuleList([SineActivation(omega_0=30.0) for _ in range(n_layers)])
        
        self._init_weights()
    
    def _init_weights(self):
        for linear in self.linears:
            nn.init.kaiming_normal_(linear.weight)
            nn.init.zeros_(linear.bias)
    
    def forward(self, coords):
        # Get SH features
        with torch.no_grad():
            sh_features = self.sh_model.spherical_harmonics(coords.double()).float()
        
        x = sh_features
        for linear, act in zip(self.linears[:-1], self.activations):
            x = act(linear(x))
        x = self.linears[-1](x)
        return x


# Test
print("Testing architectures:")
for act in ['relu', 'siren', 'learned']:
    enc = LocationEncoder(activation=act, omega_0=1.0)  # Fixed: lower omega for direct coords
    print(f"  Direct + {act}: {sum(p.numel() for p in enc.parameters()):,} params")

---
## 3. Fair Evaluation Functions

In [None]:
def get_satclip_embeddings(model, coords, device, batch_size=512):
    """Extract SatCLIP embeddings for coordinates."""
    model.eval()
    embeddings = []
    
    coords_tensor = torch.tensor(coords, dtype=torch.float64)
    
    with torch.no_grad():
        for i in range(0, len(coords), batch_size):
            batch = coords_tensor[i:i+batch_size].to(device)
            emb = model(batch).cpu().numpy()
            embeddings.append(emb)
    
    return np.vstack(embeddings)


def evaluate_sklearn(X_train, y_train, X_test, y_test, alpha=1.0):
    """Evaluate using Ridge regression (fair comparison)."""
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)


def evaluate_neural(encoder, coords_train, y_train, coords_test, y_test, 
                   epochs=100, lr=1e-3, batch_size=256, device='cuda'):
    """Train encoder end-to-end and evaluate."""
    
    # Create prediction head
    class Predictor(nn.Module):
        def __init__(self, encoder):
            super().__init__()
            self.encoder = encoder
            self.head = nn.Sequential(
                nn.Linear(256, 128),
                nn.ReLU(),
                nn.Linear(128, 1)
            )
        
        def forward(self, x):
            return self.head(self.encoder(x)).squeeze(-1)
    
    model = Predictor(encoder).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    # Prepare data
    train_coords = torch.tensor(coords_train, dtype=torch.float32)
    train_y = torch.tensor(np.log1p(y_train), dtype=torch.float32)
    test_coords = torch.tensor(coords_test, dtype=torch.float32)
    test_y = torch.tensor(np.log1p(y_test), dtype=torch.float32)
    
    train_dataset = torch.utils.data.TensorDataset(train_coords, train_y)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    best_r2 = -float('inf')
    
    for epoch in range(epochs):
        model.train()
        for coords_batch, y_batch in train_loader:
            coords_batch, y_batch = coords_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            loss = criterion(model(coords_batch), y_batch)
            loss.backward()
            optimizer.step()
        
        # Evaluate
        model.eval()
        with torch.no_grad():
            preds = model(test_coords.to(device)).cpu().numpy()
        r2 = r2_score(test_y.numpy(), preds)
        best_r2 = max(best_r2, r2)
        
        if (epoch + 1) % 25 == 0:
            print(f"    Epoch {epoch+1}/{epochs}: R²={r2:.4f}")
    
    return best_r2

---
## 4. Experiment 1: Fair SatCLIP vs Learned Activations

Using sklearn Ridge regression on embeddings for a fair comparison.

In [None]:
print("="*80)
print("EXPERIMENT 1: Fair Comparison (sklearn on embeddings)")
print("="*80)

N_SAMPLES = 15000
results_exp1 = []

for region_name, bounds in REGIONS.items():
    print(f"\n{'─'*60}")
    print(f"Region: {region_name}")
    print(f"{'─'*60}")
    
    # Sample data
    coords, values = sample_from_raster(pop_data, pop_coords, n_samples=N_SAMPLES, bounds=bounds)
    y = np.log1p(values)  # Log transform
    
    # Train/test split
    np.random.seed(42)
    idx = np.random.permutation(len(coords))
    split = len(coords) // 2
    train_idx, test_idx = idx[:split], idx[split:]
    
    coords_train, coords_test = coords[train_idx], coords[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    print(f"Train: {len(coords_train)}, Test: {len(coords_test)}")
    
    # 1. SatCLIP L=10 + Ridge
    print("\n  SatCLIP L=10 + Ridge...")
    emb_train = get_satclip_embeddings(satclip_l10, coords_train, device)
    emb_test = get_satclip_embeddings(satclip_l10, coords_test, device)
    r2_l10 = evaluate_sklearn(emb_train, y_train, emb_test, y_test)
    print(f"    R²: {r2_l10:.4f}")
    results_exp1.append({'region': region_name, 'model': 'SatCLIP L=10 (Ridge)', 'r2': r2_l10})
    
    # 2. SatCLIP L=40 + Ridge
    print("  SatCLIP L=40 + Ridge...")
    emb_train = get_satclip_embeddings(satclip_l40, coords_train, device)
    emb_test = get_satclip_embeddings(satclip_l40, coords_test, device)
    r2_l40 = evaluate_sklearn(emb_train, y_train, emb_test, y_test)
    print(f"    R²: {r2_l40:.4f}")
    results_exp1.append({'region': region_name, 'model': 'SatCLIP L=40 (Ridge)', 'r2': r2_l40})
    
    # 3. Direct + ReLU (end-to-end)
    print("  Direct + ReLU (end-to-end)...")
    encoder = LocationEncoder(activation='relu')
    r2_relu = evaluate_neural(encoder, coords_train, values[train_idx], coords_test, values[test_idx], 
                              epochs=100, device=device)
    print(f"    Best R²: {r2_relu:.4f}")
    results_exp1.append({'region': region_name, 'model': 'Direct + ReLU', 'r2': r2_relu})
    
    # 4. Direct + Learned (end-to-end)
    print("  Direct + Learned (end-to-end)...")
    encoder = LocationEncoder(activation='learned', n_frequencies=25)
    r2_learned = evaluate_neural(encoder, coords_train, values[train_idx], coords_test, values[test_idx],
                                 epochs=100, device=device)
    print(f"    Best R²: {r2_learned:.4f}")
    results_exp1.append({'region': region_name, 'model': 'Direct + Learned', 'r2': r2_learned})
    
    # 5. Direct + SIREN (fixed omega=1.0 for [-1,1] range)
    print("  Direct + SIREN (ω=1.0, fixed)...")
    encoder = LocationEncoder(activation='siren', omega_0=1.0)
    r2_siren = evaluate_neural(encoder, coords_train, values[train_idx], coords_test, values[test_idx],
                               epochs=100, device=device)
    print(f"    Best R²: {r2_siren:.4f}")
    results_exp1.append({'region': region_name, 'model': 'Direct + SIREN (ω=1)', 'r2': r2_siren})

# Summary
df1 = pd.DataFrame(results_exp1)
print("\n" + "="*70)
print("EXPERIMENT 1 RESULTS")
print("="*70)
print(df1.pivot(index='model', columns='region', values='r2').round(3).to_string())

---
## 5. Experiment 2: Hybrid Approach (SH + Learned Activations)

Can we improve SatCLIP by replacing SIREN with learned activations?

In [None]:
print("="*80)
print("EXPERIMENT 2: Hybrid (SH input + Learned Activations)")
print("="*80)

results_exp2 = []

for region_name, bounds in REGIONS.items():
    print(f"\n{'─'*60}")
    print(f"Region: {region_name}")
    print(f"{'─'*60}")
    
    # Sample data
    coords, values = sample_from_raster(pop_data, pop_coords, n_samples=N_SAMPLES, bounds=bounds)
    
    np.random.seed(42)
    idx = np.random.permutation(len(coords))
    split = len(coords) // 2
    train_idx, test_idx = idx[:split], idx[split:]
    
    coords_train, coords_test = coords[train_idx], coords[test_idx]
    
    # Test hybrid approaches with L=10 SH
    for act_type in ['relu', 'siren', 'learned']:
        print(f"\n  SH(L=10) + {act_type}...")
        encoder = HybridEncoder(satclip_l10, activation=act_type, n_frequencies=25)
        r2 = evaluate_neural(encoder, coords_train, values[train_idx], coords_test, values[test_idx],
                            epochs=100, device=device)
        print(f"    Best R²: {r2:.4f}")
        results_exp2.append({'region': region_name, 'model': f'SH(L=10) + {act_type}', 'r2': r2})
    
    # Test with L=40 SH
    for act_type in ['relu', 'learned']:
        print(f"\n  SH(L=40) + {act_type}...")
        encoder = HybridEncoder(satclip_l40, activation=act_type, n_frequencies=25)
        r2 = evaluate_neural(encoder, coords_train, values[train_idx], coords_test, values[test_idx],
                            epochs=100, device=device)
        print(f"    Best R²: {r2:.4f}")
        results_exp2.append({'region': region_name, 'model': f'SH(L=40) + {act_type}', 'r2': r2})

# Summary
df2 = pd.DataFrame(results_exp2)
print("\n" + "="*70)
print("EXPERIMENT 2 RESULTS: Hybrid Approaches")
print("="*70)
print(df2.pivot(index='model', columns='region', values='r2').round(3).to_string())

---
## 6. Experiment 3: Frequency Ablation

How many Fourier frequencies do we need?

In [None]:
print("="*80)
print("EXPERIMENT 3: Frequency Ablation")
print("="*80)

# Use Global region for ablation
coords, values = sample_from_raster(pop_data, pop_coords, n_samples=15000, bounds=None)

np.random.seed(42)
idx = np.random.permutation(len(coords))
split = len(coords) // 2
train_idx, test_idx = idx[:split], idx[split:]
coords_train, coords_test = coords[train_idx], coords[test_idx]

FREQ_VALUES = [5, 10, 25, 50, 100]
results_exp3 = []

for n_freq in FREQ_VALUES:
    print(f"\n  Testing n_frequencies={n_freq}...")
    encoder = LocationEncoder(activation='learned', n_frequencies=n_freq)
    n_params = sum(p.numel() for p in encoder.parameters())
    
    r2 = evaluate_neural(encoder, coords_train, values[train_idx], coords_test, values[test_idx],
                        epochs=100, device=device)
    print(f"    R²: {r2:.4f}, Params: {n_params:,}")
    results_exp3.append({'n_frequencies': n_freq, 'r2': r2, 'params': n_params})

df3 = pd.DataFrame(results_exp3)
print("\n" + "="*70)
print("FREQUENCY ABLATION RESULTS")
print("="*70)
print(df3.to_string(index=False))

# Plot
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(df3['n_frequencies'], df3['r2'], 'o-', markersize=10, linewidth=2)
ax.set_xlabel('Number of Frequencies')
ax.set_ylabel('R² Score')
ax.set_title('Frequency Ablation: More frequencies = better?')
ax.grid(True, alpha=0.3)
for i, row in df3.iterrows():
    ax.annotate(f"{row['r2']:.3f}", (row['n_frequencies'], row['r2']), 
                textcoords="offset points", xytext=(0,10), ha='center')
plt.tight_layout()
plt.savefig('frequency_ablation.png', dpi=150)
plt.show()

---
## 7. Combined Results & Analysis

In [None]:
print("="*80)
print("COMBINED RESULTS")
print("="*80)

# Combine experiment 1 and 2
all_results = pd.concat([df1, df2])
pivot = all_results.pivot(index='model', columns='region', values='r2')

# Reorder columns
col_order = ['Global', 'USA', 'Europe', 'China']
pivot = pivot[[c for c in col_order if c in pivot.columns]]

print("\nR² Scores by Model and Region:")
print(pivot.round(3).to_string())

# Save
all_results.to_csv('learned_activations_v2_results.csv', index=False)
print("\nResults saved to learned_activations_v2_results.csv")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Experiment 1: Direct approaches
ax = axes[0]
exp1_pivot = df1.pivot(index='model', columns='region', values='r2')
exp1_pivot.T.plot(kind='bar', ax=ax)
ax.set_ylabel('R² Score')
ax.set_xlabel('Region')
ax.set_title('Exp 1: Fair Comparison\n(SatCLIP+Ridge vs Direct+Neural)')
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3, axis='y')
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

# Experiment 2: Hybrid approaches
ax = axes[1]
exp2_pivot = df2.pivot(index='model', columns='region', values='r2')
exp2_pivot.T.plot(kind='bar', ax=ax)
ax.set_ylabel('R² Score')
ax.set_xlabel('Region')
ax.set_title('Exp 2: Hybrid Approaches\n(SH features + different activations)')
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=8)
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3, axis='y')
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.savefig('learned_activations_v2_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Key findings analysis
print("="*80)
print("KEY FINDINGS")
print("="*80)

print("\n1. SatCLIP BASELINE (with fair sklearn comparison):")
for region in ['Global', 'USA', 'Europe', 'China']:
    l10 = df1[(df1['model'] == 'SatCLIP L=10 (Ridge)') & (df1['region'] == region)]['r2'].values
    l40 = df1[(df1['model'] == 'SatCLIP L=40 (Ridge)') & (df1['region'] == region)]['r2'].values
    if len(l10) > 0 and len(l40) > 0:
        print(f"  {region}: L=10={l10[0]:.3f}, L=40={l40[0]:.3f}, diff={l40[0]-l10[0]:+.3f}")

print("\n2. DIRECT LEARNED vs SatCLIP:")
for region in ['Global', 'USA', 'Europe', 'China']:
    l10 = df1[(df1['model'] == 'SatCLIP L=10 (Ridge)') & (df1['region'] == region)]['r2'].values
    learned = df1[(df1['model'] == 'Direct + Learned') & (df1['region'] == region)]['r2'].values
    if len(l10) > 0 and len(learned) > 0:
        print(f"  {region}: Learned={learned[0]:.3f} vs L=10={l10[0]:.3f} ({learned[0]-l10[0]:+.3f})")

print("\n3. HYBRID APPROACHES (best per region):")
for region in ['Global', 'USA', 'Europe', 'China']:
    region_data = df2[df2['region'] == region]
    if len(region_data) > 0:
        best = region_data.loc[region_data['r2'].idxmax()]
        print(f"  {region}: {best['model']} = {best['r2']:.3f}")

---
## 8. Conclusions & Next Steps

### Key Findings:
1. **Fair comparison changes everything** - SatCLIP with Ridge regression performs much better than the broken MLP head approach
2. **Learned activations vs SatCLIP** - How do they compare with fair evaluation?
3. **Hybrid potential** - Does SH + learned activations beat SH + SIREN?

### Next Steps Based on Results:
- If learned activations match SatCLIP: Scale up to contrastive pretraining
- If hybrid is best: Use SH encoding with learned activations
- If neither works: The pretrained nature of SatCLIP is the key advantage

In [None]:
# Save all results
summary = {
    'experiment_1': df1.to_dict('records'),
    'experiment_2': df2.to_dict('records'),
    'experiment_3': df3.to_dict('records') if 'df3' in dir() else [],
}

import json
with open('learned_activations_v2_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("All results saved!")
print("\nFiles created:")
print("  - learned_activations_v2_results.csv")
print("  - learned_activations_v2_summary.json")
print("  - learned_activations_v2_comparison.png")
print("  - frequency_ablation.png")