In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import glob

# Define the configurations from your ablation study
ablation_configs = [
    {
        'name': 'Full Model',
        'folder_pattern': 's1_ep35_lr5_la1_seed42_ldim2_s2_lr6_la4_lc1_lcon8'
    },
    {
        'name': 'No Adversarial',
        'folder_pattern': 's1_ep35_lr5_la0_seed42_ldim2_s2_lr6_la0_lc1_lcon8'
    },
    {
        'name': 'No Covariance',
        'folder_pattern': 's1_ep35_lr5_la1_seed42_ldim2_s2_lr6_la4_lc0_lcon8'
    },
    {
        'name': 'No Contrastive',
        'folder_pattern': 's1_ep35_lr5_la1_seed42_ldim2_s2_lr6_la4_lc1_lcon0'
    },
    {
        'name': 'Reconstruction Only',
        'folder_pattern': 's1_ep35_lr5_la0_seed42_ldim2_s2_lr6_la0_lc0_lcon0'
    }
]

print("🔍 Checking for ablation study results...")
results_dir = Path("../../results")
print(f"Results directory: {results_dir.absolute()}")
print(f"Directory exists: {results_dir.exists()}")

# List all available result folders
if results_dir.exists():
    all_folders = [f.name for f in results_dir.iterdir() if f.is_dir()]
    print(f"\nFound {len(all_folders)} result folders:")
    for folder in sorted(all_folders):
        print(f"  - {folder}")
else:
    print("❌ Results directory not found!")


🔍 Checking for ablation study results...
Results directory: /Users/inescunha/Documents/GitHub/AI4CellFate/notebooks/manuscript/../../results
Directory exists: True

Found 13 result folders:
  - cross_validation
  - interpretations
  - loss_plots
  - models
  - optimisation
  - s1_ep35_lr5_la0_seed42_ldim2_s2_lr6_la0_lc1_lcon8
  - s1_ep35_lr5_la0_seed43_ldim2_s2_lr6_la0_lc1_lcon8
  - s1_ep35_lr5_la0_seed44_ldim2_s2_lr6_la0_lc1_lcon8
  - s1_ep35_lr5_la1_seed42_ldim2_s2_lr6_la4_lc0_lcon8
  - s1_ep35_lr5_la1_seed42_ldim2_s2_lr6_la4_lc1_lcon8
  - s1_ep35_lr5_la1_seed43_ldim2_s2_lr6_la4_lc1_lcon8
  - s1_ep35_lr5_la1_seed44_ldim2_s2_lr6_la4_lc1_lcon8
  - second_gen_s1_ep40_lr5_la1_seed42_ldim2_s2_lr6_la4_lc1_lcon14


In [None]:
def load_metrics_from_folder(folder_path):
    """
    Load all metrics from a specific ablation study folder.
    
    Returns:
        dict: Dictionary containing all extracted metrics
    """
    metrics = {
        'avg_kl_divergence': None,
        'feature_correlation': None,
        'accuracy': None,
        'precision': None,
        'recall_class_1': None,
        'folder_found': False,
        'files_found': []
    }
    
    if not folder_path.exists():
        print(f"❌ Folder not found: {folder_path}")
        return metrics
    
    metrics['folder_found'] = True
    print(f"📂 Processing folder: {folder_path.name}")
    
    # Find all .npy files in the folder
    npy_files = list(folder_path.glob("*.npy"))
    metrics['files_found'] = [f.name for f in npy_files]
    
    if npy_files:
        print(f"   Found {len(npy_files)} .npy files:")
        for f in npy_files:
            print(f"     - {f.name}")
    else:
        print("   ❌ No .npy files found")
        return metrics
    
    # Load KL divergences
    kl_files = [f for f in npy_files if 'kl_divergences_epoch' in f.name]
    if kl_files:
        # Use the first (should be only one) KL divergence file
        kl_file = kl_files[0]
        kl_divergences = np.load(kl_file)
        metrics['avg_kl_divergence'] = np.mean(kl_divergences)
        print(f"   ✅ KL Divergences: {kl_divergences} (avg: {metrics['avg_kl_divergence']:.4f})")
    else:
        print("   ❌ KL divergence file not found")
    
    # Load correlation matrix
    corr_files = [f for f in npy_files if 'correlation_matrix_epoch' in f.name]
    if corr_files:
        corr_file = corr_files[0]
        corr_matrix = np.load(corr_file)
        # Get off-diagonal correlation (for 2D latent space, this is [0,1] or [1,0])
        if corr_matrix.shape == (2, 2):
            metrics['feature_correlation'] = abs(corr_matrix[0, 1])  # Absolute correlation
        else:
            # For higher dimensions, take mean absolute off-diagonal correlation
            off_diag_mask = ~np.eye(corr_matrix.shape[0], dtype=bool)
            metrics['feature_correlation'] = np.mean(np.abs(corr_matrix[off_diag_mask]))
        print(f"   ✅ Feature correlation: {metrics['feature_correlation']:.4f}")
    else:
        print("   ❌ Correlation matrix file not found")
    
    # Load confusion matrix and calculate accuracy
    cm_files = [f for f in npy_files if 'confusion_matrix_values_epoch' in f.name]
    if cm_files:
        cm_file = cm_files[0]
        conf_matrix_normalized = np.load(cm_file)
        
        # Calculate metrics from confusion matrix
        metrics['accuracy'] = np.mean(np.diag(conf_matrix_normalized))  # Mean diagonal (accuracy)
        
        # Precision (class 0)
        if conf_matrix_normalized[0,0] + conf_matrix_normalized[1,0] > 0:
            metrics['precision'] = conf_matrix_normalized[0,0] / (conf_matrix_normalized[0,0] + conf_matrix_normalized[1,0])
        
        # Recall class 1
        if conf_matrix_normalized[1,0] + conf_matrix_normalized[1,1] > 0:
            metrics['recall_class_1'] = conf_matrix_normalized[1,1] / (conf_matrix_normalized[1,0] + conf_matrix_normalized[1,1])
        
        print(f"   ✅ Accuracy: {metrics['accuracy']:.4f}")
        print(f"   ✅ Precision: {metrics['precision']:.4f}")
        print(f"   ✅ Recall Class 1: {metrics['recall_class_1']:.4f}")
    else:
        print("   ❌ Confusion matrix file not found")
    
    print()
    return metrics

# Test the function with one folder first
print("🧪 Testing metric loading function...")
test_config = ablation_configs[0]  # Full Model
test_folder = results_dir / test_config['folder_pattern']
test_metrics = load_metrics_from_folder(test_folder)


🧪 Testing metric loading function...
📂 Processing folder: s1_ep35_lr5_la1_seed42_ldim2_s2_lr6_la4_lc1_lcon8
   Found 9 .npy files:
     - kl_divergences_epoch_12.npy
     - confusion_matrix_values_epoch_12.npy
     - latent_covariance_matrix_epoch_41.npy
     - latent_correlation_matrix_epoch_41.npy
     - latent_correlation_matrix_epoch_12.npy
     - confusion_matrix_values_epoch_52.npy
     - latent_covariance_matrix_epoch_12.npy
     - kl_divergences_epoch_41.npy
     - confusion_matrix_values_epoch_41.npy
   ✅ KL Divergences: [0.28173357 0.43319249] (avg: 0.3575)
   ✅ Feature correlation: 0.1163
   ✅ Accuracy: 0.6899
   ✅ Precision: 0.7080
   ✅ Recall Class 1: 0.7333



In [3]:
# Load metrics for all configurations
print("📊 Loading metrics for all ablation configurations (Seed 42)...")
print("="*70)

all_metrics = {}

for config in ablation_configs:
    print(f"🔄 Processing: {config['name']}")
    folder_path = results_dir / config['folder_pattern']
    metrics = load_metrics_from_folder(folder_path)
    all_metrics[config['name']] = metrics
    
    # Print summary for this config
    if metrics['folder_found']:
        print(f"   📈 Summary:")
        print(f"     - Avg KL Divergence: {metrics['avg_kl_divergence']:.4f}" if metrics['avg_kl_divergence'] is not None else "     - Avg KL Divergence: N/A")
        print(f"     - Feature Correlation: {metrics['feature_correlation']:.4f}" if metrics['feature_correlation'] is not None else "     - Feature Correlation: N/A")
        print(f"     - Accuracy: {metrics['accuracy']:.4f}" if metrics['accuracy'] is not None else "     - Accuracy: N/A")
    else:
        print(f"   ❌ No data available")
    print("-" * 50)

print("\n✅ Metric loading completed!")


📊 Loading metrics for all ablation configurations (Seed 42)...
🔄 Processing: Full Model
📂 Processing folder: s1_ep35_lr5_la1_seed42_ldim2_s2_lr6_la4_lc1_lcon8
   Found 9 .npy files:
     - kl_divergences_epoch_12.npy
     - confusion_matrix_values_epoch_12.npy
     - latent_covariance_matrix_epoch_41.npy
     - latent_correlation_matrix_epoch_41.npy
     - latent_correlation_matrix_epoch_12.npy
     - confusion_matrix_values_epoch_52.npy
     - latent_covariance_matrix_epoch_12.npy
     - kl_divergences_epoch_41.npy
     - confusion_matrix_values_epoch_41.npy
   ✅ KL Divergences: [0.28173357 0.43319249] (avg: 0.3575)
   ✅ Feature correlation: 0.1163
   ✅ Accuracy: 0.6899
   ✅ Precision: 0.7080
   ✅ Recall Class 1: 0.7333

   📈 Summary:
     - Avg KL Divergence: 0.3575
     - Feature Correlation: 0.1163
     - Accuracy: 0.6899
--------------------------------------------------
🔄 Processing: No Adversarial
📂 Processing folder: s1_ep35_lr5_la0_seed42_ldim2_s2_lr6_la0_lc1_lcon8
   Found 4 

In [4]:
# Create the ablation study table
print("📋 Creating Ablation Study Table...")
print("="*50)

# Prepare data for the table
table_data = {
    'Metric': [
        'Average KL Divergence',
        'Feature Correlation (|r|)',
        'Classification Accuracy',
        'Precision (Class 0)',
        'Recall (Class 1)'
    ]
}

# Add columns for each configuration
for config_name in [config['name'] for config in ablation_configs]:
    metrics = all_metrics[config_name]
    
    column_values = [
        f"{metrics['avg_kl_divergence']:.4f}" if metrics['avg_kl_divergence'] is not None else "N/A",
        f"{metrics['feature_correlation']:.4f}" if metrics['feature_correlation'] is not None else "N/A",
        f"{metrics['accuracy']:.4f}" if metrics['accuracy'] is not None else "N/A",
        f"{metrics['precision']:.4f}" if metrics['precision'] is not None else "N/A",
        f"{metrics['recall_class_1']:.4f}" if metrics['recall_class_1'] is not None else "N/A"
    ]
    
    table_data[config_name] = column_values

# Create DataFrame
df_ablation = pd.DataFrame(table_data)

# Display the table
print("🎯 ABLATION STUDY RESULTS TABLE (Seed 42)")
print("="*80)
print(df_ablation.to_string(index=False))
print("="*80)

# Also save as CSV for manuscript
csv_path = "../../results/ablation_study_table_seed42.csv"
df_ablation.to_csv(csv_path, index=False)
print(f"\n💾 Table saved as CSV: {csv_path}")

# Display with better formatting for manuscript
print("\n📄 FORMATTED TABLE FOR MANUSCRIPT:")
print("="*80)
# Set pandas options for better display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Create a styled version
df_styled = df_ablation.copy()
print(df_styled.to_string(index=False, float_format='%.4f'))


📋 Creating Ablation Study Table...
🎯 ABLATION STUDY RESULTS TABLE (Seed 42)
                   Metric Full Model No Adversarial No Covariance No Contrastive Reconstruction Only
    Average KL Divergence     0.3575         0.1463        0.5420            N/A                 N/A
Feature Correlation (|r|)     0.1163         0.7401        0.1141            N/A                 N/A
  Classification Accuracy     0.6899         0.7312        0.7234            N/A                 N/A
      Precision (Class 0)     0.7080         0.7431        0.7006            N/A                 N/A
         Recall (Class 1)     0.7333         0.7556        0.6667            N/A                 N/A

💾 Table saved as CSV: ../../results/ablation_study_table_seed42.csv

📄 FORMATTED TABLE FOR MANUSCRIPT:
                   Metric Full Model No Adversarial No Covariance No Contrastive Reconstruction Only
    Average KL Divergence     0.3575         0.1463        0.5420            N/A                 N/A
Feature Corr

In [5]:
# Create a more publication-ready table with better column names
print("\n📊 PUBLICATION-READY TABLE:")
print("="*100)

# Create a cleaner version with shorter column names
df_publication = df_ablation.copy()
df_publication.columns = [
    'Metric',
    'Full Model',
    'No Adversarial',
    'No Covariance', 
    'No Contrastive',
    'Reconstruction Only'
]

# Display the publication table
print(df_publication.to_string(index=False))

# Save the publication version
pub_csv_path = "../../results/ablation_study_publication_table.csv"
df_publication.to_csv(pub_csv_path, index=False)
print(f"\n💾 Publication table saved as: {pub_csv_path}")

# Create summary statistics
print("\n📈 SUMMARY INSIGHTS:")
print("="*50)

# Find best performing configuration for each metric
for i, metric in enumerate(df_publication['Metric']):
    row_values = []
    row_configs = []
    
    for col in df_publication.columns[1:]:  # Skip 'Metric' column
        value_str = df_publication.iloc[i][col]
        if value_str != "N/A":
            try:
                value = float(value_str)
                row_values.append(value)
                row_configs.append(col)
            except:
                continue
    
    if row_values:
        if metric == "Average KL Divergence":
            # Lower is better for KL divergence
            best_idx = np.argmin(row_values)
            best_value = row_values[best_idx]
            best_config = row_configs[best_idx]
            print(f"🏆 {metric}: {best_config} ({best_value:.4f}) - Lower is better")
        else:
            # Higher is better for other metrics
            best_idx = np.argmax(row_values)
            best_value = row_values[best_idx]
            best_config = row_configs[best_idx]
            print(f"🏆 {metric}: {best_config} ({best_value:.4f}) - Higher is better")

print("\n✅ Ablation study analysis completed!")



📊 PUBLICATION-READY TABLE:
                   Metric Full Model No Adversarial No Covariance No Contrastive Reconstruction Only
    Average KL Divergence     0.3575         0.1463        0.5420            N/A                 N/A
Feature Correlation (|r|)     0.1163         0.7401        0.1141            N/A                 N/A
  Classification Accuracy     0.6899         0.7312        0.7234            N/A                 N/A
      Precision (Class 0)     0.7080         0.7431        0.7006            N/A                 N/A
         Recall (Class 1)     0.7333         0.7556        0.6667            N/A                 N/A

💾 Publication table saved as: ../../results/ablation_study_publication_table.csv

📈 SUMMARY INSIGHTS:
🏆 Average KL Divergence: No Adversarial (0.1463) - Lower is better
🏆 Feature Correlation (|r|): No Adversarial (0.7401) - Higher is better
🏆 Classification Accuracy: No Adversarial (0.7312) - Higher is better
🏆 Precision (Class 0): No Adversarial (0.7431) - Highe