# Data Export for Web Application

This notebook exports all processed data and results into JSON format for the web application.

**Outputs**:
- `web_app/public/data/age_gaps.json` - Complete age gap data
- `web_app/public/data/metrics_summary.json` - Model performance metrics
- `web_app/public/data/correlations.json` - Inter-organ correlation matrix
- `web_app/public/data/trajectories.json` - Pseudo-longitudinal trajectories
- `web_app/public/data/clusters.json` - Clustering results with UMAP/PCA
- `web_app/public/data/feature_importance/*.json` - Feature importance per organ

In [1]:
# Setup
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Get project root
project_root = Path().resolve().parent if (Path().resolve().parent / 'src').exists() else Path().resolve()
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print(f"✓ Project root: {project_root}")

import pandas as pd
import numpy as np
import json
from datetime import datetime

# Import project modules
from organ_aging import config, analysis

print("✓ Imports successful")

✓ Project root: C:\Users\bastien\Documents\TAF\Hackathon\Vitalist
✓ Imports successful


In [2]:
# Create output directory
output_dir = project_root / "web_app" / "public" / "data"
output_dir.mkdir(parents=True, exist_ok=True)

(output_dir / "feature_importance").mkdir(exist_ok=True)
(output_dir / "individuals").mkdir(exist_ok=True)

print(f"✓ Output directory: {output_dir}")

✓ Output directory: C:\Users\bastien\Documents\TAF\Hackathon\Vitalist\web_app\public\data


## 1. Load Data

In [3]:
# Load age gaps data
age_gaps_path = project_root / "data" / "processed" / "age_gaps.parquet"
df = pd.read_parquet(age_gaps_path)
print(f"✓ Loaded {len(df)} individuals")
print(f"✓ Columns: {list(df.columns)}")

# Get organ gap columns
gap_cols = [col for col in df.columns if col.endswith('_age_gap') and col != 'max_age_gap']
organs = [col.replace('_age_gap', '') for col in gap_cols]
print(f"✓ Organs: {organs}")

✓ Loaded 531 individuals
✓ Columns: ['AGE', 'liver_age_bio', 'liver_age_gap', 'kidney_age_bio', 'kidney_age_gap', 'cardio_metabolic_age_bio', 'cardio_metabolic_age_gap', 'immune_age_bio', 'immune_age_gap', 'hematologic_age_bio', 'hematologic_age_gap', 'liver_advanced', 'kidney_advanced', 'cardio_metabolic_advanced', 'immune_advanced', 'hematologic_advanced', 'fastest_aging_organ', 'max_age_gap']
✓ Organs: ['liver', 'kidney', 'cardio_metabolic', 'immune', 'hematologic']


In [4]:
# Load metrics
metrics_path = project_root / "models" / "metrics_summary.json"
with open(metrics_path, 'r') as f:
    metrics = json.load(f)
print("✓ Loaded model metrics")

✓ Loaded model metrics


## 2. Export Age Gaps Data

In [5]:
# Calculate advanced organs (gap > 5 years)
threshold = 5.0
for organ in organs:
    gap_col = f"{organ}_age_gap"
    df[f"{organ}_advanced"] = df[gap_col] > threshold

# Count advanced organs per person
advanced_cols = [f"{organ}_advanced" for organ in organs]
df['n_advanced_organs'] = df[advanced_cols].sum(axis=1)

# Summary statistics
summary_stats = {}
for organ in organs:
    gap_col = f"{organ}_age_gap"
    summary_stats[organ] = {
        "mean": float(df[gap_col].mean()),
        "std": float(df[gap_col].std()),
        "min": float(df[gap_col].min()),
        "max": float(df[gap_col].max()),
        "q25": float(df[gap_col].quantile(0.25)),
        "median": float(df[gap_col].median()),
        "q75": float(df[gap_col].quantile(0.75))
    }

# Build JSON structure
age_gaps_export = {
    "metadata": {
        "n_individuals": len(df),
        "organs": organs,
        "date_generated": datetime.now().isoformat(),
        "threshold_advanced": threshold
    },
    "summary_stats": summary_stats,
    "data": []
}

# Export individual records
for idx, row in df.iterrows():
    record = {
        "id": int(idx),
        "age": float(row['AGE']) if 'AGE' in row else None,
        "sex": int(row['RIAGENDR']) if 'RIAGENDR' in row else None,
        "n_advanced_organs": int(row['n_advanced_organs'])
    }
    
    # Add organ-specific data
    for organ in organs:
        bio_col = f"{organ}_age_bio"
        gap_col = f"{organ}_age_gap"
        adv_col = f"{organ}_advanced"
        
        record[f"{organ}_age_bio"] = float(row[bio_col]) if bio_col in row else None
        record[f"{organ}_age_gap"] = float(row[gap_col]) if gap_col in row else None
        record[f"{organ}_advanced"] = bool(row[adv_col])
    
    age_gaps_export["data"].append(record)

# Save
output_path = output_dir / "age_gaps.json"
with open(output_path, 'w') as f:
    json.dump(age_gaps_export, f, indent=2)

print(f"✓ Exported age_gaps.json ({len(age_gaps_export['data'])} records)")

✓ Exported age_gaps.json (531 records)


## 3. Export Model Metrics

In [6]:
# Restructure metrics for frontend
metrics_export = {
    "organs": [],
    "date_generated": datetime.now().isoformat()
}

for organ in organs:
    if organ in metrics:
        organ_data = metrics[organ]
        
        # Calculate improvement
        linear_mae = organ_data['linear']['test']['mae']
        gb_mae = organ_data['gradient_boosting']['test']['mae']
        improvement = ((linear_mae - gb_mae) / linear_mae) * 100
        
        metrics_export["organs"].append({
            "name": organ,
            "display_name": organ.replace('_', ' ').title(),
            "linear": {
                "train_mae": organ_data['linear']['train']['mae'],
                "val_mae": organ_data['linear']['val']['mae'],
                "test_mae": organ_data['linear']['test']['mae'],
                "train_r2": organ_data['linear']['train']['r2'],
                "val_r2": organ_data['linear']['val']['r2'],
                "test_r2": organ_data['linear']['test']['r2']
            },
            "gradient_boosting": {
                "train_mae": organ_data['gradient_boosting']['train']['mae'],
                "val_mae": organ_data['gradient_boosting']['val']['mae'],
                "test_mae": organ_data['gradient_boosting']['test']['mae'],
                "train_r2": organ_data['gradient_boosting']['train']['r2'],
                "val_r2": organ_data['gradient_boosting']['val']['r2'],
                "test_r2": organ_data['gradient_boosting']['test']['r2']
            },
            "improvement_pct": round(improvement, 1)
        })

# Save
output_path = output_dir / "metrics_summary.json"
with open(output_path, 'w') as f:
    json.dump(metrics_export, f, indent=2)

print(f"✓ Exported metrics_summary.json")

✓ Exported metrics_summary.json


## 4. Export Correlations

In [7]:
# Calculate correlation matrix
corr_matrix = df[gap_cols].corr()

correlations_export = {
    "labels": organs,
    "matrix": corr_matrix.values.tolist(),
    "date_generated": datetime.now().isoformat()
}

# Save
output_path = output_dir / "correlations.json"
with open(output_path, 'w') as f:
    json.dump(correlations_export, f, indent=2)

print(f"✓ Exported correlations.json")
print("\nCorrelation Matrix:")
print(corr_matrix)

✓ Exported correlations.json

Correlation Matrix:
                          liver_age_gap  kidney_age_gap  \
liver_age_gap                  1.000000        0.753296   
kidney_age_gap                 0.753296        1.000000   
cardio_metabolic_age_gap       0.559259        0.567695   
immune_age_gap                 0.745856        0.717545   
hematologic_age_gap            0.721345        0.672658   

                          cardio_metabolic_age_gap  immune_age_gap  \
liver_age_gap                             0.559259        0.745856   
kidney_age_gap                            0.567695        0.717545   
cardio_metabolic_age_gap                  1.000000        0.636804   
immune_age_gap                            0.636804        1.000000   
hematologic_age_gap                       0.610299        0.785804   

                          hematologic_age_gap  
liver_age_gap                        0.721345  
kidney_age_gap                       0.672658  
cardio_metabolic_age_gap      

## 5. Export Pseudo-Longitudinal Trajectories

In [8]:
# Define age bins
age_bins = [18, 30, 40, 50, 60, 70, 80]
age_bin_labels = [f"{age_bins[i]}-{age_bins[i+1]}" for i in range(len(age_bins)-1)]

# Calculate trajectories
trajectories_export = {
    "age_bins": age_bin_labels,
    "organs": {},
    "date_generated": datetime.now().isoformat()
}

for organ in organs:
    gap_col = f"{organ}_age_gap"
    
    mean_gaps = []
    std_gaps = []
    n_individuals = []
    
    for i in range(len(age_bins)-1):
        mask = (df['AGE'] >= age_bins[i]) & (df['AGE'] < age_bins[i+1])
        subset = df[mask][gap_col]
        
        mean_gaps.append(float(subset.mean()) if len(subset) > 0 else None)
        std_gaps.append(float(subset.std()) if len(subset) > 0 else None)
        n_individuals.append(int(len(subset)))
    
    trajectories_export["organs"][organ] = {
        "display_name": organ.replace('_', ' ').title(),
        "mean_gaps": mean_gaps,
        "std_gaps": std_gaps,
        "n_individuals": n_individuals
    }

# Save
output_path = output_dir / "trajectories.json"
with open(output_path, 'w') as f:
    json.dump(trajectories_export, f, indent=2)

print(f"✓ Exported trajectories.json")

✓ Exported trajectories.json


## 6. Export Clustering Results (if available)

In [9]:
# Check if clustering results exist
if 'cluster' in df.columns and 'umap_x' in df.columns and 'umap_y' in df.columns:
    print("✓ Found clustering results")
    
    # Get unique clusters
    unique_clusters = sorted(df['cluster'].unique())
    
    # Define cluster names and descriptions
    cluster_names = {
        0: "Healthy Agers",
        1: "Cardio-Metabolic Risk",
        2: "Immune-Hematologic Aging",
        3: "Uniform Accelerated Aging"
    }
    
    cluster_descriptions = {
        0: "All organs aging slower than expected",
        1: "Cardiovascular and kidney accelerated",
        2: "Immune and blood systems advanced",
        3: "All systems aging faster"
    }
    
    clusters_export = {
        "method": "KMeans on UMAP embedding",
        "n_clusters": len(unique_clusters),
        "clusters": [],
        "embedding": [],
        "date_generated": datetime.now().isoformat()
    }
    
    # Cluster summaries
    for cluster_id in unique_clusters:
        mask = df['cluster'] == cluster_id
        cluster_df = df[mask]
        
        mean_gaps = {}
        for organ in organs:
            gap_col = f"{organ}_age_gap"
            mean_gaps[organ] = float(cluster_df[gap_col].mean())
        
        clusters_export["clusters"].append({
            "id": int(cluster_id),
            "name": cluster_names.get(cluster_id, f"Cluster {cluster_id}"),
            "description": cluster_descriptions.get(cluster_id, ""),
            "size": int(len(cluster_df)),
            "percentage": round(100 * len(cluster_df) / len(df), 1),
            "characteristics": {
                "mean_age": float(cluster_df['AGE'].mean()) if 'AGE' in cluster_df else None,
                "mean_gaps": mean_gaps
            },
            "embedding_center": [
                float(cluster_df['umap_x'].mean()),
                float(cluster_df['umap_y'].mean())
            ]
        })
    
    # Embedding points
    for idx, row in df.iterrows():
        clusters_export["embedding"].append({
            "id": int(idx),
            "x": float(row['umap_x']),
            "y": float(row['umap_y']),
            "cluster": int(row['cluster']),
            "age": float(row['AGE']) if 'AGE' in row else None
        })
    
    # Save
    output_path = output_dir / "clusters.json"
    with open(output_path, 'w') as f:
        json.dump(clusters_export, f, indent=2)
    
    print(f"✓ Exported clusters.json")
    
else:
    print("⚠ No clustering results found. Run notebook 05 first.")
    print("Creating placeholder clusters.json...")
    
    # Create placeholder
    clusters_export = {
        "method": "Not yet computed",
        "n_clusters": 0,
        "clusters": [],
        "embedding": [],
        "note": "Run notebook 05 to generate clustering results"
    }
    
    output_path = output_dir / "clusters.json"
    with open(output_path, 'w') as f:
        json.dump(clusters_export, f, indent=2)
    
    print("✓ Created placeholder clusters.json")

⚠ No clustering results found. Run notebook 05 first.
Creating placeholder clusters.json...
✓ Created placeholder clusters.json


## 7. Export Feature Importance (Placeholder)

In [10]:
# Feature importance mapping (to be filled with actual importance values)
# This is a placeholder - actual values should come from trained models

feature_info = {
    "liver": {
        "features": [
            {"name": "LBXSASSI", "display_name": "AST (Aspartate Aminotransferase)", 
             "importance": 0.25, "description": "Marker of liver cell damage",
             "direction": "Higher values → older predicted age"},
            {"name": "LBXSAL", "display_name": "Albumin",
             "importance": 0.18, "description": "Reflects liver synthetic function",
             "direction": "Lower values → older predicted age"},
            {"name": "LBXSGTSI", "display_name": "GGT (Gamma-Glutamyl Transferase)",
             "importance": 0.15, "description": "Elevated in liver disease",
             "direction": "Higher values → older predicted age"},
        ]
    },
    "kidney": {
        "features": [
            {"name": "LBXSCR", "display_name": "Creatinine",
             "importance": 0.30, "description": "Primary marker of kidney function",
             "direction": "Higher values → older predicted age"},
            {"name": "LBXSBU", "display_name": "BUN (Blood Urea Nitrogen)",
             "importance": 0.22, "description": "Waste product filtered by kidneys",
             "direction": "Higher values → older predicted age"},
        ]
    },
    "cardio_metabolic": {
        "features": [
            {"name": "BPXSY1", "display_name": "Systolic Blood Pressure",
             "importance": 0.28, "description": "Increases with vascular aging",
             "direction": "Higher values → older predicted age"},
            {"name": "LBXGH", "display_name": "HbA1c (Glycohemoglobin)",
             "importance": 0.20, "description": "Long-term glucose control marker",
             "direction": "Higher values → older predicted age"},
        ]
    },
    "immune": {
        "features": [
            {"name": "LBXWBCSI", "display_name": "White Blood Cell Count",
             "importance": 0.25, "description": "Overall immune cell count",
             "direction": "Complex relationship with age"},
            {"name": "LBXLYPCT", "display_name": "Lymphocyte Percentage",
             "importance": 0.20, "description": "Adaptive immunity marker",
             "direction": "Lower values → older predicted age"},
        ]
    },
    "hematologic": {
        "features": [
            {"name": "LBXHGB", "display_name": "Hemoglobin",
             "importance": 0.22, "description": "Oxygen-carrying protein",
             "direction": "Lower values → older predicted age"},
            {"name": "LBXRDW", "display_name": "Red Cell Distribution Width",
             "importance": 0.18, "description": "Variability in RBC size",
             "direction": "Higher values → older predicted age"},
        ]
    }
}

# Export each organ's feature importance
for organ, data in feature_info.items():
    feature_export = {
        "organ": organ,
        "display_name": organ.replace('_', ' ').title(),
        "model": "HistGradientBoosting",
        "features": data["features"],
        "note": "Feature importance values are illustrative. Run SHAP analysis for actual values.",
        "date_generated": datetime.now().isoformat()
    }
    
    output_path = output_dir / "feature_importance" / f"{organ}.json"
    with open(output_path, 'w') as f:
        json.dump(feature_export, f, indent=2)
    
    print(f"✓ Exported feature_importance/{organ}.json")

✓ Exported feature_importance/liver.json
✓ Exported feature_importance/kidney.json
✓ Exported feature_importance/cardio_metabolic.json
✓ Exported feature_importance/immune.json
✓ Exported feature_importance/hematologic.json


## 8. Export Individual Profiles (Sample)

In [11]:
# Export first 50 individual profiles as examples
# In production, you might export all or implement on-demand loading

sample_ids = df.head(50).index

for idx in sample_ids:
    row = df.loc[idx]
    
    individual_profile = {
        "id": int(idx),
        "age": float(row['AGE']) if 'AGE' in row else None,
        "sex": int(row['RIAGENDR']) if 'RIAGENDR' in row else None,
        "n_advanced_organs": int(row['n_advanced_organs']),
        "organs": []
    }
    
    for organ in organs:
        bio_col = f"{organ}_age_bio"
        gap_col = f"{organ}_age_gap"
        adv_col = f"{organ}_advanced"
        
        gap_value = float(row[gap_col])
        status = "advanced" if row[adv_col] else ("healthy" if gap_value < -5 else "normal")
        
        individual_profile["organs"].append({
            "name": organ,
            "display_name": organ.replace('_', ' ').title(),
            "biological_age": float(row[bio_col]) if bio_col in row else None,
            "age_gap": gap_value,
            "status": status,
            "advanced": bool(row[adv_col])
        })
    
    output_path = output_dir / "individuals" / f"{idx}.json"
    with open(output_path, 'w') as f:
        json.dump(individual_profile, f, indent=2)

print(f"✓ Exported {len(sample_ids)} individual profiles")

✓ Exported 50 individual profiles


## Summary

In [12]:
print("\n" + "="*60)
print("Export Complete!")
print("="*60)
print(f"\nOutput directory: {output_dir}")
print("\nGenerated files:")
print("  ✓ age_gaps.json")
print("  ✓ metrics_summary.json")
print("  ✓ correlations.json")
print("  ✓ trajectories.json")
print("  ✓ clusters.json")
print("  ✓ feature_importance/ (5 files)")
print(f"  ✓ individuals/ ({len(sample_ids)} sample files)")
print("\nNext steps:")
print("  1. Copy these files to your web app's public/data directory")
print("  2. Use PROMPT_GEMINI_WEB_APP.md to generate the web application")
print("  3. Test the web app locally")
print("  4. Deploy to Vercel/Netlify/GitHub Pages")
print("\n" + "="*60)


Export Complete!

Output directory: C:\Users\bastien\Documents\TAF\Hackathon\Vitalist\web_app\public\data

Generated files:
  ✓ age_gaps.json
  ✓ metrics_summary.json
  ✓ correlations.json
  ✓ trajectories.json
  ✓ clusters.json
  ✓ feature_importance/ (5 files)
  ✓ individuals/ (50 sample files)

Next steps:
  1. Copy these files to your web app's public/data directory
  2. Use PROMPT_GEMINI_WEB_APP.md to generate the web application
  3. Test the web app locally
  4. Deploy to Vercel/Netlify/GitHub Pages

