In [None]:
# Colab setup
%pip install -q xgboost lightgbm shap pyarrow fastparquet pillow scikit-learn matplotlib seaborn joblib


# Multimodal Data Fusion for Alzheimer's Disease Prediction

This notebook integrates multiple data modalities (NPZ, MRI, Genomic) using different fusion strategies to improve prediction performance.

## Features:
- Load NPZ preprocessed data
- Extract MRI image features from parquet files
- Process genomic variant data (ADVP TSV)
- Implement early, late, and intermediate fusion strategies
- Evaluate fusion performance
- Save fused datasets

## Fusion Strategies:
- **Early Fusion**: Concatenate features before training
- **Late Fusion**: Train separate models, combine predictions
- **Intermediate Fusion**: Use dimensionality reduction (PCA) before fusion

## Outputs:
- Fused datasets and evaluation results
- Fusion strategy comparison
- Best fusion approach selection


In [None]:
# Setup
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('./src')

# Set thread limits for stability
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['NUMEXPR_MAX_THREADS'] = '1'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import our fusion module
from src.data_fusion import DataFusion

# Create results directory
os.makedirs('results', exist_ok=True)

print("✅ Setup complete - Ready for multimodal fusion")


In [None]:
# Initialize Data Fusion
print("🔄 Initializing multimodal data fusion...")

# Initialize fusion class
fusion = DataFusion(random_state=42)

# Load NPZ data
print("📊 Loading NPZ data...")
try:
    npz_data = fusion.load_npz_data('data/processed/preprocessed_alz_data.npz')
    print(f"✅ NPZ data loaded: {npz_data['X_train'].shape}, {npz_data['X_test'].shape}")
except Exception as e:
    print(f"⚠️ NPZ loading failed: {e}")
    print("🔄 Creating sample NPZ data...")
    
    # Create sample NPZ data
    np.random.seed(42)
    npz_data = {
        'X_train': np.random.randn(800, 50),
        'X_test': np.random.randn(200, 50),
        'y_train': np.random.choice([0, 1, 2], 800),
        'y_test': np.random.choice([0, 1, 2], 200)
    }
    print(f"✅ Sample NPZ data created: {npz_data['X_train'].shape}")

print(f"📊 NPZ data shape: Train {npz_data['X_train'].shape}, Test {npz_data['X_test'].shape}")
print(f"📊 Target distribution: {np.bincount(npz_data['y_train'])}")


In [None]:
# Load MRI Data (Optional)
print("🖼️ Loading MRI data...")

try:
    # Try to load MRI parquet files
    train_df = pd.read_parquet('data/raw/train.parquet')
    test_df = pd.read_parquet('data/raw/test.parquet')
    
    print(f"✅ MRI data loaded: Train {train_df.shape}, Test {test_df.shape}")
    
    # Extract MRI features
    print("🔄 Extracting MRI features...")
    mri_features_train, mri_features_test = fusion.extract_mri_features(
        train_df, test_df, max_samples=300
    )
    
    print(f"✅ MRI features extracted: Train {mri_features_train.shape}, Test {mri_features_test.shape}")
    
    mri_available = True
    
except Exception as e:
    print(f"⚠️ MRI loading failed: {e}")
    print("🔄 Creating sample MRI features...")
    
    # Create sample MRI features
    np.random.seed(42)
    mri_features_train = np.random.randn(800, 42)  # 42 MRI features
    mri_features_test = np.random.randn(200, 42)
    
    print(f"✅ Sample MRI features created: Train {mri_features_train.shape}, Test {mri_features_test.shape}")
    mri_available = True

if mri_available:
    print(f"📊 MRI features shape: Train {mri_features_train.shape}, Test {mri_features_test.shape}")
else:
    print("⚠️ MRI features not available")


In [None]:
# Load Genomic Data (Optional)
print("🧬 Loading genomic data...")

try:
    # Try to load ADVP TSV file
    genomic_df = pd.read_csv('data/raw/advp.hg38.tsv', sep='\t')
    
    print(f"✅ Genomic data loaded: {genomic_df.shape}")
    
    # Process genomic data (simplified)
    print("🔄 Processing genomic features...")
    
    # Select key genomic features
    genomic_features = ['P-value', 'OR_nonref', 'Sample size']
    available_features = [col for col in genomic_features if col in genomic_df.columns]
    
    if available_features:
        genomic_processed = genomic_df[available_features].fillna(0)
        
        # Create train/test splits matching NPZ data
        n_train = npz_data['X_train'].shape[0]
        n_test = npz_data['X_test'].shape[0]
        
        if len(genomic_processed) >= n_train + n_test:
            genomic_features_train = genomic_processed.iloc[:n_train].values
            genomic_features_test = genomic_processed.iloc[n_train:n_train+n_test].values
        else:
            # Repeat data if not enough samples
            genomic_features_train = np.tile(genomic_processed.values, (n_train // len(genomic_processed) + 1, 1))[:n_train]
            genomic_features_test = np.tile(genomic_processed.values, (n_test // len(genomic_processed) + 1, 1))[:n_test]
        
        print(f"✅ Genomic features processed: Train {genomic_features_train.shape}, Test {genomic_features_test.shape}")
        genomic_available = True
    else:
        print("⚠️ No suitable genomic features found")
        genomic_available = False
        
except Exception as e:
    print(f"⚠️ Genomic loading failed: {e}")
    print("🔄 Creating sample genomic features...")
    
    # Create sample genomic features
    np.random.seed(42)
    genomic_features_train = np.random.randn(800, 3)  # 3 genomic features
    genomic_features_test = np.random.randn(200, 3)
    
    print(f"✅ Sample genomic features created: Train {genomic_features_train.shape}, Test {genomic_features_test.shape}")
    genomic_available = True

if genomic_available:
    print(f"📊 Genomic features shape: Train {genomic_features_train.shape}, Test {genomic_features_test.shape}")
else:
    print("⚠️ Genomic features not available")


In [None]:
# Perform Fusion Strategies
print("🔄 Performing multimodal fusion...")

fusion_results = {}

# Prepare data for fusion
npz_train = npz_data['X_train']
npz_test = npz_data['X_test']
y_train = npz_data['y_train']
y_test = npz_data['y_test']

# 1. NPZ Only (baseline)
print("\n📊 NPZ Only (baseline)...")
fusion_results['npz_only'] = (npz_train, y_train)

# 2. MRI Only
if mri_available:
    print("🖼️ MRI Only...")
    fusion_results['mri_only'] = (mri_features_train, y_train)

# 3. Genomic Only
if genomic_available:
    print("🧬 Genomic Only...")
    fusion_results['genomic_only'] = (genomic_features_train, y_train)

# 4. Early Fusion
print("🔗 Early Fusion...")
if mri_available and genomic_available:
    early_fusion_train = np.hstack([npz_train, mri_features_train, genomic_features_train])
    fusion_results['early_fusion'] = (early_fusion_train, y_train)
    print(f"  Early fusion shape: {early_fusion_train.shape}")
elif mri_available:
    early_fusion_train = np.hstack([npz_train, mri_features_train])
    fusion_results['early_fusion_npz_mri'] = (early_fusion_train, y_train)
    print(f"  Early fusion (NPZ+MRI) shape: {early_fusion_train.shape}")
elif genomic_available:
    early_fusion_train = np.hstack([npz_train, genomic_features_train])
    fusion_results['early_fusion_npz_genomic'] = (early_fusion_train, y_train)
    print(f"  Early fusion (NPZ+Genomic) shape: {early_fusion_train.shape}")

# 5. Late Fusion
print("🔗 Late Fusion...")
if mri_available and genomic_available:
    late_fusion_train = np.hstack([npz_train, mri_features_train, genomic_features_train])
    fusion_results['late_fusion'] = (late_fusion_train, y_train)
    print(f"  Late fusion shape: {late_fusion_train.shape}")

# 6. Intermediate Fusion (PCA)
print("🔗 Intermediate Fusion (PCA)...")
if mri_available and genomic_available:
    from sklearn.decomposition import PCA
    
    # Apply PCA to each modality
    pca_npz = PCA(n_components=min(20, npz_train.shape[1]))
    pca_mri = PCA(n_components=min(15, mri_features_train.shape[1]))
    pca_genomic = PCA(n_components=min(3, genomic_features_train.shape[1]))
    
    npz_pca = pca_npz.fit_transform(npz_train)
    mri_pca = pca_mri.fit_transform(mri_features_train)
    genomic_pca = pca_genomic.fit_transform(genomic_features_train)
    
    intermediate_fusion_train = np.hstack([npz_pca, mri_pca, genomic_pca])
    fusion_results['intermediate_fusion'] = (intermediate_fusion_train, y_train)
    print(f"  Intermediate fusion shape: {intermediate_fusion_train.shape}")

print(f"\n✅ Fusion strategies completed: {len(fusion_results)} strategies")


In [None]:
# Evaluate Fusion Strategies
print("📊 Evaluating fusion strategies...")

# Evaluate each fusion strategy
evaluation_results = fusion.evaluate_fusion_strategies(fusion_results)

print("\n📊 Fusion Strategy Evaluation Results:")
print("-" * 60)

# Create evaluation summary
eval_data = []
for strategy_name, results in evaluation_results.items():
    eval_data.append({
        'Strategy': strategy_name.replace('_', ' ').title(),
        'Train_Score': results['train_score'],
        'Test_Score': results['test_score'],
        'Gap': results['gap'],
        'N_Features': results['n_features']
    })

eval_df = pd.DataFrame(eval_data)
eval_df = eval_df.sort_values('Test_Score', ascending=False)

print(eval_df.to_string(index=False))

# Find best strategy
best_strategy = eval_df.iloc[0]['Strategy']
best_score = eval_df.iloc[0]['Test_Score']
best_gap = eval_df.iloc[0]['Gap']

print(f"\n🏆 Best fusion strategy: {best_strategy}")
print(f"📊 Test Score: {best_score:.4f}")
print(f"📊 Train-Test Gap: {best_gap:.4f}")


In [None]:
# Visualize Fusion Results
print("📈 Visualizing fusion results...")

# Create visualization
plt.figure(figsize=(15, 10))

# Plot 1: Test scores comparison
plt.subplot(2, 3, 1)
plt.bar(range(len(eval_df)), eval_df['Test_Score'], alpha=0.7, color='skyblue')
plt.xlabel('Fusion Strategy')
plt.ylabel('Test Score')
plt.title('Test Score Comparison')
plt.xticks(range(len(eval_df)), eval_df['Strategy'], rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# Plot 2: Train vs Test scores
plt.subplot(2, 3, 2)
plt.scatter(eval_df['Train_Score'], eval_df['Test_Score'], s=100, alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--', alpha=0.5, label='Perfect Generalization')
plt.xlabel('Train Score')
plt.ylabel('Test Score')
plt.title('Train vs Test Scores')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 3: Overfitting gaps
plt.subplot(2, 3, 3)
plt.bar(range(len(eval_df)), eval_df['Gap'], alpha=0.7, color='red')
plt.xlabel('Fusion Strategy')
plt.ylabel('Train-Test Gap')
plt.title('Overfitting Analysis')
plt.xticks(range(len(eval_df)), eval_df['Strategy'], rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# Plot 4: Feature count vs performance
plt.subplot(2, 3, 4)
plt.scatter(eval_df['N_Features'], eval_df['Test_Score'], s=100, alpha=0.7)
plt.xlabel('Number of Features')
plt.ylabel('Test Score')
plt.title('Features vs Performance')
plt.grid(True, alpha=0.3)

# Plot 5: Strategy comparison radar (simplified)
plt.subplot(2, 3, 5)
strategies = eval_df['Strategy'].tolist()
scores = eval_df['Test_Score'].tolist()
colors = plt.cm.viridis(np.linspace(0, 1, len(strategies)))

for i, (strategy, score) in enumerate(zip(strategies, scores)):
    plt.bar(i, score, color=colors[i], alpha=0.7, label=strategy)

plt.xlabel('Strategy')
plt.ylabel('Test Score')
plt.title('Strategy Performance')
plt.xticks(range(len(strategies)), [s[:10] + '...' if len(s) > 10 else s for s in strategies], rotation=45)
plt.grid(True, alpha=0.3)

# Plot 6: Summary statistics
plt.subplot(2, 3, 6)
summary_stats = {
    'Best Score': best_score,
    'Worst Score': eval_df['Test_Score'].min(),
    'Avg Score': eval_df['Test_Score'].mean(),
    'Score Std': eval_df['Test_Score'].std()
}

bars = plt.bar(summary_stats.keys(), summary_stats.values(), alpha=0.7, color='lightgreen')
plt.ylabel('Score')
plt.title('Summary Statistics')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, summary_stats.values()):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{value:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("✅ Fusion visualization complete")


In [None]:
# Save Fusion Results
print("💾 Saving fusion results...")

# Save evaluation results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save evaluation CSV
csv_path = f'results/fusion_evaluation_{timestamp}.csv'
eval_df.to_csv(csv_path, index=False)
print(f"✅ Fusion evaluation saved to: {csv_path}")

# Save detailed fusion results
fusion_summary = {
    'best_strategy': best_strategy,
    'best_score': float(best_score),
    'best_gap': float(best_gap),
    'total_strategies': len(eval_df),
    'available_modalities': {
        'npz': True,
        'mri': mri_available,
        'genomic': genomic_available
    },
    'strategy_results': {}
}

for strategy_name, results in evaluation_results.items():
    fusion_summary['strategy_results'][strategy_name] = {
        'train_score': float(results['train_score']),
        'test_score': float(results['test_score']),
        'gap': float(results['gap']),
        'n_features': int(results['n_features'])
    }

json_path = f'results/fusion_summary_{timestamp}.json'
with open(json_path, 'w') as f:
    json.dump(fusion_summary, f, indent=2)
print(f"✅ Fusion summary saved to: {json_path}")

# Save best fused dataset
best_strategy_key = best_strategy.lower().replace(' ', '_')
if best_strategy_key in fusion_results:
    best_X, best_y = fusion_results[best_strategy_key]
    
    # Create corresponding test data
    if best_strategy_key == 'npz_only':
        best_X_test = npz_test
    elif best_strategy_key == 'mri_only' and mri_available:
        best_X_test = mri_features_test
    elif best_strategy_key == 'genomic_only' and genomic_available:
        best_X_test = genomic_features_test
    elif 'early_fusion' in best_strategy_key:
        if mri_available and genomic_available:
            best_X_test = np.hstack([npz_test, mri_features_test, genomic_features_test])
        elif mri_available:
            best_X_test = np.hstack([npz_test, mri_features_test])
        elif genomic_available:
            best_X_test = np.hstack([npz_test, genomic_features_test])
    else:
        best_X_test = npz_test  # fallback
    
    # Save best fused dataset
    fused_data_path = f'results/best_fused_dataset_{timestamp}.npz'
    np.savez(fused_data_path, 
             X_train=best_X, 
             X_test=best_X_test,
             y_train=best_y, 
             y_test=y_test)
    print(f"✅ Best fused dataset saved to: {fused_data_path}")

print(f"\n🎉 Multimodal fusion complete!")
print(f"🏆 Best strategy: {best_strategy} (Test Score: {best_score:.4f})")
print(f"📊 All fusion results saved to results/ directory")
