# Hybrid Exoplanet Detection Pipeline - Demo

This notebook demonstrates the complete hybrid exoplanet detection pipeline that combines:
- **Tabular features**: Stellar and planetary parameters from Kepler catalog
- **Light curve residuals**: Phase-folded 1D CNN analysis of transit signatures  
- **Pixel differences**: 2D CNN analysis of Target Pixel File (TPF) images
- **Late fusion**: XGBoost stacker combining all three model outputs

The pipeline uses PyTorch for the neural networks and processes real Kepler data through the Lightkurve library.

In [None]:
# Import required libraries
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('../src')

# Import our custom modules
from data_loader import load_and_prepare_data, load_kepler_features, load_koi_labels
from features import download_light_curve, process_single_koi, load_residual_window
from pixel_diff import process_single_koi_tpf, load_pixel_difference, visualize_pixel_difference
from models import create_models, TabularNet, ResidualCNN1D, PixelCNN2D
from evaluate import plot_roc_curves, create_metrics_summary_table

print("✅ All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

## 1. Data Loading and Exploration

Let's start by loading and exploring the tabular features from the Kepler catalog.

In [None]:
# Load the tabular features (if available)
try:
    # Try to load actual data
    df_features = load_kepler_features("../data/raw")
    df_labels = load_koi_labels("../data/raw")
    
    print(f"✅ Loaded real data:")
    print(f"   Features: {df_features.shape}")
    print(f"   Labels: {df_labels.shape}")
    
    # Show first few rows
    print("\nSample features:")
    print(df_features.head())
    
    print("\nLabel distribution:")
    print(df_labels['disposition'].value_counts())
    
    real_data_available = True
    
except Exception as e:
    print(f"⚠️  Real data not available: {e}")
    print("📊 Creating synthetic demo data instead...")
    
    # Create synthetic data for demonstration
    np.random.seed(42)
    n_samples = 200
    n_features = 50
    
    # Generate synthetic tabular features
    kepids = np.random.randint(10000000, 20000000, n_samples)
    features = np.random.randn(n_samples, n_features)
    labels = np.random.binomial(1, 0.15, n_samples)  # 15% positive rate
    
    df_features = pd.DataFrame(features, columns=[f'feature_{i}' for i in range(n_features)])
    df_features['kepid'] = kepids
    
    df_labels = pd.DataFrame({
        'kepid': kepids,
        'disposition': ['CONFIRMED' if l else 'FALSE POSITIVE' for l in labels]
    })
    
    print(f"✅ Created synthetic data:")
    print(f"   Features: {df_features.shape}")
    print(f"   Labels: {df_labels.shape}")
    
    real_data_available = False

## 2. Light Curve Processing Demo

We'll demonstrate downloading and processing a Kepler light curve to create residual windows.

In [None]:
# Example KOI for demonstration
# Using known exoplanet host: Kepler-10 (KIC 11904151)
demo_kepid = 11904151

print(f"🔭 Processing light curve for KIC {demo_kepid}")

if real_data_available:
    try:
        # Download and process light curve
        lc = download_light_curve(demo_kepid)
        
        if lc is not None:
            print(f"✅ Downloaded light curve: {len(lc.flux)} data points")
            
            # Plot the raw light curve
            fig, axes = plt.subplots(2, 1, figsize=(12, 8))
            
            # Raw light curve
            lc.plot(ax=axes[0])
            axes[0].set_title(f'KIC {demo_kepid} - Raw Light Curve')
            
            # Process to create residual window
            success = process_single_koi(demo_kepid, "../data/processed/residual_windows")
            
            if success:
                print("✅ Created residual window")
                
                # Load and plot residual window
                residual = load_residual_window(demo_kepid, "../data/processed/residual_windows")
                
                if residual is not None:
                    phase = np.linspace(-0.5, 0.5, residual.shape[1])
                    axes[1].plot(phase, residual[0], 'b-', label='Residuals', alpha=0.8)
                    axes[1].plot(phase, residual[1], 'r-', label='Local Trend', alpha=0.8)
                    axes[1].set_xlabel('Phase')
                    axes[1].set_ylabel('Normalized Flux')
                    axes[1].set_title('Phase-Folded Residual Window')
                    axes[1].legend()
                    axes[1].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
            
        else:
            print("❌ Failed to download light curve")
            
    except Exception as e:
        print(f"❌ Error processing light curve: {e}")
        
else:
    print("📊 Creating synthetic residual window for demo...")
    
    # Create synthetic residual window
    phase = np.linspace(-0.5, 0.5, 512)
    
    # Synthetic transit signal
    transit_phase = 0.05  # Transit half-width in phase
    transit_depth = 0.001  # Transit depth
    
    residuals = np.random.normal(0, 0.0005, 512)  # Noise
    trend = np.random.normal(0, 0.0002, 512)      # Local trend
    
    # Add synthetic transit
    in_transit = np.abs(phase) < transit_phase
    residuals[in_transit] -= transit_depth
    
    # Plot synthetic data
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(phase, residuals, 'b-', label='Residuals (with transit)', alpha=0.8)
    ax.plot(phase, trend, 'r-', label='Local Trend', alpha=0.8)
    ax.set_xlabel('Phase')
    ax.set_ylabel('Normalized Flux')
    ax.set_title('Synthetic Phase-Folded Residual Window')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.show()
    
    # Save synthetic residual
    synthetic_residual = np.stack([residuals, trend], axis=0).astype(np.float32)
    os.makedirs("../data/processed/residual_windows", exist_ok=True)
    np.save(f"../data/processed/residual_windows/residual_{demo_kepid}.npy", synthetic_residual)
    
    print("✅ Created synthetic residual window")

## 3. Model Testing Demo

Let's create and test the three neural network models.

In [None]:
# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"🔧 Using device: {device}")

# Create models
n_tabular_features = df_features.shape[1] - 1  # Exclude kepid
tabular_net, residual_net, pixel_net = create_models(
    tabular_input_dim=n_tabular_features,
    residual_length=512,
    pixel_size=(16, 16),
    device=device
)

print("✅ Created neural network models:")
print(f"   TabularNet: {sum(p.numel() for p in tabular_net.parameters()):,} parameters")
print(f"   ResidualCNN1D: {sum(p.numel() for p in residual_net.parameters()):,} parameters") 
print(f"   PixelCNN2D: {sum(p.numel() for p in pixel_net.parameters()):,} parameters")

# Test forward pass with dummy data
batch_size = 4

# Create dummy inputs
tabular_input = torch.randn(batch_size, n_tabular_features).to(device)
residual_input = torch.randn(batch_size, 2, 512).to(device)
pixel_input = torch.randn(batch_size, 1, 16, 16).to(device)

print(f"\n🧪 Testing forward pass with batch size {batch_size}:")

# Test each model
with torch.no_grad():
    tab_out = tabular_net(tabular_input)
    res_out = residual_net(residual_input)
    pix_out = pixel_net(pixel_input)

print(f"   Tabular output: {tab_out.shape} (range: {tab_out.min():.3f} - {tab_out.max():.3f})")
print(f"   Residual output: {res_out.shape} (range: {res_out.min():.3f} - {res_out.max():.3f})")
print(f"   Pixel output: {pix_out.shape} (range: {pix_out.min():.3f} - {pix_out.max():.3f})")

print("✅ All models working correctly!")

## 4. Summary

This notebook demonstrates the key components of the hybrid exoplanet detection pipeline:

### ✅ What we've shown:
- **Data loading**: Tabular features from Kepler catalog
- **Light curve processing**: Phase-folded residual windows using BLS
- **Model architecture**: Three PyTorch neural networks (MLP, 1D CNN, 2D CNN)
- **Pipeline integration**: End-to-end workflow demonstration

### 🚀 Next steps for full pipeline:
1. **Data preprocessing**: Run `python src/features.py` to process all KOIs
2. **Full training**: Run `python src/train.py --epochs 100` for complete training
3. **Evaluation**: Generate comprehensive reports with `src/evaluate.py`
4. **Fusion stacking**: Train XGBoost on combined model outputs

The complete pipeline combines the strengths of all three data modalities for robust exoplanet detection! 🪐