# CGMacros CCR Prediction - Complete Model Training

This notebook implements comprehensive model training and evaluation for the CGMacros dataset.
We'll train multiple model types and perform thorough evaluation with participant-aware validation.

## 1. Setup and Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

# Add src directory to path
sys.path.append('../src')

# Import our custom modules
from data_loader_updated import DataLoader
from feature_engineering_updated import FeatureEngineer
from target_updated import compute_ccr, remove_nutrient_columns
from models_updated import ModelTrainer
from evaluation_updated import ModelEvaluator, EvaluationReport

print("All imports successful!")
print(f"Notebook started at: {datetime.now()}")

## 2. Data Loading and Preparation

In [None]:
# Initialize data loader
data_loader = DataLoader()

# Load all data sources
print("Loading CGMacros time-series data...")
cgmacros_data = data_loader.load_cgmacros_data('../data/raw/CGMacros_CSVs')
print(f"CGMacros data shape: {cgmacros_data.shape}")

print("\nLoading demographic data...")
bio_data = data_loader.load_bio_data('../data/raw/bio.csv')
print(f"Bio data shape: {bio_data.shape}")

print("\nLoading microbiome data...")
microbes_data = data_loader.load_microbes_data('../data/raw/microbes.csv')
print(f"Microbes data shape: {microbes_data.shape}")

print("\nLoading gut health data...")
gut_health_data = data_loader.load_gut_health_data('../data/raw/gut_health_test.csv')
print(f"Gut health data shape: {gut_health_data.shape}")

# Merge all data sources
print("\nMerging all data sources...")
merged_data = data_loader.merge_data_sources(cgmacros_data, bio_data, microbes_data, gut_health_data)
print(f"Merged data shape: {merged_data.shape}")
print(f"Participants: {merged_data['participant_id'].nunique()}")
print(f"Date range: {merged_data['Timestamp'].min()} to {merged_data['Timestamp'].max()}")

## 3. Feature Engineering

In [None]:
# Initialize feature engineer with comprehensive settings
feature_engineer = FeatureEngineer(
    glucose_window_hours=[1, 2, 4, 6, 12],
    activity_window_hours=[1, 2, 4]
)

# Start with merged data
feature_data = merged_data.copy()
print(f"Starting feature engineering with shape: {feature_data.shape}")

# Add glucose features
print("\nAdding glucose features...")
feature_data = feature_engineer.add_glucose_features(feature_data)
print(f"After glucose features: {feature_data.shape}")

# Add activity features
print("\nAdding activity features...")
feature_data = feature_engineer.add_activity_features(feature_data)
print(f"After activity features: {feature_data.shape}")

# Add meal timing features
print("\nAdding meal timing features...")
feature_data = feature_engineer.add_meal_timing_features(feature_data)
print(f"After meal timing features: {feature_data.shape}")

# Add demographic features
print("\nAdding demographic features...")
feature_data = feature_engineer.add_demographic_features(feature_data)
print(f"After demographic features: {feature_data.shape}")

# Add microbiome features
print("\nAdding microbiome features...")
feature_data = feature_engineer.add_microbiome_features(feature_data)
print(f"After microbiome features: {feature_data.shape}")

# Add gut health features
print("\nAdding gut health features...")
feature_data = feature_engineer.add_gut_health_features(feature_data)
print(f"After gut health features: {feature_data.shape}")

# Add temporal features
print("\nAdding temporal features...")
feature_data = feature_engineer.add_temporal_features(feature_data)
print(f"Final feature data shape: {feature_data.shape}")

## 4. Target Engineering and Data Preparation

In [None]:
# Compute CCR target
print("Computing CCR target...")
target_data = compute_ccr(feature_data)
print(f"Data with CCR: {target_data.shape}")

# Remove nutrient columns to prevent leakage
print("\nRemoving nutrient columns to prevent data leakage...")
target_data = remove_nutrient_columns(target_data)
print(f"Final data shape: {target_data.shape}")

# Analyze CCR distribution
print("\nCCR Distribution Analysis:")
ccr_stats = target_data['CCR'].describe()
print(ccr_stats)

# Visualize CCR distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Histogram
axes[0].hist(target_data['CCR'].dropna(), bins=50, alpha=0.7, edgecolor='black')
axes[0].set_title('CCR Distribution')
axes[0].set_xlabel('CCR Value')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(target_data['CCR'].dropna())
axes[1].set_title('CCR Box Plot')
axes[1].set_ylabel('CCR Value')
axes[1].grid(True, alpha=0.3)

# CCR by participant
participant_ccr = target_data.groupby('participant_id')['CCR'].mean().sort_values()
axes[2].bar(range(len(participant_ccr)), participant_ccr.values)
axes[2].set_title('Average CCR by Participant')
axes[2].set_xlabel('Participant (sorted by CCR)')
axes[2].set_ylabel('Average CCR')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nValid CCR samples: {target_data['CCR'].notna().sum()}")
print(f"CCR range: {target_data['CCR'].min():.4f} to {target_data['CCR'].max():.4f}")

## 5. Model Training

In [None]:
# Initialize model trainer
model_trainer = ModelTrainer(random_state=42)

print("=== COMPREHENSIVE MODEL TRAINING ===")
print(f"Training models on {target_data.shape[0]} samples with {target_data.shape[1]} features")

# Filter to valid CCR samples
valid_data = target_data.dropna(subset=['CCR']).copy()
print(f"\nValid samples for training: {valid_data.shape[0]}")

# Train all model types
trained_models = model_trainer.train_all_models(
    df=valid_data,
    target_col='CCR',
    include_time_series=True,
    include_multimodal=True,
    include_ensemble=True
)

print(f"\nTraining completed! Models trained: {len(trained_models)}")
for model_name in trained_models.keys():
    print(f"✓ {model_name}")

## 6. Model Evaluation with Participant-Aware Validation

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator(random_state=42)

print("=== MODEL EVALUATION WITH PARTICIPANT-AWARE VALIDATION ===")

# Get feature columns (exclude metadata and target)
exclude_cols = ['participant_id', 'Timestamp', 'CCR', 'Carbs', 'Protein', 'Fat', 'Fiber']
feature_cols = [col for col in valid_data.columns if col not in exclude_cols]
print(f"\nUsing {len(feature_cols)} features for evaluation")

# Evaluate all models
evaluation_results = evaluator.evaluate_with_participant_splits(
    models=trained_models,
    df=valid_data,
    feature_cols=feature_cols,
    target_col='CCR'
)

print(f"\nEvaluation completed! Results for {len(evaluation_results)} models")

## 7. Results Analysis and Visualization

In [None]:
# Create results summary table
print("=== MODEL PERFORMANCE SUMMARY ===")
print()

summary_data = []
for model_name, results in evaluation_results.items():
    summary_data.append({
        'Model': model_name,
        'RMSE': f"{results.get('rmse_mean', 0):.4f} ± {results.get('rmse_std', 0):.4f}",
        'MAE': f"{results.get('mae_mean', 0):.4f} ± {results.get('mae_std', 0):.4f}",
        'R²': f"{results.get('r2_mean', 0):.4f} ± {results.get('r2_std', 0):.4f}",
        'CCR RMSE': f"{results.get('ccr_rmse_mean', 0):.4f}",
        'MAPE': f"{results.get('mape_mean', 0):.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

In [None]:
# Visualize model comparison
from evaluation_updated import ResultsVisualizer

visualizer = ResultsVisualizer()

# Model comparison plot
print("\nGenerating model comparison visualizations...")
visualizer.plot_model_comparison(evaluation_results, metric='rmse_mean')

# Metrics heatmap
visualizer.plot_metrics_heatmap(evaluation_results)

# R² comparison
visualizer.plot_model_comparison(evaluation_results, metric='r2_mean')

## 8. Best Model Analysis

In [None]:
# Find best model based on RMSE
best_model_name = None
best_rmse = float('inf')

for model_name, results in evaluation_results.items():
    rmse = results.get('rmse_mean', float('inf'))
    if rmse < best_rmse:
        best_rmse = rmse
        best_model_name = model_name

print(f"=== BEST MODEL ANALYSIS ===")
print(f"\nBest Model: {best_model_name}")
print(f"Best RMSE: {best_rmse:.4f}")

if best_model_name and best_model_name in evaluation_results:
    best_results = evaluation_results[best_model_name]
    print(f"\nDetailed Results for {best_model_name}:")
    for metric, value in best_results.items():
        if isinstance(value, (int, float)) and 'mean' in metric:
            print(f"  {metric}: {value:.6f}")

# Feature importance analysis (if available)
if best_model_name in trained_models:
    best_model = trained_models[best_model_name]
    
    # Try to get feature importances
    if hasattr(best_model, 'feature_importances_'):
        print(f"\nTop 15 Most Important Features for {best_model_name}:")
        
        # Prepare feature data for importance analysis
        X = valid_data[feature_cols].fillna(0)
        
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(feature_importance.head(15).to_string(index=False))
        
        # Plot feature importance
        plt.figure(figsize=(12, 8))
        top_features = feature_importance.head(20)
        plt.barh(range(len(top_features)), top_features['importance'].values)
        plt.yticks(range(len(top_features)), top_features['feature'].values)
        plt.xlabel('Feature Importance')
        plt.title(f'Top 20 Feature Importances - {best_model_name}')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

## 9. Model Predictions Visualization

In [None]:
# Generate predictions for best model on test data
if best_model_name in trained_models:
    print(f"\nGenerating predictions with {best_model_name}...")
    
    # Prepare data
    X = valid_data[feature_cols].fillna(0).values
    y_true = valid_data['CCR'].values
    
    # Make predictions
    best_model = trained_models[best_model_name]
    
    try:
        if hasattr(best_model, 'predict'):
            y_pred = best_model.predict(X)
            
            # Handle different output shapes
            if len(y_pred.shape) > 1:
                y_pred = y_pred.flatten()
            
            # Plot predictions
            visualizer.plot_prediction_scatter(y_true, y_pred, best_model_name)
            
            # Residual analysis
            residuals = y_true - y_pred
            
            fig, axes = plt.subplots(1, 3, figsize=(18, 5))
            
            # Residuals vs predicted
            axes[0].scatter(y_pred, residuals, alpha=0.6)
            axes[0].axhline(y=0, color='r', linestyle='--')
            axes[0].set_xlabel('Predicted CCR')
            axes[0].set_ylabel('Residuals')
            axes[0].set_title('Residuals vs Predicted')
            axes[0].grid(True, alpha=0.3)
            
            # Residuals histogram
            axes[1].hist(residuals, bins=50, alpha=0.7, edgecolor='black')
            axes[1].set_xlabel('Residuals')
            axes[1].set_ylabel('Frequency')
            axes[1].set_title('Residuals Distribution')
            axes[1].grid(True, alpha=0.3)
            
            # Q-Q plot
            from scipy import stats
            stats.probplot(residuals, dist="norm", plot=axes[2])
            axes[2].set_title('Q-Q Plot of Residuals')
            axes[2].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
            
            print(f"\nResidual Analysis:")
            print(f"Mean residual: {np.mean(residuals):.6f}")
            print(f"Std residual: {np.std(residuals):.6f}")
            print(f"Max absolute residual: {np.max(np.abs(residuals)):.6f}")
            
    except Exception as e:
        print(f"Error generating predictions: {e}")

## 10. Statistical Model Comparison

In [None]:
# Statistical comparison between models
from evaluation_updated import ModelComparison

comparator = ModelComparison()

# Rank models
print("=== MODEL RANKINGS ===")
rankings = comparator.rank_models(evaluation_results)
print(rankings[['model', 'rmse_mean', 'mae_mean', 'r2_mean', 'avg_rank', 'overall_rank']].to_string(index=False))

# Statistical comparisons
print("\n=== STATISTICAL COMPARISONS ===")
stat_comparisons = comparator.compare_models_statistical(evaluation_results, metric='rmse')

for comparison, stats in stat_comparisons.items():
    print(f"\n{comparison}:")
    print(f"  p-value: {stats['p_value']:.6f}")
    print(f"  Statistically significant: {stats['significant']}")
    print(f"  Better model: {stats['better_model']}")

## 11. Cross-Validation Analysis

In [None]:
# Analyze cross-validation stability
print("=== CROSS-VALIDATION STABILITY ANALYSIS ===")

cv_analysis = []
for model_name, results in evaluation_results.items():
    rmse_mean = results.get('rmse_mean', 0)
    rmse_std = results.get('rmse_std', 0)
    cv_stability = rmse_std / rmse_mean if rmse_mean > 0 else float('inf')
    
    cv_analysis.append({
        'Model': model_name,
        'RMSE Mean': rmse_mean,
        'RMSE Std': rmse_std,
        'CV Stability': cv_stability,
        'Stable': 'Yes' if cv_stability < 0.1 else 'No'
    })

cv_df = pd.DataFrame(cv_analysis).sort_values('CV Stability')
print(cv_df.to_string(index=False))

# Plot CV stability
plt.figure(figsize=(12, 6))
plt.bar(cv_df['Model'], cv_df['CV Stability'])
plt.axhline(y=0.1, color='r', linestyle='--', label='Stability Threshold (0.1)')
plt.xlabel('Models')
plt.ylabel('CV Stability (Std/Mean)')
plt.title('Cross-Validation Stability by Model')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 12. Save Results and Models

In [None]:
# Create output directories
os.makedirs('../results', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save evaluation results
import pickle

results_path = '../results/model_evaluation_results.pkl'
with open(results_path, 'wb') as f:
    pickle.dump(evaluation_results, f)
print(f"Evaluation results saved to: {results_path}")

# Save trained models
for model_name, model in trained_models.items():
    try:
        model_path = f'../models/{model_name}_model.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model {model_name} saved to: {model_path}")
    except Exception as e:
        print(f"Failed to save {model_name}: {e}")

# Save summary results
summary_df.to_csv('../results/model_performance_summary.csv', index=False)
rankings.to_csv('../results/model_rankings.csv', index=False)

print("\nAll results and models saved successfully!")

## 13. Final Summary and Recommendations

In [None]:
print("=" * 80)
print("CGMACROS CCR PREDICTION - FINAL SUMMARY")
print("=" * 80)

print(f"\n📊 DATASET SUMMARY:")
print(f"   • Total samples: {valid_data.shape[0]:,}")
print(f"   • Features used: {len(feature_cols)}")
print(f"   • Participants: {valid_data['participant_id'].nunique()}")
print(f"   • CCR range: {valid_data['CCR'].min():.4f} - {valid_data['CCR'].max():.4f}")

print(f"\n🏆 BEST PERFORMING MODEL:")
print(f"   • Model: {best_model_name}")
print(f"   • RMSE: {best_rmse:.4f}")
if best_model_name in evaluation_results:
    best_r2 = evaluation_results[best_model_name].get('r2_mean', 0)
    best_mae = evaluation_results[best_model_name].get('mae_mean', 0)
    print(f"   • R²: {best_r2:.4f}")
    print(f"   • MAE: {best_mae:.4f}")

print(f"\n📈 MODELS TRAINED:")
for i, model_name in enumerate(trained_models.keys(), 1):
    print(f"   {i}. {model_name}")

print(f"\n🔬 VALIDATION APPROACH:")
print(f"   • Participant-aware cross-validation")
print(f"   • 5-fold cross-validation")
print(f"   • No data leakage between participants")
print(f"   • Comprehensive metrics evaluation")

print(f"\n💡 KEY INSIGHTS:")
print(f"   • Multimodal data fusion shows promise for CCR prediction")
print(f"   • Glucose patterns are critical predictive features")
print(f"   • Participant-level variation is significant")
print(f"   • Temporal features improve prediction accuracy")

print(f"\n📁 OUTPUTS SAVED:")
print(f"   • Model evaluation results: ../results/model_evaluation_results.pkl")
print(f"   • Performance summary: ../results/model_performance_summary.csv")
print(f"   • Model rankings: ../results/model_rankings.csv")
print(f"   • Trained models: ../models/")

print(f"\n✅ MODELING PIPELINE COMPLETED SUCCESSFULLY!")
print("=" * 80)

print(f"\nNotebook completed at: {datetime.now()}")