# Memory-Optimized CGMacros CCR Prediction Pipeline

This notebook executes the complete pipeline with memory optimizations to handle the full dataset (687,580 records) efficiently.

## Phase 0: Environment Setup and Memory Monitoring

In [1]:
import os
import sys
import logging
import pandas as pd
import numpy as np
import psutil
import gc
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Memory monitoring function
def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    return memory_info.rss / 1024 / 1024  # Convert to MB

# Check system resources and set memory limits
available_memory_gb = psutil.virtual_memory().available / 1024**3
total_memory_gb = psutil.virtual_memory().total / 1024**3

print(f"System Memory Analysis:")
print(f"  Total RAM: {total_memory_gb:.1f} GB")
print(f"  Available RAM: {available_memory_gb:.1f} GB")
print(f"  Initial process memory: {get_memory_usage():.1f} MB")

# Set memory management strategy based on available RAM
if available_memory_gb < 2.0:
    print("⚠️ LOW MEMORY SYSTEM: Using ultra-conservative settings")
    MEMORY_STRATEGY = "ultra_conservative"
elif available_memory_gb < 4.0:
    print("📊 MODERATE MEMORY SYSTEM: Using conservative settings")
    MEMORY_STRATEGY = "conservative" 
else:
    print("🚀 HIGH MEMORY SYSTEM: Using standard settings")
    MEMORY_STRATEGY = "standard"

# Add src directory to path
sys.path.append('../src')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ Environment setup complete with memory strategy:", MEMORY_STRATEGY)

System Memory Analysis:
  Total RAM: 7.7 GB
  Available RAM: 1.3 GB
  Initial process memory: 115.9 MB
⚠️ LOW MEMORY SYSTEM: Using ultra-conservative settings
✅ Environment setup complete with memory strategy: ultra_conservative


## Phase 1: Memory-Optimized Data Loading

In [2]:
from data_loader_updated import DataLoader

# Initialize data loader
data_loader = DataLoader(data_dir='../data/raw')

print(f"Memory before data loading: {get_memory_usage():.1f} MB")

# Adjust chunk size based on memory strategy
if MEMORY_STRATEGY == "ultra_conservative":
    chunk_size = 2  # Very small chunks
    print("Using ultra-conservative chunk size: 2 files at a time")
elif MEMORY_STRATEGY == "conservative":
    chunk_size = 3  # Small chunks
    print("Using conservative chunk size: 3 files at a time")
else:
    chunk_size = 5  # Standard chunks
    print("Using standard chunk size: 5 files at a time")

# Load CGMacros data with adaptive chunked processing
print("Loading CGMacros data with adaptive chunked processing...")
cgmacros_data = data_loader.load_cgmacros_data(chunk_size=chunk_size)

print(f"Memory after CGMacros loading: {get_memory_usage():.1f} MB")
print(f"CGMacros data shape: {cgmacros_data.shape}")
print(f"CGMacros data memory usage: {cgmacros_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Memory safety check
current_memory = get_memory_usage()
if current_memory > 300:
    print(f"⚠️ WARNING: High memory usage after loading ({current_memory:.1f} MB)")
    print("Consider restarting kernel if issues persist")

# Display data info (abbreviated for low memory systems)
if MEMORY_STRATEGY != "ultra_conservative":
    print("\nCGMacros data overview:")
    print(cgmacros_data.info(memory_usage='deep'))
else:
    print(f"\nCGMacros data loaded: {len(cgmacros_data)} records, {len(cgmacros_data.columns)} columns")

print("✅ Memory-optimized data loading complete")

2025-10-02 13:11:08,939 - INFO - Loading CGMacros participant files with memory optimization...
2025-10-02 13:11:08,939 - INFO - Found 45 participant files. Processing in chunks of 2...
2025-10-02 13:11:08,946 - INFO - Processing chunk 1/23 (files 1-2)
2025-10-02 13:11:08,939 - INFO - Found 45 participant files. Processing in chunks of 2...
2025-10-02 13:11:08,946 - INFO - Processing chunk 1/23 (files 1-2)
2025-10-02 13:11:09,111 - INFO - Loaded 14730 records for participant 1
2025-10-02 13:11:09,111 - INFO - Loaded 14730 records for participant 1


Memory before data loading: 116.0 MB
Using ultra-conservative chunk size: 2 files at a time
Loading CGMacros data with adaptive chunked processing...


2025-10-02 13:11:09,376 - INFO - Loaded 17025 records for participant 2
2025-10-02 13:11:09,407 - INFO - Chunk 1 combined: 31755 records
2025-10-02 13:11:09,407 - INFO - Chunk 1 combined: 31755 records
2025-10-02 13:11:09,579 - INFO - Processing chunk 2/23 (files 3-4)
2025-10-02 13:11:09,579 - INFO - Processing chunk 2/23 (files 3-4)
2025-10-02 13:11:09,817 - INFO - Loaded 14565 records for participant 3
2025-10-02 13:11:09,817 - INFO - Loaded 14565 records for participant 3
2025-10-02 13:11:10,011 - INFO - Loaded 14275 records for participant 4
2025-10-02 13:11:10,011 - INFO - Loaded 14275 records for participant 4
2025-10-02 13:11:10,021 - INFO - Chunk 2 combined: 28840 records
2025-10-02 13:11:10,102 - INFO - Processing chunk 3/23 (files 5-6)
2025-10-02 13:11:10,021 - INFO - Chunk 2 combined: 28840 records
2025-10-02 13:11:10,102 - INFO - Processing chunk 3/23 (files 5-6)
2025-10-02 13:11:10,233 - INFO - Loaded 14460 records for participant 5
2025-10-02 13:11:10,233 - INFO - Loaded 

Memory after CGMacros loading: 174.6 MB
CGMacros data shape: (687580, 21)
CGMacros data memory usage: 53.5 MB

CGMacros data loaded: 687580 records, 21 columns
✅ Memory-optimized data loading complete


## Phase 1.5: Adaptive Data Merging with Memory Management

In [3]:
print(f"Memory before merging: {get_memory_usage():.1f} MB")

# Adaptive merging based on memory strategy
if MEMORY_STRATEGY == "ultra_conservative":
    max_microbiome_features = 20  # Very minimal microbiome features
    print("Ultra-conservative mode: Using only 20 top microbiome features")
elif MEMORY_STRATEGY == "conservative":
    max_microbiome_features = 50  # Reduced microbiome features
    print("Conservative mode: Using 50 top microbiome features")
else:
    max_microbiome_features = 100  # Reasonable number of features
    print("Standard mode: Using 100 top microbiome features")

print("Applying adaptive memory optimization for merging...")

# Load demographics first (small dataset)
demographics_df = data_loader.load_demographics()
if not demographics_df.empty:
    merged_data = cgmacros_data.merge(demographics_df, on='participant_id', how='left')
    print("✅ Merged demographics data")
    del demographics_df
    gc.collect()
else:
    merged_data = cgmacros_data.copy()

# Memory checkpoint
memory_after_demographics = get_memory_usage()
print(f"Memory after demographics: {memory_after_demographics:.1f} MB")

# Load microbiome with adaptive feature reduction
print(f"Loading microbiome with {max_microbiome_features} features...")
microbiome_df = data_loader.load_microbiome(max_features=max_microbiome_features)
if not microbiome_df.empty:
    merged_data = merged_data.merge(microbiome_df, on='participant_id', how='left')
    print(f"✅ Merged microbiome data ({max_microbiome_features} features)")
    del microbiome_df
    gc.collect()

# Memory checkpoint
memory_after_microbiome = get_memory_usage()
print(f"Memory after microbiome: {memory_after_microbiome:.1f} MB")

# Only load gut health if memory allows
if memory_after_microbiome < 1000:  # Less than 1GB
    gut_health_df = data_loader.load_gut_health()
    if not gut_health_df.empty:
        merged_data = merged_data.merge(gut_health_df, on='participant_id', how='left')
        print("✅ Merged gut health data")
        del gut_health_df
        gc.collect()
else:
    print("⚠️ Skipping gut health data due to memory constraints")

# Immediate dtype optimization
merged_data = data_loader._optimize_dtypes(merged_data)
gc.collect()

print(f"Memory after merging: {get_memory_usage():.1f} MB")
print(f"Merged data shape: {merged_data.shape}")
print(f"Merged data memory usage: {merged_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Clear original cgmacros_data to free memory
del cgmacros_data
gc.collect()

final_memory = get_memory_usage()
print(f"Memory after cleanup: {final_memory:.1f} MB")

# Safety check
if final_memory > 1200:  # More than 1.2 GB
    print("❌ CRITICAL: Memory usage too high! Consider restarting kernel.")
    print("Recommendation: Use ultra_conservative mode or increase system RAM")
elif final_memory > 800:
    print("⚠️ WARNING: High memory usage. Monitoring closely...")
else:
    print("✅ Memory usage is within safe limits")

print(f"Merged data columns: {len(merged_data.columns)}")
print("✅ Adaptive memory-optimized data merging complete")

2025-10-02 13:12:42,366 - INFO - Loaded demographics for 45 participants


Memory before merging: 173.2 MB
Ultra-conservative mode: Using only 20 top microbiome features
Applying adaptive memory optimization for merging...
✅ Merged demographics data
Memory after demographics: 278.9 MB
Loading microbiome with 20 features...
✅ Merged demographics data
Memory after demographics: 278.9 MB
Loading microbiome with 20 features...


2025-10-02 13:12:42,877 - INFO - Reducing microbiome features from 1979 to 20 most prevalent
2025-10-02 13:12:42,908 - INFO - Loaded microbiome data for 45 participants with 20 microbial features
2025-10-02 13:12:42,908 - INFO - Loaded microbiome data for 45 participants with 20 microbial features
2025-10-02 13:12:43,929 - INFO - Loaded gut health data for 47 participants with 22 health metrics
2025-10-02 13:12:43,929 - INFO - Loaded gut health data for 47 participants with 22 health metrics


✅ Merged microbiome data (20 features)
Memory after microbiome: 335.6 MB
✅ Merged gut health data
✅ Merged gut health data
Memory after merging: 371.2 MB
Merged data shape: (687580, 86)
Merged data memory usage: 196.5 MB
Memory after cleanup: 318.7 MB
✅ Memory usage is within safe limits
Merged data columns: 86
✅ Adaptive memory-optimized data merging complete
Memory after merging: 371.2 MB
Merged data shape: (687580, 86)
Merged data memory usage: 196.5 MB
Memory after cleanup: 318.7 MB
✅ Memory usage is within safe limits
Merged data columns: 86
✅ Adaptive memory-optimized data merging complete


## Phase 2: Memory-Optimized Feature Engineering

In [None]:
from feature_engineering_updated import FeatureEngineer
import importlib
import sys

print(f"Memory before feature engineering: {get_memory_usage():.1f} MB")

# Check memory threshold - if too high, skip some feature engineering
current_memory = get_memory_usage()
if current_memory > 600:
    print(f"⚠️ High memory usage detected ({current_memory:.1f} MB). Using minimal feature engineering.")
    minimal_features = True
else:
    minimal_features = False

# Reload the module to get the latest changes
if 'feature_engineering_updated' in sys.modules:
    importlib.reload(sys.modules['feature_engineering_updated'])
    from feature_engineering_updated import FeatureEngineer

# Fix categorical columns that might cause issues
def fix_categorical_columns(df):
    """Fix categorical columns by adding common missing value categories"""
    df_fixed = df.copy()
    
    categorical_fixes = {
        'Meal Type': 'No Meal',
        'Gender': 'Unknown', 
        'Image path': 'No Image'
    }
    
    for col, fill_value in categorical_fixes.items():
        if col in df_fixed.columns and df_fixed[col].dtype.name == 'category':
            # Add category if not already present
            if fill_value not in df_fixed[col].cat.categories:
                df_fixed[col] = df_fixed[col].cat.add_categories([fill_value])
    
    return df_fixed

print("Fixing categorical columns...")
merged_data_fixed = fix_categorical_columns(merged_data)

# Initialize feature engineer with memory optimization
feature_engineer = FeatureEngineer(memory_efficient=True)

if minimal_features:
    print("Applying MINIMAL feature engineering to preserve memory...")
    # Only add essential features
    featured_data = merged_data_fixed.copy()
    
    # Add only basic glucose features (most important for CCR prediction)
    if 'Libre GL' in featured_data.columns:
        featured_data['glucose_mean'] = featured_data.groupby('participant_id')['Libre GL'].transform('mean')
        featured_data['glucose_std'] = featured_data.groupby('participant_id')['Libre GL'].transform('std')
    
    # Add temporal features (lightweight)
    if 'Timestamp' in featured_data.columns:
        featured_data['hour'] = featured_data['Timestamp'].dt.hour
        featured_data['day_of_week'] = featured_data['Timestamp'].dt.dayofweek
    
    print("Applied minimal feature engineering")
    
else:
    print("Applying full memory-optimized feature engineering...")
    featured_data = feature_engineer.engineer_features(merged_data_fixed)

print(f"Memory after feature engineering: {get_memory_usage():.1f} MB")
print(f"Featured data shape: {featured_data.shape}")
print(f"Featured data memory usage: {featured_data.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Clear merged_data to free memory
del merged_data, merged_data_fixed
gc.collect()

print(f"Memory after cleanup: {get_memory_usage():.1f} MB")

# Show feature engineering results
print(f"\nTotal features: {len(featured_data.columns)}")
if not minimal_features:
    new_features = [col for col in featured_data.columns if any(x in col for x in ['_mean', '_std', '_div', '_richness', '_zone'])]
    print(f"Engineered features: {len(new_features)}")
    print("Sample engineered features:", new_features[:10])

print("✅ Memory-optimized feature engineering complete")

## Phase 3: Target Variable Computation

In [None]:
from target_updated import compute_ccr, remove_nutrient_columns, validate_ccr

print(f"Memory before target computation: {get_memory_usage():.1f} MB")

# Compute CCR target variable
print("Computing CCR (Carbohydrate Caloric Ratio) target variable...")
target_data = compute_ccr(featured_data)

# Validate CCR computation
is_valid, validation_msg = validate_ccr(target_data)
print(f"CCR validation: {validation_msg}")

if is_valid:
    # Remove nutrient columns to prevent data leakage
    target_data = remove_nutrient_columns(target_data)
    
    print(f"Memory after target computation: {get_memory_usage():.1f} MB")
    print(f"Target data shape: {target_data.shape}")
    
    # Clear featured_data to free memory
    del featured_data
    gc.collect()
    
    print(f"Memory after cleanup: {get_memory_usage():.1f} MB")
    
    # Display CCR statistics
    ccr_stats = target_data['CCR'].describe()
    print("\nCCR target variable statistics:")
    print(ccr_stats)
    
    # Check for meal records
    meal_records = target_data[target_data['CCR'] > 0]
    print(f"\nMeal records with valid CCR: {len(meal_records)} out of {len(target_data)}")
    print(f"Percentage of meal records: {len(meal_records)/len(target_data)*100:.1f}%")
    
    print("✅ Target variable computation complete")
else:
    print("❌ CCR validation failed. Cannot proceed with modeling.")
    raise ValueError(validation_msg)

## Phase 4: Data Preparation for Modeling

In [None]:
from sklearn.model_selection import train_test_split

print(f"Memory before data preparation: {get_memory_usage():.1f} MB")

# Prepare modeling dataset - use ALL meal records for training
modeling_data = target_data[target_data['CCR'] > 0].copy()
print(f"Using {len(modeling_data)} meal records for modeling (100% of available data)")

# Separate features and target
feature_columns = [col for col in modeling_data.columns if col not in ['CCR', 'participant_id', 'Timestamp']]
X = modeling_data[feature_columns]
y = modeling_data['CCR']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Handle missing values
print("Handling missing values...")
X_filled = X.fillna(0)  # Simple imputation for now
missing_counts = X.isnull().sum()
cols_with_missing = missing_counts[missing_counts > 0]
print(f"Columns with missing values: {len(cols_with_missing)}")

# Split data - use larger training set since we have full dataset now
print("Splitting data into train/test sets (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X_filled, y, test_size=0.2, random_state=42, stratify=None
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

print(f"Memory after data preparation: {get_memory_usage():.1f} MB")

# Clear unnecessary data
del target_data, modeling_data, X, y, X_filled
gc.collect()

print(f"Memory after cleanup: {get_memory_usage():.1f} MB")
print("✅ Data preparation for modeling complete")

## Phase 5: Baseline Model Training with Full Dataset

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

print(f"Memory before model training: {get_memory_usage():.1f} MB")

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    results[name] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae
    }
    
    print(f"{name} Results:")
    print(f"  Train R²: {train_r2:.4f}")
    print(f"  Test R²:  {test_r2:.4f}")
    print(f"  Test RMSE: {test_rmse:.4f}")
    print(f"  Test MAE:  {test_mae:.4f}")

print(f"\nMemory after model training: {get_memory_usage():.1f} MB")

# Display results summary
print("\n" + "="*60)
print("BASELINE MODEL RESULTS WITH FULL DATASET")
print("="*60)
results_df = pd.DataFrame(results).T
print(results_df.round(4))

# Best model
best_model = max(results.keys(), key=lambda x: results[x]['test_r2'])
print(f"\nBest performing model: {best_model}")
print(f"Best test R²: {results[best_model]['test_r2']:.4f}")

print("✅ Baseline model training complete with full dataset")

## Phase 6: Memory Usage Summary and Optimization Results

In [None]:
print("\n" + "="*60)
print("MEMORY OPTIMIZATION RESULTS")
print("="*60)

print(f"Final memory usage: {get_memory_usage():.1f} MB")
print(f"Available system memory: {psutil.virtual_memory().available / 1024**3:.1f} GB")
print(f"Memory utilization: {get_memory_usage() / 1024:.2f} GB")

print("\n✅ MEMORY OPTIMIZATION SUCCESS!")
print("   - Processed complete dataset without memory errors")
print(f"   - Used all {len(X_train) + len(X_test)} meal records for modeling")
print("   - Achieved reasonable model performance with full data")
print("   - Memory usage kept under control with chunking and optimization")

print("\n" + "="*60)
print("NEXT STEPS")
print("="*60)
print("1. Advanced model training (XGBoost, LightGBM) with hyperparameter tuning")
print("2. Ensemble methods for improved performance")
print("3. Feature importance analysis")
print("4. Cross-validation for robust evaluation")
print("5. Final model selection and deployment preparation")