# CGMacros Data Exploration and EDA

This notebook provides exploratory data analysis for the CGMacros dataset used in the IEEE BHI 2025 Track 2 Challenge.

## Objective
Explore the multimodal CGMacros dataset to understand:
- Data quality and completeness
- Distribution of target variable (CCR)
- Feature relationships and correlations
- Patterns in glucose, activity, demographic, and microbiome data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path for imports
sys.path.append('../src')

from data_loader import load_cgmacros_data, get_data_summary
from target import compute_ccr, get_ccr_distribution_info

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 1. Data Loading and Initial Inspection

In [None]:
# Load the CGMacros data
data_dir = "../data/raw"
df = load_cgmacros_data(data_dir)

print(f"Data loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)}")

# Display first few rows
df.head()

In [None]:
# Get data summary
summary = get_data_summary(df)
print("Data Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")

In [None]:
# Check data types and missing values
print("Data Types:")
print(df.dtypes.value_counts())

print("\nMissing Values:")
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
print(missing_data.head(10))

## 2. Target Variable Analysis (CCR)

In [None]:
# Compute CCR target variable
df_with_ccr = compute_ccr(df)

print(f"Data shape after CCR computation: {df_with_ccr.shape}")

if 'ccr' in df_with_ccr.columns:
    ccr_info = get_ccr_distribution_info(df_with_ccr)
    print("\nCCR Distribution Info:")
    for key, value in ccr_info.items():
        if key != 'percentiles':
            print(f"{key}: {value}")
else:
    print("CCR column not found - check nutrient data availability")

In [None]:
# Visualize CCR distribution
if 'ccr' in df_with_ccr.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Carbohydrate Caloric Ratio (CCR) Analysis', fontsize=16)
    
    ccr_values = df_with_ccr['ccr'].dropna()
    
    # Histogram
    axes[0, 0].hist(ccr_values, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('CCR Distribution')
    axes[0, 0].set_xlabel('CCR Value')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Box plot
    axes[0, 1].boxplot(ccr_values)
    axes[0, 1].set_title('CCR Box Plot')
    axes[0, 1].set_ylabel('CCR Value')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Q-Q plot
    from scipy import stats
    stats.probplot(ccr_values, dist="norm", plot=axes[1, 0])
    axes[1, 0].set_title('CCR Q-Q Plot')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Summary statistics text
    stats_text = f"""CCR Statistics:
Count: {len(ccr_values)}
Mean: {ccr_values.mean():.4f}
Std: {ccr_values.std():.4f}
Min: {ccr_values.min():.4f}
Max: {ccr_values.max():.4f}
Median: {ccr_values.median():.4f}
Skewness: {ccr_values.skew():.4f}"""
    
    axes[1, 1].text(0.1, 0.9, stats_text, transform=axes[1, 1].transAxes, 
                    fontsize=10, verticalalignment='top',
                    bbox=dict(boxstyle="round", facecolor='wheat', alpha=0.5))
    axes[1, 1].set_title('CCR Summary Statistics')
    axes[1, 1].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("CCR data not available for visualization")

## 3. Feature Analysis by Category

In [None]:
# Categorize columns by data type
def categorize_columns(df):
    categories = {
        'glucose': [col for col in df.columns if 'glucose' in col.lower() or 'cgm' in col.lower()],
        'activity': [col for col in df.columns if any(term in col.lower() for term in ['step', 'heart', 'hr', 'activity', 'calorie'])],
        'demographic': [col for col in df.columns if any(term in col.lower() for term in ['age', 'bmi', 'gender', 'weight', 'height'])],
        'microbiome': [col for col in df.columns if any(term in col.lower() for term in ['microbe', 'bacteria', 'otu'])],
        'gut_health': [col for col in df.columns if 'gut' in col.lower()],
        'temporal': [col for col in df.columns if any(term in col.lower() for term in ['time', 'date', 'timestamp'])],
        'participant': [col for col in df.columns if 'participant' in col.lower()]
    }
    return categories

column_categories = categorize_columns(df_with_ccr)

print("Column Categories:")
for category, cols in column_categories.items():
    print(f"{category}: {len(cols)} columns")
    if cols:  # Show first few columns as examples
        print(f"  Examples: {cols[:3]}")
    print()

In [None]:
# Analyze numeric features
numeric_cols = df_with_ccr.select_dtypes(include=[np.number]).columns
print(f"Total numeric columns: {len(numeric_cols)}")

# Basic statistics for numeric columns
if len(numeric_cols) > 0:
    numeric_summary = df_with_ccr[numeric_cols].describe()
    print("\nNumeric Features Summary:")
    print(numeric_summary)

## 4. Correlation Analysis

In [None]:
# Correlation analysis with CCR
if 'ccr' in df_with_ccr.columns and len(numeric_cols) > 1:
    # Calculate correlations with CCR
    ccr_correlations = df_with_ccr[numeric_cols].corr()['ccr'].abs().sort_values(ascending=False)
    
    # Remove CCR self-correlation and NaN values
    ccr_correlations = ccr_correlations.drop('ccr', errors='ignore').dropna()
    
    print("Top 20 Features Correlated with CCR:")
    print(ccr_correlations.head(20))
    
    # Visualize top correlations
    if len(ccr_correlations) > 0:
        plt.figure(figsize=(12, 8))
        top_corr = ccr_correlations.head(15)
        plt.barh(range(len(top_corr)), top_corr.values)
        plt.yticks(range(len(top_corr)), top_corr.index)
        plt.xlabel('Absolute Correlation with CCR')
        plt.title('Top 15 Features Most Correlated with CCR')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
else:
    print("CCR or numeric features not available for correlation analysis")

In [None]:
# Create correlation heatmap for top features
if 'ccr' in df_with_ccr.columns and len(numeric_cols) > 1:
    # Select top correlated features for heatmap
    top_features = ccr_correlations.head(20).index.tolist() + ['ccr']
    
    if len(top_features) > 1:
        plt.figure(figsize=(14, 12))
        correlation_matrix = df_with_ccr[top_features].corr()
        
        # Create mask for upper triangle
        mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
        
        # Generate heatmap
        sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', 
                   cmap='coolwarm', center=0, square=True, cbar_kws={"shrink": .5})
        plt.title('Feature Correlation Matrix (Top CCR-Correlated Features)', fontsize=14)
        plt.tight_layout()
        plt.show()

## 5. Data Quality Assessment

In [None]:
# Missing data visualization
missing_data = df_with_ccr.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)

if len(missing_data) > 0:
    plt.figure(figsize=(12, 8))
    missing_pct = (missing_data / len(df_with_ccr)) * 100
    
    # Show top 20 columns with missing data
    top_missing = missing_pct.head(20)
    
    plt.barh(range(len(top_missing)), top_missing.values, color='coral')
    plt.yticks(range(len(top_missing)), top_missing.index)
    plt.xlabel('Percentage of Missing Values')
    plt.title('Missing Data by Feature (Top 20)')
    plt.grid(True, alpha=0.3, axis='x')
    
    # Add percentage labels
    for i, v in enumerate(top_missing.values):
        plt.text(v + 0.5, i, f'{v:.1f}%', va='center')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Total columns with missing data: {len(missing_data)}")
    print(f"Columns with >50% missing data: {sum(missing_pct > 50)}")
else:
    print("No missing data found!")

## 6. Participant-Level Analysis

In [None]:
# Participant analysis
if 'participant_id' in df_with_ccr.columns:
    participant_counts = df_with_ccr['participant_id'].value_counts()
    
    print(f"Total participants: {df_with_ccr['participant_id'].nunique()}")
    print(f"Total records: {len(df_with_ccr)}")
    print(f"Average records per participant: {len(df_with_ccr) / df_with_ccr['participant_id'].nunique():.2f}")
    
    # Participant record distribution
    plt.figure(figsize=(12, 6))
    plt.hist(participant_counts.values, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
    plt.xlabel('Number of Records per Participant')
    plt.ylabel('Number of Participants')
    plt.title('Distribution of Records per Participant')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # CCR by participant (if available)
    if 'ccr' in df_with_ccr.columns:
        participant_ccr = df_with_ccr.groupby('participant_id')['ccr'].agg(['mean', 'std', 'count']).reset_index()
        participant_ccr = participant_ccr.dropna()
        
        if len(participant_ccr) > 0:
            fig, axes = plt.subplots(1, 2, figsize=(15, 6))
            
            # Mean CCR by participant
            axes[0].scatter(participant_ccr['participant_id'], participant_ccr['mean'], alpha=0.7)
            axes[0].set_xlabel('Participant ID')
            axes[0].set_ylabel('Mean CCR')
            axes[0].set_title('Mean CCR by Participant')
            axes[0].grid(True, alpha=0.3)
            
            # CCR variability by participant
            axes[1].scatter(participant_ccr['mean'], participant_ccr['std'], alpha=0.7)
            axes[1].set_xlabel('Mean CCR')
            axes[1].set_ylabel('CCR Standard Deviation')
            axes[1].set_title('CCR Variability vs Mean CCR')
            axes[1].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
else:
    print("No participant ID column found")

## 7. Summary and Next Steps

### Key Findings from EDA:

1. **Data Quality**: [To be filled based on actual data]
2. **Target Variable**: [CCR distribution characteristics]
3. **Feature Relationships**: [Key correlations with CCR]
4. **Data Completeness**: [Missing data patterns]

### Recommended Next Steps:

1. **Feature Engineering**: Create domain-specific features from glucose, activity, and microbiome data
2. **Data Preprocessing**: Handle missing values and outliers
3. **Model Development**: Start with baseline models and progress to advanced methods
4. **Validation Strategy**: Implement participant-aware cross-validation

### Files Generated:
- Data summary statistics
- Feature correlation analysis
- Missing data assessment
- Participant-level insights