In [22]:
import pandas as pd
import numpy as np

np.random.seed(42)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# -----------------------------
# INFANT DATA (0‚Äì1 yr)
# -----------------------------
def generate_infant_data(n=10000):
    is_existing = np.random.choice([1,0], size=n, p=[0.6,0.4])  # int, 1=existing, 0=new, default=0, helps handle missing trends
    age_days = np.random.randint(30, 366, size=n)                # int, 0‚Äì365, age in days, default=0
    sex = np.random.choice([0,1], size=n)                        # category, 0=Male, 1=Female, default=Male
    feeding_type = np.random.choice([0,1,2], size=n, p=[0.4,0.2,0.4])    # category, 0=Breastmilk,1=Formula,2=Mixed, default=Breastmilk
    feeding_frequency = np.random.randint(4,9,size=n)                     # int, 1‚Äì10 feeds/day, default=6
    vaccination_status = np.random.choice([0,1,2], size=n, p=[0.7,0.2,0.1]) # category, 0=Up-to-date,1=Partial,2=Delayed
    sleep_hours = np.clip(np.random.normal(14,1.5,size=n),8,20)          # float, 8‚Äì20 hours/day, default=14
    illness_fever = np.random.poisson(0.2, size=n)                        # int, 0‚Äì5 in past 3 months
    illness_cold = np.random.poisson(0.3, size=n)                         # int, 0‚Äì5 in past 3 months
    illness_diarrhea = np.random.poisson(0.05, size=n)                    # int, 0‚Äì5 in past 3 months
    illness_effect = -0.1 * (illness_diarrhea + illness_fever / 2 + illness_cold / 3)  # Diarrhea impacts most
    feeding_effect = -0.15 * (feeding_type / 2) + -0.05 * np.clip(5 - feeding_frequency, 0, 5)/5  # Poor type/freq lowers z
    weight_zscore = np.clip(np.random.normal(0 + illness_effect + feeding_effect, 1, size=n), -3, 3)
    height_zscore = np.clip(np.random.normal(0 + illness_effect / 2 + feeding_effect / 2, 1, size=n), -3, 3)  # Milder effect on height
    median_weight = np.where(sex==0, 3.3 + (age_days/30)*0.7, 3.2 + (age_days/30)*0.65)  # Boys slightly heavier
    sd_weight = 0.5 + (age_days/365)*0.5
    weight_kg = np.clip(median_weight + weight_zscore * sd_weight, 2.5, 3.5 + (age_days/30)*0.8)  # Max grows with age
    median_height = np.where(sex==0, 49.9 + (age_days/30)*2.5, 49.1 + (age_days/30)*2.4)
    sd_height = 2.0 + (age_days/365)*1.0
    height_cm = np.clip(median_height + height_zscore * sd_height, 45, 50 + (age_days/30)*2.7)  # Max grows with age
    # MUAC (after ~6 months, otherwise low)
    muac = np.clip(
        10.5 + 0.25 * weight_kg + 0.02 * (height_cm - 60) + 0.3 * sex + np.random.normal(0, 0.3, size=n),
        9, 17
    )

    p_smile = sigmoid((age_days - 45) / 10)  # Peaks ~6-8 weeks
    milestone_smile = np.random.binomial(1, p_smile, size=n)
    p_roll = sigmoid((age_days - 120) / 20)  # Peaks ~4-6 months
    milestone_roll = np.random.binomial(1, p_roll, size=n)
    p_sit = sigmoid((age_days - 210) / 30)  # Peaks ~7-9 months
    milestone_sit = np.random.binomial(1, p_sit, size=n)
    avg_weight_gain = np.where(is_existing==1, np.random.normal(0.45,0.1,size=n),0)  # float, 0.2‚Äì0.6 kg/mo, collective (trend), default=0
    weight_velocity = np.where(is_existing==1, np.random.normal(0.1,0.05,size=n),0)  # float, slope of weight over time, collective, default=0
    illness_freq_trend = np.where(is_existing==1, np.random.normal(0.3,0.1,size=n),0) # float, 0‚Äì3, illness trend, collective, default=0
    growth_percentile = np.clip(50 + weight_zscore*15 + np.random.normal(0,5,size=n),0,100) # float, 0‚Äì100, regression target
    # nutrition_flag = np.where((weight_zscore < -2) | ((feeding_type == 2) & (feeding_frequency < 4)), 1, 0) # category, 0=Adequate, 1=NeedsAttention
    nutrition_flag = np.where((weight_zscore < -1.5) | ((feeding_type == 2) & (feeding_frequency < 5)) | (sleep_hours < 11),1, 0)

    # --- Realistic illness logic ---
    base_fever = 0.25 + 0.1 * (2 - vaccination_status)          # base risk for fever
    base_cold = 0.2 + 0.05 * (2 - vaccination_status)           # base risk for cold
    base_diarrhea = 0.15 + 0.05 * (2 - vaccination_status)      # base risk for diarrhea
    feeding_effect = np.where(feeding_type == 1, 0.1, 0.0)      # effect of formula feeding
    freq_effect = np.clip((5 - feeding_frequency) * 0.02, 0, 0.1) # effect of feeding frequency
    sleep_effect = np.clip((12 - sleep_hours) * 0.02, 0, 0.15)    # effect of insufficient sleep
    growth_effect = np.clip((0 - weight_zscore) * 0.05, 0, 0.15)  # effect of low z-score
    trend_effect = illness_freq_trend * 0.05                       # effect of previous illness trend
    prob_fever = np.clip(base_fever + feeding_effect + freq_effect + sleep_effect + growth_effect + trend_effect + np.random.normal(0,0.02,n),0,1)  # float, 0‚Äì1
    prob_cold = np.clip(base_cold + feeding_effect/2 + sleep_effect + growth_effect/2 + trend_effect + np.random.normal(0,0.02,n),0,1)               # float, 0‚Äì1
    prob_diarrhea = np.clip(base_diarrhea + feeding_effect + freq_effect + growth_effect + trend_effect + np.random.normal(0,0.02,n),0,1)            # float, 0‚Äì1
    milestone_sit_delay_prob = np.clip(0.5 - 0.2*weight_zscore - 0.1*milestone_sit + np.random.normal(0,0.1,size=n),0,1)  # float, 0‚Äì1, probability of delay
    df = pd.DataFrame({
        'is_existing': is_existing,
        'age_days': age_days,
        'sex': sex,
        'weight_kg': weight_kg,
        'height_cm': height_cm,
        'muac_cm': muac,
        'weight_zscore': weight_zscore,
        'height_zscore': height_zscore,
        'feeding_type': feeding_type,
        'feeding_frequency': feeding_frequency,
        'vaccination_status': vaccination_status,
        'sleep_hours': sleep_hours,
        'illness_fever': illness_fever,
        'illness_cold': illness_cold,
        'illness_diarrhea': illness_diarrhea,
        'milestone_smile': milestone_smile,
        'milestone_roll': milestone_roll,
        'milestone_sit': milestone_sit,
        'avg_weight_gain': avg_weight_gain,
        'weight_velocity': weight_velocity,
        'illness_freq_trend': illness_freq_trend,
        'growth_percentile': growth_percentile,
        'nutrition_flag': nutrition_flag,
        'prob_fever': prob_fever,
        'prob_cold': prob_cold,
        'prob_diarrhea': prob_diarrhea,
        'milestone_sit_delay_prob': milestone_sit_delay_prob
    })
    df.loc[df["milestone_sit"] == 1, "milestone_sit_delay_prob"] = 0.0  # if milestone achieved, no delay
    return df



def generate_toddler_data(n=10000):
    is_existing = np.random.choice([1,0], size=n, p=[0.6,0.4])
    age_months = np.random.randint(12,37, size=n)
    sex = np.random.choice([0,1], size=n)
    feeding_type = np.random.choice([0,1,2], size=n, p=[0.5,0.3,0.2])
    feeding_frequency = np.random.randint(2,7,size=n)
    vaccination_status = np.random.choice([0,1,2], size=n, p=[0.8,0.15,0.05])  # 0=Up-to-date,1=Partial,2=Delayed

    sleep_hours = np.clip(np.random.normal(12,1.5,size=n),8,16)
    illness_fever = np.random.poisson(0.2,size=n)
    illness_cold = np.random.poisson(0.3,size=n)
    illness_diarrhea = np.random.poisson(0.05,size=n)

    illness_effect = -0.1 * (illness_diarrhea + illness_fever / 2 + illness_cold / 3)
    feeding_effect = -0.15 * (feeding_type / 2) + -0.05 * np.clip(5 - feeding_frequency, 0, 5)/5
    weight_zscore = np.clip(np.random.normal(0 + illness_effect + feeding_effect, 1, size=n), -3, 3)
    height_zscore = np.clip(np.random.normal(0 + illness_effect / 2 + feeding_effect / 2, 1, size=n), -3, 3)

    median_weight = np.where(sex==0, 9.6 + (age_months-12)*0.5, 9.2 + (age_months-12)*0.45)
    sd_weight = 1.0 + (age_months/36)*0.5
    weight_kg = np.clip(median_weight + weight_zscore * sd_weight, 7, 15)

    median_height = np.where(sex==0, 76 + (age_months-12)*2.5, 74 + (age_months-12)*2.4)
    sd_height = 2.0 + (age_months/36)*1.0
    height_cm = np.clip(median_height + height_zscore * sd_height, 70, 100)

    muac = np.clip(
        11.5 + 0.35 * weight_kg + 0.015 * (height_cm - 75) + 0.2 * sex + np.random.normal(0, 0.3, size=n),
        10, 18
    )
    bmi = np.clip(weight_kg / (height_cm / 100)**2, 13, 19)

    p_language = sigmoid((age_months - 18) / 3)
    milestones_language = np.random.binomial(1, p_language, size=n)
    p_walking = sigmoid((age_months - 12) / 2)
    milestones_walking = np.random.binomial(1, p_walking, size=n)

    avg_weight_gain = np.where(is_existing==1, np.random.normal(0.25,0.05,size=n),0)
    weight_velocity = np.where(is_existing==1, np.random.normal(0.1,0.05,size=n),0)
    illness_freq_trend = np.where(is_existing==1, np.random.normal(0.3,0.1,size=n),0)
    growth_percentile = np.clip(50 + weight_zscore*15 + np.random.normal(0,5,size=n),0,100)

    nutrition_flag = np.where(
        (weight_zscore < -1.5) | (bmi < 14) | ((feeding_type == 2) & (feeding_frequency < 4)) | (sleep_hours < 10),
        1, 0
    )

    # --- Realistic illness logic ---
    base_fever = 0.2 + 0.08 * (feeding_type != 0) + 0.1 * (2 - vaccination_status)
    sleep_effect = np.clip((11 - sleep_hours) * 0.02, 0, 0.1)
    freq_effect = np.clip((4 - feeding_frequency) * 0.03, 0, 0.1)
    growth_effect = np.clip((0 - weight_zscore) * 0.05, 0, 0.15)
    trend_effect = illness_freq_trend * 0.05

    prob_fever = np.clip(base_fever + sleep_effect + freq_effect + growth_effect + trend_effect + np.random.normal(0,0.02,n),0,1)
    prob_cold = np.clip(0.15 + 0.2*trend_effect + sleep_effect/2 + np.random.normal(0,0.02,n),0,1)
    prob_diarrhea = np.clip(0.1 + freq_effect + growth_effect + trend_effect + np.random.normal(0,0.02,n),0,1)

    milestones_language_delay_prob = np.clip(0.5 - 0.2*weight_zscore - 0.1*milestones_language + np.random.normal(0,0.1,size=n),0,1)
    milestones_walking_delay_prob = np.clip(0.5 - 0.2*weight_zscore - 0.1*milestones_walking + np.random.normal(0,0.1,size=n),0,1)

    df = pd.DataFrame({
        'is_existing': is_existing,
        'age_months': age_months,
        'sex': sex,
        'weight_kg': weight_kg,
        'height_cm': height_cm,
        'muac_cm': muac,
        'bmi': bmi,
        'weight_zscore': weight_zscore,
        'height_zscore': height_zscore,
        'feeding_type': feeding_type,
        'feeding_frequency': feeding_frequency,
        'vaccination_status': vaccination_status,
        'sleep_hours': sleep_hours,
        'illness_fever': illness_fever,
        'illness_cold': illness_cold,
        'illness_diarrhea': illness_diarrhea,
        'milestones_language': milestones_language,
        'milestones_walking': milestones_walking,
        'avg_weight_gain': avg_weight_gain,
        'weight_velocity': weight_velocity,
        'illness_freq_trend': illness_freq_trend,
        'growth_percentile': growth_percentile,
        'nutrition_flag': nutrition_flag,
        'prob_fever': prob_fever,
        'prob_cold': prob_cold,
        'prob_diarrhea': prob_diarrhea,
        'milestones_language_delay_prob': milestones_language_delay_prob,
        'milestones_walking_delay_prob': milestones_walking_delay_prob
    })
    df.loc[df["milestones_language"] == 1, "milestones_language_delay_prob"] = 0.0
    df.loc[df["milestones_walking"] == 1, "milestones_walking_delay_prob"] = 0.0
    return df






def generate_preschool_data(n=10000):
    is_existing = np.random.choice([1,0], size=n, p=[0.6,0.4])
    age_months = np.random.randint(36,73,size=n)
    sex = np.random.choice([0,1],size=n)
    feeding_type = np.random.choice([0,1], size=n,p=[0.6,0.4])
    feeding_frequency = np.random.randint(3,7,size=n)
    vaccination_status = np.random.choice([0,1,2], size=n, p=[0.85,0.1,0.05])  # 0=Up-to-date,1=Partial,2=Delayed

    sleep_hours = np.clip(np.random.normal(11,1.0,size=n),8,14)
    illness_fever = np.random.poisson(0.2,size=n)
    illness_cold = np.random.poisson(0.25,size=n)
    illness_diarrhea = np.random.poisson(0.05,size=n)

    illness_effect = -0.1 * (illness_diarrhea + illness_fever / 2 + illness_cold / 3)
    feeding_effect = -0.15 * (feeding_type / 1) + -0.05 * np.clip(5 - feeding_frequency, 0, 5)/5
    weight_zscore = np.clip(np.random.normal(0 + illness_effect + feeding_effect, 1, size=n), -3, 3)
    height_zscore = np.clip(np.random.normal(0 + illness_effect / 2 + feeding_effect / 2, 1, size=n), -3, 3)

    median_weight = np.where(sex==0, 14.3 + (age_months-36)*0.5, 13.9 + (age_months-36)*0.45)
    sd_weight = 1.5 + (age_months/72)*0.5
    weight_kg = np.clip(median_weight + weight_zscore * sd_weight, 10, 20)

    median_height = np.where(sex==0, 95.1 + (age_months-36)*2, 94.1 + (age_months-36)*1.9)
    sd_height = 2.5 + (age_months/72)*1.0
    height_cm = np.clip(median_height + height_zscore * sd_height, 85, 120)
    bmi = np.clip(weight_kg / (height_cm / 100)**2, 13, 19)

    muac = np.clip(
        12 + 0.4 * weight_kg + 0.01 * (height_cm - 90) + 0.2 * sex + np.random.normal(0, 0.3, size=n),
        11, 21
    )

    p_speech_clarity = sigmoid((age_months - 36) / 6)
    milestone_speech_clarity = np.random.binomial(1, p_speech_clarity, size=n)
    p_social_play = sigmoid((age_months - 48) / 6)
    milestone_social_play = np.random.binomial(1, p_social_play, size=n)

    avg_weight_gain = np.where(is_existing==1,np.random.normal(0.15,0.05,size=n),0)
    weight_velocity = np.where(is_existing==1,np.random.normal(0.1,0.05,size=n),0)
    illness_freq_trend = np.where(is_existing==1,np.random.normal(0.2,0.1,size=n),0)
    growth_percentile = np.clip(50 + weight_zscore*15 + np.random.normal(0,5,size=n),0,100)
    nutrition_flag = np.where(
        (weight_zscore < -1.3) | (bmi < 14) | ((feeding_type == 1) & (feeding_frequency < 3)) | (sleep_hours < 9.5),
        1, 0
    )

    # --- Realistic illness logic ---
    sleep_effect = np.clip((10 - sleep_hours) * 0.015, 0, 0.08)
    freq_effect = np.clip((3 - feeding_frequency) * 0.04, 0, 0.1)
    growth_effect = np.clip((0 - weight_zscore) * 0.05, 0, 0.1)
    trend_effect = illness_freq_trend * 0.05
    base_fever = 0.25 + sleep_effect + growth_effect + trend_effect + 0.1 * (2 - vaccination_status)
    prob_fever = np.clip(base_fever + np.random.normal(0,0.02,n),0,1)
    prob_cold = np.clip(0.2 + sleep_effect/2 + trend_effect + 0.1 * (2 - vaccination_status) + np.random.normal(0,0.02,n),0,1)
    prob_diarrhea = np.clip(0.15 + freq_effect + growth_effect + trend_effect + 0.1 * (2 - vaccination_status) + np.random.normal(0,0.02,n),0,1)

    milestone_speech_delay_prob = np.clip(0.5 - 0.2*weight_zscore - 0.1*milestone_speech_clarity + np.random.normal(0,0.1,size=n),0,1)
    milestone_social_play_delay_prob = np.clip(0.5 - 0.2*weight_zscore - 0.1*milestone_social_play + np.random.normal(0,0.1,size=n),0,1)

    df = pd.DataFrame({
        'is_existing':is_existing,
        'age_months':age_months,
        'sex':sex,
        'weight_kg':weight_kg,
        'height_cm':height_cm,
        'muac_cm': muac,
        'bmi':bmi,
        'weight_zscore':weight_zscore,
        'height_zscore':height_zscore,
        'feeding_type':feeding_type,
        'feeding_frequency':feeding_frequency,
        'vaccination_status': vaccination_status,
        'sleep_hours':sleep_hours,
        'illness_fever':illness_fever,
        'illness_cold':illness_cold,
        'illness_diarrhea':illness_diarrhea,
        'milestone_speech_clarity':milestone_speech_clarity,
        'milestone_social_play':milestone_social_play,
        'avg_weight_gain':avg_weight_gain,
        'weight_velocity':weight_velocity,
        'illness_freq_trend':illness_freq_trend,
        'growth_percentile':growth_percentile,
        'nutrition_flag':nutrition_flag,
        'prob_fever':prob_fever,
        'prob_cold':prob_cold,
        'prob_diarrhea':prob_diarrhea,
        'milestone_speech_delay_prob':milestone_speech_delay_prob,
        'milestone_social_play_delay_prob':milestone_social_play_delay_prob
    })
    df.loc[df["milestone_speech_clarity"] == 1, "milestone_speech_delay_prob"] = 0.0
    df.loc[df["milestone_social_play"] == 1, "milestone_social_play_delay_prob"] = 0.0
    return df





def generate_schoolage_data(n=10000):
    is_existing = np.random.choice([1,0],size=n,p=[0.6,0.4])
    age_years = np.random.randint(6,11,size=n)
    sex = np.random.choice([0,1],size=n)
    feeding_type = np.random.choice([0,1],size=n,p=[0.7,0.3])
    feeding_frequency = np.random.randint(3,7,size=n)
    vaccination_status = np.random.choice([0,1,2], size=n, p=[0.9,0.08,0.02])  # 0=Up-to-date,1=Partial,2=Delayed

    sleep_hours = np.clip(np.random.normal(9,1.0,size=n),7,12)
    illness_fever = np.random.poisson(0.1,size=n)
    illness_cold = np.random.poisson(0.2,size=n)
    illness_diarrhea = np.random.poisson(0.03,size=n)

    illness_effect = -0.1 * (illness_diarrhea + illness_fever / 2 + illness_cold / 3)
    feeding_effect = -0.15 * (feeding_type / 1) + -0.05 * np.clip(5 - feeding_frequency, 0, 5)/5
    weight_zscore = np.clip(np.random.normal(0 + illness_effect + feeding_effect, 1, size=n), -3, 3)
    height_zscore = np.clip(np.random.normal(0 + illness_effect / 2 + feeding_effect / 2, 1, size=n), -3, 3)

    median_weight = np.where(sex==0, 20.5 + (age_years-6)*3.0, 19.9 + (age_years-6)*2.8)
    sd_weight = 2.0 + (age_years/10)*1.0
    weight_kg = np.clip(median_weight + weight_zscore * sd_weight, 15, 40)

    median_height = np.where(sex==0, 115.0 + (age_years-6)*4.0, 114.0 + (age_years-6)*3.8)
    sd_height = 3.0 + (age_years/10)*1.5
    height_cm = np.clip(median_height + height_zscore * sd_height, 100, 150)

    bmi = np.clip(weight_kg / (height_cm / 100)**2, 14, 22)
    muac = np.clip(
        13 + 0.45 * weight_kg + 0.008 * (height_cm - 100) + 0.2 * sex + np.random.normal(0, 0.3, size=n),
        12, 25
    )

    p_learning_skill = sigmoid((age_years - 6) / 1)
    milestone_learning_skill = np.random.binomial(1, p_learning_skill, size=n)
    p_social_skill = sigmoid((age_years - 7) / 1)
    milestone_social_skill = np.random.binomial(1, p_social_skill, size=n)

    avg_weight_gain = np.where(is_existing==1,np.random.normal(0.1,0.05,size=n),0)
    weight_velocity = np.where(is_existing==1,np.random.normal(0.05,0.02,size=n),0)
    illness_freq_trend = np.where(is_existing==1,np.random.normal(0.1,0.05,size=n),0)
    growth_percentile = np.clip(50 + weight_zscore*15 + np.random.normal(0,5,size=n),0,100)
    nutrition_flag = np.where(
        (weight_zscore < -1.3) | (bmi < 15) | ((feeding_type == 1) & (feeding_frequency < 4)) | (sleep_hours < 8),
        1, 0
    )

    # --- Realistic illness logic ---
    sleep_effect = np.clip((9 - sleep_hours) * 0.015, 0, 0.08)
    freq_effect = np.clip((4 - feeding_frequency) * 0.04, 0, 0.1)
    growth_effect = np.clip((0 - weight_zscore) * 0.05, 0, 0.1)
    trend_effect = illness_freq_trend * 0.05
    base_fever = 0.15 + sleep_effect + growth_effect + trend_effect + 0.1 * (2 - vaccination_status)
    prob_fever = np.clip(base_fever + np.random.normal(0,0.02,n),0,1)
    prob_cold = np.clip(0.15 + sleep_effect/2 + trend_effect + 0.1 * (2 - vaccination_status) + np.random.normal(0,0.02,n),0,1)
    prob_diarrhea = np.clip(0.1 + freq_effect + growth_effect + trend_effect + 0.1 * (2 - vaccination_status) + np.random.normal(0,0.02,n),0,1)

    milestone_learning_delay_prob = np.clip(0.5 - 0.2*weight_zscore - 0.1*milestone_learning_skill + np.random.normal(0,0.1,size=n),0,1)
    milestone_social_skill_delay_prob = np.clip(0.5 - 0.2*weight_zscore - 0.1*milestone_social_skill + np.random.normal(0,0.1,size=n),0,1)

    df = pd.DataFrame({
        'is_existing':is_existing,
        'age_years':age_years,
        'sex':sex,
        'weight_kg':weight_kg,
        'height_cm':height_cm,
        'muac_cm': muac,
        'bmi':bmi,
        'weight_zscore':weight_zscore,
        'height_zscore':height_zscore,
        'feeding_type':feeding_type,
        'feeding_frequency':feeding_frequency,
        'vaccination_status': vaccination_status,
        'sleep_hours':sleep_hours,
        'illness_fever':illness_fever,
        'illness_cold':illness_cold,
        'illness_diarrhea':illness_diarrhea,
        'milestone_learning_skill':milestone_learning_skill,
        'milestone_social_skill':milestone_social_skill,
        'avg_weight_gain':avg_weight_gain,
        'weight_velocity':weight_velocity,
        'illness_freq_trend':illness_freq_trend,
        'growth_percentile':growth_percentile,
        'nutrition_flag':nutrition_flag,
        'prob_fever':prob_fever,
        'prob_cold':prob_cold,
        'prob_diarrhea':prob_diarrhea,
        'milestone_learning_delay_prob':milestone_learning_delay_prob,
        'milestone_social_skill_delay_prob':milestone_social_skill_delay_prob
    })
    df.loc[df["milestone_learning_skill"] == 1, "milestone_learning_delay_prob"] = 0.0
    df.loc[df["milestone_social_skill"] == 1, "milestone_social_skill_delay_prob"] = 0.0
    return df





# -----------------------------
# SAVE CSVs
# -----------------------------
if __name__=="__main__":
    infant_df = generate_infant_data(10000)
    infant_df.to_csv("infant_synthetic_data.csv", index=False)
    toddler_df = generate_toddler_data(10000)
    toddler_df.to_csv("toddler_synthetic_data.csv", index=False)
    preschool_df = generate_preschool_data(10000)
    preschool_df.to_csv("preschool_synthetic_data.csv", index=False)
    schoolage_df = generate_schoolage_data(10000)
    schoolage_df.to_csv("schoolage_synthetic_data.csv", index=False)
    print("‚úÖ Synthetic CSVs generated for all four categories with realistic milestone-delay and illness-probability relationships!")

‚úÖ Synthetic CSVs generated for all four categories with realistic milestone-delay and illness-probability relationships!


In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier
import shap
import matplotlib.pyplot as plt
import joblib

# -----------------------------
# Load Infant Data
# -----------------------------
infant_df = pd.read_csv("infant_synthetic_data.csv")

# -----------------------------
# Define targets
# -----------------------------
targets = [
    'growth_percentile',        # continuous
    'nutrition_flag',           # binary
    'prob_fever',               # continuous
    'prob_cold',                # continuous
    'prob_diarrhea',            # continuous
    'milestone_sit_delay_prob'  # continuous
]

# -----------------------------
# Split train/test
# -----------------------------
X = infant_df.drop(columns=targets)
y = infant_df[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# Train models per target
# -----------------------------
models = {}
# shap_values = {} # Commented out SHAP initialization
metrics = {}

for target in targets:
    print(f"\nTraining model for: {target}")

    if target == 'nutrition_flag':
        # model = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
        model = XGBClassifier(
              n_estimators=300,
              learning_rate=0.05,
              max_depth=6,
              scale_pos_weight = (len(y_train[target][y_train[target] == 0]) / len(y_train[target][y_train[target] == 1])),
              random_state=42
          )

    else:
        model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)

    model.fit(X_train, y_train[target])
    models[target] = model

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics
    if target == 'nutrition_flag':
        acc = accuracy_score(y_test[target], y_pred)
        metrics[target] = {'accuracy': acc}
        print(f"Accuracy for {target}: {acc:.4f}")
    else:
        r2 = r2_score(y_test[target], y_pred)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        metrics[target] = {'r2': r2, 'rmse': rmse}
        print(f"R2 for {target}: {r2:.4f}, RMSE: {rmse:.4f}")

    # SHAP values - Commented out due to errors
    # explainer = shap.Explainer(model.predict, X_train.sample(100, random_state=42))
    # shap_val = explainer(X_test)
    # shap_values[target] = shap_val

    # Plot SHAP summary - Commented out due to errors
    # plt.figure(figsize=(8,5))
    # shap.summary_plot(shap_val, X_test, show=True)

    # Save model
    joblib.dump(model, f"infant_{target}_xgb_model.pkl")

# print("\nAll Infant models trained, evaluated, and SHAP plots generated.") # Modified print statement
print("\nAll Infant models trained and evaluated.")


Training model for: growth_percentile
R2 for growth_percentile: 0.8958, RMSE: 5.1137

Training model for: nutrition_flag
Accuracy for nutrition_flag: 0.9950

Training model for: prob_fever
R2 for prob_fever: 0.9396, RMSE: 0.0210

Training model for: prob_cold
R2 for prob_cold: 0.8095, RMSE: 0.0204

Training model for: prob_diarrhea
R2 for prob_diarrhea: 0.8916, RMSE: 0.0210

Training model for: milestone_sit_delay_prob
R2 for milestone_sit_delay_prob: 0.9400, RMSE: 0.0744

All Infant models trained and evaluated.


In [24]:
# for toddler traininig scrips
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier
import joblib

# ====================================
# Load Toddler Data
# ====================================
toddler_df = pd.read_csv("toddler_synthetic_data.csv")

# ====================================
# Define Targets
# ====================================
targets = [
    'growth_percentile',           # continuous
    'nutrition_flag',              # binary
    'prob_fever',                  # continuous
    'prob_cold',                   # continuous
    'prob_diarrhea',               # continuous
    'milestones_language_delay_prob',
    'milestones_walking_delay_prob'   # continuous
]

# ====================================
# Split Train/Test
# ====================================
X = toddler_df.drop(columns=targets)
y = toddler_df[targets]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ====================================
# Train Models per Target
# ====================================
models = {}
metrics = {}

for target in targets:
    print(f"\nTraining model for: {target}")

    # Classification model (binary)
    if target == 'nutrition_flag':
        model = XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            scale_pos_weight=(len(y_train[y_train[target]==0]) /
                              len(y_train[y_train[target]==1])),
            random_state=42,
            eval_metric='logloss'
        )
    # Regression model (continuous)
    else:
        model = XGBRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            random_state=42
        )

    # Train model
    model.fit(X_train, y_train[target])
    models[target] = model

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation metrics
    if target == 'nutrition_flag':
        acc = accuracy_score(y_test[target], y_pred.round())
        metrics[target] = {'accuracy': acc}
        print(f"Accuracy for {target}: {acc:.4f}")
    else:
        r2 = r2_score(y_test[target], y_pred)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        metrics[target] = {'r2': r2, 'rmse': rmse}
        print(f"R2 for {target}: {r2:.4f}, RMSE: {rmse:.4f}")

    # Save model
    joblib.dump(model, f"toddler_{target}_xgb_model.pkl")

print("\n‚úÖ All Toddler models trained and evaluated successfully.")
print("\nPerformance Summary:")
for target, vals in metrics.items():
    print(f"{target}: {vals}")



Training model for: growth_percentile
R2 for growth_percentile: 0.8884, RMSE: 5.2421

Training model for: nutrition_flag
Accuracy for nutrition_flag: 0.9970

Training model for: prob_fever
R2 for prob_fever: 0.9331, RMSE: 0.0206

Training model for: prob_cold
R2 for prob_cold: 0.0347, RMSE: 0.0203

Training model for: prob_diarrhea
R2 for prob_diarrhea: 0.7928, RMSE: 0.0203

Training model for: milestones_language_delay_prob
R2 for milestones_language_delay_prob: 0.9550, RMSE: 0.0537

Training model for: milestones_walking_delay_prob
R2 for milestones_walking_delay_prob: 0.9624, RMSE: 0.0274

‚úÖ All Toddler models trained and evaluated successfully.

Performance Summary:
growth_percentile: {'r2': 0.888409058855438, 'rmse': np.float64(5.242107165973478)}
nutrition_flag: {'accuracy': 0.997}
prob_fever: {'r2': 0.933054757391828, 'rmse': np.float64(0.02062857737984902)}
prob_cold: {'r2': 0.03465544730997416, 'rmse': np.float64(0.020251440090017156)}
prob_diarrhea: {'r2': 0.79279085806656

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier
import joblib

# ====================================
# Load Preschool Data
# ====================================
preschool_df = pd.read_csv("preschool_synthetic_data.csv")  # Your training CSV

# ====================================
# Define Targets
# ====================================
targets = [
    'growth_percentile',           # continuous
    'nutrition_flag',              # binary
    'prob_fever',                  # continuous
    'prob_cold',                   # continuous
    'prob_diarrhea',               # continuous
    'milestone_speech_delay_prob',
    'milestone_social_play_delay_prob'
]

# ====================================
# Feature/Target Split
# ====================================
X = preschool_df.drop(columns=targets)
y = preschool_df[targets]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ====================================
# Check Nutrition Flag Distribution
# ====================================
print("\n‚úÖ Nutrition Flag Distribution (Full Dataset):")
print(preschool_df['nutrition_flag'].value_counts())
print("\nPercentage distribution:")
print(preschool_df['nutrition_flag'].value_counts(normalize=True) * 100)

# ====================================
# Train Models
# ====================================
models = {}
metrics = {}

for target in targets:
    print(f"\nTraining model for: {target}")

    if target == 'nutrition_flag':
        model = XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            scale_pos_weight=(len(y_train[y_train[target]==0]) /
                              len(y_train[y_train[target]==1])),
            random_state=42,
            eval_metric='logloss'
        )
    else:
        model = XGBRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            random_state=42
        )

    # Train
    model.fit(X_train, y_train[target])
    models[target] = model

    # Predict
    y_pred = model.predict(X_test)

    # Metrics
    if target == 'nutrition_flag':
        acc = accuracy_score(y_test[target], y_pred.round())
        metrics[target] = {'accuracy': acc}
        print(f"Accuracy: {acc:.4f}")

        # Confusion matrix & classification report
        cm = confusion_matrix(y_test[target], y_pred.round())
        cr = classification_report(y_test[target], y_pred.round())
        print("\nConfusion Matrix:")
        print(cm)
        print("\nClassification Report:")
        print(cr)

    else:
        r2 = r2_score(y_test[target], y_pred)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        metrics[target] = {'r2': r2, 'rmse': rmse}
        print(f"R2: {r2:.4f}, RMSE: {rmse:.4f}")

    # Save model
    joblib.dump(model, f"preschool_{target}_xgb_model.pkl")

# ====================================
# Summary
# ====================================
print("\n‚úÖ All Preschool models trained and evaluated successfully.\n")
print("Performance Summary:")
for target, vals in metrics.items():
    print(f"{target}: {vals}")



‚úÖ Nutrition Flag Distribution (Full Dataset):
nutrition_flag
1    7631
0    2369
Name: count, dtype: int64

Percentage distribution:
nutrition_flag
1    76.31
0    23.69
Name: proportion, dtype: float64

Training model for: growth_percentile
R2: 0.8847, RMSE: 5.2656

Training model for: nutrition_flag
Accuracy: 0.9970

Confusion Matrix:
[[ 457    3]
 [   3 1537]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       460
           1       1.00      1.00      1.00      1540

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Training model for: prob_fever
R2: 0.8934, RMSE: 0.0207

Training model for: prob_cold
R2: 0.8772, RMSE: 0.0205

Training model for: prob_diarrhea
R2: 0.8943, RMSE: 0.0203

Training model for: milestone_speech_delay_prob
R2: 0.9588, RMSE: 0.0377

Training model for: milestone_so

In [26]:
"""
School-age Training Pipeline
Author: Ashwin Solanki
Purpose: Train XGBoost models to predict health, nutrition, and developmental milestones for school-age children (6‚Äì10 yrs).
Targets include percentile, nutrition flag, illness probabilities, and milestone delays.
"""

# -----------------------------
# 1Ô∏è‚É£ Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier
import joblib

# -----------------------------
# 2Ô∏è‚É£ Load School-Age Data
# -----------------------------
school_df = pd.read_csv("schoolage_synthetic_data.csv")  # replace with your CSV

# -----------------------------
# 3Ô∏è‚É£ Define Targets
# -----------------------------
targets = [
    'growth_percentile',              # continuous
    'nutrition_flag',                 # binary
    'prob_fever',                     # continuous
    'prob_cold',                      # continuous
    'prob_diarrhea',                  # continuous
    'milestone_learning_delay_prob',  # continuous
    'milestone_social_skill_delay_prob'     # continuous
]

# -----------------------------
# 4Ô∏è‚É£ Split Features & Labels
# -----------------------------
X = school_df.drop(columns=targets)
y = school_df[targets]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 5Ô∏è‚É£ Train Models per Target
# -----------------------------
models = {}
metrics = {}

for target in targets:
    print(f"\nTraining model for: {target}")

    # Classification model for nutrition_flag
    if target == 'nutrition_flag':
        model = XGBClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            scale_pos_weight=(len(y_train[y_train[target]==0]) /
                              len(y_train[y_train[target]==1])),
            random_state=42,
            eval_metric='logloss'
        )
    else:
        model = XGBRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=6,
            random_state=42
        )

    # Train
    model.fit(X_train, y_train[target])
    models[target] = model

    # Predict
    y_pred = model.predict(X_test)

    # Metrics & Evaluation
    if target == 'nutrition_flag':
        acc = accuracy_score(y_test[target], y_pred.round())
        cm = confusion_matrix(y_test[target], y_pred.round())
        cr = classification_report(y_test[target], y_pred.round())
        metrics[target] = {'accuracy': acc, 'confusion_matrix': cm, 'classification_report': cr}
        print(f"Accuracy for {target}: {acc:.4f}")
        print("Confusion Matrix:\n", cm)
        print("Classification Report:\n", cr)
    else:
        r2 = r2_score(y_test[target], y_pred)
        rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
        metrics[target] = {'r2': r2, 'rmse': rmse}
        print(f"R2 for {target}: {r2:.4f}, RMSE: {rmse:.4f}")

    # Save model
    joblib.dump(model, f"schoolage_{target}_xgb_model.pkl")

print("\n‚úÖ All School-age models trained and evaluated successfully.")

# -----------------------------
# 6Ô∏è‚É£ Summary
# -----------------------------
print("\nPerformance Summary:")
for target, vals in metrics.items():
    print(f"{target}: {vals}")



Training model for: growth_percentile
R2 for growth_percentile: 0.8907, RMSE: 5.0987

Training model for: nutrition_flag
Accuracy for nutrition_flag: 0.9975
Confusion Matrix:
 [[1244    4]
 [   1  751]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1248
           1       0.99      1.00      1.00       752

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Training model for: prob_fever
R2 for prob_fever: 0.8429, RMSE: 0.0208

Training model for: prob_cold
R2 for prob_cold: 0.7941, RMSE: 0.0204

Training model for: prob_diarrhea
R2 for prob_diarrhea: 0.8677, RMSE: 0.0203

Training model for: milestone_learning_delay_prob
R2 for milestone_learning_delay_prob: 0.9601, RMSE: 0.0439

Training model for: milestone_social_skill_delay_prob
R2 for milestone_social_skill_delay_prob: 0.9513, RMSE: 0.05

In [27]:
"""
Infant Prediction Pipeline (Production-ready)
Author: Ashwin Solanki
Purpose: Predict multiple health & nutrition targets for infants (0‚Äì1 years).
Handles feature alignment, missing columns, and realistic input data.
"""

# -----------------------------
# 1Ô∏è‚É£ Imports
# -----------------------------
import pandas as pd
import joblib
import os
import numpy as np

# -----------------------------
# 2Ô∏è‚É£ Configurations
# -----------------------------
MODEL_DIR = "/content"  # Change to your deployed model directory
OUTPUT_FILE = "infant_predictions.csv"

# Targets
TARGETS = [
    'growth_percentile',
    'nutrition_flag',
    'prob_fever',
    'prob_cold',
    'prob_diarrhea',
    'milestone_sit_delay_prob'
]

# Map targets to model paths
MODEL_FILES = {t: os.path.join(MODEL_DIR, f"infant_{t}_xgb_model.pkl") for t in TARGETS}

# -----------------------------
# 3Ô∏è‚É£ Training Feature Order (from your trained models)
# -----------------------------
TRAIN_FEATURES = [
    'is_existing', 'age_days', 'sex', 'weight_kg', 'height_cm','muac_cm',
    'weight_zscore', 'height_zscore', 'feeding_type', 'feeding_frequency',
    'vaccination_status', 'sleep_hours', 'illness_fever', 'illness_cold',
    'illness_diarrhea', 'milestone_smile', 'milestone_roll', 'milestone_sit',
    'avg_weight_gain', 'weight_velocity', 'illness_freq_trend'
]

# -----------------------------
# 4Ô∏è‚É£ Simulated Realistic Infant Data
# -----------------------------
X_new = pd.DataFrame({
    'is_existing': [1, 1, 0, 0, 1],
    'age_days': [300, 270, 250, 320, 290],
    'sex': [0, 1, 0, 1, 0],
    'weight_kg': [7.0, 8.2, 9.5, 10.5, 8.0],
    'height_cm': [68.0, 70.0, 75.0, 78.0, 71.0],
    'weight_zscore': [-2.0, -1.2, 0.0, 0.5, -0.5],
    'height_zscore': [-1.5, -0.8, 0.2, 0.6, -0.3],
    'muac_cm': [12.0, 12.5, 13.0, 13.2, 12.1],
    'feeding_type': [0, 0, 1, 1, 2],
    'feeding_frequency': [5, 6, 7, 8, 5],
    'vaccination_status': [0, 0, 1, 1, 0],
    'sleep_hours': [12, 13, 14, 15, 13],
    'illness_fever': [1, 0, 0, 0, 1],
    'illness_cold': [1, 0, 0, 0, 0],
    'illness_diarrhea': [0, 0, 0, 0, 0],
    'milestone_smile': [1, 1, 1, 1, 1],
    'milestone_roll': [0, 1, 1, 1, 0],
    'milestone_sit': [0, 0, 1, 1, 0],
    'avg_weight_gain': [0.1, 0.3, 0.5, 0.8, 0.2],
    'weight_velocity': [0.02, 0.05, 0.12, 0.18, 0.04],
    'illness_freq_trend': [0.5, 0.3, 0.1, 0.05, 0.4]
})

print("‚úÖ Incoming infant data loaded.")
print("Shape:", X_new.shape)
print("Columns:", list(X_new.columns))

# -----------------------------
# 4Ô∏è‚É£1 Auto Feature Alignment
# -----------------------------
# Add missing columns with default 0
for col in TRAIN_FEATURES:
    if col not in X_new.columns:
        print(f"‚ö†Ô∏è Column '{col}' missing in input. Adding default 0 values.")
        X_new[col] = 0

# Reorder columns to match training
X_model = X_new[TRAIN_FEATURES]

# -----------------------------
# 5Ô∏è‚É£ Run Predictions
# -----------------------------
predictions = pd.DataFrame(index=X_model.index)

for target in TARGETS:
    print(f"\nüîπ Predicting: {target}")

    model_path = MODEL_FILES[target]
    if not os.path.exists(model_path):
        print(f"‚ö†Ô∏è Model file not found for target: {target}. Skipping.")
        continue

    # Load model
    model = joblib.load(model_path)

    try:
        y_pred = model.predict(X_model)

        if target == 'nutrition_flag':
            try:
                y_prob = model.predict_proba(X_model)[:, 1]
                predictions[f"{target}_pred"] = (y_prob > 0.5).astype(int)
                predictions[f"{target}_prob"] = y_prob
            except:
                predictions[f"{target}_pred"] = y_pred
        else:
            predictions[f"{target}_pred"] = y_pred

    except Exception as e:
        print(f"‚ùå Error predicting {target}: {e}")
        predictions[f"{target}_pred"] = np.nan

# -----------------------------
# 6Ô∏è‚É£ Combine Results
# -----------------------------
final_results = pd.concat([X_new.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)

# -----------------------------
# 7Ô∏è‚É£ Save Predictions
# -----------------------------
final_results.to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úÖ Predictions completed successfully! Saved to: {OUTPUT_FILE}")

# -----------------------------
# 8Ô∏è‚É£ Display Sample
# -----------------------------
print("\nSample predictions:")
print(final_results.head())

# -----------------------------
# 9Ô∏è‚É£ Deployment Notes
# -----------------------------
# - Incoming data columns can be in any order; missing columns are auto-added with 0.
# - MODEL_DIR must point to the deployed model folder.
# - Column alignment avoids feature mismatch errors.
# - Optional: Add SHAP explanations for doctor-facing dashboards.


‚úÖ Incoming infant data loaded.
Shape: (5, 21)
Columns: ['is_existing', 'age_days', 'sex', 'weight_kg', 'height_cm', 'weight_zscore', 'height_zscore', 'muac_cm', 'feeding_type', 'feeding_frequency', 'vaccination_status', 'sleep_hours', 'illness_fever', 'illness_cold', 'illness_diarrhea', 'milestone_smile', 'milestone_roll', 'milestone_sit', 'avg_weight_gain', 'weight_velocity', 'illness_freq_trend']

üîπ Predicting: growth_percentile

üîπ Predicting: nutrition_flag

üîπ Predicting: prob_fever

üîπ Predicting: prob_cold

üîπ Predicting: prob_diarrhea

üîπ Predicting: milestone_sit_delay_prob

‚úÖ Predictions completed successfully! Saved to: infant_predictions.csv

Sample predictions:
   is_existing  age_days  sex  weight_kg  height_cm  weight_zscore  \
0            1       300    0        7.0       68.0           -2.0   
1            1       270    1        8.2       70.0           -1.2   
2            0       250    0        9.5       75.0            0.0   
3            0      

In [28]:
"""
Toddler Prediction Pipeline (Production-ready)
Author: Ashwin Solanki
Purpose: Predict multiple health & nutrition targets for toddlers (1‚Äì3 years).
Handles feature alignment, missing columns, and realistic input data.
"""

# -----------------------------
# 1Ô∏è‚É£ Imports
# -----------------------------
import pandas as pd
import joblib
import os
import numpy as np

# -----------------------------
# 2Ô∏è‚É£ Configurations
# -----------------------------
MODEL_DIR = "/content"   # Change to your deployed model directory
OUTPUT_FILE = "toddler_predictions.csv"

# Targets (as trained)
TARGETS = [
    'growth_percentile',
    'nutrition_flag',
    'prob_fever',
    'prob_cold',
    'prob_diarrhea',
    'milestones_walking_delay_prob',
    'milestones_language_delay_prob'
]

# Map targets to model paths
MODEL_FILES = {t: os.path.join(MODEL_DIR, f"toddler_{t}_xgb_model.pkl") for t in TARGETS}

# -----------------------------
# 3Ô∏è‚É£ Training Feature Order (from your trained models)
# -----------------------------
# This must match exactly the columns used during training
TRAIN_FEATURES = [
    'is_existing', 'age_months', 'sex', 'weight_kg', 'height_cm','muac_cm', 'bmi',
    'weight_zscore', 'height_zscore', 'feeding_type', 'feeding_frequency', 'vaccination_status',
    'sleep_hours', 'illness_fever', 'illness_cold', 'illness_diarrhea',
    'milestones_language', 'milestones_walking', 'avg_weight_gain',
    'weight_velocity', 'illness_freq_trend'
]

# -----------------------------
# 4Ô∏è‚É£ Simulated Realistic Toddler Data
# -----------------------------
X_new = pd.DataFrame({
    'is_existing': [1, 0, 1, 1, 0],
    'age_months': [14, 18, 24, 30, 34],
    'sex': [0, 1, 0, 1, 1],
    'weight_kg': [9.8, 10.5, 12.2, 13.0, 11.5],
    'height_cm': [78.0, 83.0, 89.0, 93.0, 86.0],
    'muac_cm': [13.0, 13.5, 14.2, 15.0, 14.0],
    'bmi': [16.1, 15.2, 15.4, 15.0, 15.5],  # weight / (height_m)^2
    'weight_zscore': [-1.8, -0.7, 0.1, 0.6, -0.5],
    'height_zscore': [-1.2, -0.6, 0.2, 0.7, -0.4],
    'feeding_type': [0, 1, 1, 0, 2],       # 0=FamilyFood, 1=Mixed, 2=Bottle
    'feeding_frequency': [4, 5, 5, 4, 6],
    'vaccination_status': [0, 1, 2, 1, 0],
    'sleep_hours': [12, 11, 13, 12, 11],
    'illness_fever': [0, 1, 0, 0, 1],
    'illness_cold': [1, 0, 1, 1, 0],
    'illness_diarrhea': [0, 0, 1, 0, 0],
    'milestones_language': [0, 1, 1, 1, 0],   # corresponds to milestone_talk
    'milestones_walking': [1, 1, 1, 1, 0],    # corresponds to milestone_walk
    'avg_weight_gain': [0.18, 0.22, 0.25, 0.3, 0.2],
    'weight_velocity': [0.05, 0.08, 0.1, 0.12, 0.06],
    'illness_freq_trend': [0.2, 0.4, 0.3, 0.25, 0.5]
})

print("‚úÖ Incoming toddler data loaded.")
print("Shape:", X_new.shape)
print("Columns:", list(X_new.columns))

# -----------------------------
# 4Ô∏è‚É£1 Auto Feature Alignment
# -----------------------------
# Add missing columns
for col in TRAIN_FEATURES:
    if col not in X_new.columns:
        print(f"‚ö†Ô∏è Column '{col}' missing in input. Adding default 0 values.")
        X_new[col] = 0

# Reorder columns to match training
X_model = X_new[TRAIN_FEATURES]

# -----------------------------
# 5Ô∏è‚É£ Run Predictions
# -----------------------------
predictions = pd.DataFrame(index=X_model.index)

for target in TARGETS:
    print(f"\nüîπ Predicting: {target}")

    model_path = MODEL_FILES[target]
    if not os.path.exists(model_path):
        print(f"‚ö†Ô∏è Model file not found for target: {target}. Skipping.")
        continue

    # Load model
    model = joblib.load(model_path)

    # Predict
    try:
        y_pred = model.predict(X_model)

        # Classifier handling
        if target == 'nutrition_flag':
            try:
                y_prob = model.predict_proba(X_model)[:, 1]
                predictions[f"{target}_pred"] = (y_prob > 0.5).astype(int)
                predictions[f"{target}_prob"] = y_prob
            except:
                predictions[f"{target}_pred"] = y_pred
        else:
            predictions[f"{target}_pred"] = y_pred

    except Exception as e:
        print(f"‚ùå Error predicting {target}: {e}")
        predictions[f"{target}_pred"] = np.nan

# -----------------------------
# 6Ô∏è‚É£ Combine Results
# -----------------------------
final_results = pd.concat([X_new.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)

# -----------------------------
# 7Ô∏è‚É£ Save Predictions
# -----------------------------
final_results.to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úÖ Predictions completed successfully! Saved to: {OUTPUT_FILE}")

# -----------------------------
# 8Ô∏è‚É£ Display Sample
# -----------------------------
print("\nSample predictions:")
print(final_results.head())

# -----------------------------
# 9Ô∏è‚É£ Deployment Notes
# -----------------------------
# - Incoming data columns can be in any order; missing columns are automatically added with 0.
# - MODEL_DIR must point to the deployed model folder.
# - Column alignment ensures XGBoost feature mismatch errors are avoided.
# - Optional: add SHAP explainability for doctor-facing dashboards.


‚úÖ Incoming toddler data loaded.
Shape: (5, 21)
Columns: ['is_existing', 'age_months', 'sex', 'weight_kg', 'height_cm', 'muac_cm', 'bmi', 'weight_zscore', 'height_zscore', 'feeding_type', 'feeding_frequency', 'vaccination_status', 'sleep_hours', 'illness_fever', 'illness_cold', 'illness_diarrhea', 'milestones_language', 'milestones_walking', 'avg_weight_gain', 'weight_velocity', 'illness_freq_trend']

üîπ Predicting: growth_percentile

üîπ Predicting: nutrition_flag

üîπ Predicting: prob_fever

üîπ Predicting: prob_cold

üîπ Predicting: prob_diarrhea

üîπ Predicting: milestones_walking_delay_prob

üîπ Predicting: milestones_language_delay_prob

‚úÖ Predictions completed successfully! Saved to: toddler_predictions.csv

Sample predictions:
   is_existing  age_months  sex  weight_kg  height_cm  muac_cm   bmi  \
0            1          14    0        9.8       78.0     13.0  16.1   
1            0          18    1       10.5       83.0     13.5  15.2   
2            1          24  

In [29]:
"""
Preschool Prediction Pipeline (Production-ready)
Author: Ashwin Solanki
Purpose: Predict multiple health & nutrition targets for preschoolers (3‚Äì6 years).
Handles feature alignment, missing columns, and realistic input data.
"""

# -----------------------------
# 1Ô∏è‚É£ Imports
# -----------------------------
import pandas as pd
import joblib
import os
import numpy as np

# -----------------------------
# 2Ô∏è‚É£ Configurations
# -----------------------------
MODEL_DIR = "/content"   # Change to your deployed model directory
OUTPUT_FILE = "preschool_predictions.csv"

# Targets (as trained)
TARGETS = [
    'growth_percentile',
    'nutrition_flag',
    'prob_fever',
    'prob_cold',
    'prob_diarrhea',
    'milestone_speech_delay_prob',
    'milestone_social_play_delay_prob'
]

# Map targets to model paths
MODEL_FILES = {t: os.path.join(MODEL_DIR, f"preschool_{t}_xgb_model.pkl") for t in TARGETS}

# -----------------------------
# 3Ô∏è‚É£ Training Feature Order (from your trained models)
# -----------------------------
TRAIN_FEATURES = [
    'is_existing', 'age_months', 'sex', 'weight_kg', 'height_cm','muac_cm', 'bmi',
    'weight_zscore', 'height_zscore', 'feeding_type', 'feeding_frequency','vaccination_status',
    'sleep_hours', 'illness_fever', 'illness_cold', 'illness_diarrhea',
    'milestone_speech_clarity','milestone_social_play', 'avg_weight_gain',
    'weight_velocity', 'illness_freq_trend'
]

# -----------------------------
# 4Ô∏è‚É£ Simulated Realistic Preschool Data
# -----------------------------
X_new = pd.DataFrame({
    'is_existing': [1, 0, 1, 0, 1],                     # Existing patient or new
    'age_months': [42, 48, 50, 60, 65],                # 3.5 to 5.5 years
    'sex': [0, 1, 0, 1, 0],                            # 0=Male, 1=Female
    'weight_kg': [15.2, 16.5, 17.0, 19.0, 18.5],       # realistic weight
    'height_cm': [95.0, 98.0, 100.0, 105.0, 108.0],    # realistic height
    'muac_cm': [15.8, 16.3, 17.0, 17.8, 18.0],
    'bmi': [16.8, 17.1, 17.0, 17.2, 15.9],             # BMI
    'weight_zscore': [-0.5, 0.0, 0.2, 0.5, 0.1],       # growth z-score
    'height_zscore': [-0.3, 0.1, 0.2, 0.6, 0.4],       # growth z-score
    'feeding_type': [0, 1, 0, 1, 0],                   # 0=FamilyFood, 1=Mixed
    'feeding_frequency': [3, 4, 4, 5, 3],              # meals per day
    'vaccination_status': [2, 1, 2, 2, 1],

    'sleep_hours': [10, 11, 10, 9, 10],                # avg sleep
    'illness_fever': [0, 1, 0, 0, 1],                  # 0=no, 1=yes
    'illness_cold': [1, 0, 1, 1, 0],
    'illness_diarrhea': [0, 0, 0, 0, 0],
    'milestone_speech_clarity': [1, 0, 1, 1, 1],      # 1=on track, 0=delayed
    'milestone_social_play': [1, 0, 1, 1, 1],         # 1=on track, 0=delayed
    'avg_weight_gain': [0.18, 0.22, 0.25, 0.30, 0.20],# kg/month
    'weight_velocity': [0.08, 0.10, 0.09, 0.12, 0.11],
    'illness_freq_trend': [0.20, 0.40, 0.30, 0.25, 0.35] # 0=low, 1=high
})

print("‚úÖ Incoming preschool data loaded.")
print("Shape:", X_new.shape)
print("Columns:", list(X_new.columns))

# -----------------------------
# 4Ô∏è‚É£1 Auto Feature Alignment
# -----------------------------
# Add missing columns with default 0 values
for col in TRAIN_FEATURES:
    if col not in X_new.columns:
        print(f"‚ö†Ô∏è Column '{col}' missing in input. Adding default 0 values.")
        X_new[col] = 0

# Reorder columns to match training
X_model = X_new[TRAIN_FEATURES]

# -----------------------------
# 5Ô∏è‚É£ Run Predictions
# -----------------------------
predictions = pd.DataFrame(index=X_model.index)

for target in TARGETS:
    print(f"\nüîπ Predicting: {target}")

    model_path = MODEL_FILES[target]
    if not os.path.exists(model_path):
        print(f"‚ö†Ô∏è Model file not found for target: {target}. Skipping.")
        continue

    # Load model
    model = joblib.load(model_path)

    # Predict
    try:
        y_pred = model.predict(X_model)

        # Classifier handling
        if target == 'nutrition_flag':
            try:
                y_prob = model.predict_proba(X_model)[:, 1]
                predictions[f"{target}_pred"] = (y_prob > 0.5).astype(int)
                predictions[f"{target}_prob"] = y_prob
            except:
                predictions[f"{target}_pred"] = y_pred
        else:
            predictions[f"{target}_pred"] = y_pred

    except Exception as e:
        print(f"‚ùå Error predicting {target}: {e}")
        predictions[f"{target}_pred"] = np.nan

# -----------------------------
# 6Ô∏è‚É£ Combine Results
# -----------------------------
final_results = pd.concat([X_new.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)

# -----------------------------
# 7Ô∏è‚É£ Save Predictions
# -----------------------------
final_results.to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úÖ Predictions completed successfully! Saved to: {OUTPUT_FILE}")

# -----------------------------
# 8Ô∏è‚É£ Display Sample
# -----------------------------
print("\nSample predictions:")
print(final_results.head())

# -----------------------------
# 9Ô∏è‚É£ Deployment Notes
# -----------------------------
# - Incoming data columns can be in any order; missing columns are automatically added with 0.
# - MODEL_DIR must point to the deployed model folder.
# - Column alignment ensures XGBoost feature mismatch errors are avoided.
# - Optional: add SHAP explainability for doctor-facing dashboards.


‚úÖ Incoming preschool data loaded.
Shape: (5, 21)
Columns: ['is_existing', 'age_months', 'sex', 'weight_kg', 'height_cm', 'muac_cm', 'bmi', 'weight_zscore', 'height_zscore', 'feeding_type', 'feeding_frequency', 'vaccination_status', 'sleep_hours', 'illness_fever', 'illness_cold', 'illness_diarrhea', 'milestone_speech_clarity', 'milestone_social_play', 'avg_weight_gain', 'weight_velocity', 'illness_freq_trend']

üîπ Predicting: growth_percentile

üîπ Predicting: nutrition_flag

üîπ Predicting: prob_fever

üîπ Predicting: prob_cold

üîπ Predicting: prob_diarrhea

üîπ Predicting: milestone_speech_delay_prob

üîπ Predicting: milestone_social_play_delay_prob

‚úÖ Predictions completed successfully! Saved to: preschool_predictions.csv

Sample predictions:
   is_existing  age_months  sex  weight_kg  height_cm  muac_cm   bmi  \
0            1          42    0       15.2       95.0     15.8  16.8   
1            0          48    1       16.5       98.0     16.3  17.1   
2            1  

In [30]:
"""
School-Age Prediction Pipeline (Production-ready)
Author: Ashwin Solanki
Purpose: Predict multiple health & nutrition targets for school-age children (6‚Äì10 years).
Handles feature alignment, missing columns, realistic input data, and age in years.
"""

# -----------------------------
# 1Ô∏è‚É£ Imports
# -----------------------------
import pandas as pd
import joblib
import os
import numpy as np

# -----------------------------
# 2Ô∏è‚É£ Configurations
# -----------------------------
MODEL_DIR = "/content"   # Change to your deployed model directory
OUTPUT_FILE = "schoolage_predictions.csv"

# Targets (as trained)
TARGETS = [
    'growth_percentile',
    'nutrition_flag',
    'prob_fever',
    'prob_cold',
    'prob_diarrhea',
    'milestone_learning_delay_prob',
    'milestone_social_skill_delay_prob'
]

# Map targets to model paths
MODEL_FILES = {t: os.path.join(MODEL_DIR, f"schoolage_{t}_xgb_model.pkl") for t in TARGETS}

# -----------------------------
# 3Ô∏è‚É£ Training Feature Order
# -----------------------------
# Must match exactly the columns used during training
TRAIN_FEATURES = [
    'is_existing', 'age_years', 'sex', 'weight_kg', 'height_cm','muac_cm', 'bmi',
    'weight_zscore', 'height_zscore', 'feeding_type', 'feeding_frequency','vaccination_status',
    'sleep_hours', 'illness_fever', 'illness_cold', 'illness_diarrhea',
    'milestone_learning_skill', 'milestone_social_skill', 'avg_weight_gain',
    'weight_velocity', 'illness_freq_trend'
]

# -----------------------------
# 4Ô∏è‚É£ Realistic Synthetic School-Age Data
# -----------------------------
X_new = pd.DataFrame({
    'is_existing': [1, 0, 1, 0, 1],
    'age_years': [6, 7, 8, 9, 10],
    'sex': [0, 1, 0, 1, 0],
    'weight_kg': [20.5, 23.0, 25.5, 28.0, 32.0],
    'height_cm': [115.0, 120.0, 125.0, 130.0, 138.0],
    'muac_cm': [18.5, 19.0, 19.8, 20.5, 21.0],  # Optional

    'bmi': [15.5, 16.0, 16.4, 16.5, 16.8],
    'weight_zscore': [-0.2, 0.1, 0.3, 0.5, 0.6],
    'height_zscore': [-0.1, 0.2, 0.3, 0.6, 0.7],
    'feeding_type': [0, 1, 0, 1, 0],          # 0=Home, 1=Mixed
    'feeding_frequency': [3, 4, 4, 5, 3],
    'vaccination_status': [2, 1, 2, 2, 1],
    'sleep_hours': [10, 9, 9, 8, 8],
    'illness_fever': [0, 0, 1, 0, 1],
    'illness_cold': [1, 0, 1, 1, 0],
    'illness_diarrhea': [0, 0, 0, 1, 0],
    'milestone_learning_skill': [1, 1, 1, 1, 0],
    'milestone_social_skill': [1, 1, 1, 1, 0],
    'avg_weight_gain': [0.3, 0.35, 0.4, 0.5, 0.45],
    'weight_velocity': [0.08, 0.10, 0.09, 0.12, 0.11],
    'illness_freq_trend': [0.2, 0.3, 0.25, 0.35, 0.4]
})

print("‚úÖ Incoming school-age data loaded.")
print("Shape:", X_new.shape)
print("Columns:", list(X_new.columns))

# -----------------------------
# 4Ô∏è‚É£1 Auto Feature Alignment
# -----------------------------
# Add missing columns
for col in TRAIN_FEATURES:
    if col not in X_new.columns:
        print(f"‚ö†Ô∏è Column '{col}' missing in input. Adding default 0 values.")
        X_new[col] = 0

# Reorder columns to match training
X_model = X_new[TRAIN_FEATURES]

# -----------------------------
# 5Ô∏è‚É£ Run Predictions
# -----------------------------
predictions = pd.DataFrame(index=X_model.index)

for target in TARGETS:
    print(f"\nüîπ Predicting: {target}")

    model_path = MODEL_FILES[target]
    if not os.path.exists(model_path):
        print(f"‚ö†Ô∏è Model file not found for target: {target}. Skipping.")
        continue

    # Load model
    model = joblib.load(model_path)

    # Predict
    try:
        y_pred = model.predict(X_model)

        # Classifier handling
        if target == 'nutrition_flag':
            try:
                y_prob = model.predict_proba(X_model)[:, 1]
                predictions[f"{target}_pred"] = (y_prob > 0.5).astype(int)
                predictions[f"{target}_prob"] = y_prob
            except:
                predictions[f"{target}_pred"] = y_pred
        else:
            predictions[f"{target}_pred"] = y_pred

    except Exception as e:
        print(f"‚ùå Error predicting {target}: {e}")
        predictions[f"{target}_pred"] = np.nan

# -----------------------------
# 6Ô∏è‚É£ Combine Results
# -----------------------------
final_results = pd.concat([X_new.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)

# -----------------------------
# 7Ô∏è‚É£ Save Predictions
# -----------------------------
final_results.to_csv(OUTPUT_FILE, index=False)
print(f"\n‚úÖ Predictions completed successfully! Saved to: {OUTPUT_FILE}")

# -----------------------------
# 8Ô∏è‚É£ Display Sample
# -----------------------------
print("\nSample predictions:")
print(final_results.head())

# -----------------------------
# 9Ô∏è‚É£ Deployment Notes
# -----------------------------
# - Incoming data columns can be in any order; missing columns are automatically added with 0.
# - MODEL_DIR must point to the deployed model folder.
# - Column alignment ensures XGBoost feature mismatch errors are avoided.
# - Age is now in years.
# - Optional: add SHAP explainability for doctor-facing dashboards.


‚úÖ Incoming school-age data loaded.
Shape: (5, 21)
Columns: ['is_existing', 'age_years', 'sex', 'weight_kg', 'height_cm', 'muac_cm', 'bmi', 'weight_zscore', 'height_zscore', 'feeding_type', 'feeding_frequency', 'vaccination_status', 'sleep_hours', 'illness_fever', 'illness_cold', 'illness_diarrhea', 'milestone_learning_skill', 'milestone_social_skill', 'avg_weight_gain', 'weight_velocity', 'illness_freq_trend']

üîπ Predicting: growth_percentile

üîπ Predicting: nutrition_flag

üîπ Predicting: prob_fever

üîπ Predicting: prob_cold

üîπ Predicting: prob_diarrhea

üîπ Predicting: milestone_learning_delay_prob

üîπ Predicting: milestone_social_skill_delay_prob

‚úÖ Predictions completed successfully! Saved to: schoolage_predictions.csv

Sample predictions:
   is_existing  age_years  sex  weight_kg  height_cm  muac_cm   bmi  \
0            1          6    0       20.5      115.0     18.5  15.5   
1            0          7    1       23.0      120.0     19.0  16.0   
2            1 