## Loading Data

In [5]:
import pandas as pd
import numpy as np  
import sklearn

In [6]:
dataset_parent = pd.read_csv('../../helix_extracted.csv')

In [7]:
dataset_parent

Unnamed: 0,ID,h_abs_ratio_preg_Log,h_no2_ratio_preg_Log,h_pm10_ratio_preg_None,h_pm25_ratio_preg_None,hs_no2_dy_hs_h_Log,hs_no2_wk_hs_h_Log,hs_no2_yr_hs_h_Log,hs_pm10_dy_hs_h_None,hs_pm10_wk_hs_h_None,...,hs_globalexp2_None,hs_smk_parents_None,h_distinvnear1_preg_Log,h_trafload_preg_pow1over3,h_trafnear_preg_pow1over3,hs_trafload_h_pow1over3,hs_trafnear_h_pow1over3,h_bro_preg_Log,h_clf_preg_Log,h_thm_preg_Log
0,1,0.896711,2.872304,25.948498,17.433798,2.530279,2.583284,2.612098,22.535828,20.850005,...,exposure,both,-5.629052,0.345830,21.213987,135.995760,24.914911,-2.975930,-0.536713,0.288073
1,2,0.892538,2.980008,25.897739,18.470850,1.928600,2.652479,2.761064,14.077763,29.141274,...,no exposure,neither,-5.045588,0.345830,22.649684,97.884416,27.905292,-2.975930,1.078750,3.195478
2,3,0.778723,3.056501,26.087347,18.711547,2.882591,2.591756,2.356163,46.859096,31.530981,...,no exposure,one,-4.204225,145.572759,25.549436,127.536143,29.299706,-1.604450,1.188758,3.023924
3,4,0.089056,3.089157,14.991380,16.409771,1.390750,2.456717,2.403247,29.817442,25.232778,...,exposure,one,-3.088010,187.120021,22.743850,114.868483,22.743850,2.748819,3.636813,3.839335
4,5,0.604781,3.848211,35.197296,14.889958,3.204449,3.499594,3.307663,29.817442,24.891465,...,no exposure,neither,-1.459250,158.489039,29.872070,205.774781,14.131454,4.262137,3.027843,4.572734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,1297,-0.099819,2.254548,35.827999,12.954850,1.635889,2.109734,2.393117,16.596676,22.231124,...,no exposure,one,-2.859290,141.413361,0.000000,0.000000,0.000000,1.854156,-4.419126,-0.815583
1297,1298,1.043402,3.819085,26.850889,15.318656,3.129574,3.281948,3.547043,19.266689,25.232778,...,no exposure,neither,-1.883697,0.345830,22.649684,72.862204,14.059610,4.781072,2.183305,4.808214
1298,1299,0.864024,2.872304,24.950790,18.562578,2.709897,2.664830,2.636456,43.036453,47.726555,...,no exposure,neither,-4.089047,82.098064,19.439435,86.124973,19.439435,-1.604450,-0.515838,0.337320
1299,1300,0.270579,3.288932,18.258001,13.714355,2.945648,3.724925,3.467167,13.025550,21.158041,...,no exposure,neither,-2.995108,55.909636,7.937005,163.491616,19.492218,-0.126698,2.862258,2.919491


In [8]:
# Step 1: List of relevant features for predicting pediatric metabolic health
exposure_features = [
    'ID',

    # Air Pollution
    'h_no2_ratio_preg_Log', 'h_pm25_ratio_preg_None',
    'hs_no2_yr_hs_h_Log', 'hs_pm25_yr_hs_h_None', 'hs_pm25abs_yr_hs_h_Log',

    # Urban & Built Environment
    'h_walkability_mean_preg_None', 'hs_walkability_mean_h_None',
    'h_ndvi100_preg_None', 'hs_ndvi100_h_None',

    # Socioeconomic
    'FAS_cat_None', 'hs_contactfam_3cat_num_None', 'hs_hm_pers_None',

    # Lifestyle (Diet & Activity)
    'h_fruit_preg_Ter', 'h_fish_preg_Ter', 'h_veg_preg_Ter',
    'hs_total_fruits_Ter', 'hs_total_veg_Ter', 'hs_total_meat_Ter',
    'hs_mvpa_prd_alt_None',

    # Endocrine Disruptors / Heavy Metals
    'hs_pfoa_c_Log2', 'hs_pfos_c_Log2', 'hs_pbde47_cadj_Log2',
    'hs_as_c_Log2', 'hs_cd_c_Log2', 'hs_hg_c_Log2', 'hs_mn_c_Log2'
]

# Step 2: Filter dataset
filtered_exposures = dataset_parent[exposure_features]

# Step 3: Save filtered dataset
filtered_exposures.to_csv('filtered_exposures.csv', index=False)

print("✅ Filtered exposures saved to 'filtered_exposures.csv'")


✅ Filtered exposures saved to 'filtered_exposures.csv'


In [9]:
data = pd.read_csv('filtered_exposures.csv')

In [10]:
data

Unnamed: 0,ID,h_no2_ratio_preg_Log,h_pm25_ratio_preg_None,hs_no2_yr_hs_h_Log,hs_pm25_yr_hs_h_None,hs_pm25abs_yr_hs_h_Log,h_walkability_mean_preg_None,hs_walkability_mean_h_None,h_ndvi100_preg_None,hs_ndvi100_h_None,...,hs_total_veg_Ter,hs_total_meat_Ter,hs_mvpa_prd_alt_None,hs_pfoa_c_Log2,hs_pfos_c_Log2,hs_pbde47_cadj_Log2,hs_as_c_Log2,hs_cd_c_Log2,hs_hg_c_Log2,hs_mn_c_Log2
0,1,2.872304,17.433798,2.612098,18.395672,0.321064,0.175,0.375,0.447003,0.465392,...,"(8.5,Inf]","(6,9]",47.892043,-0.053154,-0.383378,-8.972840,-4.282809,-4.140507,-2.152003,3.459432
1,2,2.980008,18.470850,2.761064,17.735640,0.281494,0.200,0.200,0.481251,0.523970,...,"(6,8.5]","(0,6]",31.831901,0.627456,-0.043164,-2.424321,-6.430280,-4.254289,-1.300448,2.881665
2,3,3.056501,18.711547,2.356163,16.808644,0.098720,0.150,0.250,0.457100,0.548659,...,"(8.5,Inf]","(9,Inf]",117.576830,0.838311,0.557685,-2.506691,-7.978291,-4.051698,-0.910502,2.998196
3,4,3.089157,16.409771,2.403247,12.729050,0.177725,0.350,0.525,0.287486,0.361057,...,"(8.5,Inf]","(0,6]",-2.033474,1.073065,3.065071,-3.446691,0.632268,-5.259045,1.480265,3.215679
4,5,3.848211,14.889958,3.307663,13.414330,0.220464,0.275,0.300,0.146271,0.427771,...,"(0,6]","(6,9]",-7.854284,0.503681,0.167947,-1.268688,0.650765,-3.752437,2.861955,2.879706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,1297,2.254548,12.954850,2.393117,13.630536,0.173676,0.200,0.275,0.213159,0.248622,...,"(8.5,Inf]","(0,6]",32.995693,0.767097,1.109848,-0.580756,1.722466,-3.158429,1.344828,2.643856
1297,1298,3.819085,15.318656,3.547043,13.414330,0.489045,0.325,0.450,0.169334,0.187567,...,"(6,8.5]","(9,Inf]",44.808196,0.589529,0.726324,-0.077749,1.731183,-3.803897,2.726831,2.929791
1298,1299,2.872304,18.562578,2.636456,18.152457,0.355094,0.150,0.200,0.506035,0.482565,...,"(6,8.5]","(9,Inf]",9.568474,0.235076,2.376664,-8.723587,-4.636304,-3.293359,-2.395929,3.218781
1299,1300,3.288932,13.714355,3.467167,11.779130,0.047575,0.350,0.475,0.330000,0.275326,...,"(8.5,Inf]","(0,6]",46.206956,0.521966,1.627522,-2.654748,0.731183,-3.000000,0.659925,3.102658


In [11]:
from sklearn.model_selection import train_test_split

# Step 1: Load filtered exposures (if not already in memory)
filtered_exposures = pd.read_csv('filtered_exposures.csv')

# Step 2: Perform 90/10 split
train_expo, test_expo = train_test_split(filtered_exposures, test_size=0.10, random_state=42)

# Step 3: Save to CSV
train_expo.to_csv('train_exposures.csv', index=False)
test_expo.to_csv('test_exposures.csv', index=False)

print("✅ Train/test split done and saved as 'train_exposures.csv' and 'test_exposures.csv'")


✅ Train/test split done and saved as 'train_exposures.csv' and 'test_exposures.csv'


## Risk Score Computation

In [12]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('train_exposures.csv')
train['risk_score'] = 0

# --------------------------
# 1. AIR POLLUTION EXPOSURES
# --------------------------
# WHO: NO₂ > 30 µg/m³ (log threshold ≈ 3.4)
train['risk_score'] += (np.exp(train['h_no2_ratio_preg_Log']) >= 30).astype(int)
train['risk_score'] += (np.exp(train['hs_no2_yr_hs_h_Log']) >= 30).astype(int)

# WHO: PM2.5 > 5–10 µg/m³ (log threshold ≈ 1.6–2.3)
train['risk_score'] += (train['h_pm25_ratio_preg_None'] >= 1.7).astype(int)  # Approx for 5.5 µg/m³
train['risk_score'] += (train['hs_pm25_yr_hs_h_None'] >= 1.7).astype(int)
train['risk_score'] += (train['hs_pm25abs_yr_hs_h_Log'].apply(np.exp) >= 5.5).astype(int)

# --------------------------
# 2. BUILT ENVIRONMENT
# --------------------------
train['risk_score'] += (train['h_walkability_mean_preg_None'] < train['h_walkability_mean_preg_None'].median()).astype(int)
train['risk_score'] += (train['hs_walkability_mean_h_None'] < train['hs_walkability_mean_h_None'].median()).astype(int)
train['risk_score'] += (train['h_ndvi100_preg_None'] < 0.3).astype(int)
train['risk_score'] += (train['hs_ndvi100_h_None'] < 0.3).astype(int)

# --------------------------
# 3. SOCIOECONOMIC
# --------------------------
train['FAS_cat_None'] = pd.to_numeric(train['FAS_cat_None'], errors='coerce')
train['risk_score'] += (train['FAS_cat_None'] < 2).astype(int)

train['hs_contactfam_3cat_num_None'] = pd.to_numeric(train['hs_contactfam_3cat_num_None'], errors='coerce')
train['risk_score'] += (train['hs_contactfam_3cat_num_None'] < 2).astype(int)

train['hs_hm_pers_None'] = pd.to_numeric(train['hs_hm_pers_None'], errors='coerce')
train['risk_score'] += (train['hs_hm_pers_None'] > train['hs_hm_pers_None'].quantile(0.75)).astype(int)

# --------------------------
# 4. DIETARY & ACTIVITY
# --------------------------
diet_cols = ['h_fruit_preg_Ter', 'h_fish_preg_Ter', 'h_veg_preg_Ter',
             'hs_total_fruits_Ter', 'hs_total_veg_Ter', 'hs_total_meat_Ter']
for col in diet_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')

train['risk_score'] += (train['h_fruit_preg_Ter'] < 2).astype(int)
train['risk_score'] += (train['h_veg_preg_Ter'] < 2).astype(int)
train['risk_score'] += (train['hs_total_fruits_Ter'] < train['hs_total_fruits_Ter'].median()).astype(int)
train['risk_score'] += (train['hs_total_veg_Ter'] < train['hs_total_veg_Ter'].median()).astype(int)
train['risk_score'] += (train['hs_total_meat_Ter'] > train['hs_total_meat_Ter'].quantile(0.75)).astype(int)

# MVPA: risk if < 60 min/day, assume already in minutes
train['hs_mvpa_prd_alt_None'] = pd.to_numeric(train['hs_mvpa_prd_alt_None'], errors='coerce')
train['risk_score'] += (train['hs_mvpa_prd_alt_None'] < 60).astype(int)

# --------------------------
# 5. TOXINS (Top 25% = high risk)
# --------------------------
toxins = ['hs_pfoa_c_Log2', 'hs_pfos_c_Log2', 'hs_pbde47_cadj_Log2',
          'hs_as_c_Log2', 'hs_cd_c_Log2', 'hs_hg_c_Log2', 'hs_mn_c_Log2']

for tox in toxins:
    train[tox] = pd.to_numeric(train[tox], errors='coerce')
    train['risk_score'] += (train[tox] > train[tox].quantile(0.75)).astype(int)

# Save updated dataset
train.to_csv('train_with_risk_score_validated.csv', index=False)
print("✅ Biomedically validated risk score saved to 'train_with_risk_score_validated.csv'")


✅ Biomedically validated risk score saved to 'train_with_risk_score_validated.csv'


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [13]:
import pandas as pd
import numpy as np

# Load test exposures
test = pd.read_csv('test_exposures.csv')
test['risk_score'] = 0

# --------------------------
# 1. AIR POLLUTION EXPOSURES
# --------------------------
test['risk_score'] += (np.exp(test['h_no2_ratio_preg_Log']) >= 30).astype(int)
test['risk_score'] += (np.exp(test['hs_no2_yr_hs_h_Log']) >= 30).astype(int)

test['risk_score'] += (test['h_pm25_ratio_preg_None'] >= 1.7).astype(int)
test['risk_score'] += (test['hs_pm25_yr_hs_h_None'] >= 1.7).astype(int)
test['risk_score'] += (test['hs_pm25abs_yr_hs_h_Log'].apply(np.exp) >= 5.5).astype(int)

# --------------------------
# 2. BUILT ENVIRONMENT
# --------------------------
test['risk_score'] += (test['h_walkability_mean_preg_None'] < test['h_walkability_mean_preg_None'].median()).astype(int)
test['risk_score'] += (test['hs_walkability_mean_h_None'] < test['hs_walkability_mean_h_None'].median()).astype(int)
test['risk_score'] += (test['h_ndvi100_preg_None'] < 0.3).astype(int)
test['risk_score'] += (test['hs_ndvi100_h_None'] < 0.3).astype(int)

# --------------------------
# 3. SOCIOECONOMIC
# --------------------------
test['FAS_cat_None'] = pd.to_numeric(test['FAS_cat_None'], errors='coerce')
test['risk_score'] += (test['FAS_cat_None'] < 2).astype(int)

test['hs_contactfam_3cat_num_None'] = pd.to_numeric(test['hs_contactfam_3cat_num_None'], errors='coerce')
test['risk_score'] += (test['hs_contactfam_3cat_num_None'] < 2).astype(int)

test['hs_hm_pers_None'] = pd.to_numeric(test['hs_hm_pers_None'], errors='coerce')
test['risk_score'] += (test['hs_hm_pers_None'] > test['hs_hm_pers_None'].quantile(0.75)).astype(int)

# --------------------------
# 4. DIETARY & ACTIVITY
# --------------------------
diet_cols = ['h_fruit_preg_Ter', 'h_fish_preg_Ter', 'h_veg_preg_Ter',
             'hs_total_fruits_Ter', 'hs_total_veg_Ter', 'hs_total_meat_Ter']
for col in diet_cols:
    test[col] = pd.to_numeric(test[col], errors='coerce')

test['risk_score'] += (test['h_fruit_preg_Ter'] < 2).astype(int)
test['risk_score'] += (test['h_veg_preg_Ter'] < 2).astype(int)
test['risk_score'] += (test['hs_total_fruits_Ter'] < test['hs_total_fruits_Ter'].median()).astype(int)
test['risk_score'] += (test['hs_total_veg_Ter'] < test['hs_total_veg_Ter'].median()).astype(int)
test['risk_score'] += (test['hs_total_meat_Ter'] > test['hs_total_meat_Ter'].quantile(0.75)).astype(int)

test['hs_mvpa_prd_alt_None'] = pd.to_numeric(test['hs_mvpa_prd_alt_None'], errors='coerce')
test['risk_score'] += (test['hs_mvpa_prd_alt_None'] < 60).astype(int)

# --------------------------
# 5. TOXINS (Top 25%)
# --------------------------
toxins = ['hs_pfoa_c_Log2', 'hs_pfos_c_Log2', 'hs_pbde47_cadj_Log2',
          'hs_as_c_Log2', 'hs_cd_c_Log2', 'hs_hg_c_Log2', 'hs_mn_c_Log2']

for tox in toxins:
    test[tox] = pd.to_numeric(test[tox], errors='coerce')
    test['risk_score'] += (test[tox] > test[tox].quantile(0.75)).astype(int)

# Save updated test set
test.to_csv('test_with_risk_score_validated.csv', index=False)
print("✅ Biomedically validated risk score saved to 'test_with_risk_score_validated.csv'")


✅ Biomedically validated risk score saved to 'test_with_risk_score_validated.csv'


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
