# Asess Healthy Eating Index Scores

## Set Working Directory

In [None]:
import os
import re

try:
  from google.colab import drive
  drive.mount('/content/drive')
  os.chdir('/content/drive/MyDrive/ds1_nhanes/')
except:
  from pathlib import Path
  if not re.search(r'ds1_nhanes$', str(os.getcwd())):
    os.chdir(Path(os.getcwd()).parent)

print(os.getcwd())

Mounted at /content/drive
/content/drive/MyDrive/ds1_nhanes


## Load Libraries and Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
# read in cleaned dataframe from folder

# df = pd.read_csv('data/clean/nhanes_2017_2023_forHEI.csv')
df = pd.read_csv('data/clean/nhanes_2017_2023_clustered.csv')


print(df.head())
# print columns
print(df.columns)


     SEQN  weight_2d     grams   satfat  monofat  polyfat  sodium  \
0  109266    2994.10   9866.51   44.307   45.510   31.964    5440   
1  109271    7988.83  13587.90  107.088  113.156   68.257   11603   
2  109273   28255.51   3707.63   42.654   39.373   24.282    4939   
3  109274    6187.41   5679.36   45.402   55.548   39.762   10156   
4  109282   25233.38   5262.90  101.404   60.175   21.043    6286   

   f_total_(cup_eq)  f_citmlb_(cup_eq)  f_other_(cup_eq)  ...  gender  age  \
0           2.05053            1.67730          0.373230  ...  Female   29   
1           1.35389            0.00000          1.329896  ...    Male   49   
2           0.00000            0.00000          0.000000  ...    Male   36   
3           0.83470            0.83160          0.000000  ...    Male   68   
4           2.87094            0.03402          1.855420  ...    Male   76   

             race                  education  income_ratio  total_cholesterol  \
0  Other or Multi  College graduate

In [None]:
# Create column for whole fruit
# sum of Citrus, Melons and Berries with Other Fruits to generate Whole Fruit (non-juice) consumption

df['fwholefrt'] = df['f_citmlb_(cup_eq)'] + df['f_other_(cup_eq)']

In [None]:
# Create column for Total Vegetables
# sum of vegetables and legumes

df['vtotalleg'] = df['v_total_(cup_eq)'] + df['v_legumes_(cup_eq)']

In [None]:
# Create column for Greens and Beans
# sum of dark green begetables and legumes

df['vdrkgrleg'] = df['v_drkgr_(cup_eq)'] + df['v_legumes_(cup_eq)']

In [None]:
# Create column for Total Protein Foods
# sum of all animal and plant proteins including meat, poultry, fish, eggs, nuts, seeds, soy, legumes

df['pfallprotleg'] = df['pf_mps_total_(oz_eq)'] + df['pf_eggs_(oz_eq)'] + df['pf_nutsds_(oz_eq)'] + df['pf_soy_(oz_eq)'] + df['pf_legumes_(oz_eq)']

In [None]:
# Create column for Seafood and Plant Proteins
# sum of all fish and plant proteins, including fish, nuts, seeds, soy, and legumes

df['pfseaplantleg'] = df['pf_seafd_hi_(oz_eq)'] + df['pf_seafd_low_(oz_eq)'] + df['pf_nutsds_(oz_eq)'] + df['pf_soy_(oz_eq)'] + df['pf_legumes_(oz_eq)']

In [None]:
# Create column for Total Fatty Acids
# sum of monounsaturated and polyunsaturated fatty acids
df['monopoly'] = df['oils_(grams)'] + df['solid_fats_(grams)']

In [None]:
# Fatty acid variable
# (PUFA + MUFA) / SFA
df['fatty_acid_ratio'] = (df['polyfat'] + df['monofat']) / df['satfat']

In [None]:
print(df.columns)

Index(['SEQN', 'weight_2d', 'grams', 'satfat', 'monofat', 'polyfat', 'sodium',
       'f_total_(cup_eq)', 'f_citmlb_(cup_eq)', 'f_other_(cup_eq)',
       'f_juice_(cup_eq)', 'v_total_(cup_eq)', 'v_drkgr_(cup_eq)',
       'v_redor_total_(cup_eq)', 'v_redor_tomato_(cup_eq)',
       'v_redor_other_(cup_eq)', 'v_starchy_total_(cup_eq)',
       'v_starchy_potato_(cup_eq)', 'v_starchy_other_(cup_eq)',
       'v_other_(cup_eq)', 'v_legumes_(cup_eq)', 'g_total_(oz_eq)',
       'g_whole_(oz_eq)', 'g_refined_(oz_eq)', 'pf_total_(oz_eq)',
       'pf_mps_total_(oz_eq)', 'pf_meat_(oz_eq)', 'pf_curedmeat_(oz_eq)',
       'pf_organ_(oz_eq)', 'pf_poult_(oz_eq)', 'pf_seafd_hi_(oz_eq)',
       'pf_seafd_low_(oz_eq)', 'pf_eggs_(oz_eq)', 'pf_soy_(oz_eq)',
       'pf_nutsds_(oz_eq)', 'pf_legumes_(oz_eq)', 'd_total_(cup_eq)',
       'd_milk_(cup_eq)', 'd_yogurt_(cup_eq)', 'd_cheese_(cup_eq)',
       'oils_(grams)', 'solid_fats_(grams)', 'add_sugars_(tsp_eq)',
       'a_drinks_(no._of_drinks)', 'kcal_d1', 'k

## Divide by kcal

In [None]:
df['total_fruit_density'] = df['f_total_(cup_eq)'] / df['kcal_2day'] * 1000
df['whole_fruit_density'] = df['fwholefrt'] / df['kcal_2day'] * 1000
df['total_vegetables_density'] = df['vtotalleg'] / df['kcal_2day'] * 1000
df['greens_and_beans_density'] = df['vdrkgrleg'] / df['kcal_2day'] * 1000
df['whole_grains_density'] = df['g_whole_(oz_eq)'] / df['kcal_2day'] * 1000
df['dairy_density'] = df['d_total_(cup_eq)'] / df['kcal_2day'] * 1000
df['total_protein_density'] = df['pfallprotleg'] / df['kcal_2day'] * 1000
df['sea_plant_protein_density'] = df['pfseaplantleg'] / df['kcal_2day'] * 1000
df['refined_grains_density'] = df['g_refined_(oz_eq)'] / df['kcal_2day'] * 1000
df['sodium_density'] = df['sodium'] / df['kcal_2day']
# NOTE: not multiplying sodium by 1000

df['added_sugar_kcal'] = df['add_sugars_(tsp_eq)'] * 16
df['fatty_acid_kcal'] = df['satfat'] * 9

In [None]:
df.loc[:, 'total_fruit_density':'fatty_acid_kcal'].head()

Unnamed: 0,total_fruit_density,whole_fruit_density,total_vegetables_density,greens_and_beans_density,whole_grains_density,dairy_density,total_protein_density,sea_plant_protein_density,refined_grains_density,sodium_density,added_sugar_kcal,fatty_acid_kcal
0,0.570543,0.570543,1.249988,0.128068,1.059373,1.033466,1.228868,0.702905,3.153831,1.513634,296.897152,398.763
1,0.214937,0.211128,0.588022,0.0,0.020003,0.686047,3.101699,0.01143,3.073916,1.842038,772.05296,963.792
2,0.0,0.0,0.840237,0.048161,0.0,1.151529,2.304807,0.0,3.022084,1.398358,768.041184,383.886
3,0.22639,0.225549,1.356596,0.0,0.225549,0.289983,2.027574,0.0,5.587152,2.754543,152.444704,408.618
4,0.57661,0.379482,0.569813,0.0,0.559148,0.702428,1.450382,0.108606,2.969437,1.262503,751.81448,912.636


In [None]:
df.columns

Index(['SEQN', 'weight_2d', 'grams', 'satfat', 'monofat', 'polyfat', 'sodium',
       'f_total_(cup_eq)', 'f_citmlb_(cup_eq)', 'f_other_(cup_eq)',
       'f_juice_(cup_eq)', 'v_total_(cup_eq)', 'v_drkgr_(cup_eq)',
       'v_redor_total_(cup_eq)', 'v_redor_tomato_(cup_eq)',
       'v_redor_other_(cup_eq)', 'v_starchy_total_(cup_eq)',
       'v_starchy_potato_(cup_eq)', 'v_starchy_other_(cup_eq)',
       'v_other_(cup_eq)', 'v_legumes_(cup_eq)', 'g_total_(oz_eq)',
       'g_whole_(oz_eq)', 'g_refined_(oz_eq)', 'pf_total_(oz_eq)',
       'pf_mps_total_(oz_eq)', 'pf_meat_(oz_eq)', 'pf_curedmeat_(oz_eq)',
       'pf_organ_(oz_eq)', 'pf_poult_(oz_eq)', 'pf_seafd_hi_(oz_eq)',
       'pf_seafd_low_(oz_eq)', 'pf_eggs_(oz_eq)', 'pf_soy_(oz_eq)',
       'pf_nutsds_(oz_eq)', 'pf_legumes_(oz_eq)', 'd_total_(cup_eq)',
       'd_milk_(cup_eq)', 'd_yogurt_(cup_eq)', 'd_cheese_(cup_eq)',
       'oils_(grams)', 'solid_fats_(grams)', 'add_sugars_(tsp_eq)',
       'a_drinks_(no._of_drinks)', 'kcal_d1', 'k

## Scale with Min and Max

This gives us scores for each of 13 categories

In [None]:
# 0.8 is perfect score of 5
# 0 is a worst score of 0
# if total fruit density is greater than 0.8, fruit score is 5, else it is / 0.8
df['total_fruit_score'] = df['total_fruit_density'].apply(lambda x: 5 if x >= 0.8 else x / 0.8)

def get_subscore(df, col_name, max_val, constant):
    col = df[col_name]
    return np.where(col >= constant, max_val, col / constant)

df['whole_fruit_score'] = get_subscore(df, 'whole_fruit_density', 5, 0.4)
df['total_vegetables_score'] = get_subscore(df, 'total_vegetables_density', 5, 1.1)
df['greens_and_beans_score'] = get_subscore(df, 'greens_and_beans_density', 5, 0.2)
df['whole_grains_score'] = get_subscore(df, 'whole_grains_density', 10, 1.5)
df['dairy_score'] = get_subscore(df, 'dairy_density', 10, 1.3)
df['total_protein_score'] = get_subscore(df, 'total_protein_density', 5, 2.5)
df['sea_plant_protein_score'] = get_subscore(df, 'sea_plant_protein_density', 5, 0.8)
df['fatty_acid_adequacy_score'] = df['fatty_acid_ratio'].apply(lambda x: 10 if x >= 2.5 else (10 * (x - 1.2) / 1.3))

# moderation components
df['refined_grains_score'] = df['refined_grains_density'].apply(lambda x: 10 if x <= 1.8 else 0 if x > 4.3 else 10 - ((10 * (x - 1.8)) / 2.5))
df['sodium_score'] = df['sodium_density'].apply(lambda x: 10 if x <= 1.1 else 0 if x > 2 else 10 - ((10 * (x - 1.1)) / 0.9))

#
df['added_sugar_pct'] = (df['added_sugar_kcal'] / df['kcal_2day']) * 100
df['added_sugar_score'] = df['added_sugar_pct'].apply(lambda x: 0 if x >= 26 else 10 if x < 6.5 else 10 - ((10 * (x - 6.5)) / 19.5))

df['fatty_acid_mod_pct'] = (df['fatty_acid_kcal'] / df['kcal_2day']) * 100
df['fatty_acid_mod_score'] = df['fatty_acid_mod_pct'].apply(lambda x: 0 if x >= 16 else 10 if x < 8 else 10 - ((10 * (x - 8)) / 8))

df.loc[:, df.columns.str.contains('score')]

Unnamed: 0,total_fruit_score,whole_fruit_score,total_vegetables_score,greens_and_beans_score,whole_grains_score,dairy_score,total_protein_score,sea_plant_protein_score,fatty_acid_adequacy_score,refined_grains_score,sodium_score,added_sugar_score,fatty_acid_mod_score
0,0.713178,5.000000,5.000000,0.640338,0.706249,0.794974,0.491547,0.878631,4.219788,4.584677,5.404069,9.096969,6.130947
1,0.268672,0.527820,0.534565,0.000000,0.013335,0.527728,5.000000,0.014288,3.800426,4.904334,1.755129,7.047818,0.874107
2,0.000000,0.000000,0.763852,0.240804,0.000000,0.885791,0.921923,0.000000,2.248901,5.111665,6.684913,2.181938,6.414001
3,0.282988,0.563873,5.000000,0.000000,0.150366,0.223064,0.811030,0.000000,6.917283,0.000000,0.000000,10.000000,6.146664
4,0.720762,0.948705,0.518012,0.000000,0.372766,0.540329,0.580153,0.135758,-3.069732,5.322253,8.194417,5.589893,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11389,0.892035,5.000000,0.336192,0.000000,0.000000,0.298654,0.989149,5.000000,8.198859,10.000000,7.402841,2.329810,8.652898
11390,5.000000,0.518232,0.579098,0.644591,10.000000,0.952441,5.000000,0.673029,1.369992,4.234961,0.000000,10.000000,7.013398
11391,0.230810,0.423540,0.828499,5.000000,0.502867,0.812300,0.561124,0.085693,10.000000,6.243107,3.719312,5.570151,8.943370
11392,0.000000,0.000000,0.924605,0.000000,0.000000,0.919422,0.596292,0.837595,0.714358,0.000000,1.222552,10.000000,2.915478


## Add scores to get total

In [None]:
df.columns

Index(['SEQN', 'weight_2d', 'grams', 'satfat', 'monofat', 'polyfat', 'sodium',
       'f_total_(cup_eq)', 'f_citmlb_(cup_eq)', 'f_other_(cup_eq)',
       'f_juice_(cup_eq)', 'v_total_(cup_eq)', 'v_drkgr_(cup_eq)',
       'v_redor_total_(cup_eq)', 'v_redor_tomato_(cup_eq)',
       'v_redor_other_(cup_eq)', 'v_starchy_total_(cup_eq)',
       'v_starchy_potato_(cup_eq)', 'v_starchy_other_(cup_eq)',
       'v_other_(cup_eq)', 'v_legumes_(cup_eq)', 'g_total_(oz_eq)',
       'g_whole_(oz_eq)', 'g_refined_(oz_eq)', 'pf_total_(oz_eq)',
       'pf_mps_total_(oz_eq)', 'pf_meat_(oz_eq)', 'pf_curedmeat_(oz_eq)',
       'pf_organ_(oz_eq)', 'pf_poult_(oz_eq)', 'pf_seafd_hi_(oz_eq)',
       'pf_seafd_low_(oz_eq)', 'pf_eggs_(oz_eq)', 'pf_soy_(oz_eq)',
       'pf_nutsds_(oz_eq)', 'pf_legumes_(oz_eq)', 'd_total_(cup_eq)',
       'd_milk_(cup_eq)', 'd_yogurt_(cup_eq)', 'd_cheese_(cup_eq)',
       'oils_(grams)', 'solid_fats_(grams)', 'add_sugars_(tsp_eq)',
       'a_drinks_(no._of_drinks)', 'kcal_d1', 'k

In [None]:
df['hei_score'] = df.loc[:, df.columns.str.contains('score')].sum(axis=1)

print(df['hei_score'].min())
print(df['hei_score'].max())
df.loc[:, df.columns.str.contains('score')]

5.865052113199992
94.91008956059333


Unnamed: 0,total_fruit_score,whole_fruit_score,total_vegetables_score,greens_and_beans_score,whole_grains_score,dairy_score,total_protein_score,sea_plant_protein_score,fatty_acid_adequacy_score,refined_grains_score,sodium_score,added_sugar_score,fatty_acid_mod_score,hei_score
0,0.713178,5.000000,5.000000,0.640338,0.706249,0.794974,0.491547,0.878631,4.219788,4.584677,5.404069,9.096969,6.130947,43.661367
1,0.268672,0.527820,0.534565,0.000000,0.013335,0.527728,5.000000,0.014288,3.800426,4.904334,1.755129,7.047818,0.874107,25.268223
2,0.000000,0.000000,0.763852,0.240804,0.000000,0.885791,0.921923,0.000000,2.248901,5.111665,6.684913,2.181938,6.414001,25.453787
3,0.282988,0.563873,5.000000,0.000000,0.150366,0.223064,0.811030,0.000000,6.917283,0.000000,0.000000,10.000000,6.146664,30.095266
4,0.720762,0.948705,0.518012,0.000000,0.372766,0.540329,0.580153,0.135758,-3.069732,5.322253,8.194417,5.589893,0.000000,19.853316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11389,0.892035,5.000000,0.336192,0.000000,0.000000,0.298654,0.989149,5.000000,8.198859,10.000000,7.402841,2.329810,8.652898,49.100436
11390,5.000000,0.518232,0.579098,0.644591,10.000000,0.952441,5.000000,0.673029,1.369992,4.234961,0.000000,10.000000,7.013398,45.985743
11391,0.230810,0.423540,0.828499,5.000000,0.502867,0.812300,0.561124,0.085693,10.000000,6.243107,3.719312,5.570151,8.943370,42.920773
11392,0.000000,0.000000,0.924605,0.000000,0.000000,0.919422,0.596292,0.837595,0.714358,0.000000,1.222552,10.000000,2.915478,18.130301


## Last Minute Wrangling

Better off in wrangle script but alas

In [None]:
df.columns

Index(['SEQN', 'weight_2d', 'grams', 'satfat', 'monofat', 'polyfat', 'sodium',
       'f_total_(cup_eq)', 'f_citmlb_(cup_eq)', 'f_other_(cup_eq)',
       'f_juice_(cup_eq)', 'v_total_(cup_eq)', 'v_drkgr_(cup_eq)',
       'v_redor_total_(cup_eq)', 'v_redor_tomato_(cup_eq)',
       'v_redor_other_(cup_eq)', 'v_starchy_total_(cup_eq)',
       'v_starchy_potato_(cup_eq)', 'v_starchy_other_(cup_eq)',
       'v_other_(cup_eq)', 'v_legumes_(cup_eq)', 'g_total_(oz_eq)',
       'g_whole_(oz_eq)', 'g_refined_(oz_eq)', 'pf_total_(oz_eq)',
       'pf_mps_total_(oz_eq)', 'pf_meat_(oz_eq)', 'pf_curedmeat_(oz_eq)',
       'pf_organ_(oz_eq)', 'pf_poult_(oz_eq)', 'pf_seafd_hi_(oz_eq)',
       'pf_seafd_low_(oz_eq)', 'pf_eggs_(oz_eq)', 'pf_soy_(oz_eq)',
       'pf_nutsds_(oz_eq)', 'pf_legumes_(oz_eq)', 'd_total_(cup_eq)',
       'd_milk_(cup_eq)', 'd_yogurt_(cup_eq)', 'd_cheese_(cup_eq)',
       'oils_(grams)', 'solid_fats_(grams)', 'add_sugars_(tsp_eq)',
       'a_drinks_(no._of_drinks)', 'kcal_d1', 'k

In [None]:
# Keep only the columns we are interested in for analysis
# Also throwing in alcohol out of curiosity
cols = [
    'SEQN',
    'weight_2d',
    'pf_total_(oz_eq)',
    'pf_legumes_(oz_eq)',
    'pf_nutsds_(oz_eq)',
    'pf_soy_(oz_eq)',
    'psu',
    'strata',
    'gender',
    'age',
    'race',
    'education',
    'income_ratio',
    'total_cholesterol',
    'blood_mercury',
    'avg_systolic_bp',
    'avg_diastolic_bp',
    'cluster',
    'hei_score',
    'a_drinks_(no._of_drinks)',
    'kcal_2day'
]

# Also add HEI component densities to explore our clusters
hei_components = [col for col in df.columns if col.endswith('_density')] \
  + ['fatty_acid_ratio'] + ['added_sugar_pct'] + ['fatty_acid_mod_pct']

# Put them together
all_cols = cols + hei_components
print(all_cols)

# Reduce df to those cols
df = df.loc[:, all_cols]
df.columns

['SEQN', 'weight_2d', 'pf_total_(oz_eq)', 'pf_legumes_(oz_eq)', 'pf_nutsds_(oz_eq)', 'pf_soy_(oz_eq)', 'psu', 'strata', 'gender', 'age', 'race', 'education', 'income_ratio', 'total_cholesterol', 'blood_mercury', 'avg_systolic_bp', 'avg_diastolic_bp', 'cluster', 'hei_score', 'a_drinks_(no._of_drinks)', 'kcal_2day', 'total_fruit_density', 'whole_fruit_density', 'total_vegetables_density', 'greens_and_beans_density', 'whole_grains_density', 'dairy_density', 'total_protein_density', 'sea_plant_protein_density', 'refined_grains_density', 'sodium_density', 'fatty_acid_ratio', 'added_sugar_pct', 'fatty_acid_mod_pct']


Index(['SEQN', 'weight_2d', 'pf_total_(oz_eq)', 'pf_legumes_(oz_eq)',
       'pf_nutsds_(oz_eq)', 'pf_soy_(oz_eq)', 'psu', 'strata', 'gender', 'age',
       'race', 'education', 'income_ratio', 'total_cholesterol',
       'blood_mercury', 'avg_systolic_bp', 'avg_diastolic_bp', 'cluster',
       'hei_score', 'a_drinks_(no._of_drinks)', 'kcal_2day',
       'total_fruit_density', 'whole_fruit_density',
       'total_vegetables_density', 'greens_and_beans_density',
       'whole_grains_density', 'dairy_density', 'total_protein_density',
       'sea_plant_protein_density', 'refined_grains_density', 'sodium_density',
       'fatty_acid_ratio', 'added_sugar_pct', 'fatty_acid_mod_pct'],
      dtype='object')

Change clusters 0-3 to 1-4

In [None]:
df['cluster'].value_counts()

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
2,4708
1,3255
3,2295
0,1136


In [None]:
df['cluster'] = df['cluster'] + 1
df['cluster'].value_counts()

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
3,4708
2,3255
4,2295
1,1136


#### Proportion of Protein from Plants

In [None]:
# Get proportion of protein from all plants, as well as each source individually
df['prop_plant_protein'] = (df['pf_legumes_(oz_eq)'] + df['pf_nutsds_(oz_eq)'] + df['pf_soy_(oz_eq)']) / (df['pf_total_(oz_eq)'] + df['pf_legumes_(oz_eq)'])
df['prop_legume_protein'] = df['pf_legumes_(oz_eq)'] / (df['pf_total_(oz_eq)'] + df['pf_legumes_(oz_eq)'])
df['prop_soy_protein'] = df['pf_nutsds_(oz_eq)'] / (df['pf_total_(oz_eq)'] + df['pf_legumes_(oz_eq)'])
df['prop_nutseed_protein'] = df['pf_soy_(oz_eq)'] / (df['pf_total_(oz_eq)'] + df['pf_legumes_(oz_eq)'])
df.loc[:, df.columns.str.contains('^prop_')].describe()

Unnamed: 0,prop_plant_protein,prop_legume_protein,prop_soy_protein,prop_nutseed_protein
count,11378.0,11378.0,11378.0,11378.0
mean,0.195182,0.071279,0.108954,0.014949
std,0.236676,0.151533,0.175667,0.065954
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.107587,0.0,0.013357,0.0
75%,0.309851,0.071906,0.1549,0.0
max,1.0,1.0,1.0,1.0


In [None]:
# solve for protein density for each type of proteins source
df['pf_legumes_density'] = df['pf_legumes_(oz_eq)'] / df['kcal_2day'] * 1000
df['pf_nutsds_density'] = df['pf_nutsds_(oz_eq)'] / df['kcal_2day'] * 1000
df['pf_soy_density'] = df['pf_soy_(oz_eq)'] / df['kcal_2day'] * 1000



In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11394 entries, 0 to 11393
Data columns (total 41 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   SEQN                       11394 non-null  int64  
 1   weight_2d                  11394 non-null  float64
 2   pf_total_(oz_eq)           11394 non-null  float64
 3   pf_legumes_(oz_eq)         11394 non-null  float64
 4   pf_nutsds_(oz_eq)          11394 non-null  float64
 5   pf_soy_(oz_eq)             11394 non-null  float64
 6   psu                        11394 non-null  int64  
 7   strata                     11394 non-null  int64  
 8   gender                     11394 non-null  object 
 9   age                        11394 non-null  int64  
 10  race                       11394 non-null  object 
 11  education                  11394 non-null  object 
 12  income_ratio               11394 non-null  float64
 13  total_cholesterol          11394 non-null  int

### Fix Racial Injustice

This is the ugliest thing I've ever done. Bringing the race category from the clean.csv and joining here to avoid running through project because Colab is the worst thing I have ever experienced.

In [None]:
new_demos = pd.read_csv('data/clean/nhanes_2017_2023_clean.csv')
new_demos = new_demos[['SEQN', 'race']]
new_demos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11394 entries, 0 to 11393
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SEQN    11394 non-null  int64 
 1   race    11394 non-null  object
dtypes: int64(1), object(1)
memory usage: 178.2+ KB


In [None]:
# Replace demos in actual DF with those from clean
df.drop(columns=['race'], inplace=True)
df = df.merge(new_demos, on='SEQN', how='left')
print(df.info())
print(df['race'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11394 entries, 0 to 11393
Data columns (total 41 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   SEQN                       11394 non-null  int64  
 1   weight_2d                  11394 non-null  float64
 2   pf_total_(oz_eq)           11394 non-null  float64
 3   pf_legumes_(oz_eq)         11394 non-null  float64
 4   pf_nutsds_(oz_eq)          11394 non-null  float64
 5   pf_soy_(oz_eq)             11394 non-null  float64
 6   psu                        11394 non-null  int64  
 7   strata                     11394 non-null  int64  
 8   gender                     11394 non-null  object 
 9   age                        11394 non-null  int64  
 10  education                  11394 non-null  object 
 11  income_ratio               11394 non-null  float64
 12  total_cholesterol          11394 non-null  int64  
 13  blood_mercury              11394 non-null  flo

### Save

In [None]:
df.to_csv('data/clean/nhanes_2017_2023_hei.csv', index=False)