In [1]:
import pandas as pd

In [3]:
# Load in the Data
kenya = pd.read_csv("kenya_rsrc.csv")

In [4]:
kenya

Unnamed: 0,Community_Id,Household_Id,Participant_Id,Timepoint,Household_Observation_Id,Observation_Id,Age (days) [EUPATH_0000579],Bicycle [ENVO_01000614],"BMI-for-age z-score, using median weight and median length or height [EUPATH_0035071]","Bruising during the last 2 days, caregiver report [EUPATH_0035108]",...,Television [ENVO_01000579],Time to water source (min) [EUPATH_0037013],Underweight [EUPATH_0035066],Wall material type [EUPATH_0025167],Wasted [EUPATH_0035064],Water and soap at hand washing location [EUPATH_0037012],Week [EUPATH_0035096],"Weight-for-age z-score, using median weight [EUPATH_0035073]","Weight-for-length or -height z-score, using median weight and median length or height [EUPATH_0035069]",Year [EUPATH_0010374]
0,c_2253550,h_22242830,1,0,ho_22242830_0,o_1_0,,,,,...,,,,,,No,,,,
1,c_2253550,h_22242830,1,1,ho_22242830_1,o_1_1,,,,,...,,,,,,No,,,,
2,c_2253550,h_22242830,1,2,ho_22242830_2,o_1_2,,,,,...,,,,,,,,,,
3,c_2353580,h_23243130,10,0,ho_23243130_0,o_10_0,,,,,...,,,,,,No,,,,
4,c_2353580,h_23243130,10,1,,o_10_1,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40717,c_7296540,h_72672730,98,1,ho_72672730_1,o_98_1,,,,,...,,,,,,,,,,
40718,c_7296540,h_72672730,98,2,,o_98_2,,,,,...,,,,,,,,,,
40719,c_7311530,h_72822630,99,0,ho_72822630_0,o_99_0,,,,,...,,,,,,No,,,,
40720,c_7311530,h_72822630,99,1,ho_72822630_1,o_99_1,,,,,...,,,,,,,,,,


# Extracting Observations

Observations are features that were generally recorded during the baseline survey and can be measured before child birth

In [None]:
observations = ["Participant_Id", "Timepoint", "Median mother's height (cm) [EUPATH_0035110]", 
               "Mother's age (years) [EUPATH_0035109]", "Mother's education level [EUPATH_0036220]", 
               "Multiple birth [EUPATH_0035113]", "Sex [PATO_0000047]", "Household hunger scale (HHS) [EUPATH_0037006]", 
                "Persons 18yrs and younger living in house [EUPATH_0035045]", "Cluster study arm [EUPATH_0044124]"]

In [6]:
observations_df = kenya[observations]

In [7]:
observations_df = observations_df.loc[observations_df["Timepoint"]==0].dropna()

In [8]:
observations_df

Unnamed: 0,Participant_Id,Timepoint,Median mother's height (cm) [EUPATH_0035110],Mother's age (years) [EUPATH_0035109],Mother's education level [EUPATH_0036220],Multiple birth [EUPATH_0035113],Sex [PATO_0000047],Household hunger scale (HHS) [EUPATH_0037006],Persons 18yrs and younger living in house [EUPATH_0035045],Cluster study arm [EUPATH_0044124]
222,2003043530,0,147.7,19.89,1. Incomplete primary,No,Male,1. Little to none,1.0,"Nutrition, water, sanitation, and handwashing"
225,2003044530,0,162.0,27.59,1. Incomplete primary,No,Female,1. Little to none,5.0,"Nutrition, water, sanitation, and handwashing"
228,2003053530,0,157.5,27.72,3. Any secondary,No,Female,1. Little to none,4.0,"Nutrition, water, sanitation, and handwashing"
237,2003063530,0,159.5,27.26,3. Any secondary,No,Female,3. Severe,2.0,"Nutrition, water, sanitation, and handwashing"
246,2003093530,0,161.0,41.36,1. Incomplete primary,No,Male,1. Little to none,6.0,"Nutrition, water, sanitation, and handwashing"
...,...,...,...,...,...,...,...,...,...,...
40623,7913093530,0,153.5,22.85,2. Complete primary,No,Female,1. Little to none,0.0,Active control
40629,7913103530,0,160.9,30.26,1. Incomplete primary,No,Female,1. Little to none,2.0,Active control
40638,7913123530,0,153.9,28.42,2. Complete primary,No,Male,1. Little to none,2.0,Active control
40641,7913133530,0,150.1,20.79,1. Incomplete primary,No,Male,1. Little to none,0.0,Active control


# Asset Scores

This code segment extracts the observations that provide indicators of socio-economic wealth

In [9]:
asset_vars = ["Participant_Id", "Timepoint", "Bicycle [ENVO_01000614]", "Car or truck [EUPATH_0000171]", 
             "Cattle count [EUPATH_0022195]", "Chicken count [EUPATH_0022197]", "Clock [ENVO_01000596]", 
             "Electricity [EUPATH_0021084]", "Gas cooker or stove [EUPATH_0037014]", "Goat count [EUPATH_0037005]",
             "Mobile phone [ENVO_01000581]", "Motorcycle or scooter [ENVO_01000615]", "Television [ENVO_01000579]",
             "Target child or sibling/neighbor [EUPATH_0035112]"]

In [10]:
asset_df = kenya[asset_vars]
asset_df = asset_df.loc[asset_df["Timepoint"]==0].dropna()

In [11]:
asset_df = asset_df.loc[asset_df["Target child or sibling/neighbor [EUPATH_0035112]"]=='Target child']
asset_df = asset_df.drop(columns=["Target child or sibling/neighbor [EUPATH_0035112]"])

# Target Variable Extraction

The target variable for this study was stunting observed in the target child during the second study timepoint

In [12]:
target_vars = ["Participant_Id", "Timepoint", "Stunted [EUPATH_0035062]"]

In [13]:
target_df = kenya[target_vars]
target_df = target_df.loc[target_df["Timepoint"]==2].dropna()

In [15]:
target_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6801 entries, 212 to 40652
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Participant_Id            6801 non-null   int64 
 1   Timepoint                 6801 non-null   int64 
 2   Stunted [EUPATH_0035062]  6801 non-null   object
dtypes: int64(2), object(1)
memory usage: 212.5+ KB


Merge the observations and target dataframes

In [14]:
kenya_final_df = observations_df.merge(target_df, on=["Participant_Id"])
kenya_final_df = kenya_final_df.drop(columns=["Timepoint_x", "Timepoint_y"])

In [15]:
kenya_final_df

Unnamed: 0,Participant_Id,Median mother's height (cm) [EUPATH_0035110],Mother's age (years) [EUPATH_0035109],Mother's education level [EUPATH_0036220],Multiple birth [EUPATH_0035113],Sex [PATO_0000047],Household hunger scale (HHS) [EUPATH_0037006],Persons 18yrs and younger living in house [EUPATH_0035045],Cluster study arm [EUPATH_0044124],Stunted [EUPATH_0035062]
0,2003043530,147.7,19.89,1. Incomplete primary,No,Male,1. Little to none,1.0,"Nutrition, water, sanitation, and handwashing",Yes
1,2003044530,162.0,27.59,1. Incomplete primary,No,Female,1. Little to none,5.0,"Nutrition, water, sanitation, and handwashing",Yes
2,2003053530,157.5,27.72,3. Any secondary,No,Female,1. Little to none,4.0,"Nutrition, water, sanitation, and handwashing",No
3,2003063530,159.5,27.26,3. Any secondary,No,Female,3. Severe,2.0,"Nutrition, water, sanitation, and handwashing",No
4,2003093530,161.0,41.36,1. Incomplete primary,No,Male,1. Little to none,6.0,"Nutrition, water, sanitation, and handwashing",No
...,...,...,...,...,...,...,...,...,...,...
6189,7913093530,153.5,22.85,2. Complete primary,No,Female,1. Little to none,0.0,Active control,No
6190,7913103530,160.9,30.26,1. Incomplete primary,No,Female,1. Little to none,2.0,Active control,No
6191,7913123530,153.9,28.42,2. Complete primary,No,Male,1. Little to none,2.0,Active control,No
6192,7913133530,150.1,20.79,1. Incomplete primary,No,Male,1. Little to none,0.0,Active control,No


# Feature Encoding

The following code encodes the non-numerical data into a numerical form. The code also applies the weights to individual features and combines the summed weights into a new Asset Score feature

In [16]:
yn_encoder = {"Yes" : 1, "No" : 0}
mf_encoder = {"Female" : 1, "Male" : 0}
edu_encoder = {"1. Incomplete primary" : 0, "2. Complete primary" : 1, "3. Any secondary" : 2}
hhs_encoder = {"1. Little to none" : 1, "2. Moderate" : 2, "3. Severe" : 3, "4. Missing" : None}

In [17]:
kenya_final_df["Mother's education level [EUPATH_0036220]"] = kenya_final_df["Mother's education level [EUPATH_0036220]"].replace(edu_encoder)
kenya_final_df["Multiple birth [EUPATH_0035113]"] = kenya_final_df["Multiple birth [EUPATH_0035113]"].replace(yn_encoder)
kenya_final_df["Sex [PATO_0000047]"] = kenya_final_df["Sex [PATO_0000047]"].replace(mf_encoder)
kenya_final_df["Household hunger scale (HHS) [EUPATH_0037006]"] = kenya_final_df["Household hunger scale (HHS) [EUPATH_0037006]"].replace(hhs_encoder)
kenya_final_df["Stunted [EUPATH_0035062]"] = kenya_final_df["Stunted [EUPATH_0035062]"].replace(yn_encoder)

In [18]:
kenya_final_df = kenya_final_df.dropna()
kenya_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6181 entries, 0 to 6193
Data columns (total 10 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Participant_Id                                              6181 non-null   int64  
 1   Median mother's height (cm) [EUPATH_0035110]                6181 non-null   float64
 2   Mother's age (years) [EUPATH_0035109]                       6181 non-null   float64
 3   Mother's education level [EUPATH_0036220]                   6181 non-null   int64  
 4   Multiple birth [EUPATH_0035113]                             6181 non-null   int64  
 5   Sex [PATO_0000047]                                          6181 non-null   int64  
 6   Household hunger scale (HHS) [EUPATH_0037006]               6181 non-null   float64
 7   Persons 18yrs and younger living in house [EUPATH_0035045]  6181 non-null   float64
 8 

In [19]:
asset_df["Clock [ENVO_01000596]"] = asset_df["Clock [ENVO_01000596]"].replace(yn_encoder)
asset_df["Bicycle [ENVO_01000614]"] = asset_df["Bicycle [ENVO_01000614]"].replace(yn_encoder)
asset_df["Car or truck [EUPATH_0000171]"] = asset_df["Car or truck [EUPATH_0000171]"].replace(yn_encoder)
asset_df["Electricity [EUPATH_0021084]"] = asset_df["Electricity [EUPATH_0021084]"].replace(yn_encoder)
asset_df["Gas cooker or stove [EUPATH_0037014]"] = asset_df["Gas cooker or stove [EUPATH_0037014]"].replace(yn_encoder)
asset_df["Mobile phone [ENVO_01000581]"] = asset_df["Mobile phone [ENVO_01000581]"].replace(yn_encoder)
asset_df["Television [ENVO_01000579]"] = asset_df["Television [ENVO_01000579]"].replace(yn_encoder)
asset_df["Motorcycle or scooter [ENVO_01000615]"] = asset_df["Motorcycle or scooter [ENVO_01000615]"].replace(yn_encoder)

In [20]:
asset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7333 entries, 222 to 40650
Data columns (total 13 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Participant_Id                         7333 non-null   int64  
 1   Timepoint                              7333 non-null   int64  
 2   Bicycle [ENVO_01000614]                7333 non-null   int64  
 3   Car or truck [EUPATH_0000171]          7333 non-null   int64  
 4   Cattle count [EUPATH_0022195]          7333 non-null   float64
 5   Chicken count [EUPATH_0022197]         7333 non-null   float64
 6   Clock [ENVO_01000596]                  7333 non-null   int64  
 7   Electricity [EUPATH_0021084]           7333 non-null   int64  
 8   Gas cooker or stove [EUPATH_0037014]   7333 non-null   int64  
 9   Goat count [EUPATH_0037005]            7333 non-null   float64
 10  Mobile phone [ENVO_01000581]           7333 non-null   int64  
 11  M

In [21]:
asset_vars

['Participant_Id',
 'Timepoint',
 'Bicycle [ENVO_01000614]',
 'Car or truck [EUPATH_0000171]',
 'Cattle count [EUPATH_0022195]',
 'Chicken count [EUPATH_0022197]',
 'Clock [ENVO_01000596]',
 'Electricity [EUPATH_0021084]',
 'Gas cooker or stove [EUPATH_0037014]',
 'Goat count [EUPATH_0037005]',
 'Mobile phone [ENVO_01000581]',
 'Motorcycle or scooter [ENVO_01000615]',
 'Television [ENVO_01000579]',
 'Target child or sibling/neighbor [EUPATH_0035112]']

In [22]:
asset_weights = {asset_vars[2]:0.25, asset_vars[3]:2, asset_vars[4]:0.2, asset_vars[5]:0.1, asset_vars[6]:0.1, 
                 asset_vars[7]:1.5, asset_vars[8]:0.5, asset_vars[9]:0.2, asset_vars[10]:0.75, asset_vars[11]:1}

asset_df["Assets Score"] = asset_df["Television [ENVO_01000579]"]*0.5

for i in range(len(asset_weights)-1):
    col = asset_vars[i+2]
    asset_df["Assets Score"] += asset_df[col] * asset_weights[col]

In [23]:
asset_df

Unnamed: 0,Participant_Id,Timepoint,Bicycle [ENVO_01000614],Car or truck [EUPATH_0000171],Cattle count [EUPATH_0022195],Chicken count [EUPATH_0022197],Clock [ENVO_01000596],Electricity [EUPATH_0021084],Gas cooker or stove [EUPATH_0037014],Goat count [EUPATH_0037005],Mobile phone [ENVO_01000581],Motorcycle or scooter [ENVO_01000615],Television [ENVO_01000579],Assets Score
222,2003043530,0,0,0,0.0,0.0,0,0,0,0.0,1,0,0,0.75
225,2003044530,0,1,0,0.0,9.0,1,0,0,0.0,0,0,0,1.25
228,2003053530,0,0,0,0.0,2.0,1,0,0,0.0,1,0,0,1.05
237,2003063530,0,0,0,1.0,0.0,0,0,0,0.0,1,0,0,0.95
246,2003093530,0,1,0,3.0,8.0,0,0,0,0.0,1,0,0,2.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40635,7913113530,0,0,0,0.0,1.0,0,0,0,0.0,1,0,0,0.85
40638,7913123530,0,1,0,0.0,6.0,0,0,0,0.0,1,0,0,1.60
40641,7913133530,0,1,0,0.0,0.0,1,0,0,0.0,1,0,0,1.10
40644,7913143530,0,1,0,3.0,10.0,0,0,0,0.0,1,0,0,2.60


In [24]:
asset_temp = asset_df[["Participant_Id", "Assets Score"]]

In [25]:
asset_temp

Unnamed: 0,Participant_Id,Assets Score
222,2003043530,0.75
225,2003044530,1.25
228,2003053530,1.05
237,2003063530,0.95
246,2003093530,2.40
...,...,...
40635,7913113530,0.85
40638,7913123530,1.60
40641,7913133530,1.10
40644,7913143530,2.60


In [26]:
kenya_final_df = kenya_final_df.merge(asset_temp, on=["Participant_Id"])

In [27]:
kenya_final_df

Unnamed: 0,Participant_Id,Median mother's height (cm) [EUPATH_0035110],Mother's age (years) [EUPATH_0035109],Mother's education level [EUPATH_0036220],Multiple birth [EUPATH_0035113],Sex [PATO_0000047],Household hunger scale (HHS) [EUPATH_0037006],Persons 18yrs and younger living in house [EUPATH_0035045],Cluster study arm [EUPATH_0044124],Stunted [EUPATH_0035062],Assets Score
0,2003043530,147.7,19.89,0,0,0,1.0,1.0,"Nutrition, water, sanitation, and handwashing",1,0.75
1,2003044530,162.0,27.59,0,0,1,1.0,5.0,"Nutrition, water, sanitation, and handwashing",1,1.25
2,2003053530,157.5,27.72,2,0,1,1.0,4.0,"Nutrition, water, sanitation, and handwashing",0,1.05
3,2003063530,159.5,27.26,2,0,1,3.0,2.0,"Nutrition, water, sanitation, and handwashing",0,0.95
4,2003093530,161.0,41.36,0,0,0,1.0,6.0,"Nutrition, water, sanitation, and handwashing",0,2.40
...,...,...,...,...,...,...,...,...,...,...,...
6123,7913093530,153.5,22.85,1,0,1,1.0,0.0,Active control,0,0.35
6124,7913103530,160.9,30.26,0,0,1,1.0,2.0,Active control,0,0.00
6125,7913123530,153.9,28.42,1,0,0,1.0,2.0,Active control,0,1.60
6126,7913133530,150.1,20.79,0,0,0,1.0,0.0,Active control,0,1.10


In [28]:
kenya_final_data = pd.get_dummies(kenya_final_df, columns=["Cluster study arm [EUPATH_0044124]"], drop_first=True)

In [29]:
kenya_final_data.to_csv("kenya_stunted_data.csv", index=False)