### Importing packages and reading dataframes

In [1]:
import os
import pandas as pd
import numpy as np
import pathlib
import shapefile
import glob
import pyreadstat
import pickle
import re
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:

CD_data = pickle.load(open('Processed_Datasets/CD_cleaned.pkl', 'rb'))


### Filtering out the last births and births that ended in mortality

In [3]:
CD_data1 = CD_data[CD_data['midx'] == 1] ## Considering only the last births
CD_dataset = CD_data1[CD_data1['ChildDied'] == 1] ## Considering only births that ended in death of the child

# CD_dataset = CD_data1
CD_dataset = CD_dataset.reset_index(drop=True)
CD_dataset = CD_dataset.drop(columns = ['AgeOfChildAtDeath'])


In [4]:
CD_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9307 entries, 0 to 9306
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   midx                            9307 non-null   object 
 1   country                         9307 non-null   object 
 2   Cluster's latitude coordinate   9307 non-null   float64
 3   Cluster's longitude coordinate  9307 non-null   float64
 4   stratum                         9307 non-null   int64  
 5   psu                             9307 non-null   int64  
 6   v005                            9307 non-null   int64  
 7   ChildDied                       9307 non-null   int64  
 8   ChildGender_Male                9307 non-null   int64  
 9   PreceedingBirthInterval(33+)    9307 non-null   int64  
 10  WasBreastfed                    9252 non-null   Int64  
 11  HealthFacilityDelivery          9291 non-null   Int64  
 12  SkilledDeliveryCareProvider     92

## Handling missingness

In [5]:
## checking for missing values percentage
missing_percent = (CD_dataset.isna().mean() * 100).sort_values(ascending=True)
print("Percentage of missing values per column:")
print(missing_percent)

Percentage of missing values per column:
midx                              0.000000
DecisionMaking_Participation      0.000000
MaternalParity                    0.000000
AgeAtFirstBirth                   0.000000
MaritalStatus                     0.000000
MothersCurrentAge                 0.000000
MothersEducationalLevel           0.000000
WealthIndex                       0.000000
UrbanResidence                    0.000000
NonTobaccoSmoker                  0.000000
weights                           0.000000
rescaled_weights                  0.000000
PreceedingBirthInterval(33+)      0.000000
Cluster's longitude coordinate    0.000000
stratum                           0.000000
psu                               0.000000
v005                              0.000000
ChildDied                         0.000000
ChildGender_Male                  0.000000
Cluster's latitude coordinate     0.000000
country                           0.000000
MothersEmploymentStatus           0.161169
HealthFacilit

In [6]:

def impute_data(df, continuous_features):

    data = df.copy()
    #data = data.dropna()
    
    # preprocess continuous features
    if continuous_features:
        for column in continuous_features:
            # Fill missing values with the median
            median_value = data[column].median()
            data[column] = data[column].fillna(median_value)
            data[column] = data[column].astype(float)
    
    # Process categorical features
    categorical_features = [col for col in data.columns if col not in continuous_features]
    for column in categorical_features:
        # Fill missing values with the mode
        mode_value = data[column].mode()[0]
        data[column] = data[column].fillna(mode_value)
        data[column] = data[column].astype(object) 
        
    return data


In [7]:
pd.set_option('future.no_silent_downcasting', True)
df_processed = impute_data(CD_dataset, ['MothersCurrentAge', 'AgeAtFirstBirth', 'MaternalParity'])

# df_processed = CD_dataset # use this for complete case analysis
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9307 entries, 0 to 9306
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   midx                            9307 non-null   object 
 1   country                         9307 non-null   object 
 2   Cluster's latitude coordinate   9307 non-null   object 
 3   Cluster's longitude coordinate  9307 non-null   object 
 4   stratum                         9307 non-null   object 
 5   psu                             9307 non-null   object 
 6   v005                            9307 non-null   object 
 7   ChildDied                       9307 non-null   object 
 8   ChildGender_Male                9307 non-null   object 
 9   PreceedingBirthInterval(33+)    9307 non-null   object 
 10  WasBreastfed                    9307 non-null   object 
 11  HealthFacilityDelivery          9307 non-null   object 
 12  SkilledDeliveryCareProvider     93

In [8]:
# pickling the BRGE_modified file
f = open('Processed_Datasets/CD_prepared.pkl', 'wb')
pickle.dump(df_processed, f)
f.close()

In [9]:
df_processed.to_csv('/Users/clairenajjuuko/Library/CloudStorage/Box-Box/Under_Five_Mortality/CD_data.csv', index=False)

## Demographic and SES Characteristics

In [10]:
df_processed['MothersCurrentAge'].describe()

count    9307.000000
mean       29.783067
std         8.237939
min        15.000000
25%        23.000000
50%        29.000000
75%        36.000000
max        49.000000
Name: MothersCurrentAge, dtype: float64

In [11]:
df_processed['MaternalParity'].describe()

count    9307.000000
mean        4.190072
std         2.854234
min         1.000000
25%         2.000000
50%         4.000000
75%         6.000000
max        17.000000
Name: MaternalParity, dtype: float64

In [12]:
print(df_processed['ChildGender_Male'].value_counts(dropna=False))
print(df_processed['ChildGender_Male'].value_counts(normalize=True) * 100)

ChildGender_Male
1    5074
0    4233
Name: count, dtype: int64
ChildGender_Male
1    54.518105
0    45.481895
Name: proportion, dtype: float64


In [13]:
print(df_processed['MaritalStatus'].value_counts(dropna=False))
print(df_processed['MaritalStatus'].value_counts(normalize=True) * 100)

MaritalStatus
married/living with partner                             7612
widowed/divorced/no longer living together/separated     965
never in union                                           730
Name: count, dtype: int64
MaritalStatus
married/living with partner                             81.787902
widowed/divorced/no longer living together/separated    10.368540
never in union                                           7.843559
Name: proportion, dtype: float64


In [14]:
print(df_processed['MothersEmploymentStatus'].value_counts(dropna=False))
print(df_processed['MothersEmploymentStatus'].value_counts(normalize=True) * 100)

MothersEmploymentStatus
1    5916
0    3391
Name: count, dtype: int64
MothersEmploymentStatus
1    63.565059
0    36.434941
Name: proportion, dtype: float64


In [15]:
print(df_processed['MothersEducationalLevel'].value_counts(dropna=False))
print(df_processed['MothersEducationalLevel'].value_counts(normalize=True) * 100)

MothersEducationalLevel
No Education    3962
Primary         3046
Secondary       2067
Higher           232
Name: count, dtype: int64
MothersEducationalLevel
No Education    42.570109
Primary         32.728054
Secondary       22.209090
Higher           2.492747
Name: proportion, dtype: float64


In [16]:
print(df_processed['WealthIndex'].value_counts(dropna=False))
print(df_processed['WealthIndex'].value_counts(normalize=True) * 100)

WealthIndex
1    2481
2    2180
3    1917
4    1568
5    1161
Name: count, dtype: int64
WealthIndex
1    26.657355
2    23.423230
3    20.597400
4    16.847534
5    12.474482
Name: proportion, dtype: float64


In [17]:
print(df_processed['UrbanResidence'].value_counts(dropna=False))
print(df_processed['UrbanResidence'].value_counts(normalize=True) * 100)

UrbanResidence
0    6622
1    2685
Name: count, dtype: int64
UrbanResidence
0    71.150747
1    28.849253
Name: proportion, dtype: float64


## RMNCH Indicators

In [18]:

print(df_processed['DecisionMaking_Participation'].value_counts(dropna=False))
print(df_processed['DecisionMaking_Participation'].value_counts(normalize=True) * 100)

DecisionMaking_Participation
0    4683
1    4624
Name: count, dtype: int64
DecisionMaking_Participation
0    50.316966
1    49.683034
Name: proportion, dtype: float64


In [19]:

print(df_processed['OfAgeMarriage'].value_counts(dropna=False))
print(df_processed['OfAgeMarriage'].value_counts(normalize=True) * 100)

OfAgeMarriage
0    5310
1    3997
Name: count, dtype: int64
OfAgeMarriage
0    57.05383
1    42.94617
Name: proportion, dtype: float64


In [20]:

print(df_processed['NonTobaccoSmoker'].value_counts(dropna=False))
print(df_processed['NonTobaccoSmoker'].value_counts(normalize=True) * 100)

NonTobaccoSmoker
1    9085
0     222
Name: count, dtype: int64
NonTobaccoSmoker
1    97.614699
0     2.385301
Name: proportion, dtype: float64


In [21]:

print(df_processed['WasBreastfed'].value_counts(dropna=False))
print(df_processed['WasBreastfed'].value_counts(normalize=True) * 100)

WasBreastfed
1    6526
0    2781
Name: count, dtype: int64
WasBreastfed
1    70.119265
0    29.880735
Name: proportion, dtype: float64


In [22]:

print(df_processed['BabyPostnatalCheck'].value_counts(dropna=False))
print(df_processed['BabyPostnatalCheck'].value_counts(normalize=True) * 100)

BabyPostnatalCheck
0    7455
1    1852
Name: count, dtype: int64
BabyPostnatalCheck
0    80.100999
1    19.899001
Name: proportion, dtype: float64


In [23]:

print(df_processed['MaternalPostpartumHealthCheck'].value_counts(dropna=False))
print(df_processed['MaternalPostpartumHealthCheck'].value_counts(normalize=True) * 100)

MaternalPostpartumHealthCheck
0    7494
1    1813
Name: count, dtype: int64
MaternalPostpartumHealthCheck
0    80.520039
1    19.479961
Name: proportion, dtype: float64


In [24]:

print(df_processed['SkilledDeliveryCareProvider'].value_counts(dropna=False))
print(df_processed['SkilledDeliveryCareProvider'].value_counts(normalize=True) * 100)

SkilledDeliveryCareProvider
1    5678
0    3629
Name: count, dtype: int64
SkilledDeliveryCareProvider
1    61.007844
0    38.992156
Name: proportion, dtype: float64


In [25]:

print(df_processed['HealthFacilityDelivery'].value_counts(dropna=False))
print(df_processed['HealthFacilityDelivery'].value_counts(normalize=True) * 100)

HealthFacilityDelivery
1    5724
0    3583
Name: count, dtype: int64
HealthFacilityDelivery
1    61.502095
0    38.497905
Name: proportion, dtype: float64


In [26]:

print(df_processed['NeonatalTetanusProtection(2+)'].value_counts(dropna=False))
print(df_processed['NeonatalTetanusProtection(2+)'].value_counts(normalize=True) * 100)

NeonatalTetanusProtection(2+)
0    5029
1    4278
Name: count, dtype: int64
NeonatalTetanusProtection(2+)
0    54.034598
1    45.965402
Name: proportion, dtype: float64


In [27]:

print(df_processed['IronPillsDuringPregnancy'].value_counts(dropna=False))
print(df_processed['IronPillsDuringPregnancy'].value_counts(normalize=True) * 100)

IronPillsDuringPregnancy
1    6773
0    2534
Name: count, dtype: int64
IronPillsDuringPregnancy
1    72.773181
0    27.226819
Name: proportion, dtype: float64


In [28]:

print(df_processed['AntenatalCare(4+)'].value_counts(dropna=False))
print(df_processed['AntenatalCare(4+)'].value_counts(normalize=True) * 100)

AntenatalCare(4+)
1    4814
0    4493
Name: count, dtype: int64
AntenatalCare(4+)
1    51.724508
0    48.275492
Name: proportion, dtype: float64


In [29]:

print(df_processed['MetNeedFamilyPlanning'].value_counts(dropna=False))
print(df_processed['MetNeedFamilyPlanning'].value_counts(normalize=True) * 100)

MetNeedFamilyPlanning
2    5542
0    1980
1    1785
Name: count, dtype: int64
MetNeedFamilyPlanning
2    59.546578
0    21.274310
1    19.179112
Name: proportion, dtype: float64


In [30]:

print(df_processed['PreceedingBirthInterval(33+)'].value_counts(dropna=False))
print(df_processed['PreceedingBirthInterval(33+)'].value_counts(normalize=True) * 100)

PreceedingBirthInterval(33+)
0    3718
1    3535
2    2054
Name: count, dtype: int64
PreceedingBirthInterval(33+)
0    39.948426
1    37.982164
2    22.069410
Name: proportion, dtype: float64


In [31]:

print(df_processed['ProtectedDrinkingWaterSource'].value_counts(dropna=False))
print(df_processed['ProtectedDrinkingWaterSource'].value_counts(normalize=True) * 100)

ProtectedDrinkingWaterSource
1    6110
0    3197
Name: count, dtype: int64
ProtectedDrinkingWaterSource
1    65.649511
0    34.350489
Name: proportion, dtype: float64


In [32]:
print(df_processed['ImprovedToiletFacility'].value_counts(dropna=False))
print(df_processed['ImprovedToiletFacility'].value_counts(normalize=True) * 100)

ImprovedToiletFacility
0    5424
1    3883
Name: count, dtype: int64
ImprovedToiletFacility
0    58.278715
1    41.721285
Name: proportion, dtype: float64


In [33]:
print(df_processed['CleanCookingFuel'].value_counts(dropna=False))
print(df_processed['CleanCookingFuel'].value_counts(normalize=True) * 100)

CleanCookingFuel
0    8581
1     726
Name: count, dtype: int64
CleanCookingFuel
0    92.19942
1     7.80058
Name: proportion, dtype: float64
