## Import packages and read dataset

In [1]:
import os
import pandas as pd
import numpy as np
import pathlib
import shapefile
import pyreadstat
import pickle
import re
from tqdm import tqdm

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# loading data
BRGE_df = pickle.load(open('Processed_Datasets/BR_processing.pkl', 'rb'))
BR_metadata = pickle.load(open('Processed_Datasets/BR_processing_metadata.pkl', 'rb'))


## Child / Birth characteristics

In [3]:
BRGE_df = BRGE_df.copy()

# b5 # child is alive - Dependent variable
BRGE_df['b5'] = BRGE_df['b5'].replace({1: 0, 0: 1})
BRGE_df['ChildDied'] = BRGE_df['b5']
print(BRGE_df['ChildDied'].value_counts(dropna=False))

# age at death of child (calculated in months)
BRGE_df['AgeOfChildAtDeath'] = BRGE_df['b7']
BRGE_df['AgeOfChildAtDeath'] = BRGE_df['AgeOfChildAtDeath'].astype('Int64')
print(BRGE_df['AgeOfChildAtDeath'].value_counts(dropna=False))
print(BRGE_df['AgeOfChildAtDeath'].describe().round(2))

ChildDied
0    304889
1     19952
Name: count, dtype: int64
AgeOfChildAtDeath
<NA>    304889
0         8953
24        1462
1         1037
2          807
12         774
3          726
6          608
36         565
9          533
4          489
7          485
8          476
5          418
11         342
10         337
13         318
18         289
14         264
15         215
48         167
16         150
17         114
20         110
19          97
23          72
21          71
22          57
55           2
47           2
34           2
56           1
39           1
42           1
46           1
27           1
54           1
37           1
43           1
58           1
35           1
Name: count, dtype: Int64
count    19952.0
mean        6.72
std         9.81
min          0.0
25%          0.0
50%          1.0
75%         11.0
max         58.0
Name: AgeOfChildAtDeath, dtype: Float64


In [4]:
# sex of child: Female - 0, Male - 1
BRGE_df['b4']= BRGE_df['b4'].replace(2, 0)
BRGE_df['ChildGender_Male'] = BRGE_df['b4']
print(BRGE_df['ChildGender_Male'].value_counts(dropna=False))
print((BRGE_df['ChildGender_Male'].value_counts(dropna=False, normalize=True) * 100).round(2))


ChildGender_Male
1    164516
0    160325
Name: count, dtype: int64
ChildGender_Male
1    50.65
0    49.35
Name: proportion, dtype: float64


## Reproductive, Maternal, Newborn, and ChildHealth Intervention coverage indicators

In [5]:
# preceeding birth interval - calculated as the difference in months between the current birth and the previous birth, counting twins as one birth
# use this # consider this < 33 months - not recommeded by WHO

# Define the conditions and choices
conditions = [
    BRGE_df['b11'].isna(),
    BRGE_df['b11'] < 33,          
    BRGE_df['b11'] >= 33  
]
choices = [
    2, # First birth 
    0, 
    1  
]
BRGE_df['PreceedingBirthInterval(33+)'] = np.select(conditions, choices)
print(BRGE_df['PreceedingBirthInterval(33+)'].value_counts(dropna=False))
print((BRGE_df['PreceedingBirthInterval(33+)'].value_counts(dropna=False, normalize=True) * 100).round(2))


PreceedingBirthInterval(33+)
1    136456
0    115933
2     72452
Name: count, dtype: int64
PreceedingBirthInterval(33+)
1    42.01
0    35.69
2    22.30
Name: proportion, dtype: float64


In [6]:
# The calculated months of breastfeeding gives the duration of breastfeeding, with the duration calculated if the respondent is still breastfeeding the child or the child was breastfed until it died
# 93 - 'ever breastfed, not currently breastfeeding'
BRGE_df['m5'] = BRGE_df['m5'].replace(94, 0) # 'never breastfed'
BRGE_df['m5'] = BRGE_df['m5'].replace(97, np.nan) # 'inconsistent'
BRGE_df['m5'] = BRGE_df['m5'].replace(98, np.nan) # "don't know"
BRGE_df['DurationOfBreastfeeding'] = BRGE_df['m5']
print(BRGE_df['DurationOfBreastfeeding'].value_counts(dropna=False))

# was breastfed
BRGE_df['WasBreastfed'] = np.where(BRGE_df['DurationOfBreastfeeding'] == 0, 0, 
                                    np.where(BRGE_df['DurationOfBreastfeeding'].isna(), np.nan, 1))
BRGE_df['WasBreastfed'] = BRGE_df['WasBreastfed'].astype('Int64')
print(BRGE_df['WasBreastfed'].value_counts(dropna=False))
print((BRGE_df['WasBreastfed'].value_counts(dropna=False, normalize=True) * 100).round(2))


DurationOfBreastfeeding
93.0    186168
0.0      21870
2.0       6098
6.0       5992
4.0       5976
3.0       5804
1.0       5713
5.0       5683
7.0       5635
12.0      5604
8.0       5542
9.0       5369
13.0      5356
10.0      5063
11.0      5043
14.0      5026
15.0      4522
16.0      4497
17.0      4067
18.0      3953
19.0      3218
20.0      2904
21.0      2406
22.0      2100
23.0      1554
24.0      1408
NaN       1031
25.0       978
26.0       760
27.0       698
28.0       577
29.0       550
30.0       463
31.0       450
32.0       392
33.0       368
34.0       319
35.0       242
36.0       163
38.0       115
37.0       115
39.0       108
40.0        97
42.0        85
43.0        77
41.0        69
44.0        66
46.0        55
45.0        51
48.0        48
47.0        47
50.0        44
49.0        38
51.0        35
52.0        34
54.0        31
56.0        30
53.0        29
55.0        27
59.0        27
57.0        26
58.0        25
Name: count, dtype: int64
WasBreastfed
1      

  BRGE_df['m5'] = BRGE_df['m5'].replace(94, 0) # 'never breastfed'


In [7]:
# place of delivery
BRGE_df['m15'] = BRGE_df['m15'].replace(10, 'home') # home
BRGE_df['m15'] = BRGE_df['m15'].replace(11, 'home') # respondent's home
BRGE_df['m15'] = BRGE_df['m15'].replace(12, 'home') # other home
BRGE_df['m15'] = BRGE_df['m15'].replace(13, 'home') # tba premises
BRGE_df['m15'] = BRGE_df['m15'].replace(14, 'home') # on the way to the hospital
BRGE_df['m15'] = BRGE_df['m15'].replace(20, 'health facility') # public sector
BRGE_df['m15'] = BRGE_df['m15'].replace(21, 'health facility') # government hospital
BRGE_df['m15'] = BRGE_df['m15'].replace(22, 'health facility') # government health center
BRGE_df['m15'] = BRGE_df['m15'].replace(23, 'health facility') # medical center with surgical antenna (cma)/ district hospital (hd)
BRGE_df['m15'] = BRGE_df['m15'].replace(24, 'health facility') # health center/post
BRGE_df['m15'] = BRGE_df['m15'].replace(25, 'health facility') # maternity
BRGE_df['m15'] = BRGE_df['m15'].replace(26, 'health facility') # other public sector / maternity
BRGE_df['m15'] = BRGE_df['m15'].replace(27, 'health facility') # dispensary
BRGE_df['m15'] = BRGE_df['m15'].replace(28, 'health facility') # other public sector
BRGE_df['m15'] = BRGE_df['m15'].replace(30, 'health facility') # private sector
BRGE_df['m15'] = BRGE_df['m15'].replace(31, 'health facility') # private hospital/clinic
BRGE_df['m15'] = BRGE_df['m15'].replace(32, 'health facility') # medical center
BRGE_df['m15'] = BRGE_df['m15'].replace(33, 'health facility') # medical post
BRGE_df['m15'] = BRGE_df['m15'].replace(34, 'health facility') # maternity clinic
BRGE_df['m15'] = BRGE_df['m15'].replace(35, 'health facility') # nursing cabinet
BRGE_df['m15'] = BRGE_df['m15'].replace(36, 'health facility') # other private sector
BRGE_df['m15'] = BRGE_df['m15'].replace(40, 'health facility') # ngo sector
BRGE_df['m15'] = BRGE_df['m15'].replace(41, 'health facility') # ngo hospital
BRGE_df['m15'] = BRGE_df['m15'].replace(42, 'health facility') # ngo clinic
BRGE_df['m15'] = BRGE_df['m15'].replace(43, 'health facility') # fbo/mission hospital
BRGE_df['m15'] = BRGE_df['m15'].replace(44, 'health facility') # fbo/mission clinic
BRGE_df['m15'] = BRGE_df['m15'].replace(45, 'health facility') # religious/voluntary dispensary
BRGE_df['m15'] = BRGE_df['m15'].replace(46, 'health facility') # ngo other
BRGE_df['m15'] = BRGE_df['m15'].replace(47, 'health facility') # other fbo medical sector
BRGE_df['m15'] = BRGE_df['m15'].replace(96, 'home') # other
BRGE_df['PlaceOfDelivery'] = BRGE_df['m15']

BRGE_df['HealthFacilityDelivery'] = np.where(BRGE_df['PlaceOfDelivery'] == "home", 0, 
                                    np.where(BRGE_df['PlaceOfDelivery'].isna(), np.nan, 1))

BRGE_df['HealthFacilityDelivery'] = BRGE_df['HealthFacilityDelivery'].astype('Int64')
                                          
print(BRGE_df['HealthFacilityDelivery'].value_counts(dropna=False))
print((BRGE_df['HealthFacilityDelivery'].value_counts(dropna=False, normalize=True) * 100).round(2))


HealthFacilityDelivery
1       215552
0       109052
<NA>       237
Name: count, dtype: Int64
HealthFacilityDelivery
1       66.36
0       33.57
<NA>     0.07
Name: proportion, dtype: Float64


In [8]:
# delivery care: skilled, unskilled, or no one
BRGE_df['DeliveryCareProvider'] = pd.NA
BRGE_df.loc[(BRGE_df['m3a'] == 1) | (BRGE_df['m3b'] == 1) | (BRGE_df['m3c'] == 1) | (BRGE_df['m3d'] == 1) | (BRGE_df['m3e'] == 1) | (BRGE_df['m3f'] == 1), 'DeliveryCareProvider'] = 'Skilled'
BRGE_df.loc[(BRGE_df['m3g'] == 1) | (BRGE_df['m3h'] == 1) | (BRGE_df['m3i'] == 1) | (BRGE_df['m3j'] == 1) | (BRGE_df['m3k'] == 1), 'DeliveryCareProvider'] = 'Unskilled/No one'
BRGE_df.loc[BRGE_df['m3n'] == 1, 'DeliveryCareProvider'] = 'Unskilled/No one'

BRGE_df['SkilledDeliveryCareProvider'] = np.where(BRGE_df['DeliveryCareProvider'] == "Unskilled/No one", 0, 
                                    np.where(BRGE_df['DeliveryCareProvider'].isna(), np.nan, 1))

BRGE_df['SkilledDeliveryCareProvider'] = BRGE_df['SkilledDeliveryCareProvider'].astype('Int64')
print(BRGE_df['SkilledDeliveryCareProvider'].value_counts(dropna=False))
print((BRGE_df['SkilledDeliveryCareProvider'].value_counts(dropna=False, normalize=True) * 100).round(2))


SkilledDeliveryCareProvider
1       211928
0       112637
<NA>       276
Name: count, dtype: Int64
SkilledDeliveryCareProvider
1       65.24
0       34.67
<NA>     0.08
Name: proportion, dtype: Float64


In [9]:
# number of antenatal visits - last birth
pd.set_option('future.no_silent_downcasting', True)
BRGE_df['m14']= BRGE_df['m14'].replace(98, np.nan) # don't know # other NaN, becuase question was asked to those with births in last five years
BRGE_df['AntenatalCare'] = BRGE_df['m14']

# Replace values based on condition
BRGE_df['AntenatalCare(4+)'] = np.where(BRGE_df['m14'] >= 4, 1, np.where(BRGE_df['m14'] < 4, 0, np.nan))

BRGE_df['AntenatalCare(4+)'] = BRGE_df['AntenatalCare(4+)'].astype('Int64')

print(BRGE_df['AntenatalCare(4+)'].value_counts(dropna=False))
print((BRGE_df['AntenatalCare(4+)'].value_counts(dropna=False, normalize=True) * 100).round(2))


AntenatalCare(4+)
1       130628
<NA>     97523
0        96690
Name: count, dtype: Int64
AntenatalCare(4+)
1       40.21
<NA>    30.02
0       29.77
Name: proportion, dtype: Float64


In [10]:
### Neonatal tetanus protection
# number of tetanus injections before birth - last birth
BRGE_df['m1'] = BRGE_df['m1'].replace(8, np.nan) # don't know
BRGE_df['NumberOfTTBeforeBirth'] = BRGE_df['m1'] 

print(BRGE_df['NumberOfTTBeforeBirth'].value_counts(dropna=False))
print(BRGE_df['NumberOfTTBeforeBirth'].describe().round(2))

BRGE_df['NeonatalTetanusProtection(2+)'] = np.where(BRGE_df['NumberOfTTBeforeBirth'] >= 2, 1, 
                                                np.where(BRGE_df['NumberOfTTBeforeBirth'] < 2, 0, np.nan))

BRGE_df['NeonatalTetanusProtection(2+)'] = BRGE_df['NeonatalTetanusProtection(2+)'].astype('Int64')

print(BRGE_df['NeonatalTetanusProtection(2+)'].value_counts(dropna=False))
print((BRGE_df['NeonatalTetanusProtection(2+)'].value_counts(dropna=False, normalize=True) * 100).round(2))


NumberOfTTBeforeBirth
NaN    97244
2      73261
1      61968
0      53756
3      30296
4       4623
5       3140
6        352
7        201
Name: count, dtype: int64
count     227597
unique         8
top            2
freq       73261
Name: NumberOfTTBeforeBirth, dtype: int64
NeonatalTetanusProtection(2+)
0       115724
1       111873
<NA>     97244
Name: count, dtype: Int64
NeonatalTetanusProtection(2+)
0       35.62
1       34.44
<NA>    29.94
Name: proportion, dtype: Float64


In [11]:
# during pregnancy, given or bought iron tablets/syrup
BRGE_df['m45'] = BRGE_df['m45'].replace(8, np.nan)
BRGE_df['IronPillsDuringPregnancy'] = BRGE_df['m45']
BRGE_df['IronPillsDuringPregnancy'] = BRGE_df['IronPillsDuringPregnancy'].astype('Int64')

print(BRGE_df['IronPillsDuringPregnancy'].value_counts(dropna=False))
print((BRGE_df['IronPillsDuringPregnancy'].value_counts(dropna=False, normalize=True) * 100).round(2))


IronPillsDuringPregnancy
1       181075
<NA>     94849
0        48917
Name: count, dtype: Int64
IronPillsDuringPregnancy
1       55.74
<NA>     29.2
0       15.06
Name: proportion, dtype: Float64


In [12]:
# Baby postnatal check within 2 months - for the last birth
BRGE_df['m70'] = BRGE_df['m70'].replace(8, np.nan) # don't know
BRGE_df['m70'] = BRGE_df['m70'].replace(3, 0) #child dies in hospital

BRGE_df['BabyPostnatalCheck'] = BRGE_df['m70']
BRGE_df['BabyPostnatalCheck'] = BRGE_df['BabyPostnatalCheck'].astype('Int64')
print(BRGE_df['BabyPostnatalCheck'].value_counts(dropna=False))
print((BRGE_df['BabyPostnatalCheck'].value_counts(dropna=False, normalize=True) * 100).round(2))


BabyPostnatalCheck
0       154789
<NA>     95020
1        75032
Name: count, dtype: Int64
BabyPostnatalCheck
0       47.65
<NA>    29.25
1        23.1
Name: proportion, dtype: Float64


In [13]:
# Postpartum check-up of mother after discharge or delivery at home (m66)
BRGE_df['m66'] = BRGE_df['m66'].replace(8, np.nan) # don't know

BRGE_df['MaternalPostpartumHealthCheck'] = BRGE_df['m66']
BRGE_df['MaternalPostpartumHealthCheck'] = BRGE_df['MaternalPostpartumHealthCheck'].astype('Int64')
print(BRGE_df['MaternalPostpartumHealthCheck'].value_counts(dropna=False))
print((BRGE_df['MaternalPostpartumHealthCheck'].value_counts(dropna=False, normalize=True) * 100).round(2))


MaternalPostpartumHealthCheck
0       164329
<NA>    105146
1        55366
Name: count, dtype: Int64
MaternalPostpartumHealthCheck
0       50.59
<NA>    32.37
1       17.04
Name: proportion, dtype: Float64


In [14]:
# Met need for family planning
BRGE_df['v626a'] = BRGE_df['v626a'].replace({0: np.nan, 1: 0, 2: 0, 3: 1, 4: 1, 7: 2, 8: 2, 9: 2})

BRGE_df['MetNeedFamilyPlanning'] = BRGE_df['v626a']
BRGE_df['MetNeedFamilyPlanning'] = BRGE_df['MetNeedFamilyPlanning'].astype('Int64')
print(BRGE_df['MetNeedFamilyPlanning'].value_counts(dropna=False))
print((BRGE_df['MetNeedFamilyPlanning'].value_counts(dropna=False, normalize=True) * 100).round(2))

MetNeedFamilyPlanning
2       134997
0        92754
1        91308
<NA>      5782
Name: count, dtype: Int64
MetNeedFamilyPlanning
2       41.56
0       28.55
1       28.11
<NA>     1.78
Name: proportion, dtype: Float64


In [15]:
# type of cooking fuel
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8394069/
# Change 5 to clean, and 6 to clean - BF, CI, GH, TZ
# Change 14 and 12 to clean - KE
# Change 14 to clean - ZA

BRGE_df['v161'] = BRGE_df['v161'].replace(1, "clean") # electricity
BRGE_df['v161'] = BRGE_df['v161'].replace(2, "clean") # lpg # solar
BRGE_df['v161'] = BRGE_df['v161'].replace(3, "clean") # natural gas # lpg
BRGE_df['v161'] = BRGE_df['v161'].replace(4, "clean") # biogas # piped natural gas

# Replace 5 and 6 with "clean" or the specified countries : biogas, alcohol/ethanol
BRGE_df.loc[BRGE_df['country'].isin(['BF', 'CI', 'GH', 'TZ']), 'v161'] = BRGE_df.loc[BRGE_df['country'].isin(['BF', 'CI', 'GH', 'TZ']), 'v161'].replace({5: "clean", 6: "clean"})
# Replace 5 and 6 with "solid" or the specified countries : petroleum/kerosene, coal, lignite
BRGE_df.loc[~BRGE_df['country'].isin(['BF', 'CI', 'GH', 'TZ']), 'v161'] = BRGE_df.loc[~BRGE_df['country'].isin(['BF', 'CI', 'GH', 'TZ']), 'v161'].replace({5: "solid", 6: "solid"})

BRGE_df['v161'] = BRGE_df['v161'].replace(7, "solid") # charcoal # gasoline/diesel
BRGE_df['v161'] = BRGE_df['v161'].replace(8, "solid") # wood # kerosene/paraffin
BRGE_df['v161'] = BRGE_df['v161'].replace(9, "solid") # straw/shrub/grass # coal/lignite
BRGE_df['v161'] = BRGE_df['v161'].replace(10, "solid") # agricultural crops # charcoal
BRGE_df['v161'] = BRGE_df['v161'].replace(11, "solid") # animal dung/waste # wood # sawdust

# Replace 12 and 14 with "clean" or the specified countries : alcohol/ethanol, solar power
BRGE_df.loc[BRGE_df['country'].isin(['KE']), 'v161'] = BRGE_df.loc[BRGE_df['country'].isin(['KE']), 'v161'].replace({12: "clean", 14: "clean"})
# Replace 12 and 14 with "solid" or the specified countries : cardboard/paper # straw/shrubs/grass # briquette # electricity from generator, animal dung/waste 
BRGE_df.loc[~BRGE_df['country'].isin(['KE']), 'v161'] = BRGE_df.loc[~BRGE_df['country'].isin(['KE']), 'v161'].replace({12: "solid", 14: "solid"})

# Replace 14 with "clean" or the specified countries : solar power
BRGE_df.loc[BRGE_df['country'].isin(['ZA']), 'v161'] = BRGE_df.loc[BRGE_df['country'].isin(['ZA']), 'v161'].replace({14: "clean"})
# Replace 12 and 14 with "solid" or the specified countries : animal dung/waste 
BRGE_df.loc[~BRGE_df['country'].isin(['ZA']), 'v161'] = BRGE_df.loc[~BRGE_df['country'].isin(['ZA']), 'v161'].replace({14: "solid"})

BRGE_df['v161'] = BRGE_df['v161'].replace(13, "solid") # agricultural crop # saw dust # electricity from other source
BRGE_df['v161'] = BRGE_df['v161'].replace(15, "solid") # processed biomass (pellets) or woodchips
BRGE_df['v161'] = BRGE_df['v161'].replace(16, "solid") # garbage/plastic
BRGE_df['v161'] = BRGE_df['v161'].replace(17, "solid") # sawdust
BRGE_df['v161'] = BRGE_df['v161'].replace(95, "solid") # no food cooked in house
BRGE_df['v161'] = BRGE_df['v161'].replace(96, "solid") # other
BRGE_df['v161'] = BRGE_df['v161'].replace(97, np.nan) # not a dejure resident

BRGE_df['CleanCookingFuel'] = BRGE_df['v161']

BRGE_df['CleanCookingFuel'] = BRGE_df['CleanCookingFuel'].replace({'clean': 1, 'solid': 0})
BRGE_df['CleanCookingFuel'] = BRGE_df['CleanCookingFuel'].astype('Int64')
print(BRGE_df['CleanCookingFuel'].value_counts(dropna=False))
print((BRGE_df['CleanCookingFuel'].value_counts(dropna=False, normalize=True) * 100).round(2))


CleanCookingFuel
0       286176
1        32331
<NA>      6334
Name: count, dtype: Int64
CleanCookingFuel
0       88.1
1       9.95
<NA>    1.95
Name: proportion, dtype: Float64


In [16]:
# source of drinking water
BRGE_df['v113'] = BRGE_df['v113'].replace(10, 'Protected') ## piped water
BRGE_df['v113'] = BRGE_df['v113'].replace(11, 'Protected') ## piped into dwelling
BRGE_df['v113'] = BRGE_df['v113'].replace(12, 'Protected') ## piped to yard/plot
BRGE_df['v113'] = BRGE_df['v113'].replace(13, 'Protected') ## piped to neighbour
BRGE_df['v113'] = BRGE_df['v113'].replace(14, 'Protected') ## public tap/standpipe
BRGE_df['v113'] = BRGE_df['v113'].replace(15, 'Protected') ## standpipe
BRGE_df['v113'] = BRGE_df['v113'].replace(16, 'Protected') ## autonomous water station
BRGE_df['v113'] = BRGE_df['v113'].replace(20, 'Protected') ## tube well water
BRGE_df['v113'] = BRGE_df['v113'].replace(21, 'Protected') ## tube well or borehole
BRGE_df['v113'] = BRGE_df['v113'].replace(22, 'Protected') ## borehole
BRGE_df['v113'] = BRGE_df['v113'].replace(33, 'Protected') ## borehole with pump
BRGE_df['v113'] = BRGE_df['v113'].replace(30, 'Protected') ## dug well (open/protected)
BRGE_df['v113'] = BRGE_df['v113'].replace(31, 'Protected') ## protected well
BRGE_df['v113'] = BRGE_df['v113'].replace(32, 'Unprotected') ## unprotected well
BRGE_df['v113'] = BRGE_df['v113'].replace(40, 'Unprotected') ## surface from spring
BRGE_df['v113'] = BRGE_df['v113'].replace(41, 'Protected') ## protected spring
BRGE_df['v113'] = BRGE_df['v113'].replace(42, 'Unprotected') ## unprotected spring
BRGE_df['v113'] = BRGE_df['v113'].replace(43, 'Unprotected') ## river/dam/lake/ponds/stream/canal/irrigation channel
BRGE_df['v113'] = BRGE_df['v113'].replace(51, 'Unprotected') ## rainwater
BRGE_df['v113'] = BRGE_df['v113'].replace(61, 'Protected') ## tanker truck
BRGE_df['v113'] = BRGE_df['v113'].replace(62, 'Protected') ## cart with small tank
BRGE_df['v113'] = BRGE_df['v113'].replace(63, 'Protected') ## bicycle with jerrycans
BRGE_df['v113'] = BRGE_df['v113'].replace(71, 'Protected') ## bottled water
BRGE_df['v113'] = BRGE_df['v113'].replace(72, 'Unprotected') ## sachet water
BRGE_df['v113'] = BRGE_df['v113'].replace(92, 'Unprotected') ## sachet water
BRGE_df['v113'] = BRGE_df['v113'].replace(82, 'Unprotected') ## sachet water
BRGE_df['v113'] = BRGE_df['v113'].replace(73, 'Protected') ## water kiosk/domestic points
BRGE_df['v113'] = BRGE_df['v113'].replace(96, 'Unprotected') ## other
BRGE_df['v113'] = BRGE_df['v113'].replace(97, np.nan) ## not a dejure resident

BRGE_df['DrinkingWaterSource'] = BRGE_df['v113']

BRGE_df['ProtectedDrinkingWaterSource'] = np.where(BRGE_df['DrinkingWaterSource'] == "Unprotected", 0, 
                                    np.where(BRGE_df['DrinkingWaterSource'].isna(), np.nan, 1))

BRGE_df['ProtectedDrinkingWaterSource'] = BRGE_df['ProtectedDrinkingWaterSource'].astype('Int64')

print(BRGE_df['ProtectedDrinkingWaterSource'].value_counts(dropna=False))
print((BRGE_df['ProtectedDrinkingWaterSource'].value_counts(dropna=False, normalize=True) * 100).round(2))


ProtectedDrinkingWaterSource
1       213136
0       105358
<NA>      6347
Name: count, dtype: Int64
ProtectedDrinkingWaterSource
1       65.61
0       32.43
<NA>     1.95
Name: proportion, dtype: Float64


In [17]:
# type of toilet facility
BRGE_df['v116'] = BRGE_df['v116'].replace(10, 'Improved') ## flush toilet
BRGE_df['v116'] = BRGE_df['v116'].replace(11, 'Improved') ## flush to piped sewer system / indoors: flush to piped public system
BRGE_df['v116'] = BRGE_df['v116'].replace(12, 'Improved') ## flush to septic tank / indoors: flush to septic tank
# Replace 'v116' 13 and 14 values where 'country' column is "AO"
BRGE_df.loc[BRGE_df['country'] == "AO", 'v116'] = BRGE_df.loc[BRGE_df['country'] == "AO", 'v116'].replace(13, 'Unimproved') #indoors: flush to open pit (ditch or river)
BRGE_df.loc[BRGE_df['country'] == "AO", 'v116'] = BRGE_df.loc[BRGE_df['country'] == "AO", 'v116'].replace(14, 'Improved')   #iinside yard: flush to piped public system
# Replace 'v116' 13 and 14 values where 'country' column is not "AO"
BRGE_df.loc[BRGE_df['country'] != "AO", 'v116'] = BRGE_df.loc[BRGE_df['country'] != "AO", 'v116'].replace(13, 'Improved')   #flush to pit latrine
BRGE_df.loc[BRGE_df['country'] != "AO", 'v116'] = BRGE_df.loc[BRGE_df['country'] != "AO", 'v116'].replace(14, 'Unimproved')  #flush to somewhere else
BRGE_df['v116'] = BRGE_df['v116'].replace(15, 'Improved') ## flush, don't know where / inside yard: flush to septic tank
BRGE_df['v116'] = BRGE_df['v116'].replace(16, 'Unimproved') ## inside yard: flush to open pit (ditch or river)
BRGE_df['v116'] = BRGE_df['v116'].replace(17, 'Improved') ## out of yard: flush to piped public system
BRGE_df['v116'] = BRGE_df['v116'].replace(18, 'Improved') ## out of yard: flush to septic tank
BRGE_df['v116'] = BRGE_df['v116'].replace(19, 'Unimproved') ## out of yard: flush to open pit (ditch or river)
BRGE_df['v116'] = BRGE_df['v116'].replace(20, 'Improved') ## pit toilet latrine
BRGE_df['v116'] = BRGE_df['v116'].replace(21, 'Improved') ## ventilated improved pit latrine (vip) / indoors: latrine to piped public system
BRGE_df['v116'] = BRGE_df['v116'].replace(22, 'Improved') ## pit latrine with slab / indoors: latrine to septic tank
BRGE_df['v116'] = BRGE_df['v116'].replace(23, 'Unimproved') ## pit latrine without slab/open pit
BRGE_df['v116'] = BRGE_df['v116'].replace(24, 'Improved') ## inside yard: latrine to piped public system
BRGE_df['v116'] = BRGE_df['v116'].replace(25, 'Improved') ## inside yard: latrine to septic tank
BRGE_df['v116'] = BRGE_df['v116'].replace(26, 'Unimproved') ## inside yard: latrine to open pit (ditch or river)
BRGE_df['v116'] = BRGE_df['v116'].replace(27, 'Improved') ## out of yard: latrine to piped public system
BRGE_df['v116'] = BRGE_df['v116'].replace(28, 'Improved') ## out of yard: latrine to septic tank
BRGE_df['v116'] = BRGE_df['v116'].replace(29, 'Unimproved') ## out of yard: latrine to open pit (ditch or river)
BRGE_df['v116'] = BRGE_df['v116'].replace(30, 'Unimproved') ## no facility
BRGE_df['v116'] = BRGE_df['v116'].replace(31, 'Unimproved') ## no facility/bush/field
BRGE_df['v116'] = BRGE_df['v116'].replace(41, 'Improved') ## composting toilet/ecosan
BRGE_df['v116'] = BRGE_df['v116'].replace(42, 'Unimproved') ## bucket toilet / bucket/potty/other container
BRGE_df['v116'] = BRGE_df['v116'].replace(43, 'Unimproved') ## hanging toilet/latrine
BRGE_df['v116'] = BRGE_df['v116'].replace(44, 'Improved') ## chemical toilet
BRGE_df['v116'] = BRGE_df['v116'].replace(96, 'Unimproved') ## other
BRGE_df['v116'] = BRGE_df['v116'].replace(97, np.nan) ## not a dejure resident

BRGE_df['TypeOfToiletFacility'] = BRGE_df['v116']

BRGE_df['ImprovedToiletFacility'] = np.where(BRGE_df['TypeOfToiletFacility'] == "Unimproved", 0, 
                                    np.where(BRGE_df['TypeOfToiletFacility'].isna(), np.nan, 1))

BRGE_df['ImprovedToiletFacility'] = BRGE_df['ImprovedToiletFacility'].astype('Int64')

print(BRGE_df['ImprovedToiletFacility'].value_counts(dropna=False))
print((BRGE_df['ImprovedToiletFacility'].value_counts(dropna=False, normalize=True) * 100).round(2))



ImprovedToiletFacility
0       170667
1       147784
<NA>      6390
Name: count, dtype: Int64
ImprovedToiletFacility
0       52.54
1       45.49
<NA>     1.97
Name: proportion, dtype: Float64


In [18]:
## Mother smokes tobacco
BRGE_df['v463a'] = BRGE_df['v463a'].replace(np.nan, 0)
BRGE_df['v463b'] = BRGE_df['v463b'].replace(np.nan, 0)
BRGE_df['v463c'] = BRGE_df['v463c'].replace(np.nan, 0)
BRGE_df['v463d'] = BRGE_df['v463d'].replace(np.nan, 0)
BRGE_df['v463e'] = BRGE_df['v463e'].replace(np.nan, 0)
BRGE_df['v463f'] = BRGE_df['v463f'].replace(np.nan, 0)
BRGE_df['v463g'] = BRGE_df['v463g'].replace(np.nan, 0)
BRGE_df['v463x'] = BRGE_df['v463x'].replace(np.nan, 0)
 
tobacco_vars = ['v463a', 'v463b', 'v463c', 'v463d', 'v463e', 'v463f', 'v463g', 'v463x']
BRGE_df['MaternalTobaccoUse'] = BRGE_df[tobacco_vars].any(axis=1).astype(int)
print(BRGE_df['MaternalTobaccoUse'].value_counts(dropna=False))
print((BRGE_df['MaternalTobaccoUse'].value_counts(dropna=False, normalize=True) * 100).round(2))

BRGE_df['NonTobaccoSmoker'] = 1 - BRGE_df['MaternalTobaccoUse']
print(BRGE_df['NonTobaccoSmoker'].value_counts(dropna=False))
print((BRGE_df['NonTobaccoSmoker'].value_counts(dropna=False, normalize=True) * 100).round(2))

MaternalTobaccoUse
0    319655
1      5186
Name: count, dtype: int64
MaternalTobaccoUse
0    98.4
1     1.6
Name: proportion, dtype: float64
NonTobaccoSmoker
1    319655
0      5186
Name: count, dtype: int64
NonTobaccoSmoker
1    98.4
0     1.6
Name: proportion, dtype: float64


## Social Economic Status

### Community characteristics

In [19]:
# v025 # Type of place of residence # dummycode # 0: rural, 1: urban
BRGE_df['v025'] = BRGE_df['v025'].replace({2:0, 1:1})
BRGE_df['UrbanResidence'] = BRGE_df['v025']

print(BRGE_df['UrbanResidence'].value_counts(dropna=False))
print((BRGE_df['UrbanResidence'].value_counts(dropna=False, normalize=True) * 100).round(2))


UrbanResidence
0    223570
1    101271
Name: count, dtype: int64
UrbanResidence
0    68.82
1    31.18
Name: proportion, dtype: float64


### Household characteristics

In [20]:
# wealth index - categorical variable
BRGE_df['v190'] = BRGE_df['v190'].replace(1, 'Low') # Lowest
BRGE_df['v190'] = BRGE_df['v190'].replace(2, 'Low') # Second
BRGE_df['v190'] = BRGE_df['v190'].replace(3, 'Middle') # Middle
BRGE_df['v190'] = BRGE_df['v190'].replace(4, 'High') # Fourth
BRGE_df['v190'] = BRGE_df['v190'].replace(5, 'High') # Highest
BRGE_df['WealthIndex'] = BRGE_df['v190']

print(BRGE_df['WealthIndex'].value_counts(dropna=False))
print((BRGE_df['WealthIndex'].value_counts(dropna=False, normalize=True) * 100).round(2))

# wealth index factor score combined 
BRGE_df.rename(columns={'v191': 'WealthIndexFactorScore'}, inplace=True)
print(BRGE_df['WealthIndexFactorScore'].value_counts(dropna=False))


WealthIndex
Low       154051
High      105218
Middle     65572
Name: count, dtype: int64
WealthIndex
Low       47.42
High      32.39
Middle    20.19
Name: proportion, dtype: float64
WealthIndexFactorScore
-87520      22
 118832     21
-80220      21
 7928       21
-87058      20
-16161      20
-79142      19
 17866      19
-30755      18
 78242      18
-88270      17
-74668      17
 57420      17
-38503      17
-31625      16
-90814      16
-82594      16
-60724      15
-68542      15
 16288      15
-86122      15
-107154     15
 83882      15
-55074      15
-61992      15
-58225      15
-59095      15
-69177      15
-54125      15
 129130     14
-94308      14
-20220      14
-75612      14
-38170      14
-81120      14
-52061      14
-76203      14
-66243      14
-5069       14
-45181      14
-60439      14
-104992     14
-59725      14
 48100      14
 86687      14
-69881      14
-61675      14
-58968      13
-61021      13
-64204      13
-68632      13
-91522      13
-50462      13


### Individual: maternal

In [21]:
# Mother's employment status - Whether the respondent is currently working
BRGE_df['MothersEmploymentStatus'] = BRGE_df['v714']
BRGE_df['MothersEmploymentStatus'] = BRGE_df['MothersEmploymentStatus'].astype('Int64')

print(BRGE_df['MothersEmploymentStatus'].value_counts(dropna=False))
print((BRGE_df['MothersEmploymentStatus'].value_counts(dropna=False, normalize=True) * 100).round(2))


MothersEmploymentStatus
1       198377
0       126150
<NA>       314
Name: count, dtype: Int64
MothersEmploymentStatus
1       61.07
0       38.83
<NA>      0.1
Name: proportion, dtype: Float64


In [22]:
# v106 # Mother's Highest educational level # 6 NaN
BRGE_df['v106'] = BRGE_df['v106'].replace(0, 'No Education') # 'No Education'
BRGE_df['v106'] = BRGE_df['v106'].replace(1, 'Primary') # 'Primary'
BRGE_df['v106'] = BRGE_df['v106'].replace(2, 'Secondary') # 'Secondary'
BRGE_df['v106'] = BRGE_df['v106'].replace(3, 'Higher') # 'Higher'
BRGE_df['v106'] = BRGE_df['v106'].replace(6, np.nan)
BRGE_df['MothersEducationalLevel'] = BRGE_df['v106']

print(BRGE_df['MothersEducationalLevel'].value_counts(dropna=False))
print((BRGE_df['MothersEducationalLevel'].value_counts(dropna=False, normalize=True) * 100).round(2))


MothersEducationalLevel
No Education    125146
Primary         107303
Secondary        80814
Higher           11576
NaN                  2
Name: count, dtype: int64
MothersEducationalLevel
No Education    38.53
Primary         33.03
Secondary       24.88
Higher           3.56
NaN              0.00
Name: proportion, dtype: float64


## Demographics

### Individual: maternal 

In [23]:
# v012 # Mother's Current age
BRGE_df['MothersCurrentAge'] = BRGE_df['v012']
print(BRGE_df['MothersCurrentAge'].value_counts(dropna=False))
print(BRGE_df['MothersCurrentAge'].describe().round(2))

MothersCurrentAge
25    22417
30    21205
28    18008
27    16635
22    15896
26    15864
20    15827
23    15437
35    14924
24    14757
32    13581
29    13318
21    11954
31    11393
33    11075
34    10082
36     9326
38     8728
19     8460
37     8454
40     8197
18     7179
39     6312
42     4161
41     4153
17     3329
43     3040
45     2932
44     2188
16     1587
46     1368
47     1014
48      885
15      611
49      544
Name: count, dtype: int64
count    324841.00
mean         28.98
std           6.97
min          15.00
25%          24.00
50%          28.00
75%          34.00
max          49.00
Name: MothersCurrentAge, dtype: float64


In [24]:
# mother's marital status
BRGE_df['v501'] = BRGE_df['v501'].replace(0, 'never in union')
BRGE_df['v501'] = BRGE_df['v501'].replace(1, 'married/living with partner')
BRGE_df['v501'] = BRGE_df['v501'].replace(2, 'married/living with partner')
BRGE_df['v501'] = BRGE_df['v501'].replace(3, 'widowed/divorced/no longer living together/separated')
BRGE_df['v501'] = BRGE_df['v501'].replace(4, 'widowed/divorced/no longer living together/separated')
BRGE_df['v501'] = BRGE_df['v501'].replace(5, 'widowed/divorced/no longer living together/separated')
BRGE_df['MaritalStatus'] = BRGE_df['v501']

print(BRGE_df['MaritalStatus'].value_counts(dropna=False))
print((BRGE_df['MaritalStatus'].value_counts(dropna=False, normalize=True) * 100).round(2))


MaritalStatus
married/living with partner                             282113
widowed/divorced/no longer living together/separated     21941
never in union                                           20787
Name: count, dtype: int64
MaritalStatus
married/living with partner                             86.85
widowed/divorced/no longer living together/separated     6.75
never in union                                           6.40
Name: proportion, dtype: float64


In [25]:
# Age of respondent at first birth
BRGE_df['AgeAtFirstBirth'] = BRGE_df['v212']
BRGE_df['AgeAtFirstBirth'] = pd.to_numeric(BRGE_df['AgeAtFirstBirth'], errors='coerce')
print(BRGE_df['AgeAtFirstBirth'].value_counts(dropna=False))
print(BRGE_df['AgeAtFirstBirth'].describe().round(2))


AgeAtFirstBirth
18    41000
17    40082
19    38209
20    32136
16    32131
21    24735
15    21639
22    19177
23    14187
14    11426
24    10818
25     8244
26     6001
13     5144
27     4638
28     3207
12     2778
29     2558
30     1852
31     1397
32      968
33      634
34      509
35      349
36      249
11      174
37      166
38      114
10       76
39       66
40       51
41       37
44       23
42       20
43       19
45       10
9         5
48        3
8         3
46        2
47        2
6         2
Name: count, dtype: int64
count    324841.00
mean         19.33
std           3.86
min           6.00
25%          17.00
50%          19.00
75%          21.00
max          48.00
Name: AgeAtFirstBirth, dtype: float64


In [26]:
# age at first marriage
BRGE_df['AgeAtFirstMarriage'] = BRGE_df['v511']
BRGE_df['AgeAtFirstMarriage'] = pd.to_numeric(BRGE_df['AgeAtFirstMarriage'], errors='coerce')
print(BRGE_df['AgeAtFirstMarriage'].value_counts(dropna=False))
print(BRGE_df['AgeAtFirstMarriage'].describe().round(2))

## early marriage
BRGE_df['OfAgeMarriage'] = np.where(BRGE_df['v511'] >= 18, 1, 
                                    np.where(BRGE_df['v511'] < 18, 0, np.nan))
BRGE_df['OfAgeMarriage'] = BRGE_df['OfAgeMarriage'].astype('Int64')
print(BRGE_df['OfAgeMarriage'].value_counts(dropna=False))
print((BRGE_df['OfAgeMarriage'].value_counts(dropna=False, normalize=True) * 100).round(2))


AgeAtFirstMarriage
17.0    34486
16.0    33581
18.0    32379
15.0    31589
19.0    27038
20.0    22994
14.0    22037
NaN     20787
21.0    16793
22.0    13606
13.0    12416
23.0    10478
24.0     8120
25.0     6448
12.0     6442
26.0     4878
27.0     3815
11.0     3225
28.0     2818
29.0     2238
10.0     2059
30.0     1796
31.0     1265
32.0      944
33.0      649
34.0      529
35.0      372
36.0      276
38.0      170
37.0      164
9.0        89
39.0       87
40.0       57
8.0        56
41.0       45
42.0       31
43.0       25
7.0        20
44.0       14
6.0         8
45.0        7
46.0        4
4.0         3
47.0        2
5.0         1
Name: count, dtype: int64
count    304054.00
mean         18.39
std           4.27
min           4.00
25%          15.00
50%          18.00
75%          20.00
max          47.00
Name: AgeAtFirstMarriage, dtype: float64
OfAgeMarriage
1       158042
0       146012
<NA>     20787
Name: count, dtype: Int64
OfAgeMarriage
1       48.65
0       44.95
<NA> 

### Household

### Mother's individual health

In [27]:
# total children ever born - parity
BRGE_df['MaternalParity'] = BRGE_df['v201']
print(BRGE_df['MaternalParity'].value_counts(dropna=False))
print(BRGE_df['MaternalParity'].describe().round(2))


MaternalParity
2     63218
3     56198
1     49455
4     44986
5     35383
6     26475
7     19338
8     13029
9      7743
10     4856
11     2381
12     1155
13      399
14      143
15       48
16       28
17        4
18        1
21        1
Name: count, dtype: int64
count    324841.00
mean          3.90
std           2.41
min           1.00
25%           2.00
50%           3.00
75%           5.00
max          21.00
Name: MaternalParity, dtype: float64


## Women's empowerment

In [28]:

# participation of mothers in the decision of their health (v) or making large household purchases (v743b) is a proxy for mother's empowerment
BRGE_df['v743a'] = BRGE_df['v743a'].replace({1: 2, 2: 1, 3: 1, 4: 0, 5: 0, 6: 0})
BRGE_df['v743b'] = BRGE_df['v743b'].replace({1: 2, 2: 1, 3: 1, 4: 0, 5: 0, 6: 0})

BRGE_df['DecisionMaking_Participation'] = BRGE_df[['v743a', 'v743b']].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)
print(BRGE_df['DecisionMaking_Participation'].value_counts(dropna=False))
print((BRGE_df['DecisionMaking_Participation'].value_counts(dropna=False, normalize=True) * 100).round(2))


DecisionMaking_Participation
1    179773
0    145068
Name: count, dtype: int64
DecisionMaking_Participation
1    55.34
0    44.66
Name: proportion, dtype: float64


## Geospatial features

In [29]:
# LATNUM # 
BRGE_df.rename(columns={'LATNUM': 'Cluster\'s latitude coordinate'}, inplace=True)

# LONGNUM # Current age
BRGE_df.rename(columns={'LONGNUM': 'Cluster\'s longitude coordinate'}, inplace=True)


## Creating dataset

In [30]:
BRGE_df['ChildDied'].value_counts()

ChildDied
0    304889
1     19952
Name: count, dtype: int64

In [31]:

CD_dataset = BRGE_df[['midx','country','Cluster\'s latitude coordinate', 'Cluster\'s longitude coordinate',
                      'ChildDied', 'AgeOfChildAtDeath', 'ChildGender_Male', 'PreceedingBirthInterval(33+)', 'WasBreastfed',
                      'HealthFacilityDelivery', 'SkilledDeliveryCareProvider', 'AntenatalCare(4+)', 'NeonatalTetanusProtection(2+)',
                      'IronPillsDuringPregnancy', 'BabyPostnatalCheck', 'MaternalPostpartumHealthCheck', 'MetNeedFamilyPlanning', 'CleanCookingFuel', 
                      'ProtectedDrinkingWaterSource', 'ImprovedToiletFacility', 'NonTobaccoSmoker', 'OfAgeMarriage',
                      'UrbanResidence', 'WealthIndex', 'MothersEducationalLevel', 'MothersEmploymentStatus','MothersCurrentAge', 
                      'MaritalStatus', 'AgeAtFirstBirth', 'MaternalParity', 'DecisionMaking_Participation']]


In [32]:

CD_dataset.shape


(324841, 31)

In [33]:
# pickling the BRGE_modified file
f = open('Processed_Datasets/CD_cleaned.pkl', 'wb')
pickle.dump(CD_dataset, f)
f.close()

# End