# Data Understanding: SDI 2014 Tanzania â€“ Facility-Level Data

This notebook explores the facility-level dataset from the 2014 Tanzania Service Delivery Indicators (SDI) Health Survey.
The objective is to understand the structure, key variables, and data quality before cleaning and analysis.

This dataset will later be used to analyze facility characteristics associated with potential delays in healthcare service delivery.


In [20]:
import pandas as pd
import numpy as np
import os

processed_dir = r"C:\Users\Students\Desktop\flax_projects\SDI_Tanzania_Healthcare_Delays\data\processed"
os.makedirs(processed_dir, exist_ok=True)


pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

In [21]:
#load facility dataset
facility_path = r"C:\Users\Students\Pictures\Screenshots\flax_datasets\TZA_2014_SDI-H_v01_M_v02_A_PUF_CSV\TZA_2014_SDI-H_v01_M_v02_A_PUF_CSV\facility_level.csv"
facility_df = pd.read_csv(facility_path)

facility_df.head()


Unnamed: 0,facility_id,country,year,ruralurban,publicprivate,m0s0q0,facility_level,transport_central_a,transport_central_hr,transport_central_mn,hours_opt_mn,num_outpatient,has_inpatient,num_inpatient,beds_total,has_births,has_maternity_a,has_maternity_b,has_antibiotics,has_ocytocics,has_anticonvuls,has_placenta,has_poc,has_delivery,has_resusc,has_bloodtrans,has_csection,num_births,transfers_csection,power_a,power_b,power_d,power_c,power_e,power_f,power_g,power_h,power_i,water_a,water_b,water_c,water_d,water_e,water_f,water_g,toilet_opt_a,toilet_opt_d,toilet_opt_e,toilet_ipt_a,toilet_ipt_d,...,zinctabs_a,zinctabs_b,act_a,act_b,artesunate_a,artesunate_b,albendazole_a,albendazole_b,mebendazole_a,mebendazole_b,treat_guidelines,treat_observed,has_gloves,has_mcondoms,has_fcondoms,rdt_a,hivtest_a,glucometer_a,tbtest_a,has_bednet,has_tapemeasure,has_length,does_vaccines,vaccine_store1,has_fridge,fridge_temp,fridge_tempmon,fridge_power,measles_vac,polio_vac,dpt_vac,pentavalent_vac,pneumococ_vac,bcg_vac,hepb_vac,tetanus_vac,has_dispsyringe,has_vaccarrier,has_autosyringe,has_icepacks,has_sharps,vaccine_store2,final_wt,ipw,abs_prob,abs_wt,vign_prob,vign_wt,has_weights,has_facility
0,1,TANZANIA,2014,3,1,1,3.0,1,2,30,30,2.0,0,,,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1,,,,,,,,,2,0.0,,,,0,5,6,2.0,0.0,1,,...,1,1.0,1,1.0,5,,1,1.0,1,1.0,1.0,1.0,1,0,0,1,1,0,0,0,1,0,1,1.0,3.0,4.444445,2.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,0.0,44.826572,44.826572,1.0,44.826572,1.0,44.826572,1,1
1,10,TANZANIA,2014,2,1,1,2.0,1,3,0,0,2.0,1,2.0,1.0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,2,1.0,96.0,30.0,10.0,3.0,1.0,1.0,1.0,2,0.0,,,,0,0,9,2.0,0.0,9,1.0,...,4,,1,1.0,1,1.0,1,1.0,1,1.0,1.0,1.0,1,1,1,1,1,1,1,1,1,1,1,1.0,3.0,6.111111,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,2.0,2.0,0.0,7.443255,7.443255,1.0,7.443255,0.5,14.886511,1,1
2,100,TANZANIA,2014,1,1,1,2.0,1,1,30,0,3.0,1,1.0,1.0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1,,,,,,,,,3,1.0,0.0,0.0,0.0,0,20,4,2.0,1.0,1,,...,1,1.0,1,1.0,5,,1,1.0,4,,1.0,1.0,1,1,1,1,1,1,1,1,1,1,1,1.0,3.0,8.888889,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,6.820906,6.820906,0.909091,7.502997,0.333333,20.462719,1,1
3,101,TANZANIA,2014,1,3,1,3.0,1,2,0,0,2.0,1,,1.0,1,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2,0.0,,,,1.0,1.0,1.0,1.0,3,1.0,72.0,0.0,3.0,1,0,4,1.0,1.0,10,1.0,...,1,1.0,4,,5,,1,1.0,1,1.0,1.0,0.0,1,0,0,1,1,1,0,1,1,1,1,1.0,3.0,11.111111,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,5.647542,5.647542,1.0,5.647542,0.5,11.295084,1,1
4,102,TANZANIA,2014,2,5,1,3.0,4,0,15,0,3.0,0,,,0,,,,,,,,,,,,,0.0,2,1.0,8.0,30.0,7.0,5.0,1.0,1.0,1.0,3,1.0,84.0,0.0,7.0,0,20,10,1.0,1.0,1,,...,1,1.0,1,1.0,5,,1,1.0,1,1.0,0.0,,1,1,0,1,1,1,0,1,0,0,1,1.0,3.0,6.666667,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,3.928568,3.928568,0.769231,5.107138,0.333333,11.785705,1,1


Basic inspections

In [22]:
facility_df.shape

(403, 296)

In [23]:
facility_df.columns

Index(['facility_id', 'country', 'year', 'ruralurban', 'publicprivate',
       'm0s0q0', 'facility_level', 'transport_central_a',
       'transport_central_hr', 'transport_central_mn',
       ...
       'has_sharps', 'vaccine_store2', 'final_wt', 'ipw', 'abs_prob', 'abs_wt',
       'vign_prob', 'vign_wt', 'has_weights', 'has_facility'],
      dtype='str', length=296)

In [24]:
facility_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Columns: 296 entries, facility_id to has_facility
dtypes: float64(206), int64(89), str(1)
memory usage: 932.1 KB


In [25]:
facility_df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
facility_id,403.0,,,,242.327543,139.969709,1.0,114.5,251.0,364.5,536.0
country,403,1,TANZANIA,403,,,,,,,
year,403.0,,,,2014.0,0.0,2014.0,2014.0,2014.0,2014.0,2014.0
ruralurban,403.0,,,,1.550868,0.690568,1.0,1.0,1.0,2.0,3.0
publicprivate,403.0,,,,1.945409,1.486924,1.0,1.0,1.0,3.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...
abs_wt,401.0,,,,29.852601,71.612078,1.240716,7.906569,15.814888,27.664261,1207.66687
vign_prob,401.0,,,,0.533669,0.313078,0.005393,0.333333,0.5,1.0,1.0
vign_wt,401.0,,,,46.845903,90.12404,2.200269,13.397341,25.071518,47.433136,1307.132568
has_weights,403.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [26]:
facility_df.duplicated().sum()

np.int64(0)

#ðŸŽ¯ What our project will actually measure

We will analyze potential treatment delay risk, driven by:

Facility type & ownership

Rural vs urban location

Staffing & infrastructure availability

Equipment and logistics readiness

In [27]:
# Quickly check missingness
missing = facility_df.isna().mean().sort_values(ascending=False)
missing.head()

incinerator_c    1.0
incinerator_b    1.0
incinerator_a    1.0
nonelectric_c    1.0
thermometer_d    1.0
dtype: float64

In [28]:
# Look at key categorical variables
facility_df['facility_level'].value_counts(dropna=False)

facility_level
3.0    272
2.0     84
1.0     27
NaN     20
Name: count, dtype: int64

In [29]:
facility_df['publicprivate'].value_counts(dropna=False)

publicprivate
1    269
5     62
3     59
2     12
4      1
Name: count, dtype: int64

In [30]:
facility_df['ruralurban'].value_counts(dropna=False)

ruralurban
1    227
2    130
3     46
Name: count, dtype: int64

In [31]:
core_cols = [
    'facility_id',
    'facility_level',
    'publicprivate',
    'ruralurban',
    'transport_central_a',
    'transport_central_hr',
    'transport_central_mn',
    'has_sharps',
    'vaccine_store2'
]

facility_core = facility_df[core_cols].copy()
facility_core.head()


Unnamed: 0,facility_id,facility_level,publicprivate,ruralurban,transport_central_a,transport_central_hr,transport_central_mn,has_sharps,vaccine_store2
0,1,3.0,1,3,1,2,30,2.0,0.0
1,10,2.0,1,2,1,3,0,2.0,0.0
2,100,2.0,1,1,1,1,30,2.0,1.0
3,101,3.0,3,1,1,2,0,2.0,1.0
4,102,3.0,5,2,4,0,15,2.0,1.0


## Initial Observations

The facility-level dataset contains a wide range of variables related to ownership, location, infrastructure, and logistics.
Initial exploration focuses on identifying core facility characteristics that may influence service readiness and potential delays in healthcare delivery.


In [44]:
facility_core.to_csv(r'C:\Users\Students\Desktop\flax_projects\SDI_Tanzania_Healthcare_Delays\data\processed\facility_core.csv', index=False)



In [45]:
provider_facility = pd.read_csv(r'C:\Users\Students\Desktop\flax_projects\SDI_Tanzania_Healthcare_Delays\data\processed\provider_facility.csv')

In [47]:
merged_facility = facility_core.merge(provider_facility, on='facility_id', how='inner')
merged_facility.to_csv(r'C:\Users\Students\Desktop\flax_projects\SDI_Tanzania_Healthcare_Delays\data\processed\facility_full.csv', index=False)
