# Under-five mortality (DHS data)

## Resources

https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Early_Childhood_Mortality.htm

Under-5 Mortality Rate (U5MR)
The under-5 mortality rate is the probability (expressed as a rate per 1,000 live births) of a child exposed in a specific period dying before reaching the age of five years.


Coverage:
Population base: Live births to respondents (BR file)

Time period: Five-year or ten-year periods of time preceding the survey (v008-1 to v008-60 or v008-120 months), excluding the month of interview

Numerators:
Number of deaths to live-born children during specified age range and specified time period
Under-5 mortality: Deaths at ages 0 to 4 years, including deaths reported at ages 0 to 59 months and 0 to 99 days

Denominator: Number of surviving children at beginning of specified age range during the specified time period

Variables: BR file.

b3 Date of birth of child (CMC)

b5 Child is alive (1 = Yes, 0 = No)

b7 Age at death in months (imputed)

v008 Date of interview (CMC)

v005 Woman’s individual sample weight

## Preliminary steps

In [None]:
rm(list = ls())

options(scipen=999)

# Global paths
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")

# Paths
ROOT_PATH <- '~/workspace'
CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')
CODE_PATH <- file.path(ROOT_PATH, 'code')
DATA_PATH <- file.path(ROOT_PATH, 'data')
DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')
OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'mortality')

# Load utils
source(file.path(CODE_PATH, "snt_utils.r"))

# List required pcks
required_packages <- c("haven", "sf", "glue", "survey", "data.table", "stringi", "jsonlite", "httr", "reticulate", "arrow", "DHS.rates")

# Execute function
install_and_load(required_packages)

Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
CONFIG_FILE_NAME <- "SNT_config.json"
config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },
                        error = function(e) {
                          msg <- paste0("Error while loading configuration", conditionMessage(e))  
                          cat(msg)   
                          stop(msg) 
                        })

msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) 
log_msg(msg)

# Set config variables
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE

In [None]:
admin_level <- 'ADM1'
admin_id_col <- glue(admin_level, 'ID', .sep='_')
admin_name_col <- glue(admin_level, 'NAME', .sep='_')
admin_cols <- c(admin_id_col, admin_name_col)

## Geo/admin data

In [None]:
# Load spatial file from dataset

dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED

spatial_data_filename <- paste(COUNTRY_CODE, "shapes.geojson", sep = "_")
# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))
spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)
log_msg(glue("File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}"))

spatial_data <- st_as_sf(spatial_data)

# aggregate geometries by the admin columns
spatial_data <- aggregate_geometry(
  sf_data=spatial_data,
  admin_id_colname=admin_id_col,
  admin_name_colname=admin_name_col
)

# keep class
spatial_data <- st_as_sf(spatial_data)

if(COUNTRY_CODE == "COD"){
  spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])
}

admin_data <- st_drop_geometry(spatial_data)
setDT(admin_data)

## Import DHS data

In [None]:
indicator_u5mr <- 'U5MR_PERMIL' # to be computed

data_source <- 'DHS'
household_recode <- 'HR'
births_recode <- 'BR'
target_file_type <- 'SV'

delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=".zip")

dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)

dhs_br_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, births_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_br_zip_filename), exdir=DHS_DATA_PATH)

# # Remove existing output files
# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)
# files_to_delete <- files[grepl('U5_MORT', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]
# file.remove(files_to_delete)

data_extension <- '.SAV'
dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(".*", household_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)
dhs_br_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(".*", births_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)

if(!check_dhs_same_version(dhs_hr_filename, dhs_br_filename)){
  stop("The input DHS data do not have the same version/issue. Check available data before rerunning.")
}

dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode
dhs_hr_dt <- setDT(dhs_hr_dt)

dhs_br_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_br_filename)) # births recode
dhs_br_dt <- setDT(dhs_br_dt)

# Make admin codes and names dataframe (for future merging)

dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])

dhs_admin_dt <- make_dhs_admin_df(
  input_dhs_df=dhs_hr_dt,
  original_admin_column="HV024",
  new_admin_name_colname=admin_name_col,
  new_admin_code_colname='DHS_ADM1_CODE'
)

# format the names to be like DHIS2 names
dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]

# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space
dhs_admin_dt[get(admin_name_col) == "MAI NDOMBE", (admin_name_col) := "MAINDOMBE"]

# Check that all regions can be matched with DHIS2 pyramid
if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){
  stop("The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.")
}

rm(dhs_hr_dt) # free up resources

## Preprocess DHS data

In [None]:
# Relevant columns
household_id_cols <- c("CASEID", "V000", "V001", "V002")
household_sampling_cols <- c("V005", "V021", "V022", "V023", "V024")
birth_date_col <- "B3" # Date of birth of child (CMC)
alive_col <- "B5" # Child is alive (1 = Yes, 0 = No)
death_age_col <-"B7" # Age at death in months (imputed)
end_date_col <- "V008" # Date of interview (CMC)

dhs_br_dt[, (birth_date_col) := as.integer(get(birth_date_col))]
dhs_br_dt[, (death_age_col) := as.integer(get(death_age_col))]
dhs_br_dt[, (end_date_col) := as.integer(get(end_date_col))]

dhs_br_dt <- dhs_br_dt[
  ,
  .SD,
  .SDcols = c(
  household_id_cols,
  household_sampling_cols,
  birth_date_col,
  alive_col,
  death_age_col,
  end_date_col
)
]

## Compute indicator

In [None]:
region_dt_list <- split(dhs_br_dt, by = "V024")

u5mort_table <- rbindlist(
  lapply(region_dt_list, make_dhs_adm1_u5mort_dt) 
)

lower_bound_col <- glue("{toupper(indicator_u5mr)}_CI_LOWER_BOUND")
upper_bound_col <- glue("{toupper(indicator_u5mr)}_CI_UPPER_BOUND")
sample_avg_col <- glue("{toupper(indicator_u5mr)}_SAMPLE_AVERAGE")

# add necessary missing columns and remove non-necessary present columns
u5mort_table <- merge.data.table(dhs_admin_dt, u5mort_table, by = 'DHS_ADM1_CODE', all = TRUE)
setnames(u5mort_table,
         old=c("R", "LCI", "UCI"),
         new=c(
           sample_avg_col,
           lower_bound_col,
           upper_bound_col
           ),
         skip_absent=TRUE # not changing all names
         )
u5mort_table <- merge.data.table(admin_data, u5mort_table, by = admin_name_col)
u5mort_table <- u5mort_table[
  ,
  .SD,
  .SDcols = c(
    admin_cols,
    sample_avg_col,
    lower_bound_col,
    upper_bound_col
    )
  ]

# Cap the CI's at 0 (in case of small numbers)
u5mort_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]

filename_without_extension <- glue("{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5mr)}")
write.csv(u5mort_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)
write_parquet(u5mort_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))