# Careseeking behavior upon child fever (DHS data)

## Resources

https://dhsprogram.com/data/Guide-to-DHS-Statistics/Fever_and_Careseeking.htm?rhtocid=_13_3_0#Percentage_of_children4

1)      Percentage of children under age 5 years with fever in the 2 weeks preceding the survey.
2)      Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought.
3)      Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought the same or next day following the onset of fever.
4)      Among children under age 5 with fever in the 2 weeks preceding the survey, percentage who took antibiotic drugs.

Coverage:

Population base: Living children under age 5 years (KR file)

Time period: Two weeks preceding the survey

Numerators:
1)      Number of living children under age 5 years with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 & h22 = 1)
2)      Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 except traditional practitioner (usually h32t))
3)      Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview for whom advice or treatment was sought the same day or next day following the onset of fever (b5 = 1 & b19 < 60 & h22 = 1 & any of h32a – x = 1 excluding advice or treatment from a traditional practitioner (usually h32t) & h46b in 0:1)
4)      Number of living children under age 5 years with a fever at any time during the 2 weeks preceding the interview who took antibiotic drugs (b5 = 1 & h22 = 1 & (h37i = 1 or h37j = 1 or h37n= 1 or h37o = 1) [or ml13i = 1 or ml13j = 1 or ml13n = 1 or ml13o = 1])

Denominators:
- Numerator 1: Number of living children under age 5 (b5 = 1 & b19 < 60)

- Numerators 2, 3, and 4: Number of living children under age 5 with fever at any time during the 2 weeks preceding the survey (b5 = 1 & b19 < 60 and h22 = 1).

Project uses (split by "private/public"): "2)      Among children under age 5 years with fever in the 2 weeks preceding the survey, percentage for whom advice or treatment was sought."

## Preliminary steps

In [None]:
rm(list = ls())

options(scipen=999)

In [None]:
# Global paths
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")

# Paths
ROOT_PATH <- '~/workspace'
CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')
CODE_PATH <- file.path(ROOT_PATH, 'code')
DATA_PATH <- file.path(ROOT_PATH, 'data')
DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')
OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'careseeking')

# Load utils
source(file.path(CODE_PATH, "snt_utils.r"))

# List required pcks
required_packages <- c("haven", "sf", "glue", "survey", "data.table", "stringi", "jsonlite", "httr", "reticulate", "arrow")

# Execute function
install_and_load(required_packages)

In [None]:
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
CONFIG_FILE_NAME <- "SNT_config.json"
config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },
                        error = function(e) {
                          msg <- paste0("Error while loading configuration", conditionMessage(e))  
                          cat(msg)   
                          stop(msg) 
                        })

msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) 
log_msg(msg)

# Set config variables
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE

## Spatial/admin data

In [None]:
admin_level <- 'ADM1'
admin_id_col <- glue(admin_level, 'ID', .sep='_')
admin_name_col <- glue(admin_level, 'NAME', .sep='_')
admin_cols <- c(admin_id_col, admin_name_col)

In [None]:
# Load spatial file from dataset

dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED

spatial_data_filename <- paste(COUNTRY_CODE, "shapes.geojson", sep = "_")
# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))
spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)
log_msg(glue("File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}"))

spatial_data <- st_as_sf(spatial_data)

# aggregate geometries by the admin columns
spatial_data <- aggregate_geometry(
  sf_data=spatial_data,
  admin_id_colname=admin_id_col,
  admin_name_colname=admin_name_col
)

# keep class
spatial_data <- st_as_sf(spatial_data)

if(COUNTRY_CODE == "COD"){
  spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])
}

admin_data <- st_drop_geometry(spatial_data)
setDT(admin_data)

## Import DHS data

In [None]:
data_source <- 'DHS'

indicator_public_care <- 'PCT_PUBLIC_CARE'
indicator_private_care <- 'PCT_PRIVATE_CARE'
indicator_no_care <- 'PCT_NO_CARE'

In [None]:
household_recode <- 'HR'
kid_recode <- 'KR'
target_file_type <- 'SV'

delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=".zip")

dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)

dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)

# # Remove existing output files
# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)
# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]
# file.remove(files_to_delete)

data_extension <- '.SAV'
dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(".*", household_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)
dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(".*", kid_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)

if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){
  stop("The necessary DHS data do not have the same version/issue. Check available data before rerunning.")
}

dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode
dhs_hr_dt <- setDT(dhs_hr_dt)

dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode
dhs_kr_dt <- setDT(dhs_kr_dt)

### Make admin codes and names dataframe (for future merging)

In [None]:
dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])

dhs_admin_dt <- make_dhs_admin_df(
  input_dhs_df=dhs_hr_dt,
  original_admin_column="HV024",
  new_admin_name_colname=admin_name_col,
  new_admin_code_colname='DHS_ADM1_CODE'
)

# format the names to be like DHIS2 names
dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]

# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space
dhs_admin_dt[get(admin_name_col) == "MAI NDOMBE", (admin_name_col) := "MAINDOMBE"]

# Check that all regions can be matched with DHIS2 pyramid
if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){
  stop("The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.")
}

In [None]:
rm(dhs_hr_dt) # will not be used further

### Relevant columns

In [None]:
kid_id_cols <- c("CASEID", "V000", "V001", "V002")
household_sampling_cols <- c("V005", "V021", "V022", "V023", "V024")
kid_alive_col <- "B5"
kid_age_col <- "B19"
kid_fever_col <- "H22"
# grep("^H32", names(dhs_kr_dt), value = TRUE)
kid_public_care_cols <- c("H32A", "H32B", "H32C", "H32D", "H32E", "H32F", "H32G", "H32H", "H32I")
kid_private_care_cols <- c("H32J", "H32K", "H32L", "H32M", "H32N", "H32O", "H32P", "H32Q", "H32R",
                           "H32NA", "H32NB", "H32NC", "H32ND", "H32NE")
kid_other_care_cols <- c("H32S", "H32W", "H32X")

## Preprocess kid file

In [None]:
# filter rows and columns
kr_dt <- dhs_kr_dt[(
  !(is.na(get(kid_age_col))) & # no missing age
    get(kid_age_col) < 60 & # younger than 5
    get(kid_alive_col) == 1 & # alive
    get(kid_fever_col) == 1 # had fever
  ),
  .SD, .SDcols = c(
    kid_id_cols,
    household_sampling_cols,
    kid_alive_col,
    kid_age_col,
    kid_fever_col,
    kid_other_care_cols,
    kid_public_care_cols,
    kid_private_care_cols
    )]

kr_dt[, wt := V005/1000000]

kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = "DHS_ADM1_CODE", by.y = "V024", all = TRUE)

In [None]:
# Create the numerators

kr_dt[, (indicator_public_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_public_care_cols]
kr_dt[, (indicator_private_care) := as.integer(rowSums(.SD == 1, na.rm = TRUE) > 0), .SDcols = kid_private_care_cols]
kr_dt[, (indicator_no_care) := as.integer(rowSums(.SD != 0, na.rm = TRUE) == 0), .SDcols = c(kid_public_care_cols, kid_private_care_cols)]

# check
xtabs(~ kr_dt[[indicator_public_care]] + kr_dt[[indicator_private_care]] + kr_dt[[indicator_no_care]])

### Sampling design

In [None]:
# clustering, stratification, weights (for means, proportions, regression models, etc.)
kr_design_sampling = svydesign(
  ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)
  data = kr_dt, # dataset
  strata = ~ V023, # groupings of primary sampling units
  weights = ~ wt, # the sampling weights variable
  num_p=1, # ? dunno what this is
  nest = T # the primary sampling units are nested within the strata
)

## Compute indicators

Make the indicator tables and save them, add the sample estimation of the average proportion to a summary table

In [None]:
summary_table <- copy(admin_data)

for (indicator_name in c(indicator_public_care, indicator_private_care, indicator_no_care)){
  
    # make the table name
    table_name <- glue(tolower(indicator_name), 'table', .sep = '_')
    
    # create the content for the table
    table_content <- svyby(
        formula = as.formula(paste("~", indicator_name)),
        by = reformulate(admin_name_col),
        FUN = svymean,
        design = kr_design_sampling,
        level = 0.95,
        vartype = "ci",
        na.rm = TRUE,
        influence = TRUE  # which observations have a substantial change in the results of the analysis
    )
  
    # make it into data.table
    setDT(table_content)

    lower_bound_col <- glue("{toupper(indicator_name)}_CI_LOWER_BOUND")
    upper_bound_col <- glue("{toupper(indicator_name)}_CI_UPPER_BOUND")
    sample_avg_col <- glue("{toupper(indicator_name)}_SAMPLE_AVERAGE")
    
    # names(table_content) <- toupper(names(table_content))
    names(table_content)[names(table_content) == 'ci_l'] <- lower_bound_col
    names(table_content)[names(table_content) == 'ci_u'] <- upper_bound_col
    names(table_content)[names(table_content) == indicator_name] <- sample_avg_col
    
    # Cap the CI's between 0 and 1 (in case of small sample => large CI's)
    table_content[get(lower_bound_col) < 0, (lower_bound_col) := 0]
    table_content[get(upper_bound_col) > 1, (upper_bound_col) := 1]

    # Convert to percentages
    table_content[, (lower_bound_col) := get(lower_bound_col) * 100]
    table_content[, (upper_bound_col) := get(upper_bound_col) * 100]
    table_content[, (sample_avg_col) := get(sample_avg_col) * 100]
  
    # add the sample average column to the summary table
    indicator_estimation_table <- table_content[
        ,
        .SD,
        .SDcols = c(
            admin_name_col,
            grep('SAMPLE_AVERAGE', names(table_content), value = TRUE)
        )
        ]
  
    summary_table <- merge.data.table(summary_table, indicator_estimation_table, by = admin_name_col)
    
    # write it to .csv and .parquet files
    filename_without_extension <- glue("{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_name)}")
    write.csv(table_content, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)
    write_parquet(table_content, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))
    
    # assign the content to its variable name
    assign(table_name, table_content)
}

In [None]:
# changing names for the summary table
names(summary_table) <- gsub('_SAMPLE_AVERAGE', '', names(summary_table))

In [None]:
summary_filename_without_extension <- glue("{COUNTRY_CODE}_{data_source}_{admin_level}_PCT_CARESEEKING_SAMPLE_AVERAGE")
write.csv(summary_table, file = file.path(OUTPUT_DATA_PATH, paste0(summary_filename_without_extension, '.csv')), row.names = FALSE)
write_parquet(summary_table, file.path(OUTPUT_DATA_PATH, paste0(summary_filename_without_extension, '.parquet')))