# Under-5 Prevalence of Malaria (DHS data)

## Resources

https://dhsprogram.com/data/Guide-to-DHS-Statistics/index.htm#t=Prevalence_of_Malaria_in_Children.htm%23Percentage_of_children22bc-1&rhtocid=_15_13_0

Numerators:
1)      Number of de facto children tested using RDT who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 = 1)
2)      Number of de facto children tested using microscopy who are positive for malaria (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 = 1)
 
Denominators:
a)       Number of de facto children tested using RDT (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml35 in 0,1)
b)      Number of de facto children tested using microscopy (hv042 = 1 & hv103 = 1 & hc1 in 6:59 & hml32 in 0,1,6)

**Project uses RDT**

## Preliminary steps

In [None]:
rm(list = ls())

options(scipen=999)

# Global paths
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")

# Paths
ROOT_PATH <- '~/workspace'
CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')
CODE_PATH <- file.path(ROOT_PATH, 'code')
DATA_PATH <- file.path(ROOT_PATH, 'data')
DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')
OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'prevalence')

# Load utils
source(file.path(CODE_PATH, "snt_utils.r"))

# List required pcks
required_packages <- c("haven", "sf", "glue", "survey", "data.table", "stringi", "jsonlite", "httr", "reticulate", "arrow")

# Execute function
install_and_load(required_packages)

Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
CONFIG_FILE_NAME <- "SNT_config.json"
config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },
                        error = function(e) {
                          msg <- paste0("Error while loading configuration", conditionMessage(e))  
                          cat(msg)   
                          stop(msg) 
                        })

msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) 
log_msg(msg)

# Set config variables
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE


## Get data

In [None]:
admin_level <- 'ADM1'
admin_id_col <- glue(admin_level, 'ID', .sep='_')
admin_name_col <- glue(admin_level, 'NAME', .sep='_')
admin_cols <- c(admin_id_col, admin_name_col)

In [None]:
# Load spatial file from dataset

dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED

spatial_data_filename <- paste(COUNTRY_CODE, "shapes.geojson", sep = "_")
# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))
spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)
log_msg(glue("File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}"))

spatial_data <- st_as_sf(spatial_data)

# aggregate geometries by the admin columns
spatial_data <- aggregate_geometry(
  sf_data=spatial_data,
  admin_id_colname=admin_id_col,
  admin_name_colname=admin_name_col
)

# keep class
spatial_data <- st_as_sf(spatial_data)

if(COUNTRY_CODE == "COD"){
  spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])
}

admin_data <- st_drop_geometry(spatial_data)
setDT(admin_data)

### Import 

In [None]:
indicator_u5prev <- 'PCT_U5_PREV_RDT' # to be computed

data_source <- 'DHS'
household_recode <- 'HR'
person_recode <- 'PR'
target_file_type <- 'SV'

delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=".zip")

dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)

dhs_pr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, person_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_pr_zip_filename), exdir=DHS_DATA_PATH)

# # Remove existing output files
# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)
# files_to_delete <- files[grepl('U5_PREV', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]
# file.remove(files_to_delete)

data_extension <- '.SAV'
dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(".*", household_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)
dhs_pr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(".*", person_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)

if(!check_dhs_same_version(dhs_hr_filename, dhs_pr_filename)){
  stop("The input DHS data do not have the same version/issue. Check available data before rerunning.")
}

dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode
dhs_hr_dt <- setDT(dhs_hr_dt)

dhs_pr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_pr_filename)) # person recode
dhs_pr_dt <- setDT(dhs_pr_dt)

### Make admin dataframe (for future merging)

In [None]:
dhs_admin_dt <- make_dhs_admin_df(
  input_dhs_df=dhs_hr_dt,
  original_admin_column="HV024",
  new_admin_name_colname=admin_name_col,
  new_admin_code_colname='DHS_ADM1_CODE'
)

# format the names to be like DHIS2 names
dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]

# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space
dhs_admin_dt[get(admin_name_col) == "MAI NDOMBE", (admin_name_col) := "MAINDOMBE"]

# Check that all regions can be matched with DHIS2 pyramid
if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){
  stop("The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.")
}

rm(dhs_hr_dt) # free up resources

### Preprocess

In [None]:
# Relevant columns
household_id_cols <- c("HHID", "HV000", "HV001", "HV002")
household_sampling_cols <- c("HV005", "HV021", "HV022", "HV023", "HV024")
hemoglobin_selection_col = "HV042"
person_slept_col <- grep("^HV103", names(dhs_pr_dt), value = TRUE)
kid_age_col <- "HC1"
smear_result_col <- "HML32" # smear test (GE)
rdt_result_col <- "HML35" # rapid diagnostic test (RDT / TDR)

In [None]:
# sapply(kid_age_cols, function(i) table(dhs_pr_dt[[i]], useNA = 'always'))

# table(dhs_pr_dt$HC1, useNA = 'ifany')
# table(dhs_pr_dt$HV103, useNA = 'ifany')
# table(dhs_pr_dt$HV042, useNA = 'ifany')

# filter rows and columns
pr_dt <- dhs_pr_dt[(
  !(is.na(get(kid_age_col))) & # no missing age
    get(kid_age_col) >= 6 & # 6 months or older
    get(kid_age_col) <= 59 & # younger than 5
    get(person_slept_col) == 1 & # slept last night in household
    get(hemoglobin_selection_col) == 1 # household selected for hemoglobin test
  ),
  .SD, .SDcols = c(
    household_id_cols,
    household_sampling_cols,
    hemoglobin_selection_col,
    person_slept_col,
    kid_age_col,
    smear_result_col,
    rdt_result_col)
    ]

pr_dt[, wt := HV005/1000000]

pr_dt <- merge.data.table(dhs_admin_dt, pr_dt, by.x = "DHS_ADM1_CODE", by.y = "HV024", all = TRUE)

## Rapid Diagnostic Test Indicator

In [None]:
xtabs( ~ get(rdt_result_col), data = pr_dt, addNA = TRUE)

# filter rows
rdt_dt <- pr_dt[
  get(rdt_result_col) %in% c(0, 1), # tested and had either positive (1) or negative (0) result
  ]

# clustering, stratification, weights (for means, proportions, regression models, etc.)
rdt_design_sampling = svydesign(
  ids = ~ HV021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)
  data = rdt_dt, # dataset
  strata = ~ HV023, # groupings of primary sampling units
  weights = ~ wt, # the sampling weights variable
  num_p=1, # ? dunno what this is
  nest = T # the primary sampling units are nested within the strata
)

In [None]:
malaria_rdt_table <- svyby(formula = as.formula(paste("~", rdt_result_col)), # to dynamically set the target colname
                           # by = ~ ADM1,
                           by = reformulate(admin_name_col), # to dynamically define the grouping colname
                           FUN = svymean, # compute survey mean
                           design = rdt_design_sampling, # the weights, strata, clusters
                           level = 0.95, # the level for CI's
                           vartype = "ci", # for variance, use the CI's
                           na.rm = TRUE, # remove the NA's in the calculation
                           influence = TRUE) # which observations have a substantial change in the results of the analysis

In [None]:
setDT(malaria_rdt_table)

In [None]:
lower_bound_col <- glue("{toupper(indicator_u5prev)}_CI_LOWER_BOUND")
upper_bound_col <- glue("{toupper(indicator_u5prev)}_CI_UPPER_BOUND")
sample_avg_col <- glue("{toupper(indicator_u5prev)}_SAMPLE_AVERAGE")

# names(malaria_rdt_table) <- toupper(names(malaria_rdt_table))
names(malaria_rdt_table)[names(malaria_rdt_table) == 'ci_l'] <- lower_bound_col
names(malaria_rdt_table)[names(malaria_rdt_table) == 'ci_u'] <- upper_bound_col
names(malaria_rdt_table)[names(malaria_rdt_table) == rdt_result_col] <- sample_avg_col

# Cap the CI's between 0 and 1 (in case of small sample => large CI's)
malaria_rdt_table[get(lower_bound_col) < 0, (lower_bound_col) := 0]
malaria_rdt_table[get(upper_bound_col) > 1, (upper_bound_col) := 1]

In [None]:
# Convert to percentages
malaria_rdt_table[, (lower_bound_col) := get(lower_bound_col) * 100]
malaria_rdt_table[, (upper_bound_col) := get(upper_bound_col) * 100]
malaria_rdt_table[, (sample_avg_col) := get(sample_avg_col) * 100]

In [None]:
malaria_rdt_table <- merge.data.table(admin_data, malaria_rdt_table, by = admin_name_col, all = TRUE)

filename_without_extension <- glue("{COUNTRY_CODE}_{data_source}_{admin_level}_{toupper(indicator_u5prev)}")
write.csv(malaria_rdt_table, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')), row.names = FALSE)
write_parquet(malaria_rdt_table, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))