# DTP Vaccination rates and attrition using DHS data

## Preliminaries

In [None]:
rm(list = ls())

options(scipen=999)

In [None]:
# Global paths
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")

In [None]:
# Paths
ROOT_PATH <- '~/workspace'
CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')
CODE_PATH <- file.path(ROOT_PATH, 'code')
DATA_PATH <- file.path(ROOT_PATH, 'data')
DHS_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'raw')
OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'dhs', 'indicators', 'vaccination')

In [None]:
# Load utils
source(file.path(CODE_PATH, "snt_utils.r"))

# List required pcks
required_packages <- c("readr", "haven", "glue", "survey", "data.table", "sf", "ggplot2", "stringi", "reticulate", "jsonlite", "httr", "arrow")

# Execute function
install_and_load(required_packages)

In [None]:
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

In [None]:
# Load SNT config
CONFIG_FILE_NAME <- "SNT_config.json"
config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },
                        error = function(e) {
                          msg <- paste0("Error while loading configuration", conditionMessage(e))  
                          cat(msg)   
                          stop(msg) 
                        })

msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) 
log_msg(msg)

In [None]:
# Set config variables
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE

## Geo data

In [None]:
admin_level <- 'ADM1'
admin_id_col <- glue(admin_level, 'ID', .sep='_')
admin_name_col <- glue(admin_level, 'NAME', .sep='_')
admin_cols <- c(admin_id_col, admin_name_col)

In [None]:
# Load spatial file from dataset

dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED

spatial_data_filename <- paste(COUNTRY_CODE, "shapes.geojson", sep = "_")
# spatial_data <- read_sf(file.path(DATA_PATH, 'dhis2', 'formatted', spatial_data_filename))
spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)
log_msg(glue("File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}"))

spatial_data <- st_as_sf(spatial_data)

# aggregate geometries by the admin columns
spatial_data <- aggregate_geometry(
  sf_data=spatial_data,
  admin_id_colname=admin_id_col,
  admin_name_colname=admin_name_col
  )

# keep class
spatial_data <- st_as_sf(spatial_data)

# DRC provinces need to be cleaned
if(COUNTRY_CODE == "COD"){
  spatial_data[[admin_name_col]] <- clean_admin_names(spatial_data[[admin_name_col]])
}

admin_data <- st_drop_geometry(spatial_data)
setDT(admin_data)

## Import DHS data

In [None]:
vaccination_doses <- c(1, 2, 3)
indicator_access <- 'PCT_DTP'
indicator_attrition <- 'PCT_DROPOUT_DTP'

In [None]:
data_source <- 'DHS'
household_recode <- 'HR'
kid_recode <- 'KR'
target_file_type <- 'SV'

delete_otherextension_files(DHS_DATA_PATH, extension_to_retain=".zip")

In [None]:
dhs_hr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, household_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_hr_zip_filename), exdir=DHS_DATA_PATH)

dhs_kr_zip_filename <- extract_latest_dhs_recode_filename(DHS_DATA_PATH, kid_recode, target_file_type)
unzip(file.path(DHS_DATA_PATH, dhs_kr_zip_filename), exdir=DHS_DATA_PATH)

# # Remove existing output files
# files <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)
# files_to_delete <- files[grepl('DTP', basename(files), ignore.case = TRUE) & grepl(COUNTRY_CODE, basename(files), ignore.case = TRUE)]
# file.remove(files_to_delete)

data_extension <- '.SAV'
dhs_hr_filename <- list.files(path = DHS_DATA_PATH, pattern = paste0(".*", household_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)
dhs_kr_filename <- dir(path = DHS_DATA_PATH, pattern = paste0(".*", kid_recode, ".*\\", data_extension, "$"), ignore.case=TRUE)

if(!check_dhs_same_version(dhs_hr_filename, dhs_kr_filename)){
  stop("The necessary DHS data do not have the same version/issue. Check available data before rerunning.")
}

dhs_hr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_hr_filename)) # household recode
dhs_hr_dt <- setDT(dhs_hr_dt)

dhs_kr_dt <- read_spss(file.path(DHS_DATA_PATH, dhs_kr_filename)) # kid recode
dhs_kr_dt <- setDT(dhs_kr_dt)

## Preprocess DHS data

### Extract DHS admin data

In [None]:
# Make admin codes and names dataframe (for future merging)

dhs_beginning_year <- as.integer(dhs_hr_dt[, min(HV007)])

dhs_admin_dt <- make_dhs_admin_df(
  input_dhs_df=dhs_hr_dt,
  original_admin_column="HV024",
  new_admin_name_colname=admin_name_col,
  new_admin_code_colname='DHS_ADM1_CODE'
)

# format the names to be like DHIS2 names
dhs_admin_dt[, (admin_name_col) := format_names(get(admin_name_col))]

# TODO this should be changed in the formatting of DHIS2 data; the correct name should be with a space
dhs_admin_dt[get(admin_name_col) == "MAI NDOMBE", (admin_name_col) := "MAINDOMBE"]

# Check that all regions can be matched with DHIS2 pyramid
if(!check_perfect_match(dhs_admin_dt, admin_name_col, admin_data, admin_name_col)){
  stop("The DHS data provided does not fully match DHIS2 pyramid data. Please check input data before retrying.")
}

rm(dhs_hr_dt) # free up resources

### Filter rows and columns

In [None]:
# remove dead children from the dataset, keep only children aged 1 or more (avoid left censoring for vaccination) and respect the base for the 'h' variables
kr_dt <- dhs_kr_dt[B5 == 1 & B8 >= 1 & B19 < 36,]

household_id_cols <- c('V000', 'V001', 'V002')
kid_id_cols <- c('CASEID', 'BIDX')
kid_dpt1_cols <- c('H3', 'H3D', 'H3M', 'H3Y')
kid_dpt2_cols <- c('H5', 'H5D', 'H5M', 'H5Y')
kid_dpt3_cols <- c('H7', 'H7D', 'H7M', 'H7Y')
kid_sampling_cols <- c('V005', 'V021', 'V023', 'V024')

kr_dt <- kr_dt[, .SD, .SDcols = c(household_id_cols, kid_id_cols, kid_sampling_cols, kid_dpt1_cols, kid_dpt2_cols, kid_dpt3_cols)]

# # check i didn't omit any crucial variable
# stopifnot(nrow(kr_dt[duplicated(kr_dt)]) == 0)

### New features

Add the region labels, to subsequently match DHIS2 data

In [None]:
kr_dt <- merge.data.table(dhs_admin_dt, kr_dt, by.x = "DHS_ADM1_CODE", by.y = "V024", all = TRUE)

Create the target features (whether or not the kid was vaccinated, for each dose)

In [None]:
# Create dummy variables for the various DTP vaccine doses
kr_dt[, `:=`(
  DTP1 = fcase(
    H3 == 0L, 0L,
    H3 %in% c(1L, 2L, 3L), 1L,
    default = NA
  ),
  DTP2 = fcase(
    H5 == 0L, 0L,
    H5 %in% c(1L, 2L, 3L), 1L,
    default = NA
  ),
  DTP3 = fcase(
    H7 == 0L, 0L,
    H7 %in% c(1L, 2L, 3L), 1L,
    default = NA
  )
)]

# Correct external consistency issues: children who got the third dose also had the second, and so on:
kr_dt[DTP2 == 1, DTP1 := 1]
kr_dt[DTP3 == 1, DTP1 := 1]
kr_dt[DTP3 == 1, DTP2 := 1]

### Create the survey design

In [None]:
# compute the household/kid weights
kr_dt[, wt := V005/1000000]

In [None]:
# account for the sampling strategy (clustering, stratification, weights) for means, proportions, regression models, etc.
dtp_design = svydesign(
  ids = ~ V021, # primary sampling unit / cluster ids (cluster number and/or ultimate area unit)
  data = kr_dt, # dataset
  strata = ~ V023, # groupings of primary sampling units
  weights = ~ wt, # the sampling weights variable
  nest = T # the primary sampling units are nested within the strata
  )

## Vaccination proportion indicator

For each vaccine dose:
- compute the proportions of vaccinated per region
- compute the CIs
- add the admin units and save to .csv and parquet

In [None]:
# create the empty dropout table to add each proportion and compute attrition rates
DTP_DROPOUT <- copy(admin_data)

for (dose_number in vaccination_doses){
  table_name <- glue("{toupper(indicator_access)}{dose_number}")
  vaccine_colname <- glue("DTP{dose_number}")
  
  computed_proportions <- svyby(
      # formula = ~ get(vaccine_colname),
      as.formula(paste("~", vaccine_colname)),
      # by = ~ ADM1,
      by = reformulate(admin_name_col),
      FUN = svymean,
      design = dtp_design,
      level = 0.95,
      vartype = "ci",
      na.rm = TRUE,
      influence = TRUE
  )
  
  # change the name of the target column (to avoid the 'get' in the name)
  names(computed_proportions)[2] <- vaccine_colname
  
  # assign the table value to the table name
  assign(table_name, computed_proportions)
  
  # change the names of the columns
  lower_bound_col <- glue("{toupper(indicator_access)}{dose_number}_CI_LOWER_BOUND")
  upper_bound_col <- glue("{toupper(indicator_access)}{dose_number}_CI_UPPER_BOUND")
  sample_avg_col <- glue("{toupper(indicator_access)}{dose_number}_SAMPLE_AVERAGE")  
  
  # retrieve data, modify colnames, and reassign
  df <- get(table_name)
  names(df)[names(df) == 'ci_l'] <- lower_bound_col
  names(df)[names(df) == 'ci_u'] <- upper_bound_col
  names(df)[names(df) == vaccine_colname] <- sample_avg_col
  setDT(df)

  # Cap the CI's between 0 and 1 (in case of small sample => large CI's)
  df[get(lower_bound_col) < 0, (lower_bound_col) := 0]
  df[get(upper_bound_col) > 1, (upper_bound_col) := 1]

  # Convert to percentages
  df[, (lower_bound_col) := get(lower_bound_col) * 100]
  df[, (upper_bound_col) := get(upper_bound_col) * 100]
  df[, (sample_avg_col) := get(sample_avg_col) * 100]
    
  # add the admin units
  df <- merge.data.table(admin_data, df, by = admin_name_col, all.x = TRUE)
  
  # write to file
  filename_without_extension <- glue("{COUNTRY_CODE}_{data_source}_{admin_level}_{table_name}")
  fwrite(df, file = file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.csv')))
  write_parquet(df, file.path(OUTPUT_DATA_PATH, paste0(filename_without_extension, '.parquet')))
  
  # add current dose table to the summary table (for future computation of dropout rates)
  DTP_DROPOUT <- merge.data.table(DTP_DROPOUT, df, by = admin_cols)
}
  

## Dropout rate indicator

Add dropout rates plots: for each vaccine dose:
- make the dropout rates
- add them to the summary file and save it as .csv and parquet
- make plots and save them

In [None]:
# remove the CI columns (as requested)
DTP_DROPOUT[, grep("BOUND", names(DTP_DROPOUT), value = TRUE) := NULL]

In [None]:
for(current_dose in vaccination_doses){
  for (reference_dose in 1:(current_dose - 1)){
    if((reference_dose >= 1) & (reference_dose < current_dose)){
    attrition_col <- glue("{toupper(indicator_attrition)}_{reference_dose}_{current_dose}")
    print(glue('Computing attrition for {attrition_col}'))
    numerator_colname <- glue("{toupper(indicator_access)}{current_dose}_SAMPLE_AVERAGE")
    denominator_colname <- glue("{toupper(indicator_access)}{reference_dose}_SAMPLE_AVERAGE")
    DTP_DROPOUT[, (attrition_col) := (1 - get(numerator_colname) / get(denominator_colname))*100] # percentages instead of rates, as requested
    }
  }
}

# remove the unnecessary columns
DTP_DROPOUT[, grep("SAMPLE_AVERAGE", names(DTP_DROPOUT), value = TRUE) := NULL]

In [None]:
dtp_dropout_filename_without_extension <- glue("{COUNTRY_CODE}_{data_source}_{admin_level}_{indicator_attrition}")
fwrite(DTP_DROPOUT, file = file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, ".csv")))
write_parquet(DTP_DROPOUT, file.path(OUTPUT_DATA_PATH, paste0(dtp_dropout_filename_without_extension, ".parquet")))