# Outliers Detection PATH method

--------------------

**Input**: 
* **routine DHIS2** data formatted.
    * from Dataset "**snt-dhis2-formatted**", `XXX_routine_data.parquet`

**Output**: 
All outputs saved to Dataset **snt-outliers-imputation**, with the following .parquet files:
* **outliers table** with flags for outlier data points:
    *  cols: YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE, **OUTLIER_TREND**
    *  Filename: `XXX_routine_outliers_trend_detection.parquet`
* **Routine data imputed** Original routine data with imputed value:
    *   cols: PERIOD, YEAR, MONTH, ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, `DHIS2_INDICATORS`
    *   Filename: `XXX_routine_outliers-trend_imputed.parquet`
* **Routine data removed** Original routine data with outliers values removed:
    *   cols: PERIOD, YEAR, MONTH, ADM1_ID, ADM1_NAME, ADM2_ID, ADM2_NAME, OU_ID, OU_NAME, `DHIS2_INDICATORS`
    *   Filename: `XXX_routine_outliers-trend_removed.parquet`
* **DB Table** 🐘 in OpenHEXA WS **Database** with added cols needed for 📊 Shiny App: SNT Outliers Explorer
    *   cols: YEAR, MONTH, ADM1_ID, ADM2_ID, OU_ID, INDICATOR, VALUE, **OUTLIER_TREND**
    *   Table name: `outliers_detection_results`

---------------------

In [None]:
# Parameters
# MEAN_DEVIATION <- 10
# EPI_THRESHOLD <- 5 

## 1. Setup

In [None]:
# Project folders
ROOT_PATH <- "~/workspace"  
CODE_PATH <- file.path(ROOT_PATH, 'code') 
CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')
DATA_PATH <- file.path(ROOT_PATH, 'data')

# Load utils
source(file.path(CODE_PATH, "snt_utils.r"))

# Load libraries 
required_packages <- c("arrow", "tidyverse", "jsonlite", "DBI", "RPostgres", "reticulate", "glue")
install_and_load(required_packages)

# Environment variables
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")

# Load OpenHEXA sdk
openhexa <- import("openhexa.sdk")

### 1.1. Validate parameters

In [None]:
if (!exists("MEAN_DEVIATION")) MEAN_DEVIATION <- 10
if (!exists("EPI_THRESHOLD")) EPI_THRESHOLD <- 5 

### 1.2. Load and check `SNT_config` file

In [None]:
# Load SNT config
config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, "SNT_config.json")) },
    error = function(e) {
        msg <- glue("[ERROR] Error while loading configuration {conditionMessage(e)}")
        log_msg(msg)
        stop(msg)
    })

log_msg(glue("SNT configuration loaded from  : {file.path(CONFIG_PATH, 'SNT_config.json')}"))

In [None]:
# Check SNT configuration 
for (conf in c("COUNTRY_CODE", "DHIS2_ADMINISTRATION_1", "DHIS2_ADMINISTRATION_2")) {
    print(glue("{conf} : {config_json$SNT_CONFIG[conf]}"))
    if (is.null(config_json$SNT_CONFIG[[conf]])) {
        msg <- paste("Missing configuration input:", conf)
        log_msg(msg)
        stop(msg)
    }
}

# Set config vars
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

DHIS2_INDICATORS <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS)  # PATH: c("TEST", "CONF", "PRES")

## 2. Load Data

### 2.1. **Routine** data (DHIS2) 

Formatted & aggregated data stored in OpenHEXA Dataset "**SNT_DHIS2_FORMATTED**"

In [None]:
# Load file from dataset (formatting)
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED
dhis2_routine <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_routine.parquet")) }, 
    error = function(e) {
        msg <- glue("[ERROR] Error while loading DHIS2 routine data file for {COUNTRY_CODE} : {conditionMessage(e)}")  # log error message
        log_msg(msg)
        stop(msg)
})

log_msg(glue("DHIS2 routine data loaded from dataset : {dataset_name}"))
log_msg(glue("DHIS2 routine data loaded has dimensions: {nrow(dhis2_routine)} rows, {ncol(dhis2_routine)} columns."))
print(dim(dhis2_routine))
head(dhis2_routine, 4)

🔍 **Assert indicators are present**

In [None]:
# Raise an error if any of DHIS2_INDICATORS are not present in the dhis2 routine data.
for (ind in DHIS2_INDICATORS) {    
    if (!(ind %in% colnames(dhis2_routine))) {
        msg <- paste("[ERROR] Missing indicator column in routine data: ", ind)
        log_msg(msg)
        stop(msg)
    }
}

## 3. Outliers Detection

### 3.1. Transform routine data  

* **Pivot longer***: cols become rows

In [None]:
dhis2_routine_long <- dhis2_routine %>%
    select(all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", DHIS2_INDICATORS))) %>%
    pivot_longer(cols = all_of(DHIS2_INDICATORS), names_to = "INDICATOR", values_to = "VALUE")

print(dim(dhis2_routine_long))
head(dhis2_routine_long, 2)

🔍 **Remove duplicated values**

In [None]:
# check if there are any duplicates
duplicated <- dhis2_routine_long %>%
  group_by(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR) %>%
  summarise(n = dplyr::n(), .groups= "drop") %>%
  filter(n > 1L)

# Remove dups
if (nrow(duplicated) > 0) {
    log_msg(glue("Removing {nrow(duplicated)} duplicated values."))
    dhis2_routine_long <- dhis2_routine_long %>%
        distinct(ADM1_ID, ADM2_ID, OU_ID, PERIOD, INDICATOR, .keep_all = TRUE)
    head(duplicated)
}

### 3.2. Calculate **summary stats**
At `OU_ID` (Health Facility) x `INDICATOR`, calculate:
* `MEAN_80` mean over center percentile 80%
* `SD_80` standard deviation over center percentile 80%

In [None]:
# Compute and add mean_80 and sd_80 columns (PATH method)
log_msg(glue("Computing trend outliers (PATH Method) over: {paste(DHIS2_INDICATORS, collapse=', ')}."))

# Exclude zeros as these are more likely period of not reporting than true zeros.
routine_mean_80 <- dhis2_routine_long %>% 
    filter(VALUE > 0) %>%
    group_by(ADM1_ID, ADM2_ID, OU_ID, INDICATOR) %>%
    arrange(VALUE) %>%
    mutate(rank = row_number()) %>%
    # keep values with in the 10th and 90th percentiles (to reduce effects of outliers)
    filter(rank > n() * 0.1, rank <= n() * 0.9) %>% # Filter the middle 80%
    summarise(MEAN_80 = ceiling(mean(VALUE, na.rm = TRUE)), 
              SD_80 = ceiling(sd(VALUE, na.rm = TRUE)), .groups= "drop")

# join raw routine_data with corresponding mean value for each OU
dhis2_routine_stats <- dhis2_routine_long %>%
  left_join(routine_mean_80, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR"))

### 3.3. Flag outliers

In [None]:
# unusually high values (outliers = True)
dhis2_routine_outliers <- dhis2_routine_stats %>%
  mutate(OUTLIER_TREND = case_when(VALUE > (MEAN_80 + MEAN_DEVIATION * SD_80) ~ TRUE, TRUE ~ FALSE)) %>%
  mutate(OUTLIER_TREND = case_when(is.na(VALUE) | is.na(SD_80) ~ FALSE, TRUE ~ OUTLIER_TREND))  # is.na(MEAN_80)?

dim(dhis2_routine_outliers)
head(dhis2_routine_outliers, 2)

## 4. Exceptions

### 4.1. Detect possible stock-outs: 
1) If 'presumed cases' `(PRES)` suddenly jumps up at a time that testing (TEST), this indicates a RDT stockout - so add a condition where this isn't an outlier if it is within some range of the average number of confirmed cases.

In [None]:
# high presumed cases during lower tests
low_testing_periods <- dhis2_routine_outliers %>%
    filter(INDICATOR == "TEST") %>%
    mutate(
        low_testing = case_when(VALUE < MEAN_80 ~ TRUE, TRUE ~ FALSE), 
         # presumed may not exceed upper limits for tests    
        upper_limit_tested = MEAN_80 + MEAN_DEVIATION * SD_80) %>% 
    select(all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "low_testing", "upper_limit_tested")))

# decide which one could be possible stock-out periods
possible_stockout <- dhis2_routine_outliers %>%
  filter(OUTLIER_TREND == TRUE) %>%
  left_join(low_testing_periods, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>% 
  # make sure value does not exceed reasonable figures
  mutate(POSSIBLE_STKOUT = case_when(low_testing == TRUE & INDICATOR == "PRES" & VALUE < upper_limit_tested ~ TRUE, TRUE ~ FALSE)) %>%
  filter(POSSIBLE_STKOUT == TRUE) %>%
  select(all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_STKOUT")))

In [None]:
# Log possible stockouts
stockouts_n <- length(unique(possible_stockout$OU_ID))
if (stockouts_n > 0) {
    log_msg(glue("There are {length(unique(possible_stockout$OU_ID))} Health Facilities (OUs) with both high presumed cases (within a reasonable range) and low testing in the routine_data."))
}

head(possible_stockout, 2)

### 4.2. Detect possible epidemic: 

2) Sometimes tests and confirmed cases both jump up together - this can be due to a genuine epidemic event, or because there was a lag in reporting in previous months.  
   Identify Health Facilities (OUs) with extreme outliers **EPI_THRESHOLD** times larger than the outlier threshold) for both tests and confirmed cases, these could be legit epidemic outbreaks at the same time.

In [None]:
# flag possible epidemic 
possible_epidemic <- dhis2_routine_outliers %>% 
    filter(OUTLIER_TREND == TRUE & (INDICATOR == "TEST" | INDICATOR == "CONF")) %>%     
    rename(total = VALUE) %>% 
    # outlier threshold max value
    mutate(max_value = MEAN_80 + MEAN_DEVIATION * SD_80) %>% 
    # remove columns not necessary for wider format
    select(-c("MEAN_80", "SD_80")) %>% 
    # wider format with two values (value and outlier-threshold max value) for each INDICATOR
    pivot_wider(names_from = INDICATOR, values_from = c(total, max_value)) %>% 
    unnest(cols = everything()) %>% 
    # identify HF months with outlier for both variables
    filter(total_CONF >= max_value_CONF & total_TEST >= max_value_TEST) %>% 
    mutate(POSSIBLE_EPID = ifelse(total_CONF >= EPI_THRESHOLD * max_value_CONF | total_TEST >= EPI_THRESHOLD * max_value_TEST, TRUE, FALSE)) %>% 
    filter(POSSIBLE_EPID == TRUE) %>% 
    select(all_of(c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "POSSIBLE_EPID")))

epidemic_n <- length(unique(possible_epidemic$OU_ID))
if (epidemic_n > 0) {    
    print(glue("There are {epidemic_n} health facilities (OUs) where both the number of tests and confirmed cases increased sharply at the same time, sign of possible epidemic."))
}
head(possible_epidemic, 2)

### 4.3. Join exception tables and correct outlier flags

Join corrected outlier columns into the final table:

**OUTLIERS_TREND**: Indicates whether its outside a reasonable range.  
**OUTLIERS_TREND_01** (exception 1): Corrects the presumed cases when possible RDT or stockout.  
**OUTLIERS_TREND_02** (exception 2): Corrects potential epidemic.

In [None]:
# Join columns and correct outliers column
routine_data_outliers_clean <- dhis2_routine_outliers %>% 
    left_join(possible_stockout, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>%
    mutate(OUTLIER_TREND_01 = case_when(OUTLIER_TREND == TRUE & INDICATOR =="PRES" & POSSIBLE_STKOUT == TRUE ~ FALSE, TRUE ~ OUTLIER_TREND)) %>%
    left_join(possible_epidemic, by = c("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD")) %>%
    mutate(OUTLIER_TREND_02 = case_when(OUTLIER_TREND_01 == TRUE & INDICATOR %in% c("CONF", "TEST") & POSSIBLE_EPID == TRUE ~ TRUE, TRUE ~ OUTLIER_TREND_01)) %>%    
    select(-OUTLIER_TREND) %>%
    rename(OUTLIER_TREND = OUTLIER_TREND_02) %>% 
    mutate(
        YEAR = as.integer(substr(PERIOD, 1, 4)),
        MONTH = as.integer(substr(PERIOD, 5, 6))) %>%
    select(all_of(
        c(
          "PERIOD",
          "YEAR",
          "MONTH",
          "ADM1_ID",
          "ADM2_ID",
          "OU_ID",          
          "INDICATOR",
          "VALUE",
          "MEAN_80",
          "SD_80",
          "OUTLIER_TREND",
          "POSSIBLE_STKOUT",          
          "POSSIBLE_EPID"          
        )
    ))

print(dim(routine_data_outliers_clean))
head(routine_data_outliers_clean, 2)

In [None]:
# routine_data_outliers_clean %>% filter(OUTLIER_TREND==TRUE) %>% head(3)
# TEST VALUE : OU_ID=="a6ajNA9VZ7z" & PERIOD=="202201" (TEST) 52 -> 25.1

## 5. Routine data imputation

We consider the flagged outliers in the corrected column **OUTLIER_TREND_02** and we replace values by **MEAN_80**

### 5.1. Impute outliers with MEAN_80

In [None]:
# replace outliers by mean_80
routine_data_outliers_imputed <- routine_data_outliers_clean %>%    
    rename(VALUE_OLD = VALUE) %>%    
    # replace outliers with the mean 80% value
    mutate(VALUE_IMPUTED = ifelse(OUTLIER_TREND == TRUE, MEAN_80, VALUE_OLD)) %>%
    arrange("ADM1_ID", "ADM2_ID", "OU_ID", "PERIOD", "INDICATOR") %>%      
    select(all_of(c("PERIOD", "YEAR", "MONTH", "ADM1_ID", "ADM2_ID", "OU_ID", "INDICATOR", "VALUE_OLD", "VALUE_IMPUTED", "OUTLIER_TREND")))

print(dim(routine_data_outliers_imputed))
head(routine_data_outliers_imputed, 2)

In [None]:
dim(routine_data_outliers_imputed[routine_data_outliers_imputed$OUTLIER_TREND==TRUE,])
dim(routine_data_outliers_imputed[routine_data_outliers_imputed$OUTLIER_TREND==FALSE,])

### 5.2. Format final `imputed` and `removed` routine data

**Imputed**: This table contains the routine data where outliers have been imputed.  
**Removed**: This table contains the routine data with outliers removed from the dataset.

In [None]:
# get names from routine (This cleaning only applies to DRC names)
pyramid_names <- dhis2_routine %>% 
    distinct(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID, OU_ID, OU_NAME) %>%
    # Simpify strings 
    mutate(
        ADM1_NAME = stringr::str_trim(str_remove_all(ADM1_NAME, "^[A-Z]{2}| PROVINCE")),
        ADM2_NAME = stringr::str_trim(str_remove_all(ADM2_NAME, "^[A-Z]{2}| ZONE DE SANTE"))
    )

In [None]:
# Routine outliers imputed
dhis2_routine_outliers_imputed <- routine_data_outliers_imputed %>%
    select(-c("VALUE_OLD", "OUTLIER_TREND")) %>%
    pivot_wider(names_from = INDICATOR, values_from = VALUE_IMPUTED) %>%
    mutate(YEAR = as.integer(substr(PERIOD, 1, 4)), MONTH = as.integer(substr(PERIOD, 5, 6))) %>%
    left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) %>% 
    select(all_of(c("PERIOD", 
                    "YEAR", 
                    "MONTH", 
                    "ADM1_NAME", 
                    "ADM1_ID", 
                    "ADM2_NAME", 
                    "ADM2_ID", 
                    "OU_ID", 
                    "OU_NAME", 
                    DHIS2_INDICATORS)))
    
print(dim(dhis2_routine_outliers_imputed))
head(dhis2_routine_outliers_imputed, 2)

In [None]:
# Routine outliers removed
dhis2_routine_outliers_removed <- routine_data_outliers_imputed %>%
    filter(OUTLIER_TREND == FALSE) %>%
    select(-c("VALUE_OLD", "OUTLIER_TREND")) %>%
    pivot_wider(names_from = INDICATOR, values_from = VALUE_IMPUTED) %>%
    mutate(YEAR = as.integer(substr(PERIOD, 1, 4)), MONTH = as.integer(substr(PERIOD, 5, 6))) %>%
    filter(!if_all(all_of(DHIS2_INDICATORS), is.na)) %>%
    left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) %>% 
    select(all_of(c("PERIOD", 
                    "YEAR", 
                    "MONTH", 
                    "ADM1_NAME", 
                    "ADM1_ID", 
                    "ADM2_NAME", 
                    "ADM2_ID", 
                    "OU_ID", 
                    "OU_NAME", 
                    DHIS2_INDICATORS))) 
    
print(dim(dhis2_routine_outliers_removed))
head(dhis2_routine_outliers_removed, 2)

In [None]:
# log
nr_of_outliers <- nrow(routine_data_outliers_clean[routine_data_outliers_clean$OUTLIER_TREND == TRUE,])
perc_outliers <- nr_of_outliers/nrow(routine_data_outliers_clean) * 100
log_msg(glue("Using PATH outliers detection method {nr_of_outliers} outliers were identified ({sprintf('%.3f', perc_outliers)} % of values).")) 

## 6. Export Output tables
Export tables as .parquet files to `data/` folder

In [None]:
output_path <- file.path(DATA_PATH , "dhis2", "outliers_imputation")

# Save routine outliers table (parquet)
outliers_parquet <- file.path(output_path , paste0(COUNTRY_CODE, "_routine_outliers-trend_detection.parquet")) 
routine_outliers_db_table <- routine_data_outliers_clean %>% 
    select(-c("MEAN_80", "SD_80", "POSSIBLE_STKOUT", "POSSIBLE_EPID")) %>%
    mutate(DATE = make_date(year = YEAR, month = MONTH, day = 1L)) %>%
    left_join(pyramid_names, by = c("ADM1_ID", "ADM2_ID", "OU_ID")) 

write_parquet(routine_outliers_db_table, outliers_parquet)
log_msg(glue("Outliers detection table saved under: {outliers_parquet}"))

In [None]:
# Save routine data imputed (parquet)   
imputed_parquet <- file.path(output_path, paste0(COUNTRY_CODE, "_routine_outliers-trend_imputed.parquet"))
write_parquet(dhis2_routine_outliers_imputed, imputed_parquet)
log_msg(glue("Routine data outliers imputed saved under: {imputed_parquet}"))

In [None]:
# Save routine data removed (parquet)   
removed_parquet <- file.path(output_path , paste0(COUNTRY_CODE, "_routine_outliers-trend_removed.parquet"))
write_parquet(dhis2_routine_outliers_removed, removed_parquet)
log_msg(glue("Routine data outliers removed saved under: {removed_parquet}"))