# **DRC DHIS2 Data Quality Analysis and Incidence Calculations**


In [None]:
# Set SNT Paths
SNT_ROOT_PATH  <- "~/workspace"
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")

# load util functions
source(file.path(CODE_PATH, "snt_utils.r"))

# List required packages 
required_packages <- c("dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", "viridis", "patchwork", "zoo", "purrr", "arrow", "sf", "reticulate") 

# Execute function
install_and_load(required_packages)

# Set environment to load openhexa.sdk from the right environment
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")

# Load openhexa.sdk
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))},
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# Configuration variables
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

In [None]:
# print function
printdim <- function(df, name = deparse(substitute(df))) {
  cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n")
}

# 2. Reporting Completeness of Composite Indicators

In [None]:
# import analytics DHIS2 data
data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_routine.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 Routine data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })
printdim(data)

In [None]:
# Import aggregated data
rds <- data %>%
  mutate(Date = as.Date(paste(YEAR, MONTH, "01", sep = "-")))

printdim(rds)
colnames(rds)

## 2.1 Proportion of health facilities and districts reporting values above zero, "0" values or NA values.

In [None]:
# --- STEP 1: Identify all numeric indicator columns (excluding ID and metadata columns)
indicator_cols <- names(config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS) 

# --- STEP 2: Reshape data to long format
data_long <- rds %>%
  select(OU, Date, all_of(indicator_cols)) %>%
  pivot_longer(cols = all_of(indicator_cols), names_to = "Indicator", values_to = "value") %>%
  mutate(value = as.numeric(value))

# --- STEP 3: Create full grid (OU × Indicator × Date)
full_grid <- expand_grid(
  OU = unique(data_long$OU),
  Indicator = unique(data_long$Indicator),
  Date = unique(data_long$Date)
)

# --- STEP 4: Join and detect reporting status
reporting_check <- full_grid %>%
  left_join(data_long, by = c("OU", "Indicator", "Date")) %>%
  mutate(
    is_missing = is.na(value),
    is_zero = value == 0 & !is.na(value),
    is_positive = value > 0 & !is.na(value)
  )

# --- STEP 5: Summarise by Indicator and Date
reporting_summary <- reporting_check %>%
  group_by(Indicator, Date) %>%
  summarise(
    n_total = n_distinct(OU),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    .groups = "drop"
  )

# --- STEP 6: Reshape for stacked bar plot
plot_data <- reporting_summary %>%
  pivot_longer(
    cols = starts_with("pct_"),
    names_to = "Status", values_to = "Percentage"
  ) %>%
  mutate(Status = recode(Status,
                         pct_missing = "Missing",
                         pct_zero = "Zero reported",
                         pct_positive = "Positive reported")) %>%
  complete(Indicator, Date, Status, fill = list(Percentage = 0))

# --- STEP 7: Plot
options(repr.plot.width = 15, repr.plot.height = 10)
ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +
  geom_col(position = "stack") +
  facet_wrap(~ Indicator, scales = "free_y") +
  scale_y_continuous(limits = c(0, 100)) +
  scale_fill_manual(values = c(
    "Missing" = "tomato",
    "Zero reported" = "skyblue",
    "Positive reported" = "green"
  )) +
  labs(
    title = "Health Facility Reporting Status by Indicator",
    x = "Month", y = "% of Facilities",
    fill = "Reporting Status"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    strip.text = element_text(size = 14),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )

**Conclusions** Over 75% of health facilities report on CONF, SUSP, TEST and MALTREAT.

In [None]:
# --- STEP 2: Long format and keep ADM2
data_long <- rds %>%
  select(ADM2, OU, Date, all_of(indicator_cols)) %>%
  pivot_longer(cols = all_of(indicator_cols), names_to = "Indicator", values_to = "value") %>%
  mutate(value = as.numeric(value))

# --- STEP 3: Full expected grid at ADM2 level
full_grid <- expand_grid(
  ADM2 = unique(data_long$ADM2),
  Indicator = unique(data_long$Indicator),
  Date = unique(data_long$Date)
)

# --- STEP 4: Detect if any OU reported per ADM2 × Indicator × Date
reporting_check <- data_long %>%
  group_by(ADM2, Indicator, Date) %>%
  summarise(
    is_missing = all(is.na(value)),
    is_zero = all(value == 0, na.rm = TRUE),
    is_positive = any(value > 0, na.rm = TRUE),
    .groups = "drop"
  )

# --- STEP 5: Join with full grid to account for non-reporting ADM2s
reporting_full <- full_grid %>%
  left_join(reporting_check, by = c("ADM2", "Indicator", "Date")) %>%
  mutate(
    is_missing = replace_na(is_missing, TRUE),
    is_zero = replace_na(is_zero, FALSE),
    is_positive = replace_na(is_positive, FALSE)
  )

# --- STEP 6: Summary per Indicator and Date
reporting_summary <- reporting_full %>%
  group_by(Indicator, Date) %>%
  summarise(
    n_total = n_distinct(ADM2),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero & !is_missing),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    .groups = "drop"
  )

# --- STEP 7: Reshape for plotting
plot_data <- reporting_summary %>%
  pivot_longer(cols = starts_with("pct_"), names_to = "Status", values_to = "Percentage") %>%
  mutate(Status = recode(Status,
                         pct_missing = "Missing",
                         pct_zero = "Zero reported",
                         pct_positive = "Positive reported")) %>%
  complete(Indicator, Date, Status, fill = list(Percentage = 0))

# --- STEP 8: Plot
ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +
  geom_col(position = "stack") +
  facet_wrap(~ Indicator, scales = "free_y") +
  scale_y_continuous(limits = c(0, 100)) +
  scale_fill_manual(values = c(
    "Missing" = "tomato",
    "Zero reported" = "skyblue",
    "Positive reported" = "green"
  )) +
  labs(
    title = "District-Level Reporting Status by Indicator",
    x = "Month", y = "% of Districts (ADM2)",
    fill = "Reporting Status"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    strip.text = element_text(size = 14),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )


**Conclusion** When aggregating at the health district level (ADM2), we observe that virtually districts report on CONF, MALTREAT, SUSP, TEST and MALADM.

## 2.2 Reporting Rate

To accurately measure data completeness, we calculate the monthly reporting rate per health district (ADM2) as the **proportion of facility–months that submitted at least one report containing a confirmed malaria case (CONF)**. For each ADM2, we expect one report per facility per month. For example, if an ADM2 has 25 facilities, we expect 25 reports for a given month. If only 21 of those facilities report confirmed cases that month, the reporting rate is 21/25 = 84%.

This method improves over simple binary completeness flags by accounting for both spatial (facility coverage) and temporal (monthly timeliness) dimensions. A facility-month is considered reported if the CONF value is not missing, which serves as a proxy for overall completeness of malaria indicators. We use the presence of CONF (confirmed malaria cases) as the condition for marking a facility-month as reported because it is a core indicator consistently tracked across the dataset. While other indicators (e.g. TEST, PRES) could be used, CONF offers a reliable and relevant proxy for report completeness in the context of malaria surveillance. This choice ensures alignment with the structure of the incidence calculation, which is also based on confirmed cases.

In [None]:
# Tag REPORTED only if CONF is not NA
rds_reporting <- rds %>%
  mutate(REPORTED_CONF = if_else(!is.na(CONF), 1, 0))

# Aggregate at ADM2 × YEAR level
reporting_rate_monthly <- rds_reporting %>%
  group_by(ADM2, YEAR, MONTH) %>%
  summarise(
    n_facilities = n_distinct(OU),
    n_reports = sum(REPORTED_CONF, na.rm = TRUE),
    n_expected = n_facilities,  # Each HF expected to report once per month
    reporting_rate = n_reports / n_expected,
    .groups = "drop"
  )

In [None]:
# Prepare date column
reporting_rate_monthly <- reporting_rate_monthly %>%
  mutate(
    date = as.Date(paste0(YEAR, "-", MONTH, "-01")),
    ADM2 = factor(ADM2) 
  )

# Plot reporting rate heatmap
options(repr.plot.width = 15, repr.plot.height = 10)
ggplot(reporting_rate_monthly, aes(x = date, y = ADM2, fill = reporting_rate * 100)) +
  geom_tile() +
  scale_fill_viridis_c(
    option = "C",
    direction = 1,  # blue = low, yellow = high
    limits = c(0, 100),
    name = "Reporting rate (%)"
  ) +
  labs(
    title = "Monthly Reporting Rate by Health District",
    subtitle = "Each tile represents the reporting completeness per district per month",
    x = "Month",
    y = "Health District"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),
    axis.text.y = element_text(size = 9),
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    legend.position = "right",
    panel.grid = element_blank()
  )

In [None]:
# plot data in maps per year
# load shapes file
shapesdata <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_shapes.geojson")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 shapes data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

# Aggregate to year
reporting_rate_yearly <- reporting_rate_monthly %>%
  group_by(ADM2, YEAR) %>%
  summarise(
    n_reports = sum(n_reports, na.rm = TRUE),
    n_expected = sum(n_expected, na.rm = TRUE),
    reporting_rate = n_reports / n_expected,
    .groups = "drop"
  )

map_data <- shapesdata %>%
  left_join(reporting_rate_yearly, by = "ADM2") %>%
  st_as_sf()

options(repr.plot.width = 15, repr.plot.height = 10)
ggplot(map_data) +
  geom_sf(aes(fill = reporting_rate)) +
  facet_wrap(~ YEAR) +
  scale_fill_viridis_c(option = "C") +
  labs(title = "Reporting Rate per ADM2 by Year", fill = "Reporting Rate") +
  theme_minimal(base_size = 14)

### Important Note on Reporting Rate Calculation (Monthly vs Yearly):
When calculating reporting rates, aggregating directly at the yearly level assumes a fixed set of facilities throughout the year, which can overestimate the number of expected reports and underestimate the reporting rate. In contrast, calculating rates per month and then aggregating accounts for the actual presence of facilities in each period, better reflecting real reporting dynamics — including facilities entering or leaving the system. This approach aligns more closely with how DHIS2 data behaves and avoids underestimating completeness.