# **Analyse de la qualité des données de routine**


## Configuration

In [None]:
# Set SNT Paths
SNT_ROOT_PATH  <- "~/workspace"
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")

# load util functions
source(file.path(CODE_PATH, "snt_utils.r"))

# List required packages 
required_packages <- c("dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", "viridis", "patchwork", "zoo", "purrr", "arrow", "sf", "reticulate", "knitr")

# Execute function
install_and_load(required_packages)

# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))},
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# Configuration variables
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

In [None]:
# print function
printdim <- function(df, name = deparse(substitute(df))) {
  cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n")
}

## 1. Complétude du rapport des établissements de santé de routine par élément de donnée

In [None]:
# import analytics DHIS2 data
data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_analytics.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 analytics file for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_pyramid.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 Shapes data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

reporting_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_reporting.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 Shapes data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })
printdim(data)
printdim(pyramid_data)
printdim(reporting_data)

## 1.1 Liste des éléments de donnée

In [None]:
# 1. Extract the list of categories and their DX codes
indicator_defs <- config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS
category_elements <- map(indicator_defs, ~ .x)  # safely preserve all vectors
category_names <- names(category_elements)

# 2. Get unique DX and DX_NAME from your main dataset
data_elements <- data %>%
  select(DX, DX_NAME) %>%
  distinct()

# 3. Build a lookup table assigning category to each DX
classified_elements <- bind_rows(lapply(category_names, function(cat) {
  ids <- category_elements[[cat]]
  data_elements %>%
    filter(DX %in% ids) %>%
    mutate(Categorie = cat)
}))

# 4. Display results sorted
classified_elements %>%
  arrange(Categorie, DX_NAME) %>%
  kable(
    caption = "Liste des éléments de données classés par catégorie",
    col.names = c("ID de l'élément", "Nom de l'élément", "Catégorie") # "Catégorie" should be called "Indicateur agrege'" or something that reflects what we do later (in formatting)
  )

## 1.2 Nombre de formations sanitaires
L’"**activité**" a été définie comme le moment où une formation sanitaire a rapporté au moins une information sur un quelconque élément de donnée. Cela a été évalué en vérifiant si au moins un élément de donnée avait été rapporté durant la période spécifiée.

In [None]:
head(pyramid_data, 3)

In [None]:
# Total number of unique facilities using org unit ID from pyramid data
facility_level = config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL
print(glue::glue("Note: `formation sanitaire` correspond to pyramid level == {facility_level}"))
# OLD: total_facilities <- pyramid_data %>% pull(LEVEL_5_ID) %>% unique() %>% length() # GP: paramatrized `LEVEL_5_ID`
total_facilities <- pyramid_data %>% 
pull(!!sym(paste0("LEVEL_", facility_level, "_ID"))) %>%
unique() %>% 
length()

# Check health facility activity: counted as "active" if any value is reported (including 0)
activity <- data %>%
  group_by(OU, PE) %>%
  summarise(active = any(!is.na(VALUE)), .groups = "drop")

# Number of facilities that were ever active
active_facilities <- activity %>%
  group_by(OU) %>%
  summarise(active_ever = any(active), .groups = "drop") %>%
  filter(active_ever) %>%
  nrow()

# Proportion
proportion_active <- 100 * active_facilities / total_facilities

# Print result
cat("Parmi ", total_facilities, " formations sanitaires uniques, ", 
    active_facilities, 
    " ont rapporté au moins une donnée sur un élément quelconque pendant la période spécifiée (", 
    round(proportion_active, 1), "%).\n", sep = "")

## 1.3 Période de couverture des données

In [None]:
# Mois minimum et maximum dans le jeu de données
cat("Premier mois pour lequel les données ont été extraites :", min(data$PE), "\n")
cat("Dernier mois pour lequel les données ont été extraites :", max(data$PE), "\n")
cat("Nombre total de mois couverts par les données :", length(unique(data$PE)), "\n")

# Vérification des mois manquants (en supposant des données mensuelles entre min et max)
all_months <- seq(ymd(paste0(min(data$PE), "01")),
                  ymd(paste0(max(data$PE), "01")),
                  by = "1 month") %>%
              format("%Y%m")

## 1.4 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque élément de données

In [None]:
options(jupyter.plot_mimetypes = c("image/png"))

In [None]:
# # --- STEP 1: Convert 'PE' to proper DATE
# data <- data %>%
#   mutate(
#     PE = as.character(PE),
#     DATE = as.Date(paste0(substr(PE, 1, 4), "-", substr(PE, 5, 6), "-01")), # TO DO: 💡 simplify ... mutate(YEAR = PE %/% 100, MONTH = PE %% 100)
#     INDICATOR = DX_NAME  # alias for clarity
#   )

# # --- STEP 2: Build expected full grid (OU × INDICATOR × DATE)
# full_grid <- expand_grid(
#   OU = unique(data$OU),
#   INDICATOR = unique(data$INDICATOR),
#   DATE = unique(data$DATE)
# )

# # --- STEP 3: Join to detect missing / zero / positive
# reporting_check <- full_grid %>%
#   left_join(
#     data %>% select(OU, INDICATOR, DATE, VALUE),
#     by = c("OU", "INDICATOR", "DATE")
#   ) %>%
#   mutate(
#     is_missing = is.na(VALUE),
#     is_zero = VALUE == 0 & !is.na(VALUE),
#     is_positive = VALUE > 0 & !is.na(VALUE)
#   )

# # --- STEP 4: Summarise by INDICATOR and date
# reporting_summary <- reporting_check %>%
#   group_by(INDICATOR, DATE) %>%
#   summarise(
#     n_total = n_distinct(OU),
#     n_missing = sum(is_missing),
#     n_zero = sum(is_zero),
#     n_positive = sum(is_positive),
#     pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
#     pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
#     pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
#     .groups = "drop"
#   )

# # --- STEP 5: Reshape for stacked plot
# plot_data <- reporting_summary %>%
#   pivot_longer(
#     cols = starts_with("pct_"),
#     names_to = "Status", values_to = "Percentage"
#   ) %>%
#   mutate(
#     Status = recode(Status,
#                     pct_missing = "Valeur manquante",
#                     pct_zero = "Valeur nulle rapportée",
#                     pct_positive = "Valeur positive rapportée")
#   ) %>%
#   complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))

In [None]:
# --- 🚨 (NEW) STEP 1: *GP* sum up VALUEs of each INDICATOR (DX_NAME) by CO!! 🚨
data <- data %>%
  group_by(OU, PE, DX_NAME) |>  # DX_NAME == INDICATOR
  summarise(VALUE = sum(as.numeric(VALUE)),
           .groups = "drop") |>
mutate(INDICATOR = DX_NAME)

In [None]:
# --- STEP 2: Build expected full grid (OU × INDICATOR × DATE)
full_grid <- expand_grid(
  OU = unique(data$OU),
  INDICATOR = unique(data$INDICATOR),
  # DATE = unique(data$DATE) # we don't really need `DATE` and it makes everything heavier ...
  PE = unique(data$PE)
)


In [None]:
# --- STEP 3: Join to detect missing / zero / positive
reporting_check <- full_grid %>%
  left_join(
    # data %>% select(OU, INDICATOR, DATE, VALUE),
    data %>% select(OU, INDICATOR, PE, VALUE),
    # by = c("OU", "INDICATOR", "DATE")
    by = c("OU", "INDICATOR", "PE")
  ) %>%
  mutate(
    is_missing = is.na(VALUE),
    is_zero = VALUE == 0 & !is.na(VALUE),
    is_positive = VALUE > 0 & !is.na(VALUE)
  )

In [None]:
head(reporting_check, 3)

In [None]:
# --- STEP 4: Summarise by INDICATOR and date
reporting_summary <- reporting_check %>%
  # group_by(INDICATOR, DATE) %>%
  group_by(INDICATOR, PE) %>%
  summarise(
    n_total = n_distinct(OU),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    # pct_total = sum(pct_missing, pct_zero, pct_positive), # sanity check: should be always == 100
    .groups = "drop"
  )

In [None]:
head(reporting_summary, 3)

In [None]:
# --- STEP 5: Reshape for stacked plot
plot_data <- reporting_summary %>%
  pivot_longer(
    cols = starts_with("pct_"),
    names_to = "Status", values_to = "Percentage"
  ) %>%
  mutate(
    Status = recode(Status,
                    pct_missing = "Valeur manquante",
                    pct_zero = "Valeur 0 rapportée", # old: "Valeur nulle rapportée",
                    pct_positive = "Valeur positive rapportée")
  ) %>%
  # complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))
  complete(INDICATOR, PE, Status, fill = list(Percentage = 0))

In [None]:
plot_data <- plot_data %>%
  left_join(classified_elements, by = c("INDICATOR" = "DX_NAME"))

In [None]:
head(plot_data, 3)

In [None]:
# Get all categories
categories <- unique(plot_data$Categorie)

# One plot per category
plots_by_category <- map(categories, function(cat) {
  ggplot(plot_data %>% filter(Categorie == cat),
         # aes(x = DATE, y = Percentage, fill = Status)) +
         aes(x = PE, y = Percentage, fill = Status)) +
    geom_col(position = "stack") +
    geom_hline(yintercept = c(25, 50, 75), color = "white", linewidth = 0.25) +
    facet_wrap(~ INDICATOR, scales = "free_y", nrow = 1) + # old: ncol = 3 
    scale_y_continuous() +
    scale_fill_manual(values = c(
      "Valeur manquante" = "tomato",
      "Valeur 0 rapportée" = "skyblue",
      "Valeur positive rapportée" = "green"
    )) +
    labs(
      title = paste("État de rapportage - Catégorie:", cat),
      subtitle = "Proportion des valeurs rapportées par mois",
      x = NULL, # x = "Mois", 
      y = "% des formations sanitaires",
      fill = "Statut du rapportage"
    ) +
    theme_minimal(base_size = 14) +
    theme(
      plot.title = element_text(face = "bold", size = 16),
      strip.text = element_text(size = 10),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 10),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1) # to replace `DATE`
    )
})


In [None]:
# Example: show the first category plot
options(repr.plot.width = 15, repr.plot.height = 5)
plots_by_category


## 1.5 Proportion de mois rapportés pour chaque élément de données par formation sanitaire

In [None]:
# # Max available column
# admin_levels <- names(data)
# name_cols <- grep("LEVEL_\\d+_NAME", admin_levels, value = TRUE)
# max_level <- max(as.numeric(gsub("LEVEL_(\\d+)_NAME", "\\1", name_cols)))
# max_admin_col_name <- paste0("LEVEL_", max_level, "_NAME")

# # Count number of months reported for each indicator per facility
# facility_coverage <- data %>%
#   group_by(OU, !!sym(max_admin_col_name), DX_NAME) %>%
#   summarise(N_VALUES = sum(!is.na(VALUE)), .groups = "drop") %>%
#   pivot_wider(names_from = DX_NAME, 
#               values_from = N_VALUES, 
#               values_fill = 0)

# # Turn wide data back to long for plotting
# facility_long <- facility_coverage %>%
#   pivot_longer(
#     cols = -c(OU, !!sym(max_admin_col_name)),
#     names_to = "indicator",
#     values_to = "months_reported"
#   ) %>%
#   mutate(percent_reported = (months_reported / length(unique(data$PE))) * 100) %>% 
#   left_join(
#     data %>% 
#       select(OU, !!sym(ADM_2)) %>% 
#       distinct(),
#     by = "OU"
#   )

In [None]:
head(data, 3)

In [None]:
# GP: simplified ... (we actually only need `OU`, not `LEVEL_<n>_NAME`)

# # Max available column
# admin_levels <- names(data)
# name_cols <- grep("LEVEL_\\d+_NAME", admin_levels, value = TRUE)
# max_level <- max(as.numeric(gsub("LEVEL_(\\d+)_NAME", "\\1", name_cols)))
# max_admin_col_name <- paste0("LEVEL_", max_level, "_NAME")

# Count number of months reported for each indicator per facility
facility_coverage <- data %>%
  # group_by(OU, !!sym(max_admin_col_name), DX_NAME) %>%
  group_by(OU, DX_NAME) %>%
  summarise(N_VALUES = sum(!is.na(VALUE)), .groups = "drop") %>%
  pivot_wider(names_from = DX_NAME, 
              values_from = N_VALUES, 
              values_fill = 0)

# Turn wide data back to long for plotting
facility_long <- facility_coverage %>%
  pivot_longer(
    # cols = -c(OU, !!sym(max_admin_col_name)),
    cols = -c(OU),
    names_to = "indicator",
    values_to = "months_reported"
  ) %>%
  mutate(percent_reported = (months_reported / length(unique(data$PE))) * 100) # %>% 
  # left_join(
  #   data %>% 
  #     # select(OU, !!sym(ADM_2)) %>% 
  #     select(OU, !!sym(ADM_2)) %>% 
  #     distinct(),
  #   by = "OU"
  # )

In [None]:
head(facility_long, 3)

In [None]:
# Heatmap: Indicators as rows, Health Facilities as columns
options(repr.plot.width = 18, repr.plot.height = 10)

ggplot(facility_long, aes(x = OU, y = indicator, fill = percent_reported)) + # x = !!sym(max_admin_col_name)
  geom_tile() +
  scale_fill_viridis_c(name = "% Rapporté", limits = c(0, 100)) +
  labs(
    title = "Complétude des rapports par formation sanitaire, sur toutes les périodes.",
    x = "Formation sanitaire",
    y = "Elément de données"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),  # Hide x-axis labels if too many
    axis.ticks.x = element_blank(),
    axis.text.y = element_text(size = 12),
    plot.title = element_text(size = 18, face = "bold"),
    axis.title.x = element_text(size = 16),  
    axis.title.y = element_text(size = 16),
    panel.grid = element_blank()
  )