## **Extraction des donn√©es de routine**

In [None]:
## CONFIGURATION ##

In [None]:
# Set SNT Paths
SNT_ROOT_PATH  <- "~/workspace"
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")

# load util functions
source(file.path(CODE_PATH, "snt_utils.r"))

# List required packages 
required_packages <- c("dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", "viridis", "patchwork", "zoo", "scales", "purrr", "arrow", "sf", "reticulate", "knitr", "glue")

# Execute function
install_and_load(required_packages)

# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

In [None]:
# Load SNT config
config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))},
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# Configuration variables
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS
indicator_defs <- config_json$DHIS2_DATA_DEFINITIONS$DHIS2_INDICATOR_DEFINITIONS
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADM_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)
facility_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL

In [None]:
# print function
printdim <- function(df, name = deparse(substitute(df))) {
  cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n")
}

In [None]:
# import analytics DHIS2 data
routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_analytics.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 analytics file for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_pyramid.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 organisation units data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

#reporting_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_reporting.parquet")) }, 
#                  error = function(e) {
#                      msg <- paste("Error loading " , COUNTRY_CODE , " DHIS2 reporting rates data : " , 
#                                   paste0(COUNTRY_CODE, "_dhis2_raw_reporting.parquet"), " can not be loaded.")
#                      cat(msg)
#                      log_msg(msg, "warning")                      
#                      return(NULL)
#                      })

printdim(routine_data)
printdim(pyramid_data)
#printdim(reporting_data)

### 1. Liste des √©l√©ments de donn√©e extraits

In [None]:
# 1. Extract the list of categories and their DX codes
category_elements <- map(indicator_defs, ~ .x)  # safely preserve all vectors
category_names <- names(category_elements)

# 2. Get unique DX and DX_NAME from your main dataset
data_elements <- routine_data %>%
  select(DX, DX_NAME) %>%
  distinct()

# 3. Build a lookup table assigning category to each DX
classified_elements <- bind_rows(lapply(category_names, function(cat) {
  ids <- category_elements[[cat]]
  data_elements %>%
    filter(DX %in% ids) %>%
    mutate(Categorie = cat)
}))

# 4. Display results sorted
classified_elements %>%
  arrange(Categorie, DX_NAME) %>%
  kable(
    caption = "Liste des √©l√©ments de donn√©es extraits, class√©s par indicateur",
    col.names = c("ID de l'√©l√©ment", "Nom de l'√©l√©ment de donn√©e", "Indicateur")
  )

In [None]:
# Simple table of data elements and their disaggregations
disaggregations_table <- routine_data %>%
  distinct(DX, DX_NAME, CO_NAME) %>%
  group_by(DX, DX_NAME) %>%
  summarise(
    `D√©sagr√©gations` = paste(sort(unique(na.omit(CO_NAME))), collapse = "; "),
    .groups = "drop"
  ) %>%
  mutate(`D√©sagr√©gations` = ifelse(`D√©sagr√©gations` == "", "‚Äî", `D√©sagr√©gations`)) %>%
  arrange(DX_NAME)

# Display
disaggregations_table

### 2. P√©riode de couverture des donn√©es

In [None]:
# Mois minimum et maximum dans le jeu de donn√©es
cat("Premier mois pour lequel les donn√©es ont √©t√© extraites :", min(routine_data$PE), "\n")
cat("Dernier mois pour lequel les donn√©es ont √©t√© extraites :", max(routine_data$PE), "\n")
cat("Nombre total de mois couverts par les donn√©es :", length(unique(routine_data$PE)), "\n")

# V√©rification des mois manquants (en supposant des donn√©es mensuelles entre min et max)
all_months <- seq(ymd(paste0(min(routine_data$PE), "01")),
                  ymd(paste0(max(routine_data$PE), "01")),
                  by = "1 month") %>%
              format("%Y%m")

### 3. R√©sum√© hierarchique

In [None]:
# Map NAME -> ID (robust if already *_ID)
adm1_id <- ifelse(str_ends(ADM_1, "_ID"), ADM_1, str_replace(ADM_1, "_NAME$", "_ID"))
adm2_id <- ifelse(str_ends(ADM_2, "_ID"), ADM_2, str_replace(ADM_2, "_NAME$", "_ID"))

# Collect and order available LEVEL_*_ID columns
level_id_cols <- names(pyramid_data)[grepl("^LEVEL_\\d+_ID$", names(pyramid_data))]
level_order   <- as.integer(str_match(level_id_cols, "^LEVEL_(\\d+)_ID$")[,2])
level_id_cols <- level_id_cols[order(level_order)]

# Build summary (counts of unique IDs per level)
level_summary <- tibble(Column = level_id_cols) %>%
  mutate(
    Level = as.integer(str_match(Column, "^LEVEL_(\\d+)_ID$")[,2]),
    `Nombre d'unit√©s` = map_int(Column, ~ n_distinct(pyramid_data[[.x]], na.rm = TRUE))
  ) %>%
  arrange(Level)

# Add role labels using *_ID columns
level_summary <- level_summary %>%
  mutate(
    R√¥le = case_when(
      Column == adm1_id ~ "ADM_1 (administration 1)",
      Column == adm2_id ~ "ADM_2 (administration 2)",
      Level  == facility_level ~ glue("Niveau des FOSA (L{facility_level})"),
      TRUE ~ ""
    )
  )

# Pretty print
level_summary %>%
  mutate(Niveau = paste0("L", Level)) %>%
  select(Niveau, Column, `Nombre d'unit√©s`, R√¥le) %>%
  kable(caption = "R√©sum√© hi√©rarchique: nombre d‚Äôunit√©s (IDs) uniques par niveau (pyramid_data)")

cat(glue(
  "\nNote : ADM_1 est mapp√© sur `{ADM_1}` ‚Üí `{adm1_id}`, ADM_2 sur `{ADM_2}` ‚Üí `{adm2_id}`. ",
  "Le niveau op√©rationnel des formations sanitaires est L{facility_level}.\n"
))

### 4. Nombre et activit√© des formations sanitaires

In [None]:
# Nombre total de formations sanitaires uniques selon le niveau organisationnel d√©fini dans la pyramide
total_facilities <- pyramid_data %>% 
  pull(!!sym(paste0("LEVEL_", facility_level, "_ID"))) %>%
  unique() %>% 
  length()

cat(glue::glue(
  "Les √©tablissements sont identifi√©s de mani√®re unique par leur identifiant d‚Äôunit√© organisationnelle issu de la pyramide, ",
  "c‚Äôest-√†-dire le niveau {facility_level} de la hi√©rarchie sanitaire. ",
  "Au total, {total_facilities} formations sanitaires uniques ont √©t√© identifi√©es √† ce niveau."
))

In [None]:
# V√©rification de l‚Äôactivit√© : une formation sanitaire est consid√©r√©e comme ¬´ active ¬ª
# si elle a rapport√© au moins une valeur (y compris z√©ro) pendant la p√©riode sp√©cifi√©e.
activity <- routine_data %>%
  group_by(OU, PE) %>%
  summarise(active = any(!is.na(VALUE)), .groups = "drop")

# Nombre de formations sanitaires actives au moins une fois
active_facilities <- activity %>%
  group_by(OU) %>%
  summarise(active_ever = any(active), .groups = "drop") %>%
  filter(active_ever) %>%
  nrow()

# Proportion d‚Äô√©tablissements actifs
proportion_active <- 100 * active_facilities / total_facilities

# R√©sum√© des r√©sultats (version enrichie)
period_start <- min(routine_data$PE)
period_end <- max(routine_data$PE)

cat(glue(
  "Sur un total de {total_facilities} formations sanitaires uniques identifi√©es dans la pyramide, ",
  "{active_facilities} ont rapport√© au moins une donn√©e sur un √©l√©ment au cours de la p√©riode sp√©cifi√©e ",
  "dans les donn√©es de routine ({period_start}‚Äì{period_end}), ",
  "soit {round(proportion_active, 1)} % d‚Äô√©tablissements ayant effectivement transmis des donn√©es."
))

In [None]:
# Years from routine (already fine)
yrs_rout <- sort(unique(as.integer(substr(routine_data$PE, 1, 4))))
years    <- seq(min(yrs_rout, na.rm = TRUE), max(yrs_rout, na.rm = TRUE), by = 1)

# Helper: ensure Annee is integer
open_in_year <- function(df, y) {
  y <- as.integer(y)
  year_start <- as.Date(sprintf("%s-01-01", y))
  year_end   <- as.Date(sprintf("%s-12-31", y))
  df %>%
    filter(
      as.Date(OPENING_DATE) <= year_end,
      is.na(CLOSED_DATE) | as.Date(CLOSED_DATE) >= year_start
    ) %>%
    summarise(Annee = y, Ouvertes_pyramide = n(), .groups = "drop")
}

open_per_year <- bind_rows(lapply(years, open_in_year, df = pyramid_data)) %>%
  mutate(Annee = as.integer(Annee))

reported_per_year <- routine_data %>%
  mutate(Annee = as.integer(substr(PE, 1, 4))) %>%
  filter(Annee %in% years, !is.na(OU)) %>%
  group_by(Annee, OU) %>%
  summarise(any_value = any(!is.na(VALUE)), .groups = "drop") %>%
  group_by(Annee) %>%
  summarise(Ayant_rapporte_routine = sum(any_value, na.rm = TRUE), .groups = "drop")

reconciliation <- open_per_year %>%
  left_join(reported_per_year, by = "Annee") %>%
  mutate(
    Ayant_rapporte_routine = tidyr::replace_na(Ayant_rapporte_routine, 0L),
    `Pct_rapportant_(%)` = dplyr::if_else(
      Ouvertes_pyramide > 0,
      round(100 * Ayant_rapporte_routine / Ouvertes_pyramide, 1),
      NA_real_
    )
  ) %>%
  arrange(Annee)

# Updated text (no "six derni√®res ann√©es")
cat(glue(
  "L‚Äôactivit√© structurelle des formations sanitaires est √©valu√©e via les dates d‚Äôouverture/fermeture de la pyramide. ",
  "Une formation est consid√©r√©e ouverte pour une ann√©e si elle a √©t√© inaugur√©e avant/pendant cette ann√©e ",
  "et non ferm√©e avant le 31/12. Le tableau pr√©sente, pour chaque ann√©e disponible dans l‚Äôextraction routine, ",
  "le nombre de formations ouvertes et celles ayant rapport√© au moins une valeur."
))

kable(reconciliation,
     caption = "Ouverture (pyramide) vs. rapportage effectif (routine), par ann√©e")


In [None]:
# --- Make sure VALUE is treated as numeric where possible (silently)
routine_data <- routine_data %>%
  mutate(VALUE = suppressWarnings(as.numeric(VALUE)))

# ----- Build the fixed universes from routine_data only -----
# A) Universe over the whole period (ever reported anything)
active_ou_all <- routine_data %>%
  group_by(OU) %>%
  summarise(active_ever = any(!is.na(VALUE)), .groups = "drop") %>%
  filter(active_ever) %>%
  pull(OU)

denom_all <- length(active_ou_all)

# B) Universe per year (reported at least once within that year)
per_ou_pe <- routine_data %>%
  group_by(OU, PE) %>%
  summarise(any_value = any(!is.na(VALUE)), .groups = "drop") %>%
  mutate(year = substr(PE, 1, 4))

active_by_year <- per_ou_pe %>%
  group_by(year, OU) %>%
  summarise(active_year = any(any_value), .groups = "drop") %>%
  filter(active_year) %>%
  group_by(year) %>%
  summarise(denom_year = n_distinct(OU), .groups = "drop")

# ----- Monthly reporting using fixed universes -----
# A) Denominator = active over the whole period
monthly_reporting_all <- per_ou_pe %>%
  filter(OU %in% active_ou_all) %>%
  group_by(PE) %>%
  summarise(
    n_reporting   = sum(any_value),
    denom         = denom_all,
    pct_reporting = 100 * n_reporting / denom,
    .groups = "drop"
  ) %>%
  arrange(PE)

# B) Denominator = active within the year
monthly_reporting_by_year <- per_ou_pe %>%
  group_by(year, PE) %>%
  summarise(n_reporting = sum(any_value), .groups = "drop") %>%
  left_join(active_by_year, by = "year") %>%
  mutate(pct_reporting = 100 * n_reporting / denom_year) %>%
  arrange(PE) %>%
  group_by(year) %>%
  mutate(denom_line = first(denom_year)) %>%
  ungroup() %>%
  mutate(PE = factor(PE, levels = sort(unique(PE))))  # keep month order

In [None]:
monthly_reporting_by_year <- monthly_reporting_by_year %>%
  dplyr::group_by(year) %>%
  dplyr::mutate(denom_line = dplyr::first(denom_year)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(PE = factor(PE, levels = sort(unique(PE))))  # keep month order

options(repr.plot.width = 13, repr.plot.height = 8)
ggplot(monthly_reporting_by_year, aes(x = PE)) +
  geom_line(aes(y = n_reporting, color = "Formations rapportant", group = 1), linewidth = 1) +
  geom_point(aes(y = n_reporting, color = "Formations rapportant"), size = 1.2) +
  geom_line(aes(y = denom_line, color = "Total actif dans l'ann√©e", group = 1),
            linewidth = 1, linetype = "dashed") +
  facet_wrap(~ year, scales = "free_x") +
  scale_color_manual(values = c(
    "Formations rapportant"     = "steelblue",
    "Total actif dans l'ann√©e"  = "grey40"
  )) +
  labs(
    title = "√âvolution du nombre de formations sanitaires rapportant des donn√©es",
    subtitle = "Ligne pointill√©e : total des formations sanitaires qui ont d√©clar√© au moins une fois un √©l√©ment de donn√©e au cours de l'ann√©e",
    x = NULL, y = "Nombre de formations sanitaires", color = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


In [None]:
options(repr.plot.width = 13, repr.plot.height = 8)
ggplot(monthly_reporting_by_year, aes(x = PE, y = pct_reporting)) +
  geom_col(fill = "darkgreen", alpha = 0.8) +
  facet_wrap(~ year, scales = "free_x") +
  labs(
    title = "Proportion de formations sanitaires ayant rapport√© au moins une valeur",
    subtitle = "Par mois, avec d√©nominateur fix√© √† l'ann√©e de r√©f√©rence",
    x = NULL,
    y = "% des formations sanitaires"
  ) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank()
  )


In [None]:
if (config_json$SNT_CONFIG$COUNTRY_CODE == "NER") {

  # --- Helper to classify facility types (Niger-specific) ---
  norm_fosa_type <- function(x){
    x_up <- str_to_upper(str_squish(x))
    case_when(
      str_detect(x_up, "^HD\\b")                             ~ "HD (h√¥pital de district)",
      str_detect(x_up, "^CSI\\b")                            ~ "CSI (centre de sant√© int√©gr√©)",
      str_detect(x_up, "^CS\\b")                             ~ "CS (case de sant√©)",
      str_detect(x_up, "^(SS\\b|SALLE\\b|SALLE D'ACCOUCHEMENT\\b)") ~ "SS / Salle (soins/maternit√©)",
      str_detect(x_up, "^(CLINIQUE|POLYCLINIQUE)\\b")        ~ "Clinique (priv√©)",
      str_detect(x_up, "^CABINET\\b")                        ~ "Cabinet (priv√©)",
      str_detect(x_up, "^(INFIRMERIE|INFIRM)\\b")            ~ "Infirmerie (priv√©)",
      str_detect(x_up, "^CNSS\\b")                           ~ "CNSS",
      TRUE                                                   ~ "Autre"
    )
  }

  # --- Classify and count ---
  fosa_counts <- pyramid_data %>%
    mutate(fosa_type = norm_fosa_type(LEVEL_6_NAME)) %>%
    count(fosa_type, sort = TRUE)

  # --- Add total row ---
  fosa_counts <- fosa_counts %>%
    add_row(fosa_type = "Total", n = sum(fosa_counts$n))

  total_l6 <- sum(fosa_counts$n[fosa_counts$fosa_type != "Total"])

  # --- Display summary table ---
  knitr::kable(fosa_counts, caption = "R√©partition des formations sanitaires par type (niveau 6)")

} else {
  cat("Cette section n'est pas applicable : la structure pyramidale diff√®re pour ce pays.")
}

### 5. Compl√©tude de l'extraction des donn√©es de routine au niveau des formations sanitaires

Cette section pr√©sente la distribution des valeurs extraites pour chaque √©l√©ment de donn√©e du SNIS, mois par mois, au niveau des formations sanitaires incluses dans la pyramide sanitaire.

Pour chaque √©l√©ment, trois situations sont distingu√©es :
- Valeur positive rapport√©e : au moins une valeur sup√©rieure √† z√©ro a √©t√© d√©clar√©e
- Valeur z√©ro rapport√©e : uniquement des valeurs √©gales √† z√©ro ont √©t√© enregistr√©es
- Valeur manquante : aucune donn√©e n‚Äôa √©t√© rapport√©e pour le mois consid√©r√©

Le nombre total de formations sanitaires reste constant, correspondant √† celles ayant transmis au moins une donn√©e sur la p√©riode d‚Äôanalyse.

Les graphiques ci-dessous illustrent, pour chaque indicateur SNIS, la proportion relative de ces trois types de valeurs au fil du temps, permettant d‚Äô√©valuer la compl√©tude et la coh√©rence des donn√©es extraites avant tout traitement analytique.

In [None]:
options(jupyter.plot_mimetypes = c("image/png"))

In [None]:
# --- üö® (NEW) STEP 1: *GP* sum up VALUEs of each INDICATOR (DX_NAME) by CO!! üö®
routine_data <- routine_data %>%
  group_by(OU, PE, DX_NAME) |>  # DX_NAME == INDICATOR
  summarise(VALUE = sum(as.numeric(VALUE)),
           .groups = "drop") |>
mutate(INDICATOR = DX_NAME)

In [None]:
# --- STEP 2: Build expected full grid (OU √ó INDICATOR √ó DATE)
full_grid <- expand_grid(
  OU = unique(routine_data$OU),
  INDICATOR = unique(routine_data$INDICATOR),
  PE = unique(routine_data$PE)
)

In [None]:
# --- STEP 3: Join to detect missing / zero / positive
reporting_check <- full_grid %>%
  left_join(
    # data %>% select(OU, INDICATOR, DATE, VALUE),
    routine_data %>% select(OU, INDICATOR, PE, VALUE),
    # by = c("OU", "INDICATOR", "DATE")
    by = c("OU", "INDICATOR", "PE")
  ) %>%
  mutate(
    is_missing = is.na(VALUE),
    is_zero = VALUE == 0 & !is.na(VALUE),
    is_positive = VALUE > 0 & !is.na(VALUE)
  )

In [None]:
# --- STEP 4: Summarise by INDICATOR and date
reporting_summary <- reporting_check %>%
  # group_by(INDICATOR, DATE) %>%
  group_by(INDICATOR, PE) %>%
  summarise(
    n_total = n_distinct(OU),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    # pct_total = sum(pct_missing, pct_zero, pct_positive), # sanity check: should be always == 100
    .groups = "drop"
  )

In [None]:
# --- STEP 5: Reshape for stacked plot
plot_data <- reporting_summary %>%
  pivot_longer(
    cols = starts_with("pct_"),
    names_to = "Status", values_to = "Percentage"
  ) %>%
  mutate(
    Status = recode(Status,
                    pct_missing = "Valeur manquante",
                    pct_zero = "Valeur 0 rapport√©e", # old: "Valeur nulle rapport√©e",
                    pct_positive = "Valeur positive rapport√©e")
  ) %>%
  # complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))
  complete(INDICATOR, PE, Status, fill = list(Percentage = 0))

In [None]:
plot_data <- plot_data %>%
  left_join(classified_elements %>% distinct(DX_NAME, Categorie),
            by = c("INDICATOR" = "DX_NAME"),
            relationship = "many-to-many")

In [None]:
categories <- plot_data %>%
  filter(!is.na(Categorie)) %>%
  distinct(Categorie) %>%
  pull(Categorie)

plots_by_category <- map(categories, function(cat) {
  df_cat <- plot_data %>% filter(Categorie == cat)

  ggplot(df_cat,
         aes(x = PE, y = Percentage, fill = Status)) +
    geom_col(position = "stack") +
    geom_hline(yintercept = c(25, 50, 75), color = "white", linewidth = 0.25) +
    facet_wrap(~ INDICATOR, scales = "free_y", nrow = 1) +
    scale_fill_manual(values = c(
      "Valeur manquante" = "tomato",
      "Valeur 0 rapport√©e" = "skyblue",
      "Valeur positive rapport√©e" = "green"
    )) +
    labs(
      title = paste("Distribution des valeurs extraites - Indicateur :", cat),
      subtitle = "Proportion de formations sanitaires ayant rapport√© des valeurs manquantes, nulles ou positives par mois",
      x = NULL,
      y = "% des formations sanitaires",
      fill = "Type de valeur extraite"
    ) +
    theme_minimal(base_size = 14) +
    theme(
      plot.title = element_text(face = "bold", size = 16),
      strip.text = element_text(size = 10),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 10),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)
    )
})


In [None]:
# Example: show the first category plot
options(repr.plot.width = 15, repr.plot.height = 5)
walk(plots_by_category, print)

### 6. Disponibilit√© des donn√©es par formation sanitaire (sur la p√©riode analys√©e)

Cette section √©value la disponibilit√© des donn√©es de routine pour chaque formation sanitaire sur l‚Äôensemble de la p√©riode analys√©e.

- Pour chaque indicateur, le graphique montre le pourcentage de mois avec au moins une valeur non manquante (c‚Äôest-√†-dire, une donn√©e rapport√©e, qu‚Äôelle soit nulle ou positive).
- Chaque ligne correspond √† une formation sanitaire, et chaque colonne √† un indicateur.
- Les couleurs vont du jaune (100 %), indiquant une disponibilit√© compl√®te, au violet (0 %), indiquant une absence totale de donn√©es sur la p√©riode.

Ce diagnostic permet d‚Äôidentifier les formations sanitaires avec des probl√®mes chroniques de rapportage ou des interruptions prolong√©es dans la saisie des donn√©es.

In [None]:
# How many distinct months are in the analysis window?
n_months <- dplyr::n_distinct(routine_data$PE)

# --- 1) Coverage by facility x indicator -------------------------------------
# Count the number of months with any non-missing VALUE (dedup PE if needed)
facility_cov <- routine_data %>%
  dplyr::group_by(OU, DX_NAME, PE) %>%
  dplyr::summarise(has_value = any(!is.na(VALUE)), .groups = "drop") %>%  # 1 row per OU √ó DX √ó PE
  dplyr::group_by(OU, DX_NAME) %>%
  dplyr::summarise(
    months_reported = sum(has_value),          # months with data
    pct_reported    = 100 * months_reported / n_months,
    .groups = "drop"
  )

# Optional: order facilities by overall completeness (across all indicators)
ou_order <- facility_cov %>%
  dplyr::group_by(OU) %>%
  dplyr::summarise(pct_overall = mean(pct_reported, na.rm = TRUE), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(pct_overall)) %>%
  dplyr::pull(OU)

# Optional: order indicators (e.g., alphabetical, or use your custom order)
ind_order <- facility_cov %>%
  dplyr::distinct(DX_NAME) %>%
  dplyr::arrange(DX_NAME) %>%
  dplyr::pull(DX_NAME)

plot_df <- facility_cov %>%
  dplyr::mutate(
    OU      = factor(OU,      levels = ou_order),
    DX_NAME = factor(DX_NAME, levels = ind_order)
  )

# --- 2) Heatmap ---------------------------------------------------------------
# Make the figure wide and tall so it remains readable
options(repr.plot.width = 15, repr.plot.height = 9)

ggplot(plot_df, aes(x = OU, y = DX_NAME, fill = pct_reported)) +
  geom_tile() +
  scale_fill_viridis_c(name = "% rapport√©", limits = c(0, 100)) +
  labs(
    title = "Disponibilit√© des donn√©es par formation sanitaire (sur la p√©riode analys√©e)",
    subtitle = paste0("Pour chaque √©l√©ment, % de mois avec une valeur non manquante ‚Ä¢ Fen√™tre: ",
                      n_months, " mois"),
    x = "Formation sanitaire",
    y = "√âl√©ment de donn√©es"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x  = element_blank(),   # trop nombreux
    axis.ticks.x = element_blank(),
    axis.text.y  = element_text(size = 11),
    plot.title   = element_text(face = "bold", size = 16),
    panel.grid   = element_blank(),
    legend.position = "right"
  )

### 7. Tendances nationales et mensuelles par √©l√©ment de donn√©es

Cette section pr√©sente l‚Äô√©volution temporelle des valeurs mensuelles totales pour chaque indicateur de paludisme au cours de la p√©riode analys√©e. Les courbes montrent la somme des valeurs rapport√©es √† travers toutes les formations sanitaires et toutes les d√©sagr√©gations.
- Chaque graphique correspond √† un indicateur agr√©g√© (par exemple, cas confirm√©s, cas pr√©sum√©s, d√©c√®s, etc.).
- L‚Äôaxe horizontal repr√©sente le temps (mois), et l‚Äôaxe vertical le total des valeurs rapport√©es pour l‚Äôensemble du pays.

Ces tendances permettent de visualiser les fluctuations saisonni√®res et d‚Äôidentifier d‚Äô√©ventuelles anomalies ou ruptures dans la dynamique des cas rapport√©s.

In [None]:
# routine data
routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_analytics.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 analytics file for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

In [None]:
routine_data <- routine_data %>%
  mutate(VALUE = suppressWarnings(as.numeric(VALUE))) %>%  # ensure numeric
  group_by(DX) %>%
  mutate(
    med = median(VALUE, na.rm = TRUE),
    mad_val = mad(VALUE, constant = 1, na.rm = TRUE),
    z_mad = abs(VALUE - med) / (mad_val + 1e-9),
    VALUE = if_else(z_mad > 50, NA_real_, VALUE)  # remove big outliers
  ) %>%
  ungroup() %>%
  select(-med, -mad_val, -z_mad)


In [None]:
### 1. Build mapping table (Categorie, DX, optional CO)
indicator_map <- imap_dfr(indicator_defs, function(dx_list, categorie) {
  tibble(raw = dx_list) %>%
    mutate(
      DX        = str_extract(raw, "^[^\\.]+"),        # before first "."
      CO        = str_extract(raw, "(?<=\\.).+"),      # after first ".", if present
      Categorie = categorie
    ) %>%
    select(Categorie, DX, CO)
})

### 2. Clean routine data (fresh, with DX + CO columns present)
rd_clean <- routine_data %>%
  mutate(
    VALUE = suppressWarnings(as.numeric(VALUE)),
    month = ymd(paste0(PE, "01"))
  ) %>%
  filter(!is.na(month), !is.na(DX))

### 3. Split mapping into ‚ÄúDX only‚Äù vs ‚ÄúDX+CO‚Äù, then join

# (a) Categories where CO is NOT specified: include all COs for that DX
map_dx_only <- indicator_map %>%
  filter(is.na(CO)) %>%
  distinct(Categorie, DX)

joined_dx_only <- rd_clean %>%
  inner_join(
    map_dx_only,
    by = "DX",
    relationship = "many-to-many"  # <-- silence warning, intentional
  )

# (b) Categories where CO IS specified: restrict to that exact DX+CO pair
map_dx_co <- indicator_map %>%
  filter(!is.na(CO)) %>%
  distinct(Categorie, DX, CO)

joined_dx_co <- rd_clean %>%
  inner_join(
    map_dx_co,
    by = c("DX", "CO"),
    relationship = "many-to-many"  # <-- silence warning, intentional
  )

# (c) Combine
rd_cat <- bind_rows(joined_dx_only, joined_dx_co)

# If rd_clean didn't already carry DX_NAME (depends on what you filtered earlier),
# make sure DX_NAME is attached:
if (!"DX_NAME" %in% names(rd_cat)) {
  rd_cat <- rd_cat %>%
    left_join(
      routine_data %>%
        distinct(DX, DX_NAME),
      by = "DX"
    )
}

### 4. Monthly totals per category and per data element (DX_NAME)
monthly_by_dx <- rd_cat %>%
  group_by(Categorie, DX_NAME, month) %>%
  summarise(
    total = sum(VALUE, na.rm = TRUE),
    .groups = "drop"
  )

### 5. Ensure full monthly sequence per (Categorie, DX_NAME)
monthly_by_dx_complete <- monthly_by_dx %>%
  group_by(Categorie, DX_NAME) %>%
  complete(
    month = seq(min(month, na.rm = TRUE),
                max(month, na.rm = TRUE),
                by = "1 month"),
    fill = list(total = 0)
  ) %>%
  ungroup()

### 6. Build one plot per category
min_month <- min(ymd(paste0(routine_data$PE, "01")), na.rm = TRUE)
max_month <- max(ymd(paste0(routine_data$PE, "01")), na.rm = TRUE)

plots_by_cat <- monthly_by_dx_complete %>%
  split(.$Categorie) %>%
  imap(function(df_cat, cat_name) {
    ggplot(
      df_cat,
      aes(x = month, y = total, color = DX_NAME, group = DX_NAME)
    ) +
      geom_line(linewidth = 1, alpha = 0.9) +
      geom_point(size = 1.8, alpha = 0.9) +
      
      # Fixed x-axis for all categories
      scale_x_date(
        limits = c(min_month, max_month),
        date_breaks = "1 year",
        date_labels = "%Y"
      ) +
      
      # Default auto y-axis
      scale_y_continuous(
        labels = scales::label_number(scale_cut = scales::cut_short_scale())
      ) +
      
      labs(
        title = paste0(cat_name, " ‚Äî s√©ries mensuelles par √©l√©ment de donn√©e"),
        subtitle = "Somme des valeurs mensuelles (toutes UO et d√©sagr√©gations confondues)",
        x = "Temps",
        y = "Total national rapport√©",
        color = "√âl√©ment de donn√©e"
      ) +
      theme_minimal(base_size = 14) +
      theme(
        plot.title = element_text(face = "bold", size = 20),
        plot.subtitle = element_text(size = 13, margin = margin(b = 10)),
        axis.title.x = element_text(size = 14, face = "bold", margin = margin(t = 10)),
        axis.title.y = element_text(size = 14, face = "bold", margin = margin(r = 10)),
        axis.text.x  = element_text(size = 11, angle = 45, hjust = 1, vjust = 1),
        axis.text.y  = element_text(size = 11),
        legend.position = "bottom",
        legend.title = element_text(size = 13, face = "bold"),
        legend.text  = element_text(size = 12),
        legend.key.width = unit(1.2, "lines"),
        legend.key.height = unit(0.8, "lines"),
        legend.box = "vertical",
        legend.spacing.y = unit(2, "pt"),
        panel.grid.minor = element_blank()
      ) +
      guides(color = guide_legend(nrow = 2, byrow = TRUE))
  })

In [None]:
# Bigger canvas for all of them
options(repr.plot.width = 12, repr.plot.height = 6)

purrr::iwalk(
  plots_by_cat,
  ~{
    print(.x)
  }
)