# **Complétude et cohérence des indicateurs composites**


In [None]:
# Set SNT Paths
SNT_ROOT_PATH  <- "~/workspace"
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")

# load util functions
source(file.path(CODE_PATH, "snt_utils.r"))

# List required packages 
required_packages <- c("dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", "viridis", "patchwork", "zoo", "purrr", "arrow", "sf", "reticulate", "tidyverse") 

# Execute function
install_and_load(required_packages)

# Set environment to load openhexa.sdk from the right environment
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")

# Load openhexa.sdk
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))},
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# Configuration variables
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME
ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

In [None]:
# print function
printdim <- function(df, name = deparse(substitute(df))) {
  cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n")
}

# 1. Complétude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators

In [None]:
# import analytics DHIS2 data
routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_routine.parquet")) }, 
                  error = function(e) {
                      msg <- paste0("[DATA NOT FOUND] Error while loading DHIS2 Routine data for: " , COUNTRY_CODE, " the report cannot be executed. [ERROR DETAILS] ", conditionMessage(e))
                      stop(msg)
                      })

population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_population.parquet")) }, 
                  error = function(e) {
                      msg <- paste0(COUNTRY_NAME , " Population data is not available in dataset : " , dataset_name, " last version.")
                      log_msg(msg, "warning")
                      population_data <- NULL
                      })

shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_shapes.geojson")) }, 
                  error = function(e) {                      
                      msg <- paste0(COUNTRY_NAME , " Shapes data is not available in dataset : " , dataset_name, " last version.")
                      log_msg(msg, "warning")
                      shapes_data <- NULL
                      })

printdim(routine_data)

In [None]:
head(routine_data)

## 1.1 Proportion de formations sanitaires ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur

In [None]:
# Step 0: Rename your data for convenience
data <- routine_data

# Step 1: Convert PERIOD to DATE
data <- data %>%
  mutate(
    DATE = ymd(paste0(PERIOD, "01"))
  )

# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value
indicator_vars <- setdiff(names(data), c(
  "PERIOD", "YEAR", "MONTH", "OU_ID", "OU_NAME", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "DATE"
))

long_data <- data %>%
  pivot_longer(cols = all_of(indicator_vars),
               names_to = "INDICATOR",
               values_to = "VALUE") %>%
  rename(OU = OU_ID)

# Step 3: Build expected full grid (OU × INDICATOR × DATE)
full_grid <- expand_grid(
  OU = unique(long_data$OU),
  INDICATOR = unique(long_data$INDICATOR),
  DATE = unique(long_data$DATE)
)

# Step 4: Join and assess reporting status
reporting_check <- full_grid %>%
  left_join(
    long_data %>% select(OU, INDICATOR, DATE, VALUE),
    by = c("OU", "INDICATOR", "DATE")
  ) %>%
  mutate(
    is_missing = is.na(VALUE),
    is_zero = VALUE == 0 & !is.na(VALUE),
    is_positive = VALUE > 0 & !is.na(VALUE)
  )

# Step 5: Summarise reporting status
reporting_summary <- reporting_check %>%
  group_by(INDICATOR, DATE) %>%
  summarise(
    n_total = n_distinct(OU),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    .groups = "drop"
  )

# Step 6: Prepare plot-ready data
plot_data <- reporting_summary %>%
  pivot_longer(
    cols = starts_with("pct_"),
    names_to = "Status",
    values_to = "Percentage"
  ) %>%
  mutate(
    Status = recode(Status,
                    pct_missing = "Valeur manquante",
                    pct_zero = "Valeur nulle rapportée",
                    pct_positive = "Valeur positive rapportée")
  ) %>%
  complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))


In [None]:
options(repr.plot.width = 17, repr.plot.height = 10)
ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +
  geom_col(position = "stack") +
  facet_wrap(~ INDICATOR, scales = "free_y", ncol = 4) +
  scale_y_continuous() +
  scale_fill_manual(values = c(
    "Valeur manquante" = "tomato",
    "Valeur nulle rapportée" = "skyblue",
    "Valeur positive rapportée" = "green"
  )) +
  labs(
    title = "Taux de rapportage par indicateur (niveau formation sanitaire)",
    subtitle = "Proportion des valeurs rapportées par mois et par indicateur",
    x = "Mois", y = "% des formations sanitaires",
    fill = "Statut du rapportage"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", size = 20),
    strip.text = element_text(size = 16),
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 16)
  )


## 1.2 Proportion des districts ayant rapporté des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur.

In [None]:
# Step 0: Rename for convenience
data <- routine_data

# Step 1: Convert PERIOD to proper Date
data <- data %>%
  mutate(Date = ymd(paste0(PERIOD, "01")))

# Step 2: Identify indicator columns
indicator_cols <- setdiff(names(data), c(
  "PERIOD", "YEAR", "MONTH", "OU_ID", "OU_NAME",
  "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "Date"
))

# Step 3: Reshape to long format
data_long <- data %>%
  select(ADM2_ID, OU_ID, Date, all_of(indicator_cols)) %>%
  pivot_longer(cols = all_of(indicator_cols),
               names_to = "Indicator", values_to = "value") %>%
  mutate(value = as.numeric(value))

# Step 4: Full expected grid at ADM2 level
full_grid <- expand_grid(
  ADM2_ID = unique(data_long$ADM2_ID),
  Indicator = unique(data_long$Indicator),
  Date = unique(data_long$Date)
)

# Step 5: Detect if *any* health facility reported per district × indicator × date
reporting_check <- data_long %>%
  group_by(ADM2_ID, Indicator, Date) %>%
  summarise(
    is_missing = all(is.na(value)),
    is_zero = all(value == 0, na.rm = TRUE),
    is_positive = any(value > 0, na.rm = TRUE),
    .groups = "drop"
  )

# Step 6: Join with full grid to fill in missing ADM2s
reporting_full <- full_grid %>%
  left_join(reporting_check, by = c("ADM2_ID", "Indicator", "Date")) %>%
  mutate(
    is_missing = replace_na(is_missing, TRUE),
    is_zero = replace_na(is_zero, FALSE),
    is_positive = replace_na(is_positive, FALSE)
  )

# Step 7: Summarise by Indicator and Date
reporting_summary <- reporting_full %>%
  group_by(Indicator, Date) %>%
  summarise(
    n_total = n_distinct(ADM2_ID),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero & !is_missing),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    .groups = "drop"
  )

# Step 8: Reshape for plotting
plot_data <- reporting_summary %>%
  pivot_longer(cols = starts_with("pct_"),
               names_to = "Status", values_to = "Percentage") %>%
  mutate(Status = recode(Status,
                         pct_missing = "Valeur manquante",
                         pct_zero = "Valeur nulle rapportée",
                         pct_positive = "Valeur positive rapportée")) %>%
  complete(Indicator, Date, Status, fill = list(Percentage = 0))

# Step 9: Plot
ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +
  geom_col(position = "stack") +
  facet_wrap(~ Indicator, scales = "free_y") +
  scale_y_continuous(limits = c(0, 100)) +
  scale_fill_manual(values = c(
    "Valeur manquante" = "tomato",
    "Valeur nulle rapportée" = "skyblue",
    "Valeur positive rapportée" = "green"
  )) +
  labs(
    title = "Taux de rapportage par indicateur (niveau district)",
    subtitle = "Proportion des districts (ADM2_ID) rapportant chaque mois",
    x = "Mois", y = "% des districts",
    fill = "Statut du rapportage"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    strip.text = element_text(size = 14),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )


# 2. Cohérence interne des indicateurs composites

## 2.1 Filtrage préliminaire des valeurs aberrantes pour l’analyse de cohérence

Avant d’évaluer la cohérence entre les indicateurs composites, nous éliminons d’abord les valeurs aberrantes les plus extrêmes. Cette étape ne modifie pas définitivement le jeu de données et ne vise pas à détecter toutes les valeurs aberrantes ; elle permet simplement d’exclure les cas extrêmes afin de faciliter une évaluation plus fiable de la cohérence entre les indicateurs.

In [None]:
# Function to detect outliers based on MAD method
detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = "mad_flag") {
  data_long %>%
    group_by(OU, indicator, YEAR) %>%
    mutate(
      median_val = median(value, na.rm = TRUE),
      mad_val = mad(value, na.rm = TRUE),
      "{outlier_column}" := value > (median_val + deviation * mad_val) | value < (median_val - deviation * mad_val)
    ) %>%
    ungroup()
}

In [None]:
# Step 0: Select relevant core indicators
target_indicators <- c("SUSP", "TEST", "CONF", "MALTREAT", "PRES")

# Step 1: Convert wide to long format
routine_long <- routine_data %>%
  pivot_longer(
    cols = all_of(target_indicators),
    names_to = "indicator",
    values_to = "value"
  ) %>%
  mutate(
    PERIOD = as.character(PERIOD),  # Ensure PERIOD is character for join
    OU = OU_ID                      # Alias for join clarity
  )

# Step 2: Filter to indicators of interest
routine_long_filtered <- routine_long %>%
  filter(indicator %in% target_indicators)

# Step 3: Calculate MAD15
mad15_data <- detect_mad_outliers(
  routine_long_filtered,
  deviation = 15,
  outlier_column = "mad15"
)

# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)
mad10_flags <- mad15_data %>%
  filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%
  detect_mad_outliers(deviation = 10, outlier_column = "mad10")

# Step 5: Combine MAD15 and MAD10 results
mad_combined <- mad15_data %>%
  left_join(
    mad10_flags %>% select(PERIOD, OU, indicator, mad10),
    by = c("PERIOD", "OU", "indicator")
  )

In [None]:
# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)
outlier_flags <- mad_combined %>%
  filter(mad15 == TRUE | mad10 == TRUE) %>%
  mutate(PERIOD = as.numeric(PERIOD)) %>%
  select(PERIOD, OU, indicator)

# Step 7: Reshape routine_data to long format for filtering
routine_long_all <- routine_data %>%
  pivot_longer(
    cols = all_of(target_indicators),
    names_to = "indicator",
    values_to = "value"
  ) %>%
  mutate(OU = OU_ID)

# Step 8: Remove outliers
routine_long_clean <- routine_long_all %>%
  anti_join(outlier_flags, by = c("PERIOD", "OU", "indicator"))

# Step 9: Reshape back to wide format if needed
routine_data_clean <- routine_long_clean %>%
  select(-OU) %>%
  pivot_wider(names_from = indicator, values_from = value)


## 2.2 Cohérence des indicateurs

In [None]:
# Step 1: Extract year and month from PERIOD
routine_hd_month <- routine_data_clean %>%
  mutate(
    YEAR = substr(PERIOD, 1, 4),
    MONTH = substr(PERIOD, 5, 6)
  ) %>%
  group_by(ADM2_ID, YEAR, MONTH) %>%
  summarise(
    SUSP = sum(SUSP, na.rm = TRUE),
    TEST = sum(TEST, na.rm = TRUE),
    CONF = sum(CONF, na.rm = TRUE),
    MALTREAT = sum(MALTREAT, na.rm = TRUE),
    PRES = sum(PRES, na.rm = TRUE),
    .groups = "drop"
  )

# Step 2: Create scatter plots
options(repr.plot.width = 14, repr.plot.height = 6)

p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Suspectés vs Testés", x = "Cas suspectés", y = "Cas testés") +
  theme_minimal(base_size = 16)

p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Testés vs Confirmés", x = "Cas testés", y = "Cas confirmés") +
  theme_minimal(base_size = 16)

p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Confirmés vs Traités", x = "Cas confirmés", y = "Cas traités") +
  theme_minimal(base_size = 16)

# Step 3: Combine plots
(p1 | p2 | p3) + plot_layout(guides = "collect")


In [None]:
# Step 1: Aggregate monthly values
rds_clean_month <- routine_data_clean %>%
  mutate(
    YEAR = substr(PERIOD, 1, 4),
    MONTH = substr(PERIOD, 5, 6),
    DATE = as.Date(paste(YEAR, MONTH, "01", sep = "-"))
  ) %>%
  group_by(YEAR, MONTH, DATE) %>%
  summarise(
    SUSP = sum(SUSP, na.rm = TRUE),
    TEST = sum(TEST, na.rm = TRUE),
    CONF = sum(CONF, na.rm = TRUE),
    PRES = sum(PRES, na.rm = TRUE),
    .groups = "drop"
  )

# Step 2: Plot monthly national trends
options(repr.plot.width = 14, repr.plot.height = 6)
rds_clean_month %>%
  pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = "Indicator") %>%
  ggplot(aes(x = DATE, y = value, color = Indicator)) +
  geom_line(linewidth = 1.2) +
  labs(
    title = "Tendances mensuelles nationales des indicateurs composites (après suppression des outliers)",
    x = "Mois", y = "Nombre de cas", color = "Indicateur"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", size = 20),
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 16),
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 16)
  )


## 2.3 Carte des populations par district sanitaire (DS)

In [None]:
# Run if population_data is available
if (!is.null(population_data) & !is.null(shapes_data)) {
    # Join population to spatial shapes
    map_data <- shapes_data %>%
      left_join(population_data, by = "ADM2_ID")
    
    # Plot population per district (DS)
    ggplot(map_data) +
      geom_sf(aes(fill = POPULATION), color = "white", size = 0.2) +
      scale_fill_viridis_c(option = "C", name = "Population") +
      labs(
        title = "Population totale par district sanitaire (DS)",
        subtitle = "Données DHIS2",
        caption = "Source: NMDR / DHIS2"
      ) +
      theme_minimal(base_size = 14)
} else {
    print("Population or shapes data not available.")
}