In [None]:
# üí° Comments / Questions & To Do's:
# - filter by YEAR keep only 2022-2024): 
#     1. Why these years? Arbitrary choice? Based on what? linked to what?
#     2. Is this a paramater is some other pipeline? if so, should be integrated here somehow 
# - Missing data: why do we have NA values for population? Are these real NA (missing data) or 0?
# - OUTLIERS: there are clear outliers (i.e., DS AGADEZ): shall we do some simple data cleaning here?
# - Population catagories (breaks) do we have a specific scale in mind 
#   (i.e., use same as another country) or can I set it based on the data

In [None]:
# TO DO / FINISH:
# - add safety "if" logic so nb does not fail if data is missing or wrong path ...
#   - (maybe) also add meaningful messages
# - Add code to export PNG files of relevant figures
# - Set dynamic boundaries for POPULATION categories? (so can use same code in different countries)
# - Clean code to avoid redundancies (especially ggplot stuff, a lot of copy pasted ...)

## 0. Paths and Config

In [None]:
# Set SNT Paths
SNT_ROOT_PATH  <- "~/workspace"
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")

REPORTING_NB_PATH <- file.path(SNT_ROOT_PATH, "pipelines/snt_dhis2_formatting/reporting")

# Create output directories if they don't exist (before loading utils)
figures_dir <- file.path(REPORTING_NB_PATH, "outputs", "figures")
if (!dir.exists(figures_dir)) {
  dir.create(figures_dir, recursive = TRUE)
  print(paste0("Created figures directory: ", figures_dir))
}

In [None]:
# Load util functions
source(file.path(CODE_PATH, "snt_utils.r"))

In [None]:
required_packages <- c(
    "tidyverse", 
    "arrow", 
    "sf", 
    "reticulate",
    "patchwork"
) 

# Execute function
install_and_load(required_packages)

In [None]:
# Set environment to load openhexa.sdk from the right environment
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")

# Load openhexa.sdk
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

In [None]:
# Load SNT config
config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))},
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

In [None]:
# Configuration variables
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
COUNTRY_NAME <- config_json$SNT_CONFIG$COUNTRY_NAME
ADM_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

In [None]:
# print function
printdim <- function(df, name = deparse(substitute(df))) {
  cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n")
}

## 1. Import data

In [None]:
# import analytics DHIS2 data
routine_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_routine.parquet")) }, 
                  error = function(e) {
                      msg <- paste0("[WARNING] Error while loading DHIS2 Routine data for: " , COUNTRY_CODE, 
                                    " the report cannot be executed. [ERROR DETAILS] ", conditionMessage(e))
                      stop(msg)
                      })

printdim(routine_data)

In [None]:
population_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_population.parquet")) }, 
                  error = function(e) {
                      msg <- paste0(COUNTRY_NAME , " Population data is not available in dataset : " , dataset_name, " last version.")
                      log_msg(msg, "warning")
                      population_data <- NULL
                      })

printdim(population_data)

In [None]:
shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_shapes.geojson")) }, 
                  error = function(e) {                      
                      msg <- paste0(COUNTRY_NAME , " Shapes data is not available in dataset : " , dataset_name, " last version.")
                      log_msg(msg, "warning")
                      shapes_data <- NULL
                      })

printdim(shapes_data)

# **Compl√©tude des indicateurs composites**


# 1. Compl√©tude du rapportage des indicateurs composites / Reporting Completeness of Composite Indicators

In [None]:
head(routine_data)

## 1.1 Proportion de formations sanitaires ayant rapport√© des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur

In [None]:
# Step 0: Rename your data for convenience
data <- routine_data

# Step 1: Convert PERIOD to DATE
data <- data %>%
  mutate(
    DATE = ymd(paste0(PERIOD, "01"))
  )

# Step 2: Reshape wide to long: INDICATOR = column name (e.g., CONF), VALUE = value
indicator_vars <- setdiff(names(data), c(
  "PERIOD", "YEAR", "MONTH", "OU_ID", "OU_NAME", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "DATE"
))

long_data <- data %>%
  pivot_longer(cols = all_of(indicator_vars),
               names_to = "INDICATOR",
               values_to = "VALUE") %>%
  rename(OU = OU_ID)

# Step 3: Build expected full grid (OU √ó INDICATOR √ó DATE)
full_grid <- expand_grid(
  OU = unique(long_data$OU),
  INDICATOR = unique(long_data$INDICATOR),
  DATE = unique(long_data$DATE)
)

# Step 4: Join and assess reporting status
reporting_check <- full_grid %>%
  left_join(
    long_data %>% select(OU, INDICATOR, DATE, VALUE),
    by = c("OU", "INDICATOR", "DATE")
  ) %>%
  mutate(
    is_missing = is.na(VALUE),
    is_zero = VALUE == 0 & !is.na(VALUE),
    is_positive = VALUE > 0 & !is.na(VALUE)
  )

# Step 5: Summarise reporting status
reporting_summary <- reporting_check %>%
  group_by(INDICATOR, DATE) %>%
  summarise(
    n_total = n_distinct(OU),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    .groups = "drop"
  )

# Step 6: Prepare plot-ready data
plot_data <- reporting_summary %>%
  pivot_longer(
    cols = starts_with("pct_"),
    names_to = "Status",
    values_to = "Percentage"
  ) %>%
  mutate(
    Status = recode(Status,
                    pct_missing = "Valeur manquante",
                    pct_zero = "Valeur nulle rapport√©e",
                    pct_positive = "Valeur positive rapport√©e")
  ) %>%
  complete(INDICATOR, DATE, Status, fill = list(Percentage = 0))


In [None]:
options(repr.plot.width = 17, repr.plot.height = 10)
ggplot(plot_data, aes(x = DATE, y = Percentage, fill = Status)) +
  geom_col(position = "stack") +
  facet_wrap(~ INDICATOR, scales = "free_y", ncol = 4) +
  scale_y_continuous() +
  scale_fill_manual(values = c(
    "Valeur manquante" = "tomato",
    "Valeur nulle rapport√©e" = "skyblue",
    "Valeur positive rapport√©e" = "green"
  )) +
  labs(
    title = "Taux de rapportage par indicateur (niveau formation sanitaire)",
    subtitle = "Proportion des valeurs rapport√©es par mois et par indicateur",
    x = "Mois", y = "% des formations sanitaires",
    fill = "Statut du rapportage"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", size = 20),
    strip.text = element_text(size = 16),
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 16)
  )


## 1.2 Proportion des districts ayant rapport√© des valeurs nulles, manquantes (NULL) ou positives pour chaque indicateur.

In [None]:
# Step 0: Rename for convenience
data <- routine_data

# Step 1: Convert PERIOD to proper Date
data <- data %>%
  mutate(Date = ymd(paste0(PERIOD, "01")))

# Step 2: Identify indicator columns
indicator_cols <- setdiff(names(data), c(
  "PERIOD", "YEAR", "MONTH", "OU_ID", "OU_NAME",
  "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", "Date"
))

# Step 3: Reshape to long format
data_long <- data %>%
  select(ADM2_ID, OU_ID, Date, all_of(indicator_cols)) %>%
  pivot_longer(cols = all_of(indicator_cols),
               names_to = "Indicator", values_to = "value") %>%
  mutate(value = as.numeric(value))

# Step 4: Full expected grid at ADM2 level
full_grid <- expand_grid(
  ADM2_ID = unique(data_long$ADM2_ID),
  Indicator = unique(data_long$Indicator),
  Date = unique(data_long$Date)
)

# Step 5: Detect if *any* health facility reported per district √ó indicator √ó date
reporting_check <- data_long %>%
  group_by(ADM2_ID, Indicator, Date) %>%
  summarise(
    is_missing = all(is.na(value)),
    is_zero = all(value == 0, na.rm = TRUE),
    is_positive = any(value > 0, na.rm = TRUE),
    .groups = "drop"
  )

# Step 6: Join with full grid to fill in missing ADM2s
reporting_full <- full_grid %>%
  left_join(reporting_check, by = c("ADM2_ID", "Indicator", "Date")) %>%
  mutate(
    is_missing = replace_na(is_missing, TRUE),
    is_zero = replace_na(is_zero, FALSE),
    is_positive = replace_na(is_positive, FALSE)
  )

# Step 7: Summarise by Indicator and Date
reporting_summary <- reporting_full %>%
  group_by(Indicator, Date) %>%
  summarise(
    n_total = n_distinct(ADM2_ID),
    n_missing = sum(is_missing),
    n_zero = sum(is_zero & !is_missing),
    n_positive = sum(is_positive),
    pct_missing = ifelse(n_total > 0, 100 * n_missing / n_total, 0),
    pct_zero = ifelse(n_total > 0, 100 * n_zero / n_total, 0),
    pct_positive = ifelse(n_total > 0, 100 * n_positive / n_total, 0),
    .groups = "drop"
  )

# Step 8: Reshape for plotting
plot_data <- reporting_summary %>%
  pivot_longer(cols = starts_with("pct_"),
               names_to = "Status", values_to = "Percentage") %>%
  mutate(Status = recode(Status,
                         pct_missing = "Valeur manquante",
                         pct_zero = "Valeur nulle rapport√©e",
                         pct_positive = "Valeur positive rapport√©e")) %>%
  complete(Indicator, Date, Status, fill = list(Percentage = 0))

# Step 9: Plot
ggplot(plot_data, aes(x = Date, y = Percentage, fill = Status)) +
  geom_col(position = "stack") +
  facet_wrap(~ Indicator, scales = "free_y") +
  scale_y_continuous(limits = c(0, 100)) +
  scale_fill_manual(values = c(
    "Valeur manquante" = "tomato",
    "Valeur nulle rapport√©e" = "skyblue",
    "Valeur positive rapport√©e" = "green"
  )) +
  labs(
    title = "Taux de rapportage par indicateur (niveau district)",
    subtitle = "Proportion des districts (ADM2_ID) rapportant chaque mois",
    x = "Mois", y = "% des districts",
    fill = "Statut du rapportage"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    strip.text = element_text(size = 14),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )


# 2. Coh√©rence interne des indicateurs composites

## 2.1 Filtrage pr√©liminaire des valeurs aberrantes pour l‚Äôanalyse de coh√©rence

Avant d‚Äô√©valuer la coh√©rence entre les indicateurs composites, nous √©liminons d‚Äôabord les valeurs aberrantes les plus extr√™mes. Cette √©tape ne modifie pas d√©finitivement le jeu de donn√©es et ne vise pas √† d√©tecter toutes les valeurs aberrantes ; elle permet simplement d‚Äôexclure les cas extr√™mes afin de faciliter une √©valuation plus fiable de la coh√©rence entre les indicateurs.

In [None]:
# Function to detect outliers based on MAD method
detect_mad_outliers <- function(data_long, deviation = 15, outlier_column = "mad_flag") {
  data_long %>%
    group_by(OU, indicator, YEAR) %>%
    mutate(
      median_val = median(value, na.rm = TRUE),
      mad_val = mad(value, na.rm = TRUE),
      "{outlier_column}" := value > (median_val + deviation * mad_val) | value < (median_val - deviation * mad_val)
    ) %>%
    ungroup()
}

In [None]:
# Step 0: Select relevant core indicators
target_indicators <- c("SUSP", "TEST", "CONF", "MALTREAT", "PRES")

# Step 1: Convert wide to long format
routine_long <- routine_data %>%
  pivot_longer(
    cols = all_of(target_indicators),
    names_to = "indicator",
    values_to = "value"
  ) %>%
  mutate(
    PERIOD = as.character(PERIOD),  # Ensure PERIOD is character for join
    OU = OU_ID                      # Alias for join clarity
  )

# Step 2: Filter to indicators of interest
routine_long_filtered <- routine_long %>%
  filter(indicator %in% target_indicators)

# Step 3: Calculate MAD15
mad15_data <- detect_mad_outliers(
  routine_long_filtered,
  deviation = 15,
  outlier_column = "mad15"
)

# Step 4: Calculate MAD10 (only where mad15 not flagged or missing)
mad10_flags <- mad15_data %>%
  filter(is.na(mad15) | mad15 == FALSE, !is.na(value)) %>%
  detect_mad_outliers(deviation = 10, outlier_column = "mad10")

# Step 5: Combine MAD15 and MAD10 results
mad_combined <- mad15_data %>%
  left_join(
    mad10_flags %>% select(PERIOD, OU, indicator, mad10),
    by = c("PERIOD", "OU", "indicator")
  )

In [None]:
# Step 6: Identify outliers (MAD15 or MAD10 flagged as TRUE)
outlier_flags <- mad_combined %>%
  filter(mad15 == TRUE | mad10 == TRUE) %>%
  mutate(PERIOD = as.numeric(PERIOD)) %>%
  select(PERIOD, OU, indicator)

# Step 7: Reshape routine_data to long format for filtering
routine_long_all <- routine_data %>%
  pivot_longer(
    cols = all_of(target_indicators),
    names_to = "indicator",
    values_to = "value"
  ) %>%
  mutate(OU = OU_ID)

# Step 8: Remove outliers
routine_long_clean <- routine_long_all %>%
  anti_join(outlier_flags, by = c("PERIOD", "OU", "indicator"))

# Step 9: Reshape back to wide format if needed
routine_data_clean <- routine_long_clean %>%
  select(-OU) %>%
  pivot_wider(names_from = indicator, values_from = value)


## 2.2 Coh√©rence des indicateurs

In [None]:
# Step 1: Extract year and month from PERIOD
routine_hd_month <- routine_data_clean %>%
  mutate(
    YEAR = substr(PERIOD, 1, 4),
    MONTH = substr(PERIOD, 5, 6)
  ) %>%
  group_by(ADM2_ID, YEAR, MONTH) %>%
  summarise(
    SUSP = sum(SUSP, na.rm = TRUE),
    TEST = sum(TEST, na.rm = TRUE),
    CONF = sum(CONF, na.rm = TRUE),
    MALTREAT = sum(MALTREAT, na.rm = TRUE),
    PRES = sum(PRES, na.rm = TRUE),
    .groups = "drop"
  )

# Step 2: Create scatter plots
options(repr.plot.width = 14, repr.plot.height = 6)

p1 <- ggplot(routine_hd_month, aes(x = SUSP, y = TEST)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Suspect√©s vs Test√©s", x = "Cas suspect√©s", y = "Cas test√©s") +
  theme_minimal(base_size = 16)

p2 <- ggplot(routine_hd_month, aes(x = TEST, y = CONF)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Test√©s vs Confirm√©s", x = "Cas test√©s", y = "Cas confirm√©s") +
  theme_minimal(base_size = 16)

p3 <- ggplot(routine_hd_month, aes(x = CONF, y = MALTREAT)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Confirm√©s vs Trait√©s", x = "Cas confirm√©s", y = "Cas trait√©s") +
  theme_minimal(base_size = 16)

# Step 3: Combine plots
(p1 | p2 | p3) + plot_layout(guides = "collect")


In [None]:
# Step 1: Aggregate monthly values
rds_clean_month <- routine_data_clean %>%
  mutate(
    YEAR = substr(PERIOD, 1, 4),
    MONTH = substr(PERIOD, 5, 6),
    DATE = as.Date(paste(YEAR, MONTH, "01", sep = "-"))
  ) %>%
  group_by(YEAR, MONTH, DATE) %>%
  summarise(
    SUSP = sum(SUSP, na.rm = TRUE),
    TEST = sum(TEST, na.rm = TRUE),
    CONF = sum(CONF, na.rm = TRUE),
    PRES = sum(PRES, na.rm = TRUE),
    .groups = "drop"
  )

# Step 2: Plot monthly national trends
options(repr.plot.width = 14, repr.plot.height = 6)
rds_clean_month %>%
  pivot_longer(cols = c(SUSP, TEST, CONF, PRES), names_to = "Indicator") %>%
  ggplot(aes(x = DATE, y = value, color = Indicator)) +
  geom_line(linewidth = 1.2) +
  labs(
    title = "Tendances mensuelles nationales des indicateurs composites (apr√®s suppression des outliers)",
    x = "Mois", y = "Nombre de cas", color = "Indicateur"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", size = 20),
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 16),
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 16)
  )


# 3. Carte des populations par district sanitaire (DS)

## 3.1. Carte de la Population pour ADM2 

In [None]:
# Code from previous version of the notebook
# Uses continuos scale for population

# Run if population_data is available
if (!is.null(population_data) & !is.null(shapes_data)) {
    # Join population to spatial shapes
    map_data <- shapes_data %>%
      left_join(population_data, by = "ADM2_ID")
    
    # Plot population per district (DS)
    plot <- ggplot(map_data) +
      geom_sf(aes(fill = POPULATION), color = "white", size = 0.2) +
      scale_fill_viridis_c(option = "C", name = "Population") +
      labs(
        title = "Population totale par district sanitaire (DS)",
        subtitle = "Donn√©es DHIS2",
        caption = "Source: NMDR / DHIS2"
      ) +
      theme_minimal(base_size = 14) 

    print(plot)

} else {
    print("Population or shapes data not available.")
}


## ‚ö†Ô∏è 3.2. Carte de la Population D√©sagr√©g√©e (sp√©cifique au pays)
Le code suivant est sp√©cifique √† chaque pays et repose sur une population d√©sagr√©g√©e. 

### üá≥üá™ NER specific code 
Made ad hoc to allow comparison with data from other or previous analyses. Namely:
* only year 2022 to 2024
* specific palette (yellowish to brick red)
* specific intervals
* looks at **disaggregated** population <- this is sometimes contry-specific!

In [None]:
population_data_filtered <- population_data
if (COUNTRY_CODE == "NER") {
    print("üá≥üá™ Executing NER specific code ... ")

    # --- Filter data to keep only 2022-2024 ... ---
    years_to_keep <- 2022:2024
    population_data_filtered <- population_data |> filter(YEAR %in% years_to_keep)

    # --- Read data from SNT_metadata.json ---
    metadata_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_metadata.json"))},
     error = function(e) {
           msg <- paste0("Error while loading metadata", conditionMessage(e))  
            cat(msg)   
          stop(msg) 
    })

    # --- Assign population breaks from metadata ---
    value_breaks_tot <- jsonlite::fromJSON(metadata_json$POPULATION_TOTAL$SCALE)
    value_breaks_u5 <- jsonlite::fromJSON(metadata_json$POPULATION_U5$SCALE)
    value_breaks_fe <- jsonlite::fromJSON(metadata_json$POPULATION_PREGNANT$SCALE)

    # --- Define function to create dyanic labels based on breaks for pop category ---
    create_dynamic_labels <- function(breaks) {
    fmt <- function(x) {
        format(x / 1000, big.mark = "'", scientific = FALSE, trim = TRUE)
        }
  
    labels <- c(
        paste0("< ", fmt(breaks[1]), "k"),                          # First label
        paste0(fmt(breaks[-length(breaks)]), " - ", fmt(breaks[-1]), "k"), # Middle
        paste0("> ", fmt(breaks[length(breaks)]), "k")               # Last label
    ) 
    return(labels)
    }

    # --- Create dynamic labels based on breaks ---
    labels_tot <- create_dynamic_labels(value_breaks_tot)
    labels_u5 <- create_dynamic_labels(value_breaks_u5)
    labels_fe <- create_dynamic_labels(value_breaks_fe)

}

In [None]:
NER_palette_population <- c(
  "1" = "#fae6db",
  "2" = "#f1b195",
  "3" = "#ea7354",
  "4" = "#cc3f32",
  "5" = "#972620"
)


### Population Totales

In [None]:
if (COUNTRY_CODE == "NER") {

  # IMPORTNAT: palette vector MUST be RENAMED with the (dynamic) descriptive labels
names(NER_palette_population) <- labels_tot

plot <- population_data_filtered %>%
  mutate(
    CATEGORY_POPULATION = cut(
      POPULATION,
      breaks = c(0, value_breaks_tot, Inf),
      labels = labels_tot, 
      right = TRUE,
      include.lowest = TRUE
    )
  ) %>% 
  left_join(shapes_data, 
            by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% 
  ggplot() +
  geom_sf(aes(geometry = geometry,
              fill = CATEGORY_POPULATION),
          color = "black",
          linewidth = 0.25, 
          show.legend = TRUE
  ) +
  labs(
    title = "Population totale par district sanitaire (DS)",
    subtitle = "Source: NMDR / DHIS2"
  ) +
  scale_fill_manual(
    values = NER_palette_population,   
    limits = labels_tot, 
    drop = FALSE 
  ) +
  facet_wrap(~YEAR, ncol = 3) +
  theme_void() +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),
    legend.position = "bottom",
    legend.title = element_blank(),
    strip.text = element_text(face = "bold"),
    legend.key.height = unit(0.5, "line"),
    legend.margin = margin(10, 0, 0, 0)
  )

print(plot)

# Export to see better in high resolution
ggsave(
  filename = file.path(REPORTING_NB_PATH, "outputs", "figures", paste0(COUNTRY_CODE, "_choropleth_population_totals.png")),
  width = 14,
  height = 8,
  dpi = 300
)
}


### Population Femmes Enceintes (FE)

In [None]:
if (COUNTRY_CODE == "NER") {

names(NER_palette_population) <- labels_fe

plot <- population_data_filtered %>%
  mutate(
    CATEGORY_POPULATION = cut(
      POPULATION_FE,
      breaks = c(0, value_breaks_fe, Inf),
      labels = labels_fe, 
      right = TRUE,
      include.lowest = TRUE
    )
  ) %>% 
  left_join(shapes_data, 
            by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% 
  ggplot() +
  geom_sf(aes(geometry = geometry,
              fill = CATEGORY_POPULATION),
          color = "black",
          linewidth = 0.25, 
          show.legend = TRUE
  ) +
  labs(
    title = "Population des femmes enceintes par district sanitaire (DS)",
    subtitle = "Source: NMDR / DHIS2"
  ) +
  scale_fill_manual(
    values = NER_palette_population,   
    limits = labels_fe, 
    drop = FALSE # Prevents dropping empty levels from legend
  ) +
  facet_wrap(~YEAR, ncol = 3) +
  theme_void() +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),
    legend.position = "bottom",
    legend.title = element_blank(),
    strip.text = element_text(face = "bold"),
    legend.key.height = unit(0.5, "line"),
    legend.margin = margin(10, 0, 0, 0)
  )

print(plot)

# Export to see better in high resolution
ggsave(
  filename = file.path(REPORTING_NB_PATH, "outputs", "figures", paste0(COUNTRY_CODE, "_choropleth_population_fe.png")),
  width = 14,   
  height = 8,
  dpi = 300
)

}


### Population Enfants moins de 5 ans (U5)

In [None]:
if (COUNTRY_CODE == "NER") {

names(NER_palette_population) <- labels_u5

plot <- population_data_filtered %>%
  mutate(
    CATEGORY_POPULATION = cut(
      POPULATION_U5,
      breaks = c(0, value_breaks_u5, Inf),
      labels = labels_u5, 
      right = TRUE,
      include.lowest = TRUE
    )
  ) %>% 
  left_join(shapes_data, 
            by = join_by(ADM1_NAME, ADM1_ID, ADM2_NAME, ADM2_ID)) %>% 
  ggplot() +
  geom_sf(aes(geometry = geometry,
              fill = CATEGORY_POPULATION),
          color = "black",
          linewidth = 0.25, 
          show.legend = TRUE
  ) +
  labs(
    title = "Population des enfants de moins de 5 ans par district sanitaire (DS)",
    subtitle = "Source: NMDR / DHIS2"
  ) +
  scale_fill_manual(
    values = NER_palette_population,   
    limits = labels_u5, 
    drop = FALSE 
  ) +
  facet_wrap(~YEAR, ncol = 3) +
  theme_void() +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(margin = margin(5, 0, 20, 0)),
    legend.position = "bottom",
    legend.title = element_blank(),
    strip.text = element_text(face = "bold"),
    legend.key.height = unit(0.5, "line"),
    legend.margin = margin(10, 0, 0, 0)
  )

print(plot)

# Export PNG
ggsave(
  filename = file.path(REPORTING_NB_PATH, "outputs", "figures", paste0(COUNTRY_CODE, "_choropleth_population_u5.png")),
  width = 14,   
  height = 8,
  dpi = 300
)

}

## 3.2. <b>Compl√©tude et qualit√©</b> des donn√©es de la Population

#### Population Totale

In [None]:
# hist(population_data$POPULATION)
hist(population_data_filtered$POPULATION)

In [None]:
ggplot(population_data_filtered) +
  geom_point(aes(x = POPULATION,
                 y = fct_reorder(ADM2_NAME, POPULATION),
                 color = factor(YEAR))
             ) +
  facet_grid(rows = "ADM1_NAME", 
             scale = "free_y", 
             space = "free_y", 
             switch = "y") +
  scale_x_continuous(breaks = c(0, 2e+05, 4e+05, 6e+05, 8e+05, 1e+06, 1.5e+06),
                     labels = scales::comma) +
  scale_color_viridis_d(option = "mako", end = 0.8) +
  labs(color = "Ann√©e") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 7),
    axis.title.x = element_text(size = 7),
    axis.title.y = element_blank(),
    strip.placement = "outside",
    panel.grid.minor.x = element_blank(),
    legend.position = "bottom"
  )

# Export PNG
ggsave(
  filename = file.path(REPORTING_NB_PATH, "outputs", "figures", "hist_population_totale.png"),
  units = "cm",
  width = 15,
  height = 23,
  bg = "white"
)

#### Population Femmes Enceintes (FE)

In [None]:
# Wrap in if statement to avoid errors if POPULATION_FE is missing
if ("POPULATION_FE" %in% names(population_data_filtered)) { 
    hist(population_data_filtered$POPULATION_FE)
}

In [None]:
if ("POPULATION_FE" %in% names(population_data_filtered)) { 
  
ggplot(population_data_filtered) +
  geom_point(aes(x = POPULATION_FE,
                 y = fct_reorder(ADM2_NAME, POPULATION_FE),
                 color = factor(YEAR))
             ) +
  facet_grid(rows = "ADM1_NAME", 
             scale = "free_y", 
             space = "free_y", 
             switch = "y") +
  scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+05, 1e+06, 1.5e+06),
                     labels = scales::comma) +
  scale_color_viridis_d(option = "mako", end = 0.8) +
  labs(
      # title = ""
      color = "Ann√©e") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 7),
    axis.title.x = element_text(size = 7),
    axis.title.y = element_blank(),
    strip.placement = "outside",
    panel.grid.minor.x = element_blank(),
    legend.position = "bottom"
  )

} 

#### Population Enfants moins de 5 ans (U5)

In [None]:
if ("POPULATION_U5" %in% names(population_data_filtered)) {
    hist(population_data_filtered$POPULATION_U5)
}

In [None]:
if ("POPULATION_U5" %in% names(population_data_filtered)) {

ggplot(population_data_filtered) +
  geom_point(aes(x = POPULATION_U5,
                 y = fct_reorder(ADM2_NAME, POPULATION_U5, .na_rm = FALSE),
                 color = factor(YEAR))
             ) +
  facet_grid(rows = "ADM1_NAME", 
             scale = "free_y", 
             space = "free_y", 
             switch = "y") +
  scale_x_continuous(breaks = c(0, 2e+04, 4e+04, 6e+04, 8e+04, 1e+05, 1.5e+05),
                     labels = scales::comma) +
  scale_color_viridis_d(option = "mako", end = 0.8) +
  labs(
      # title = ""
      color = "Ann√©e") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 7),
    axis.title.x = element_text(size = 7),
    axis.title.y = element_blank(),
    strip.placement = "outside",
    panel.grid.minor.x = element_blank(),
    legend.position = "bottom"
  )

}