# Estimations de l‚Äôincidence brute et ajust√©e

## 1. Setup

In [None]:
# Set SNT Paths
SNT_ROOT_PATH  <- "~/workspace"
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")
DATA_PATH <- file.path(SNT_ROOT_PATH, 'data', 'dhis2')

# load util functions
source(file.path(CODE_PATH, "snt_utils.r"))

# List required packages 
required_packages <- c("dplyr", "tidyr", "terra", "ggplot2", "stringr", "lubridate", "viridis", "patchwork", "zoo", "purrr", "arrow", "sf", "reticulate", "leaflet")

# Execute function
install_and_load(required_packages)

# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))},
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# Required environment for the sf packages
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")

In [None]:
# Configuration variables
DATASET_NAME <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_INCIDENCE
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE

# Cols to select from pyramid
ADMIN_1_NAME <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADMIN_2_NAME <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)
ADMIN_1_ID <- str_replace(ADMIN_1_NAME, "_NAME", "_ID")
ADMIN_2_ID <- str_replace(ADMIN_2_NAME, "_NAME", "_ID")

In [None]:
ADMIN_1_ID
ADMIN_2_ID
ADMIN_1_NAME
ADMIN_2_NAME

In [None]:
# print function
printdim <- function(df, name = deparse(substitute(df))) {
  cat("Dimensions of", name, ":", nrow(df), "rows x", ncol(df), "columns\n\n")
}

## 2. Load data

#### 2.1. Shapes

In [None]:
# import DHIS2 shapes data
DATASET_DHIS2 <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED
shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, "_shapes.geojson")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 Shapes data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

In [None]:
names(shapes_data)

#### 2.2. Pyramid
Needed to add back `*_NAME` cols

In [None]:

pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_DHIS2, paste0(COUNTRY_CODE, "_pyramid.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 Shapes data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

In [None]:
# Keep only relevant cols: ADM_2
pyramid <- pyramid_data %>%
  # select with renaming
  select(
    ADM1_ID = ADMIN_1_ID,
    ADM1_NAME = ADMIN_1_NAME, 
    ADM2_ID = ADMIN_2_ID,
    ADM2_NAME = ADMIN_2_NAME
  ) %>%
  distinct()

head(pyramid, 3)

#### 2.3. Monthly cases
Needed for <b>coherence checks</b>:
* **TPR** at monthly level over time 
    * Explain changes (or lack thereof) between Crude and Adj1
    * Useful to monitor resistance
* **Reporting Rate**
    * Explain changes (or lack thereof) between Adj1 and Adj2
* **Indicators** coherence:
    * SUSP > TEST
    * TEST > CONF
    * ... (check and add more ...)


‚ö†Ô∏è Note: **Import** from üìÅ`/data/` folder (not OH Dataset) - TBD if this intermediate file should also be stored in the dataset "SNT_DHIS2_INCIDENCE
" ...

In [None]:
# Import monthly_cases data from 
monthly_cases <- arrow::read_parquet(file.path(DATA_PATH, "incidence", paste0(COUNTRY_CODE, "_monthly_cases.parquet")))

dim(monthly_cases)
head(monthly_cases, 3)

In [None]:
# Add _NAME cols by joining with pyramid_data
monthly_cases <- left_join(monthly_cases, pyramid, by = join_by(ADM1_ID, ADM2_ID))

In [None]:
head(monthly_cases, 3)

#### 2.4. Yearly Incidence

In [None]:
# Identify the parquet file in the dataset (dynamic way, not hardcoded)

dataset_last_version <- openhexa$workspace$get_dataset(DATASET_NAME)$latest_version

files_iter <- dataset_last_version$files 

files <- list()
repeat {
  file <- tryCatch(
    py_to_r(iter_next(files_iter)),
    error = function(e) NULL
  )
  
  if (is.null(file)) break
  
  files <- append(files, list(file))
}

# Extract list of filename from files
filenames <- sapply(files, function(f) f$filename)

# Identify index of the element of filenames (list) which contains ".parquet" 
parquet_index <- which(grepl(".parquet", filenames))

# Assign filename to variable
filename_to_import <- files[[parquet_index]]$filename

print(paste0("Identified incidence file to be imported: ", filename_to_import))

In [None]:
# import data
yearly_incidence <- tryCatch({ get_latest_dataset_file_in_memory(DATASET_NAME, filename_to_import) }, 
                  error = function(e) {
                      msg <- paste("Error while loading seasonality file for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })

dim(yearly_incidence)
head(yearly_incidence, 3)

## Coherence checks - resume here ‚ùå‚ùå‚ùå
WIP by GP 20251219

See Jira: https://bluesquare.atlassian.net/browse/SNT25-272

#### 1. TPR (monthly) over time

In [None]:
ggplot(monthly_cases) +
  geom_line(aes(x = MONTH, y = TPR, group = ADM2_NAME),
  alpha = 0.75) +
  facet_grid(
    cols = vars(YEAR), rows = vars(ADM1_NAME),
    switch = "y") +
  scale_x_continuous(breaks = seq(1,12,1)) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1L), limits = c(0, 1)) +
  geom_hline(
    yintercept = 0,
    color = "grey21",
    linewidth = 0.5
  ) +
  labs(
    title = "Taux de Positivit√© des Tests (TPR) pour ADM2 et mois"  ) +
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 0.5),
    strip.placement = "outside",
    axis.title.y = element_blank(),
    # Make axis line thicker
    # axis.line = element_line(size = 0.5)
  )

ggsave(
    "/home/jovyan/workspace/pipelines/snt_dhis2_incidence/reporting/outputs/TPR.png",
    units = "cm",
    width = 21,
    height = 29.7,
    dpi = 200)

#### 2. RR

#### 3. Check on indicators (scatterplots)
Check other nb, sources, notes ... 

## Incidence du paludisme par ann√©e par district sanitaire
‚ö†Ô∏è This was not written by GP: make sure it doesn't break ...

In [None]:
incidence_data <- yearly_incidence %>%
  left_join(pyramid_data, by = c("ADM2_ID" = "LEVEL_3_ID"))

printdim(incidence_data)
head(incidence_data)

In [None]:
# Step 1: Prepare long-form data
incidence_long <- incidence_data %>%
  select(ADM2_ID, YEAR, POPULATION,
         INCIDENCE_CRUDE,
         INCIDENCE_ADJ_TESTING,
         INCIDENCE_ADJ_REPORTING,
         INCIDENCE_ADJ_CARESEEKING) %>%
  pivot_longer(
    cols = starts_with("INCIDENCE"),
    names_to = "INCIDENCE_TYPE",
    values_to = "incidence"
  ) %>%
  mutate(
    incidence_type_label = case_when(
      INCIDENCE_TYPE == "INCIDENCE_CRUDE"             ~ "Brute",
      INCIDENCE_TYPE == "INCIDENCE_ADJ_TESTING"       ~ "Ajust√©e 1 (Test)",
      INCIDENCE_TYPE == "INCIDENCE_ADJ_REPORTING"     ~ "Ajust√©e 2 (Test + Compl√©tude)",
      INCIDENCE_TYPE == "INCIDENCE_ADJ_CARESEEKING"   ~ "Ajust√©e 3 (Test + Compl√©tude + Soins)",
      TRUE ~ INCIDENCE_TYPE
    )
  )

# Step 2: Join with shapefile
map_data_long <- shapes_data %>%
  left_join(incidence_long, by = "ADM2_ID")

# Step 3: Categorize incidence for coloring
map_data_long <- map_data_long %>%
  mutate(
    incidence_cat = case_when(
      is.na(incidence)       ~ "NA",
      incidence < 100        ~ "0 √† 100",
      incidence < 250        ~ "100 √† 250",
      incidence < 500        ~ "250 √† 450",
      incidence < 1000       ~ "450 √† 1000",
      TRUE                   ~ "> √† 1000"
    ),
    incidence_cat = factor(incidence_cat, levels = c(
      "0 √† 100", "100 √† 250", "250 √† 450", "450 √† 1000", "> √† 1000", "NA"
    )),
    incidence_type_label = factor(incidence_type_label, levels = c(
      "Brute",
      "Ajust√©e 1 (Test)",
      "Ajust√©e 2 (Test + Compl√©tude)",
      "Ajust√©e 3 (Test + Compl√©tude + Soins)"
    ))
  )

# Step 4: Color palette from your map
incidence_colors <- c(
  "0 √† 100"      = "#bdd7e7",
  "100 √† 250"   = "#fcae91",
  "250 √† 450"   = "#fb6a4a",
  "450 √† 1000"  = "#cb181d",
  "> √† 1000"    = "#67000d",
  "NA"          = "#000000"
)

# Step 5: Plot
options(repr.plot.width = 20, repr.plot.height = 12)
ggplot(map_data_long) +
  geom_sf(aes(fill = incidence_cat), color = "white", size = 0.2) +
  facet_grid(rows = vars(incidence_type_label), cols = vars(YEAR)) +
  scale_fill_manual(values = incidence_colors, name = "Incidence (pour 1000)") +
  labs(
    title = "Incidence annuelle du paludisme par district sanitaire",
    subtitle = "Brute et ajust√©e selon les √©tapes OMS"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    strip.text = element_text(face = "bold", size = 12),
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 13),
    legend.position = "right"
  )


In [None]:
# Step 1: Compute mean incidence across years
mean_incidence <- incidence_data %>%
  select(ADM2_ID, POPULATION,
         INCIDENCE_CRUDE,
         INCIDENCE_ADJ_TESTING,
         INCIDENCE_ADJ_REPORTING,
         INCIDENCE_ADJ_CARESEEKING) %>%
  pivot_longer(
    cols = starts_with("INCIDENCE"),
    names_to = "INCIDENCE_TYPE",
    values_to = "incidence"
  ) %>%
  group_by(ADM2_ID, INCIDENCE_TYPE) %>%
  summarise(incidence = mean(incidence, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    incidence_type_label = case_when(
      INCIDENCE_TYPE == "INCIDENCE_CRUDE"             ~ "Brute",
      INCIDENCE_TYPE == "INCIDENCE_ADJ_TESTING"       ~ "Ajust√©e 1 (Test)",
      INCIDENCE_TYPE == "INCIDENCE_ADJ_REPORTING"     ~ "Ajust√©e 2 (Test + Compl√©tude)",
      INCIDENCE_TYPE == "INCIDENCE_ADJ_CARESEEKING"   ~ "Ajust√©e 3 (Test + Compl√©tude + Soins)",
      TRUE ~ INCIDENCE_TYPE
    )
  )

# Step 2: Join with shapefile
map_data_mean <- shapes_data %>%
  left_join(mean_incidence, by = "ADM2_ID")

# Step 3: Categorize
map_data_mean <- map_data_mean %>%
  mutate(
    incidence_cat = case_when(
      is.na(incidence)       ~ "NA",
      incidence < 100        ~ "0 √† 100",
      incidence < 250        ~ "100 √† 250",
      incidence < 500        ~ "250 √† 450",
      incidence < 1000       ~ "450 √† 1000",
      TRUE                   ~ "> √† 1000"
    ),
    incidence_cat = factor(incidence_cat, levels = c(
      "0 √† 100", "100 √† 250", "250 √† 450", "450 √† 1000", "> √† 1000", "NA"
    )),
    incidence_type_label = factor(incidence_type_label, levels = c(
      "Brute",
      "Ajust√©e 1 (Test)",
      "Ajust√©e 2 (Test + Compl√©tude)",
      "Ajust√©e 3 (Test + Compl√©tude + Soins)"
    ))
  )

In [None]:
# Step 4: Plot mean incidence
ggplot(map_data_mean) +
  geom_sf(aes(fill = incidence_cat), color = "white", size = 0.2) +
  facet_wrap(~ incidence_type_label) +
  scale_fill_manual(values = incidence_colors, name = "Incidence moyenne (pour 1000)") +
  labs(
    title = "Incidence moyenne du paludisme par district sanitaire",
    subtitle = "Moyenne annuelle (toutes ann√©es confondues)",
    x = NULL, y = NULL
  ) +
  theme_minimal(base_size = 16) +
  theme(
    strip.text = element_text(face = "bold", size = 16),
    plot.title = element_text(face = "bold", size = 20),
    plot.subtitle = element_text(size = 16),
    legend.position = "right"
  )