# Accès aux soins de santé  

Proportion of the population living within a radius of 5km from a HF (~60mn walking distance)

In [None]:
Sys.time()

## 1. Setup

In [None]:
# Project paths
ROOT_PATH <- '~/workspace'
PROJECT_PATH <- file.path(ROOT_PATH, "pipelines/snt_healthcare_access")
CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')
CODE_PATH <- file.path(ROOT_PATH, 'code')
DATA_PATH <- file.path(ROOT_PATH, 'data')

OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'healthcare_access')
OUTPUT_PLOTS_PATH <- file.path(PROJECT_PATH, 'reporting', 'outputs')

**Validate parameters**

In [None]:
# Paremeters
if (!exists("FOSA_FILE")) FOSA_FILE <- NULL # Optional file (full path) with health facility locations 
if (!exists("POP_FILE")) POP_FILE <- NULL   # POPULATION raster file to use (full path).
if (!exists("RADIUS_METERS")) RADIUS_METERS <- 5000  # Distance in meters  

In [None]:
print(glue::glue("FOSA file: {FOSA_FILE}"))
print(glue::glue("Population file: {POP_FILE}"))
print(glue::glue("Radius meters: {RADIUS_METERS}"))

In [None]:
# FOSA geolocation quality check is executed later, after fosa_dt is loaded.
NULL

In [None]:
# Global settings
options(scipen=999)

# Load snt utils 
source(file.path(CODE_PATH, "snt_utils.r"))

# Required packages # "geojsonio", #"RColorBrewer",
required_packages <- c("jsonlite", "dplyr", "data.table", "ggplot2", "arrow", "glue", "sf", "terra",  "httr", "reticulate", "arrow", "stringr")
install_and_load(required_packages)
terraOptions(memfrac = 0.5)

# Openhexa
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

# Load SNT config
config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, "SNT_config.json")) },
                        error = function(e) {
                          msg <- paste0("Error while loading configuration", conditionMessage(e))  
                          cat(msg)   
                          stop(msg) 
                        })

log_msg(glue("SNT configuration loaded from: {file.path(CONFIG_PATH, 'SNT_config.json')}"))

In [None]:
# Set variables
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ORG_UNITS_LEVEL <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL
print(paste("Country code: ", COUNTRY_CODE))

# Global variables
admin_col <- "ADM2_ID"
reference_year <- 2020  # WorldPop available data until 2020
country_epsg_degrees <- 4326 # for plotting
country_epsg_meters <- 32630 # for creating the buffer areas

# column names
latitude_col <- "LATITUDE"
longitude_col <- "LONGITUDE"
coordinate_cols <- c(longitude_col, latitude_col) # longitude (x) first, latitude (y) second
status_closed_col <- "CLOSED_DATE" # for FOSA status 

## 2. Load data

### 2.1. Load spatial administrative unit data

In [None]:
# load as vector data
dhis2_formatted_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED
spatial_units_data <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, "_shapes.geojson")) }, 
                  error = function(e) {
                      stop(glue("[ERROR] Error while loading DHIS2 Shapes data for: {paste0(COUNTRY_CODE, '_shapes.geojson')} [ERROR DETAILS] {conditionMessage(e)}"))
                      })
log_msg(glue("Shapes file downloaded successfully from dataset: {dhis2_formatted_dataset}"))

Make the related data objects: the admin data and the country polygon

In [None]:
# change CRS (Coordinate Reference System)
spatial_units_data <- reproject_epsg(spatial_units_data, country_epsg_degrees)

# Make the related data objects: the admin data and the country polygon
# drop geometry to get attribute table (as a data.table)
admin_data <- setDT(st_drop_geometry(spatial_units_data))
setDT(admin_data)

# make the whole country polygon
all_country <- st_union(spatial_units_data)

### 2.2. Load population data

In [None]:
# Load pop data (raster .tif)
if (is.null(POP_FILE)) {
    worldpop_dir <- file.path(DATA_PATH, "worldpop", "raw")
    worldpop_pattern <- glue("{COUNTRY_CODE}_worldpop_ppp_*.tif")
    worldpop_candidates <- list.files(
        worldpop_dir,
        pattern = glue("^{COUNTRY_CODE}_worldpop_ppp_[0-9]{{4}}\\.tif$"),
        full.names = TRUE
    )

    if (length(worldpop_candidates) == 0) {
        stop(glue("No WorldPop raster found in {worldpop_dir} for pattern {worldpop_pattern}"))
    }

    years_found <- as.integer(stringr::str_extract(basename(worldpop_candidates), "[0-9]{4}"))

    # Pick latest extraction by file modification time
    file_info <- file.info(worldpop_candidates)
    latest_idx <- which.max(file_info$mtime)
    pop_filename <- worldpop_candidates[latest_idx]

    # Keep reference_year for labels if a year is present in filename
    reference_year <- years_found[latest_idx]
    if (is.na(reference_year)) {
        reference_year <- as.integer(format(as.Date(file_info$mtime[latest_idx]), "%Y"))
    }

    pop_data <- tryCatch(
        { rast(pop_filename) },
        error = function(e) {
            stop(glue("Error while loading population raster: {conditionMessage(e)}"))
        }
    )
    log_msg(glue("Population raster data loaded: {pop_filename} (auto-selected latest year: {reference_year})"))
} else {
    pop_data <- tryCatch(
        { rast(file.path(POP_FILE)) },
        error = function(e) {
            stop(glue("Error while loading population raster: {conditionMessage(e)}"))
        }
    )

    # Try to infer year for reporting file names when user provides custom raster.
    year_from_pop_file <- as.integer(stringr::str_extract(basename(POP_FILE), "[0-9]{4}"))
    if (!is.na(year_from_pop_file)) {
        reference_year <- year_from_pop_file
    }
    log_msg(glue("Population raster data loaded: {POP_FILE}"))
}

### 2.3. Load data to use for FOSA locations

Import the locations (points) of the healthcare units.  
-If input file (`FOSA_FILE`) is not provided, load available DHIS2 pyramid data.

In [None]:
# Check, if user inputs data, that the user's data is usable
fosa_dt <- import_fosa_data(
    input_file_path=FOSA_FILE,
    pipeline_dhis2_dataset=dhis2_formatted_dataset,
    pipeline_country_code=COUNTRY_CODE,
    latitude_colname="LATITUDE",
    longitude_colname="LONGITUDE")

setDT(fosa_dt)
head(fosa_dt, 3)

## 3. FOSA data pre-processing  
  
Determine active Health facilities:  
    -Remove closed `Health facilities` from the FOSA table using `closing date` column (if available).  

Clean coordinates:  
    -Remove `Health facilities` with incomplete coordinates (Lon, Lat) from the FOSA table.

In [None]:
# format the latitude and longitude columns as expected by the notebook
input_longitude_col <- grep(glue::glue("^{longitude_col}$"), names(fosa_dt), ignore.case = TRUE, value = TRUE)
input_latitude_col <- grep(glue::glue("^{latitude_col}$"), names(fosa_dt), ignore.case = TRUE, value = TRUE)
setnames(fosa_dt, old=c(input_longitude_col, input_latitude_col), new=c(longitude_col, latitude_col))
tolower(names(fosa_dt))

In [None]:
# Quality check: Count facilities with geographic coordinates
# (run after fosa_dt has been loaded and coordinate columns standardized)
total_facilities <- nrow(fosa_dt)
facilities_with_coords <- fosa_dt[(!is.na(get(longitude_col))) & (!is.na(get(latitude_col))), .N]
facilities_missing_coords <- total_facilities - facilities_with_coords
pct_with_coords <- round(100 * facilities_with_coords / total_facilities, 2)
pct_missing_coords <- round(100 * facilities_missing_coords / total_facilities, 2)

log_msg(glue("FOSA geolocation quality check:"))
log_msg(glue("  Total facilities: {total_facilities}"))
log_msg(glue("  Facilities with longitude AND latitude: {facilities_with_coords} ({pct_with_coords}%)"))
log_msg(glue("  Facilities missing coordinates: {facilities_missing_coords} ({pct_missing_coords}%)"))

if (pct_with_coords < 50) {
    log_msg(glue("[WARNING] Less than 50% of facilities have geographic coordinates. Data quality may be insufficient for reliable healthcare access analysis."), level = "warning")
}

print(glue("Geolocation coverage: {facilities_with_coords}/{total_facilities} ({pct_with_coords}%)"))

### 3.1. Remove health facilities with **CLOSED_DATE** (if present)

In [None]:
# if the data contains a column indicated the date when closed, filter only units which were not closed
if(any(tolower(names(fosa_dt)) == tolower(status_closed_col))) {
    initial_rows <- nrow(fosa_dt)
    fosa_dt <- fosa_dt[is.na(get(status_closed_col)),]
    filtered_rows <- nrow(fosa_dt)
    removed_rows <- initial_rows - filtered_rows
    log_msg(glue("Removed {removed_rows} observations, FOSA which are no longer in operation."))
}

### 3.2. Specific case for **Niger (NER)**, we include two extra filters for health facilities:  
  
#### 3.2.1 Remove closed health facilities from the FOSA table where the name contains `(clôture)` or `(fermé)`.  
  
  -Compare strings with ignore case and accents removed.

In [None]:
# Remove any HF that have these strings in the name together with 
if (COUNTRY_CODE == "NER") {
    closing_suffixes <- c("cloture" , "ferme")
    ou_column_name <- glue("LEVEL_{ORG_UNITS_LEVEL}_NAME")
    
    # Logical vector to select rows
    remove_rows <- str_detect(fosa_dt[[ou_column_name]], regex(paste0(closing_suffixes, collapse = "|"), ignore_case = TRUE))
    rows_removed <- fosa_dt[remove_rows, ]
    fosa_dt <- fosa_dt[!remove_rows, ]
    
    print(glue("FOSA Dimensions: {paste(dim(fosa_dt), collapse=',')}"))
    
    # removed rows:
    rows_removed
}

#### 3.2.2 Remove closed `Health facilities` from the FOSA table where the name is part of the `Structure Clôturées` Organisation units group.  

In [None]:
# Use the list org units group "Structures Clôturées" that contains the "Closed" health facilitie ids

ou_groups <- file.path(DATA_PATH, "dhis2/extracts_raw/organisation_unit_groups/NER_organisation_unit_groups.parquet")
if (COUNTRY_CODE == "NER" & file.exists(ou_groups)) {
    log_msg(glue("Filtering NER organisation units with: {ou_groups}"))
    org_units <- read_parquet(ou_groups)
    log_msg(glue("FOSA table dimensions: {paste(dim(fosa_dt), collapse=', ')}"))
    
    # Select list: Structures Clôturées (oshkuclYJAw)    
    structures_cloturees <- org_units[org_units$id == "oshkuclYJAw", ]$organisation_units[[1]]
    if (length(structures_cloturees) > 0) {
        ou_column_id <- glue("LEVEL_{ORG_UNITS_LEVEL}_NAME")
        # Filter fosa_dt: keep rows where the value is NOT in structures_cloturees
        fosa_dt <- fosa_dt[!(fosa_dt[[ou_column_id]] %in% structures_cloturees), ]
    }
 
    log_msg(glue("FOSA list filtered using organisation units list Structures Clôturées, FOSA table dimensions: {paste(dim(fosa_dt), collapse=', ')}"))
}

### 3.3. Remove units which are missing either latitude or longitude

In [None]:
# filter and select coordinate_cols
fosa_dt <- unique(
    fosa_dt[
        (!is.na(get(longitude_col))) & (!is.na(get(latitude_col))),
        .SD,
        .SDcols = c(coordinate_cols)
        ]
    )
print(dim(fosa_dt))

In [None]:
# Convert the FOSA data table to an sf object using the specified coordinate columns.
fosa_vect <- st_as_sf(fosa_dt, coords = coordinate_cols, crs = country_epsg_degrees)
fosa_vect_filtered <- sf::st_filter(fosa_vect, all_country, .predicate = st_within)
log_msg(glue("Using {nrow(fosa_vect)} distinct observations, which have geographic coordinates within the country boundaries."))

In [None]:
# free up resources
rm(fosa_vect)
rm(all_country)
rm(fosa_dt)
gc()

## 4. Start health access computation

In [None]:
# make the circles around each health unit
overlapping_coverage_vect <- make_coverage_radii_sf(
  input_vect = fosa_vect_filtered,
  coordinate_colnames = coordinate_cols,
  epsg_value_degrees = country_epsg_degrees,
  epsg_value_meters = country_epsg_meters,
  radius_meters = RADIUS_METERS
)

In [None]:
# dissolve everything into one multipolygon
coverage_vect <- st_union(overlapping_coverage_vect)
coverage_vect <- st_as_sf(coverage_vect)

In [None]:
coverage_plot <- make_overlaid_sf_plot(
  admin_unit_vect=spatial_units_data,
  points_sf_vect=fosa_vect_filtered,
  buffer_vect=coverage_vect,
  epsg_value_degrees=country_epsg_degrees,
  plot_title=glue("Healthcare coverage in {COUNTRY_CODE}")
)

timestamp <- format(Sys.time(), "%Y-%m-%d_%H%M%S")
coverage_plot_filename <- glue("{COUNTRY_CODE}_coverage_plot_{timestamp}.png")
invisible(ggsave(filename=file.path(OUTPUT_PLOTS_PATH, coverage_plot_filename), plot=coverage_plot))
log_msg(glue("{COUNTRY_CODE} Coverage map saved: {file.path(OUTPUT_PLOTS_PATH, coverage_plot_filename)}"))

In [None]:
# free up resources
rm(overlapping_coverage_vect)
rm(fosa_vect_filtered)
gc()

In [None]:
log_msg("Computing radii around each healthcare unit. This will take a few minutes..")
    
# determine which cells are included in at least one of the radii
pop_healthcare_rast <- make_rasterized_inclusion_data(
  buffer_vect = coverage_vect,
  raster_data = pop_data,
  epsg_value_degrees = country_epsg_degrees,
  value_inside = 1,
  value_outside = 0
)

log_msg("Radii around each healthcare unit done.")

In [None]:
# inject the dummy variable into the population data
pop_healthcare_data <- c(pop_data, pop_healthcare_rast)
names(pop_healthcare_data) <- c("POP_TOTAL", "COVERED")

In [None]:
# free up resources
rm(coverage_vect)
rm(pop_healthcare_rast)
rm(pop_data)
gc()

In [None]:
log_msg("Computing the population inside the buffers. This is computationally intensive and will take a few minutes..")

# Compute and write to disk (chunked, memory-safe)
pop_covered_healthcare <- writeRaster(
  pop_healthcare_data$POP_TOTAL * pop_healthcare_data$COVERED,
  file.path(DATA_PATH, "healthcare_access/.pop_covered_healthcare.tif"),
  overwrite = TRUE
)

names(pop_covered_healthcare) <- "POP_COVERED"
log_msg("Population buffer counts done.")

In [None]:
# free up resources
gc()

In [None]:
# convert spatial_units_data from sf to terra SpatVector for rasterization
spatial_units_clean <- spatial_units_data %>%
  sf::st_zm(drop = TRUE, what = "ZM") %>%
  sf::st_make_valid()

# Keep only polygon-like geometries for rasterization
geom_types <- unique(as.character(sf::st_geometry_type(spatial_units_clean)))
if (any(grepl("GEOMETRYCOLLECTION", geom_types, ignore.case = TRUE))) {
  spatial_units_clean <- suppressWarnings(sf::st_collection_extract(spatial_units_clean, "POLYGON"))
}

if (!(admin_col %in% names(spatial_units_clean))) {
  stop(glue("Column '{admin_col}' not found in spatial units data for rasterization."))
}

# Convert admin IDs to integer zone IDs for robust rasterization/zonal statistics
spatial_units_clean$ZONE_ID <- as.integer(as.factor(spatial_units_clean[[admin_col]]))

spatial_units_vect <- tryCatch(
  {
    terra::vect(spatial_units_clean)
  },
  error = function(e) {
    log_msg(glue("[WARNING] Direct sf->terra conversion failed: {conditionMessage(e)}. Trying via sp."), "warning")
    terra::vect(methods::as(spatial_units_clean, "Spatial"))
  }
)

log_msg("Rasterizing the spatial units, based on the population grid.")
adm2_raster <- terra::rasterize(
  spatial_units_vect,
  pop_healthcare_data$POP_TOTAL,
  field = "ZONE_ID"
)
log_msg("Rasterization done.")

In [None]:
# total population per admin unit
pop_total_by_adm2 <- terra::zonal(
  pop_healthcare_data$POP_TOTAL,
  adm2_raster,
  fun = "sum",
  na.rm = TRUE
)

log_msg("Aggregated the total population by spatial units.")

# covered population per admin unit
pop_cov_by_adm2 <- terra::zonal(
  pop_covered_healthcare,
  adm2_raster,
  fun = "sum",
  na.rm = TRUE
)

log_msg("Aggregated the covered population by spatial units.")

adm2_pop_total <- setDT(as.data.frame(pop_total_by_adm2))
adm2_pop_covered <- setDT(as.data.frame(pop_cov_by_adm2))

# terra::zonal returns columns: [zone_col, value_col]
# Detect column names dynamically (first col = zone, second col = aggregated value)
col_names_total <- names(adm2_pop_total)
col_names_cov <- names(adm2_pop_covered)

zone_col_total <- col_names_total[1]
value_col_total <- col_names_total[2]
zone_col_cov <- col_names_cov[1]
value_col_cov <- col_names_cov[2]

setnames(adm2_pop_total, old = zone_col_total, new = "ZONE_ID")
setnames(adm2_pop_total, old = value_col_total, new = "POP_TOTAL")
setnames(adm2_pop_covered, old = zone_col_cov, new = "ZONE_ID")
setnames(adm2_pop_covered, old = value_col_cov, new = "POP_COVERED")

zone_lookup <- as.data.table(sf::st_drop_geometry(spatial_units_clean))[, c("ZONE_ID", admin_col), with = FALSE]
zone_lookup <- unique(zone_lookup)

adm2_pop_total <- merge.data.table(adm2_pop_total, zone_lookup, by = "ZONE_ID", all.x = TRUE)
adm2_pop_covered <- merge.data.table(adm2_pop_covered, zone_lookup, by = "ZONE_ID", all.x = TRUE)

output_df <- merge.data.table(
  adm2_pop_total[, c(admin_col, "POP_TOTAL"), with = FALSE],
  adm2_pop_covered[, c(admin_col, "POP_COVERED"), with = FALSE],
  by = admin_col,
  all = TRUE
)

if (nrow(output_df) == 0) {
  stop("Error: There was an error when computing covered population.")
}           

In [None]:
# make the percentage covered column
output_df$PCT_HEALTH_ACCESS <- output_df$POP_COVERED*100 / output_df$POP_TOTAL
output_df <- merge.data.table(admin_data, output_df, by = admin_col, all.x = TRUE)
head(output_df)

## 5. Save output results

In [None]:
# write to file
output_df_filename_stem <- glue("{COUNTRY_CODE}_population_covered_health")
fwrite(output_df, file.path(OUTPUT_DATA_PATH, glue("{output_df_filename_stem}.csv")))
write_parquet(output_df, file.path(OUTPUT_DATA_PATH, glue("{output_df_filename_stem}.parquet")))
log_msg(glue("Health access coverage saved: {file.path(OUTPUT_DATA_PATH, glue('{output_df_filename_stem}.parquet'))}"))

In [None]:
print(Sys.time())