## Specific fix for mixed organisation units Niger hierarchy  

These transformations make sense given the original format used to extract organisation units from DHIS2.  
Each row corresponds to a DHIS2 organisation unit with its complete hierarchy.

**Steps:**  
  
-Load raw pyramid extracted from DHIS2.  
-Load Organisation units groups that defines the target health facilities that should be in level 6 (this is our **prioritary list**).  
-Parent references for OUs moved from level 3 to level 6, must be updated (district location), for this we use coordinates.
-Missing coordinates at level 3 are completed manually (coordinates extracted from Google Maps (lat, lon).
-We update any OUs from the **prioritary list** at level 4 that had their parent modified in the level 3 modifications (set district), and move them to level 6.
-We update any OUs form the **prioritary list** at level 5 that had their parent modified in the level 3 modifications (set district), and move them to level 6.
-Add the newly build **level 6** to the final table (includes all previous OUs moved to level 6). 
-Save outputs to the expected output filename (parameter).


In [None]:
# PROJECT PATHS
SNT_ROOT_PATH <- "/home/hexa/workspace"  
CODE_PATH <- file.path(SNT_ROOT_PATH, 'code') 

# Load snt utils
source(file.path(CODE_PATH, "snt_utils.r"))

# Load libraries
required_packages <- c("arrow", "dplyr", "tidyverse", "jsonlite", "reticulate", "glue", "sf")
install_and_load(required_packages)

# # Load openhexa.sdk and set environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
openhexa <- import("openhexa.sdk")
openhexa_toolbox <- import("openhexa.toolbox")

### Read the raw pyramid file (parquet)

In [None]:
# Check if parameters is available
if (!("INPUT_PATH" %in% ls())) stop("[WARNING] Input parameter 'INPUT_PATH' not found. Exiting notebook.")
if (!("OUTPUT_PATH" %in% ls())) stop("[WARNING] Input parameter 'OUTPUT_PATH' not found. Exiting notebook.")

In [None]:
# Read specific file using INPUT_PATH parameter
pyramid_df <- tryCatch({ read_parquet(file.path(INPUT_PATH)) },
  error = function(e) {
      # Handle errors
      msg <- glue::glue("[WARNING] Error reading Parquet file at {INPUT_PATH}")
      stop(msg)    
  })

print(dim(pyramid_df))
head(pyramid_df %>% select(-geometry), 3)  # geometry column is too long to print.
log_msg(glue::glue("NER organisation units transformation: {dim(pyramid_df)[[1]]} Total organisation units."))

### Load organisation units groups  

For details about this **prioritay list** (liste_groupes_prioritaires), see Stephan's code: code in pipelines/snt_dhis2_extract/dev/etl_extract_orgUnit_v2.ipynb)
  
**Ticket reference**: https://bluesquare.atlassian.net/browse/SNT25-253

In [None]:
# group selection ids 
liste_groupes_prioritaires = c('EDbDMbIQtPD', 'iGLtZMdDGMD', 'KX9EuY75nGE', 'bZlIiMRLRbA', 'Ag0dMMJp4mH', 'S6YdxQgX8SO', 
                              'sB6YOzTUHkF', 'oihgQahh9LH', 'DrQFMU6RoCG', 'pwD7FU7Qfyz', 'dgDPQhxqOcJ','Gox5G2BIGBf')

# Create ou groups
ou_groups = read_parquet(file.path(SNT_ROOT_PATH, "data/dhis2/extracts_raw/organisation_unit_groups/NER_organisation_unit_groups_raw.parquet")) # hardcoded
ou_groups_exploded <- unnest(ou_groups, cols = c(organisation_units)) 
ou_selection <- ou_groups_exploded[ou_groups_exploded$id %in% liste_groupes_prioritaires, ]
group_prioritaires_table <- pyramid_df[pyramid_df$id %in% unique(ou_selection$organisation_units), ]

### Complete coordinates for missing facilities (Manual fix)

In [None]:
# Org units with missing geometries
prioritaires_geo_na <- group_prioritaires_table[group_prioritaires_table$level==3, ] %>% filter(is.na(geometry)) # OU with NA
dim(prioritaires_geo_na)
head(prioritaires_geo_na, 3)

In [None]:
# Create point
make_point_geojson <- function(lat, lon) {
  sprintf('{"type": "Point", "coordinates": [%f, %f]}', lon, lat)
}

# xMqXanPgczy : Centre Sante Mère Enfant Tillaberi
group_prioritaires_table[group_prioritaires_table$id=="xMqXanPgczy",]$geometry <- make_point_geojson(14.212177799561589, 1.4625739941131144) 
# sgO4yBg59SJ :	HRM Maradi	
group_prioritaires_table[group_prioritaires_table$id=="sgO4yBg59SJ",]$geometry <- make_point_geojson(13.485271755127068, 7.143422105623865)
# oHRvIBeR5xH : Hopital Cure
group_prioritaires_table[group_prioritaires_table$id=="oHRvIBeR5xH",]$geometry <- make_point_geojson(13.551421362165923, 2.116344191939423) 
# TVaP0vBLvat : Hopital ophtalmo Makka
group_prioritaires_table[group_prioritaires_table$id=="TVaP0vBLvat",]$geometry <- make_point_geojson(13.509657990942971, 2.1473435456528174)
# evMtQ7bLFYI : Hôpital Général de Référence 
group_prioritaires_table[group_prioritaires_table$id=="evMtQ7bLFYI",]$geometry <- make_point_geojson(13.586255600670649, 2.0918749136394097)  
# u3xCSh4hG9Q :	Hôpital Ophtalmologique Makkah 
group_prioritaires_table[group_prioritaires_table$id=="u3xCSh4hG9Q",]$geometry <- make_point_geojson(13.509793678687808, 2.147386518669057)
# P1oyCQT39rj : Hôpital de l'Amıtıé Nıger Turquıe
group_prioritaires_table[group_prioritaires_table$id=="P1oyCQT39rj",]$geometry <- make_point_geojson(13.535431049590938, 2.09186651126039)

### Start building the re-arranged pyramid

In [None]:
# Re-arrange pyramid structure
new_pyramid <- pyramid_df[0, ]
new_pyramid_1 <- rbind(new_pyramid, pyramid_df[pyramid_df$level == 1, ])
new_pyramid_2 <- rbind(new_pyramid_1, pyramid_df[pyramid_df$level == 2, ])

# handle level 3 (select only Districts)
new_pyramid_3 <- rbind(new_pyramid_2, pyramid_df[pyramid_df$level == 3 & grepl("DS", pyramid_df$name), ])
group_prioritaires_level_3 <- group_prioritaires_table[group_prioritaires_table$level == 3, ]  # org units being moved to level 6

Prepare points and polygons to match

In [None]:
# Filter level 3 polygons and convert to char
polygons_level3 <- pyramid_df[pyramid_df$level == 3, ]
polygons_level3$geometry <- as.character(polygons_level3$geometry)
group_prioritaires_level_3$geometry <- as.character(group_prioritaires_level_3$geometry)

# Keep only non-empty, non-NA geometries
polygons_level3 <- polygons_level3[!is.na(polygons_level3$geometry) & polygons_level3$geometry != "", ]
points_level_6 <- group_prioritaires_level_3[!is.na(group_prioritaires_level_3$geometry) & group_prioritaires_level_3$geometry != "", ]

# Disable S2 (assume planar coordinates)
sf_use_s2(TRUE)

# Convert to sf and validate poligons
polygons_sf <- st_as_sf(polygons_level3, geometry = st_as_sfc(polygons_level3$geometry, GeoJSON = TRUE), crs = 4326)
points_sf <- st_as_sf(points_level_6, geometry = st_as_sfc(points_level_6$geometry, GeoJSON = TRUE), crs = 4326)
polygons_sf$geometry <- st_make_valid(polygons_sf$geometry) 

In [None]:
# check
if (nrow(group_prioritaires_level_3) != nrow(points_sf)) {
    log_msg(glue("Check whether all organisation units in the priority list have valid coordinates."), "warning")
}

Find the corresponding district (DS) at level 3 using the coordinates

In [None]:
# inside_matrix: rows = points, columns = polygons
inside_matrix <- st_within(points_sf, polygons_sf, sparse = FALSE)

# Check if points fall in a polygon
point_polygon_dict <- list()
for (i in seq_len(nrow(points_sf))) {
    point_id <- points_sf$id[[i]]
    point_name <- points_sf$name[[i]]
    
    # Which polygons contain this point
    polygons_containing <- which(inside_matrix[i, ])
    
    if (length(polygons_containing) > 0) {
        found_polygons <- polygons_sf[polygons_containing, ]    
        found_polygons_ds <- found_polygons[grepl("^DS", found_polygons$name), ]
        
        if (nrow(found_polygons_ds) >= 1) {            
            polygon_id <- found_polygons_ds$id[1] # select the first match
            polygon_name <- found_polygons_ds$name[1]            
            
            # store in list
            point_polygon_dict[[point_id]] <- list(
                point_name = point_name,
                polygon_id = polygon_id,
                polygon_name = polygon_name
            )            
            print(glue("Point: {point_name} ({point_id}) is inside polygon: {polygon_name} ({polygon_id})"))
            
        } else {    
            point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA) 
            cat("Point:", point_id, "is not inside any district (DS) polygon\n")
        }
    } else {
        point_polygon_dict[[point_id]] <- list(point_name = point_name, polygon_id = NA, polygon_name = NA)
        cat("Point:", point_id, "is not inside any district (DS) polygon\n")
    }
}


Set the facilities under the corresponding district 

In [None]:
# Build rows for level 6 (Health facilities)
new_level_6 <- pyramid_df[0, ]
new_level_6 <- rbind(new_level_6, group_prioritaires_level_3)

# Set the facilities under the corresponding district 
for (point_id in names(point_polygon_dict)) {
    entry <- point_polygon_dict[[point_id]]
    print(glue("Setting {entry$point_name} ({point_id} ) to district : {entry$polygon_name} ({entry$polygon_id})")) 
    new_level_6[new_level_6$id == point_id, ]$level_6_id <- point_id
    new_level_6[new_level_6$id == point_id, ]$level_6_name <- entry$point_name
    new_level_6[new_level_6$id == point_id, ]$level_3_id <- entry$polygon_id
    new_level_6[new_level_6$id == point_id, ]$level_3_name <- entry$polygon_name    
    
}
new_level_6$level <- 6  # Reset the level
dim(new_level_6)
head(new_level_6, 3)

### Fixes for level 4 facilities that got their parent moved to level 6

Add the level 4 to the `new_pyramid` and update level 4 parents

In [None]:
# Are there any org units at level 4 that had the parent at level 3 already moved?
group_prioritaires_level_4 <- group_prioritaires_table[group_prioritaires_table$level == 4, ]

# Add level 4 to new pyramid (except org units to be moved to lvl 6)
new_pyramid_4 <- rbind(new_pyramid_3, pyramid_df[pyramid_df$level == 4 & !(pyramid_df$id %in% group_prioritaires_level_4$id), ])

# Run check and fix
child_lvl_4 <- get_updated_children(
    new_level_table=new_level_6,
    group_table=group_prioritaires_level_4,  
    level = 4,
    target_level=6,
    parent_level=3
)

In [None]:
if (nrow(child_lvl_4) == 0) {
    log_msg(glue("There are no facilities at level 4 that needed an update to their parent reference."))
} else {
    log_msg(glue("{nrow(child_lvl_4)} facilities had a parent reference updated."))    
}

# Move level 4 to level 6
group_prioritaires_level_4$level <- 6
group_prioritaires_level_4$level_6_id <- group_prioritaires_level_4$level_4_id
group_prioritaires_level_4$level_6_name <- group_prioritaires_level_4$level_4_name
group_prioritaires_level_4$level_4_id <- NA
group_prioritaires_level_4$level_4_name <- NA

# update group 4
group_prioritaires_level_4_clean <- group_prioritaires_level_4[!group_prioritaires_level_4$id %in% child_lvl_4$id, ]
group_prioritaires_level_4_updated <- rbind(group_prioritaires_level_4_clean, child_lvl_4)  

new_level_6 <- rbind(new_level_6, group_prioritaires_level_4_updated)  
    
dim(new_level_6)
tail(new_level_6, 3)

### Run check for level 5

In [None]:
# select org units at level 5 to be moved -> 6
group_prioritaires_level_5 <- group_prioritaires_table[group_prioritaires_table$level == 5, ]

# Add level 5 to new pyramid (except org units to be moved to lvl 6)
new_pyramid_5 <- rbind(new_pyramid_4, pyramid_df[pyramid_df$level == 5 & !(pyramid_df$id %in% group_prioritaires_level_5$id), ])

# Run check and fix
child_lvl_5 <- get_updated_children(
    new_level_table=new_level_6,
    group_table=group_prioritaires_level_5,  
    level = 5,
    target_level=6,
    parent_level=3
)

In [None]:
dim(new_pyramid_5)

In [None]:
if (nrow(child_lvl_5) == 0) {
    log_msg(glue("There are no facilities at level 5 that needed an update to their parent reference."))
} else {
    log_msg(glue("{nrow(child_lvl_5)} facilities had a parent reference updated."))    
}

# Move level 5 to level 6
group_prioritaires_level_5$level <- 6
group_prioritaires_level_5$level_6_id <- group_prioritaires_level_5$level_5_id
group_prioritaires_level_5$level_6_name <- group_prioritaires_level_5$level_5_name
group_prioritaires_level_5$level_5_id <- NA
group_prioritaires_level_5$level_5_name <- NA

# update group 5
group_prioritaires_level_5_clean <- group_prioritaires_level_5[!group_prioritaires_level_5$id %in% child_lvl_5$id, ]
group_prioritaires_level_5_updated <- rbind(group_prioritaires_level_5_clean, child_lvl_5)  

new_level_6 <- rbind(new_level_6, group_prioritaires_level_5_updated)  
    
dim(new_level_6)
tail(new_level_6, 3)

### Select and add level 6 org units

In [None]:
# select org units at level 5 to be moved -> 6
group_prioritaires_level_6 <- group_prioritaires_table[group_prioritaires_table$level == 6, ]

# Add level 6 to new pyramid 
# Comment this line if we want only to include the org units in the list
new_pyramid_6 <- rbind(new_pyramid_5, pyramid_df[pyramid_df$level == 6 & !(pyramid_df$id %in% group_prioritaires_level_6$id), ])

new_level_6 <- rbind(new_level_6, group_prioritaires_level_6)
new_level_6 <- new_level_6[!duplicated(new_level_6$id), ]

dim(new_level_6)
tail(new_level_6, 3)

### Create final pyramid

In [None]:
# Add level 6 to new pyramid (based on the org units prioritaire list)
final_pyramid <- rbind(new_pyramid_6, new_level_6)
dim(final_pyramid)
tail(final_pyramid, 3)

In [None]:
for (level in unique(pyramid_df$level)) {
    count <- nrow(pyramid_df[pyramid_df$level==level, ])
    print(glue("Original pyramid OU at level {level}: {count}"))
}

In [None]:
for (level in unique(final_pyramid$level)) {
    count <- nrow(final_pyramid[final_pyramid$level==level, ])
    print(glue("New pyramid OU at level {level}: {count}"))
} 

### Save output

In [None]:
# Write to parquet
log_msg(glue::glue("NER organisation units transformation: Saving transformed organisation units under {OUTPUT_PATH}."))
write_parquet(final_pyramid, file.path(OUTPUT_PATH))