## Setup start 

In [None]:
# Parameters
# SNT_ROOT_PATH   <- '~/workspace'   # SNT root 

In [None]:
# Set project folders
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")
FORMATTED_DATA_PATH <- file.path(SNT_ROOT_PATH, "data", "dhis2", "extracts_formatted")

**Load functions**

In [None]:
source(file.path(CODE_PATH, "snt_utils.r"))

**Check and load required libraries**  

In [None]:
# List required pcks  ---------------->  check  what are the really required libraries
required_packages <- c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "reticulate")

# Execute function
install_and_load(required_packages)

In [None]:
# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

### Load SNT configuration


In [None]:
# Load SNT config
config_json <- tryCatch({
        fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))
    },
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# print(config.json$SNT_CONFIG)
msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, "SNT_config.json"))
log_msg(msg)

# Save this country code in a variable
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)
extracts_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS

### Load DHIS2 organisation units data

-Load DHIS2 organisation units from latest dataset version 


In [None]:
# Load file from dataset
dhis2_pyramid_data <- tryCatch({ get_latest_dataset_file_in_memory(extracts_dataset_id, paste0(COUNTRY_CODE, "_dhis2_raw_pyramid.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 organisation units file for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
                      cat(msg)
                      stop(msg)
})

msg <- paste0("DHIS2 organisation units data loaded from dataset : ", extracts_dataset_id, " dataframe dimensions: ", paste(dim(dhis2_pyramid_data), collapse=", "))
log_msg(msg)

print(dim(dhis2_pyramid_data))
head(dhis2_pyramid_data, 3)

In [None]:
# Set administrative columns
adm_1_id_col <- gsub("_NAME", "_ID", ADMIN_1)
adm_1_name_col <- ADMIN_1
adm_2_id_col <- gsub("_NAME", "_ID", ADMIN_2)
adm_2_name_col <- ADMIN_2

# Select the corresponding population level names from the pyramid 
dhis2_pyramid_selection <- dhis2_pyramid_data %>%
  select(
    !!sym(adm_1_name_col),
    !!sym(adm_1_id_col),
    !!sym(adm_2_name_col),
    !!sym(adm_2_id_col)
  ) %>%
  distinct()

print(dim(dhis2_pyramid_selection))
head(dhis2_pyramid_selection)

### Load DHIS2 population data

-Load DHIS2 population from latest dataset version 


In [None]:
# Load file from dataset
dhis2_data <- tryCatch({ get_latest_dataset_file_in_memory(extracts_dataset_id, paste0(COUNTRY_CODE, "_dhis2_raw_population.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 population file for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
                      cat(msg)
                      stop(msg)
})

msg <- paste0("DHIS2 population data loaded from dataset : ", extracts_dataset_id, " dataframe dimensions: ", paste(dim(dhis2_data), collapse=", "))
log_msg(msg)

In [None]:
dim(dhis2_data)
unique(dhis2_data$DX)

## SNT total population aggregation

**Build Population indicators based on definitions**

In [None]:
# Set population template
pop_template <- crossing(
    YEAR = unique(as.integer(dhis2_data$PE)),
    dhis2_pyramid_selection
)

print(dim(pop_template))
head(pop_template, 3)

In [None]:
# Population indicator definitions
indicators <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[["POPULATION_INDICATORS"]]

# Loop over the definitions
for (ind_name in names(indicators)) {
    
    log_msg(glue::glue("Building DHIS2 population indicator: {ind_name}."))
    
    # select indicator data
    indicator_data <- dhis2_data[dhis2_data$DX %in% c(indicators[[ind_name]]$ids), ]

    # aggregate data on OU and PE in case there are several dx and co, and format
    indicator_aggregated <- indicator_data %>%
            group_by(across(c("PE", "OU"))) %>%
            mutate(VALUE = as.integer(as.numeric(VALUE)),
                   PE = as.numeric(PE)) %>%
            summarise(VALUE = sum(VALUE, na.rm = TRUE), .groups = "drop") %>%        
            rename(!!ind_name := VALUE,               
                   !!adm_2_id_col:= OU,
                   YEAR := PE) %>%
            select(-any_of(c("CO", "DX")))

    # Combine with previous indicators
    pop_template <- left_join(pop_template, indicator_aggregated, by = c("YEAR", adm_2_id_col))    
}

# sort
population_table <- pop_template %>%
    arrange(YEAR, !!sym(adm_1_name_col), !!sym(adm_2_name_col)) 


In [None]:
print(dim(population_table))
head(population_table, 3)

## Format SNT population data

### SNT format 

Apply standard SNT formatting for the final table

In [None]:
# Clean strings for admin 1 and admin 2
population_table[[ADMIN_1]] <- format_names(population_table[[ADMIN_1]]) # format_names() in snt_utils.r
population_table[[ADMIN_2]] <- format_names(population_table[[ADMIN_2]])

# Select and Rename columns
population_table_formatted <- population_table %>% 
    rename(
        ADM1_NAME = !!sym(adm_1_name_col),
        ADM1_ID = !!sym(adm_1_id_col),           
        ADM2_NAME = !!sym(adm_2_name_col),
        ADM2_ID = !!sym(adm_2_id_col)       
        )

print(dim(population_table_formatted))
head(population_table_formatted)

### Output formatted population data

In [None]:
out_msg <- paste0("Population data saved under: ", file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_population.csv")))

# write parquet file
write_parquet(population_table_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_population.parquet")))

# write csv file
write.csv(population_table_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_population.csv")), row.names = FALSE)

# log
log_msg(out_msg)

### Data Summary 

In [None]:
# Data summary
print(summary(population_table_formatted))