## Setup start 

In [None]:
# Parameters

SNT_ROOT_PATH   <- '~/workspace'   # SNT root

In [None]:
# Set project folders
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")
FORMATTED_DATA_PATH <- file.path(SNT_ROOT_PATH, "data", "dhis2", "formatted")

**Load functions**

In [None]:
source(file.path(CODE_PATH, "snt_utils.r"))

**Check and load required libraries**  

In [None]:
# List required pcks  ---------------->  check  what are the really required libraries
required_packages <- c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "reticulate")

# Execute function
install_and_load(required_packages)

In [None]:
# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

### Load SNT configuration


In [None]:
# Load SNT config
config_json <- tryCatch({
        fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))
    },
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# print(config.json$SNT_CONFIG)
msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, "SNT_config.json"))
log_msg(msg)

**Checks for SNT mandatory configuration fields**

In [None]:
# CHECK SNT configuration 
snt_config_mandatory <- c("COUNTRY_CODE", "DHIS2_ADMINISTRATION_1", "DHIS2_ADMINISTRATION_2") #, "ORG_UNITS_LEVELS_SELECTION")
for (conf in snt_config_mandatory) {
    print(paste(conf, ":", config_json$SNT_CONFIG[conf]))
    if (is.null(config_json$SNT_CONFIG[[conf]])) {
        msg <- paste("Missing configuration input:", conf)
        cat(msg)   
        stop(msg)
    }
}

# Save this country code in a variable
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

### Load DHIS2 population data

-Load DHIS2 population from latest dataset version 


In [None]:
# DHIS2 Dataset extract identifier
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS

# Load file from dataset
dhis2_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_population.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 population file for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
                      cat(msg)
                      stop(msg)
})

msg <- paste0("DHIS2 population data loaded from dataset : ", dataset_name, " dataframe dimensions: ", paste(dim(dhis2_data), collapse=", "))
log_msg(msg)

## SNT total population computation

In [None]:
# log
# msg <- paste0("Formatting total population for SNT.")
# log_msg(msg)

In [None]:
# Set administrative columns
adm_1_id_col <- gsub("_NAME", "_ID", ADMIN_1)
adm_1_name_col <- ADMIN_1
adm_2_id_col <- gsub("_NAME", "_ID", ADMIN_2)
adm_2_name_col <- ADMIN_2

# Administrative columns list
admin_columns <- c(
    adm_1_id_col,
    adm_1_name_col,
    adm_2_id_col,
    adm_2_name_col
)

# aggregate data
df_aggregated <- dhis2_data %>%
    mutate(VALUE = as.numeric(VALUE)) %>%
    group_by(!!!syms(c("DX", "CO", "PE", admin_columns))) %>%                
    summarise(POPULATION = sum(VALUE, na.rm = TRUE), .groups = "drop")  # sum with NA handling

# Clean strings for admin 1 and admin 2
df_aggregated[[ADMIN_1]] <- format_names(df_aggregated[[ADMIN_1]]) # (format_names() in snt_utils.r)
df_aggregated[[ADMIN_2]] <- format_names(df_aggregated[[ADMIN_2]])

### Pivot dhis2 population table

In [None]:
# pivot table on DX and CO columns (available combinations to columns)
population_data <- pivot_wider(df_aggregated,
                               id_cols = all_of(c("PE", admin_columns)),
                               names_from = c("DX", "CO"),
                               values_from = "POPULATION")

print(paste("Population data pivot : ", paste0(dim(population_data), collapse=", ")))

### Build indicator definitions

In [None]:
# Get list of indicator definitions from SNT configuration
pop_indicator_definitions <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_INDICATOR_DEFINITIONS
names(pop_indicator_definitions) <- "POPULATION" # we call it POPULATION 

# loop over the definitions
for (indicator in names(pop_indicator_definitions)) {
        
    data_element_uids <- pop_indicator_definitions[[indicator]]   
    col_names <- c()
   
    if (length(data_element_uids) > 0) {
        for (dx in data_element_uids) {
            dx_co <- gsub("\\.", "_", dx)            
            if (grepl("_", dx_co)) {
                col_names <- c(col_names , dx)
            } else {
                col_names <- c(col_names ,colnames(population_data)[grepl(dx, colnames(population_data))])
            }
        }

        # logs
        msg <- paste0("Building indicator : ", indicator, " -> column selection : ", paste(col_names, collapse = ", "))        
        log_msg(msg)
        
        if (length(col_names) > 1) {
            sums <- rowSums(population_data[, col_names], na.rm = TRUE)
            all_na <- rowSums(!is.na(population_data[, col_names])) == 0
            sums[all_na] <- NA  # Keep NA if all rows are NA!
            population_data[[indicator]] <- sums            
        } else {
            population_data[indicator] <- population_data[, col_names] 
        }
        
    } else {
        population_data[indicator] <- NA
        
        # logs
        msg <- paste0("Building indicator : ", indicator, " -> column selection : NULL")
        log_msg(msg)
    }
}

In [None]:
head(population_data)

## Format SNT population data

### SNT format 

In [None]:
# Filter routine data columns by indicators
population_data_selection <- population_data[, c("PE", admin_columns, names(pop_indicator_definitions))]

# Select and Rename columns
population_data_formatted <- population_data_selection %>%
    select(
        YEAR = PE,
        ADM1 = !!sym(adm_1_name_col),
        ADM1_ID = !!sym(adm_1_id_col),           
        ADM2 = !!sym(adm_2_name_col),
        ADM2_ID = !!sym(adm_2_id_col),                      
        all_of(names(pop_indicator_definitions))
        )

# Column names to upper case
colnames(population_data_formatted) <- clean_column_names(population_data_formatted)

# Sort dataframe by period
population_data_formatted <- population_data_formatted[order(as.numeric(population_data_formatted$YEAR)), ]
print(dim(population_data_formatted))

head(population_data_formatted)

### Output formatted population data

In [None]:
out_msg <- paste0("Population data saved under: ", file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_population.parquet")))

# write parquet file
write_parquet(population_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_population.parquet")))

# write csv file
write.csv(population_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_population.csv")), row.names = FALSE)

In [None]:
# log
log_msg(out_msg)

### Data Summary 

In [None]:
# Data summary
print(summary(population_data_formatted))