## Setup start 

In [None]:
# Set project folders
SNT_ROOT_PATH   <- '~/workspace'   # SNT root
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")
FORMATTED_DATA_PATH <- file.path(SNT_ROOT_PATH, "data", "dhis2", "extracts_formatted")

# Load functions
source(file.path(CODE_PATH, "snt_utils.r"))

# List required pcks
required_packages <- c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "reticulate","glue")

# Execute function
install_and_load(required_packages)

# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

### Load SNT configuration


In [None]:
# Load SNT config
config_json <- tryCatch({
        fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))
    },
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, "SNT_config.json"))
log_msg(msg)

# Save this country code in a variable
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

### Load DHIS2 reporting rates data

-Load DHIS2 population from latest dataset version 


In [None]:
# DHIS2 Dataset extract identifier
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS

# Load file from dataset
dhis2_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_reporting.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 reporting file for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
                      cat(msg)
                      stop(msg)
})

msg <- glue("DHIS2 reporting data loaded from dataset : {dataset_name}, dataframe dimensions: {paste(dim(dhis2_data), collapse=', ')}")
log_msg(msg)
head(dhis2_data, 3)

## Reporting rates dataset formatting

### Format (clean) pyramid and dataset names

In [None]:
name_columns <- colnames(dhis2_data)[grepl("_NAME", colnames(dhis2_data))]
dhis2_data_clean <- dhis2_data
for (column in name_columns){
    print(paste0("Format : ", column))
    # Clean strings 
    dhis2_data_clean[[column]] <- format_names(dhis2_data_clean[[column]])     
}
  
# Column names to upper case
colnames(dhis2_data_clean) <- clean_column_names(dhis2_data_clean)

### Column selection

In [None]:
# Set administrative columns
adm_1_id_col <- gsub("_NAME", "_ID", ADMIN_1)
adm_1_name_col <- ADMIN_1
adm_2_id_col <- gsub("_NAME", "_ID", ADMIN_2)
adm_2_name_col <- ADMIN_2

ou_level <- config_json$SNT_CONFIG$ANALYTICS_ORG_UNITS_LEVEL
adm_ou_id_col <- glue("LEVEL_{ou_level}_ID")
adm_ou_name_col <- glue("LEVEL_{ou_level}_NAME")

# Administrative columns list
admin_columns <- c(
    adm_1_id_col,
    adm_1_name_col,
    adm_2_id_col,
    adm_2_name_col,
    adm_ou_id_col,
    adm_ou_name_col
) 

# Select relevant columns for SNT
fixed_cols <- c("PE", "VALUE", "PRODUCT_UID", "PRODUCT_NAME", "PRODUCT_METRIC")
selected_cols <-  c(fixed_cols, admin_columns)
dhis2_data_selection <- dhis2_data_clean[selected_cols]

print(dim(dhis2_data_selection))
head(dhis2_data_selection, 3)

## Format SNT reporting data

### SNT format 

In [None]:
# Select and rename columns
dhis2_data_formatted <- dhis2_data_selection %>%
    mutate(
        PE = as.numeric(PE),
        YEAR = as.numeric(substr(PE, 1, 4)),
        MONTH = as.numeric(substr(PE, 5, 6)),
        VALUE = as.numeric(VALUE)
    ) %>%
    select(
        PERIOD = PE,
        YEAR,
        MONTH,
        ADM1_NAME = !!sym(adm_1_name_col),
        ADM1_ID = !!sym(adm_1_id_col),           
        ADM2_NAME = !!sym(adm_2_name_col),
        ADM2_ID = !!sym(adm_2_id_col),  
        OU_ID = !!sym(adm_ou_id_col),
        OU_NAME = !!sym(adm_ou_name_col),                 
        all_of(fixed_cols)
     )

# Sort dataframe by period
dhis2_data_formatted <- dhis2_data_formatted[order(as.numeric(dhis2_data_formatted$PERIOD)), ]

print(dim(dhis2_data_formatted))
head(dhis2_data_formatted, 3)

## Output formatted population data

In [None]:
out_msg <- paste0("Formatted reporting data saved under: ", file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_reporting.parquet")))

# write parquet file
write_parquet(dhis2_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_reporting.parquet")))

# write csv file
write.csv(dhis2_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_reporting.csv")), row.names = FALSE)

# log
log_msg(out_msg)

### Data Summary 

In [None]:
# Data summary
print(summary(dhis2_data_formatted))