## Setup start 

In [None]:
# Parameters

# SNT_ROOT_PATH   <- '~/workspace'   # SNT root

In [None]:
# Set project folders
CODE_PATH      <- file.path(SNT_ROOT_PATH, 'code')
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")
FORMATTED_DATA_PATH <- file.path(SNT_ROOT_PATH, "data", "dhis2_formatted")

**Load functions**

In [None]:
source(file.path(CODE_PATH, "snt_utils.r"))
# source(file.path(CODE_PATH, "snt_functions.r"))

**Check and load required libraries**  

In [None]:
# List required pcks  ---------------->  check  what are the really required libraries
required_packages <- c("lubridate", "zoo", "arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "reticulate")

# Execute function
install_and_load(required_packages)

In [None]:
# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

### Load SNT configuration


In [None]:
# Load SNT config
config_json <- tryCatch({
        fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))
    },
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# print(config.json$SNT_CONFIG)
msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, "SNT_config.json"))
log_msg(msg)

**Checks for SNT mandatory configuration fields**

In [None]:
# CHECK SNT configuration 
snt_config_mandatory <- c("COUNTRY_CODE", "DHIS2_ADMINISTRATION_1", "DHIS2_ADMINISTRATION_2") #, "ORG_UNITS_LEVELS_SELECTION")
for (conf in snt_config_mandatory) {
    print(paste(conf, ":", config_json$SNT_CONFIG[conf]))
    if (is.null(config_json$SNT_CONFIG[[conf]])) {
        msg <- paste("Missing configuration input:", conf)
        cat(msg)   
        stop(msg)
    }
}

# Save this country code in a variable
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)

### Load DHIS2 analytics data

-Load DHIS2 anlytics from latest dataset version 


In [None]:
# DHIS2 Dataset extract identifier
dataset_name <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_EXTRACTS

# Load file from dataset
dhis2_data <- tryCatch({ get_latest_dataset_file_in_memory(dataset_name, paste0(COUNTRY_CODE, "_dhis2_raw_analytics.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 analytics file for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
                      cat(msg)
                      stop(msg)
})


msg <- paste0("DHIS2 analytics data loaded from dataset : ", dataset_name, " dataframe dimensions: ", paste(dim(dhis2_data), collapse=", "))
log_msg(msg)

## SNT Indicators computation

### Select dhis2 metadata  

In [None]:
# log
msg <- paste0("Computing SNT indicators.")
log_msg(msg)

In [None]:
# Select only metadata (reduce the size of the dataframe)
administrative_cols <- colnames(dhis2_data)[grepl("LEVEL_", colnames(dhis2_data))]
dhis2_metadata <- dhis2_data[ , c("OU", administrative_cols)] # Metadata
dhis2_metadata <- distinct(dhis2_metadata)
dim(dhis2_metadata)

In [None]:
# Max admin columns available (matchin ou)
name_cols <- grep("LEVEL_\\d+_NAME", administrative_cols, value = TRUE)
max_level <- max(as.numeric(gsub("LEVEL_(\\d+)_NAME", "\\1", name_cols)))
max_admin_col_name <- paste0("LEVEL_", max_level, "_NAME")

# Result
print(max_admin_col_name)

In [None]:
# Clean strings for admin 1 and admin 2
dhis2_metadata[[ADMIN_1]] <- format_names(dhis2_metadata[[ADMIN_1]]) # (format_names() in snt_utils.r)
dhis2_metadata[[ADMIN_2]] <- format_names(dhis2_metadata[[ADMIN_2]])

### Select dhis2 values data  

In [None]:
# dhis2 Values table
dhis2_values <- dhis2_data[ , c("DX", "CO", "OU", "PE", "VALUE")]
head(dhis2_values)

### Pivot dhis2 value table

In [None]:
# make sure we have numeric data in "values" column
dhis2_values$VALUE <- as.numeric(dhis2_values$VALUE)

# pivot table on DX and CO columns (available combinations to columns)
routine_data <- pivot_wider(dhis2_values,
                            id_cols = all_of(c("OU", "PE")),
                            names_from = c("DX", "CO"),
                            values_from = 'VALUE')

print(paste("Routine data pivot : ", paste0(dim(routine_data), collapse=", ")))

### Build indicator definitions

In [None]:
# copy
routine_data_ind <- routine_data

# Get list of indicator definitions from SNT configuration
dhis_indicator_definitions <- config_json$DHIS2_DATA$DHIS2_INDICATOR_DEFINITIONS
names(dhis_indicator_definitions) <- toupper(names(dhis_indicator_definitions))

# loop over the definitions
for (indicator in names(dhis_indicator_definitions)) {
        
    data_element_uids <- dhis_indicator_definitions[[indicator]]    
    col_names <- c()
   
    if (length(data_element_uids) > 0) {
        for (dx in data_element_uids) {
            dx_co <- gsub("\\.", "_", dx)            
            if (grepl("_", dx_co)) {
                col_names <- c(col_names , dx_co)
            } else {
                col_names <- c(col_names ,colnames(routine_data_ind)[grepl(dx, colnames(routine_data_ind))])
            }
        }

        # logs
        msg <- paste0("Building indicator : ", indicator, " -> column selection : ", paste(col_names, collapse = ", "))        
        log_msg(msg)
        
        if (length(col_names) > 1) {
            sums <- rowSums(routine_data_ind[, col_names], na.rm = TRUE)
            all_na <- rowSums(!is.na(routine_data_ind[, col_names])) == 0
            sums[all_na] <- NA  # Keep NA if all rows are NA!
            routine_data_ind[[indicator]] <- sums            
        } else {
            routine_data_ind[indicator] <- routine_data_ind[, col_names] 
        }
        
    } else {
        routine_data_ind[indicator] <- NA
        
        # logs
        msg <- paste0("Building indicator : ", indicator, " -> column selection : NULL")
        log_msg(msg)
    }
}

In [None]:
dim(routine_data)

In [None]:
# # Manual check
# my_string <- "nRm30I4w9En_yI0WfOFcgSc nRm30I4w9En_brxxCYkQqcd nRm30I4w9En_r5lWfJh2t2l nRm30I4w9En_xxMINnPGqUg nRm30I4w9En_xCV9NGB897u MALTREAT"
# head(routine_data[, c("ou", "pe", strsplit(my_string, " ")[[1]])])

## Format SNT routine data

### SNT format 

In [None]:
# Filter routine data columns by indicators
routine_data_selection <- routine_data_ind[, c("OU", "PE", names(dhis_indicator_definitions))]

# left join with metadata
routine_data_merged <- merge(routine_data_selection, dhis2_metadata, by = "OU", all.x = TRUE)

# Select administrative columns
adm_1_id_col <- gsub("_NAME", "_ID", ADMIN_1)
adm_1_name_col <- ADMIN_1
adm_2_id_col <- gsub("_NAME", "_ID", ADMIN_2)
adm_2_name_col <- ADMIN_2

# Select and Rename
routine_data_formatted <- routine_data_merged %>%
    mutate(
        YEAR = substr(PE, 1, 4),
        MONTH = substr(PE, 5, 6)
    ) %>%
    select(
        PERIOD = PE,
        YEAR,
        MONTH,
        OU, 
        OU_NAME = !!sym(max_admin_col_name),
        ADM1 = !!sym(adm_1_name_col),
        ADM1_ID = !!sym(adm_1_id_col),           
        ADM2 = !!sym(adm_2_name_col),
        ADM2_ID = !!sym(adm_2_id_col),                      
        all_of(names(dhis_indicator_definitions))
    )

# Column names to upper case
colnames(routine_data_formatted) <- clean_column_names(routine_data_formatted)

# Sort dataframe by period
routine_data_formatted <- routine_data_formatted[order(as.numeric(routine_data_formatted$PERIOD)), ]
print(dim(routine_data_formatted))

### Output formatted routine data

In [None]:
out_msg <- paste0("Rountine data saved under: ", file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_routine.parquet")))

# write parquet file
write_parquet(routine_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_routine.parquet")))

# write csv file
write.csv(routine_data_formatted, file.path(FORMATTED_DATA_PATH, paste0(COUNTRY_CODE, "_routine.csv")), row.names = FALSE)

In [None]:
# log
log_msg(out_msg)

### Data Summary 

In [None]:
# Data summary
print(summary(routine_data_formatted))