## Setup start 

In [None]:
# Parameters
# SNT_ROOT_PATH   <- '~/workspace'   # SNT root
# ADJUST_WITH_WORLDPOP <- FALSE

In [None]:
# Set project folders
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")
POPULATION_DATA_PATH <- file.path(SNT_ROOT_PATH, "data", "dhis2", "population_transformed")

**Load functions**

In [None]:
source(file.path(CODE_PATH, "snt_utils.r"))

**Check and load required libraries**  

In [None]:
# List required pcks  ---------------->  check  what are the really required libraries
required_packages <- c("arrow", "dplyr", "tidyr", "stringr", "stringi", "jsonlite", "httr", "glue", "reticulate")

# Execute function
install_and_load(required_packages)

In [None]:
# Set environment to load openhexa.sdk from the right environment
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

### Load SNT configuration


In [None]:
# config file path
config_path <- file.path(CONFIG_PATH, "SNT_config.json")

config_json <- tryCatch({ fromJSON(config_path) },
    error = function(e) {
        msg <- glue("Error while loading configuration: {config_path}")
        cat(msg)   
        stop(msg) 
    })

# print(config.json$SNT_CONFIG)
msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, "SNT_config.json"))
log_msg(msg)

# Save this country code in a variable
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
ADMIN_1 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_1)
ADMIN_2 <- toupper(config_json$SNT_CONFIG$DHIS2_ADMINISTRATION_2)
format_dataset_id <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED

### Load DHIS2 population data (formatted dataset)

-Load DHIS2 population from latest dataset version 


In [None]:
# Load file from dataset
dhis2_population <- tryCatch({ 
    get_latest_dataset_file_in_memory(format_dataset_id, paste0(COUNTRY_CODE, "_population.parquet")) 
    }, error = function(e) {
        msg <- paste("[ERROR] Error while loading DHIS2 population file for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
        log_msg(msg, "error")
        stop(msg)
})

msg <- glue("DHIS2 population data loaded from dataset : {format_dataset_id} dataframe dimensions: [{paste(dim(dhis2_population), collapse=', ')}]")
log_msg(msg)

In [None]:
dim(dhis2_population)
head(dhis2_population, 3)

## SNT population scaling

Adjust DHIS2 population using Worldpop data as scaling factor (optional).
If this option is **not selected**, we try adjusting using the total population reference from configuration.

Details:  
- To adjust using population from WorldPop UN estimates data, **must** run *B.2 WorldPop Extract pipeline* first!.
- We assume we have Woldpop population data only for one year (latest).  
- If the Wpop data is not available, we continue the process and try with total population reference from configuration (if set).
- The scaled population will be stored in the column "POPULATION" (the original POPULATION column will be replaced/updated).

In [None]:
# default values
wpop_population <- NULL  
total_population_reference <- NULL

if (ADJUST_WITH_WORLDPOP) {
    
    worldpop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$WORLDPOP_DATASET_EXTRACT  # WPOP Dataset identifier    
    wpop_unadj_filename <- paste0(COUNTRY_CODE, "_worldpop_population.parquet")  # WPOP file
    log_msg(glue("Adjusting DHIS2 population data with {wpop_unadj_filename} from dataset {worldpop_dataset}"))
    
    # Load file from dataset
    wpop_population <- tryCatch({
                get_latest_dataset_file_in_memory(worldpop_dataset, wpop_unadj_filename) 
                },
    error = function(e) {
      msg <- paste0("[WARNING] Error while loading WorldPop population file for: " , COUNTRY_CODE,
                   " Please execute B.2 WorldPop Extract pipeline to ensure the data is available in the snt-worldpop-extract dataset.")
      log_msg(msg, "warning")
      return(NULL)
    })

    if (!is.null(wpop_population)) {
        # Compute WPop Total population 
        wpop_year_max <- max(wpop_population$YEAR)  # SELECTING LAST AVAILABLE YEAR (!)
        total_population_reference <- sum(wpop_population[wpop_population$YEAR == wpop_year_max, ]$POPULATION_UNADJ, na.rm=TRUE)         
        log_msg(glue("Total UN-adjusted WorldPop population: {total_population_reference}"))
    }
              
}  else {
    
    # try using total_population_reference from config file.
    if (is.null(total_population_reference)) {  
        total_population_reference <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[["TOTAL_POPULATION_REF"]]    
        if (is.null(total_population_reference)) {
            log_msg(glue("No total population reference found in 'snt_config'. Adjustmen will be skipped."), "warning")
        } else {
            log_msg(glue("Total reference population: {total_population_reference}"))
        }
    }        
}

Scale population using total population reference (if available)

In [None]:
if (!is.null(total_population_reference)) {

    # Compute totals per DHIS2 year
    year_totals <- dhis2_population %>%
            group_by(YEAR) %>%
            summarise(total_year_pop = sum(POPULATION, na.rm = TRUE))
    
    # Compute scaling factor per year
    year_totals <- year_totals %>%
      mutate(scaling_factor = total_population_reference / total_year_pop)
    
    # Join back and compute adjusted population
    dhis2_population <- dhis2_population %>%
      left_join(year_totals, by = "YEAR") %>%
      mutate(POPULATION_SCALED = round(POPULATION * scaling_factor)) %>%
      select(-total_year_pop, -scaling_factor)
 
    for (i in seq_len(nrow(year_totals))) {
        row <- year_totals[i, ]
        dhis2_total = sum(dhis2_population[dhis2_population$YEAR==row$YEAR, "POPULATION"], na.rm=TRUE)
        dhis2_total_scd = sum(dhis2_population[dhis2_population$YEAR==row$YEAR, "POPULATION_SCALED"], na.rm=TRUE)
        log_msg(glue("DHIS2 population year {row$YEAR} ({dhis2_total}) scaled: {dhis2_total_scd} (scaling_factor={round(row$scaling_factor, 3)})."))
    } 

    head(dhis2_population, 3)
}

## SNT Population projection and back-calculation using a growth factor

Apply a growth factor (if defined in the snt config file).

-Projects the population size backward and forward in time (years) using growth rates.  
-For the computation, we consider only one population reference (column) as initial value "POPULATION" or "POPULATION_SCALED" (is computed in previous steps)

In [None]:
# try using growth_factor from config file.
growth_factor <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[["GROWTH_FACTOR"]]
reference_year <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[["REFERENCE_YEAR"]]

# which population column to use?
population_column <- ifelse(("POPULATION_SCALED" %in% colnames(dhis2_population)), "POPULATION_SCALED", "POPULATION") 
columns_selection <- c("YEAR", "ADM1_NAME", "ADM1_ID", "ADM2_NAME", "ADM2_ID", population_column)  # WE SELECT POPULATION OR (IF EXISTS) POPULATION_SCALED COLUMN (!)

if (!is.null(growth_factor)) {
    
    if (!is.null(reference_year)) {
        if (!(reference_year %in% unique(dhis2_population$YEAR))) {
            not_found <- reference_year
            reference_year <- max(dhis2_population$YEAR)
            log_msg(glue("Reference year {not_found} is not present in the population data, using last year: {reference_year}."), "warning")            
        }
    } else {
        reference_year <- max(dhis2_population$YEAR)
    }
    
    log_msg(glue("Applying growth factor {growth_factor} to project {tolower(population_column)} from reference year {reference_year}."))
    
    n_years_future <- 6 # n_years to the future 
    n_years_past <- 6 # n_years to the past 
    projection_years_backward <- seq(reference_year - 1, reference_year - n_years_past, by=-1)
    projection_years_forward <- seq(reference_year + 1, reference_year + n_years_future)
    
    dhis2_population_reference <- dhis2_population[dhis2_population$YEAR == reference_year, columns_selection]
    pop_result <- dhis2_population_reference
    population_forward <- dhis2_population_reference
    population_backward <- dhis2_population_reference
    total_pop_year <- list()
    
    # --- Forward projection ---
    for (year in projection_years_forward) {
        population_forward[["YEAR"]] <- year
        population_forward[[population_column]] <- round(population_forward[[population_column]] * (1 + growth_factor))        
        pop_result <- rbind(pop_result, population_forward)        
    }
    
    # --- Backward projection ---
    for (year in projection_years_backward) {
        population_backward[["YEAR"]] <- year
        population_backward[[population_column]] <- round(population_backward[[population_column]] / (1 + growth_factor))                
        pop_result <- rbind(pop_result, population_backward)
    }
    
    pop_result <- pop_result[order(pop_result$YEAR), ]       
    
} else {
    # We need to modify the input to produce a similar table (format)
    pop_result <- dhis2_population[order(dhis2_population$YEAR), columns_selection]
}

In [None]:
# Check total populations per year
for (year in sort(unique(pop_result$YEAR))) {
    tot_pop <- sum(pop_result[pop_result$YEAR == year, population_column], na.rm=TRUE)
    print(glue("Total population {year} : {tot_pop}"))
}

In [None]:
# Rename the output column
pop_result <- pop_result %>% rename(POPULATION = !!population_column)

print(dim(pop_result))
head(pop_result, 3)

## SNT Population disaggregations

Any defined disaggregations will be computed from the 'POPULATION_DISAGGREGATIONS' in the configuration file and included as additional columns in the final table.

In [None]:
pop_disagg <- config_json$DHIS2_DATA_DEFINITIONS$POPULATION_DEFINITIONS[["POPULATION_DISAGGREGATIONS"]]

# Check if the list exists and is not empty
if (!is.null(pop_disagg) && length(pop_disagg) > 0) {
      
    for (name in names(pop_disagg)) {
        value <- pop_disagg[[name]]
        log_msg(glue::glue("Adding disaggregation: {name}, Factor: {value}"))
        pop_result[[toupper(name)]] <- round(pop_result[["POPULATION"]] * value)
    }
  
} else {
  message("No population disaggregations defined.")
}

In [None]:
print(dim(pop_result))
head(pop_result, 3)

### Output formatted population data

In [None]:
out_msg <- paste0("Transfomerd population data saved under: ", file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, "_population.csv")))

# write parquet file
write_parquet(pop_result, file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, "_population.parquet")))

# write csv file
write.csv(pop_result, file.path(POPULATION_DATA_PATH, paste0(COUNTRY_CODE, "_population.csv")), row.names = FALSE)

# log
log_msg(out_msg)

### Data Summary 

In [None]:
# Data summary
print(summary(pop_result))