# **WorldPop versus DHIS2 population comparison analysis**


In [None]:
# system("conda install -c conda-forge libgdal-hdf5 -y")

In [None]:
# Set SNT Paths
SNT_ROOT_PATH  <- "~/workspace"
CODE_PATH      <- file.path(SNT_ROOT_PATH, "code")
CONFIG_PATH    <- file.path(SNT_ROOT_PATH, "configuration")
 
# load util functions
source(file.path(CODE_PATH, "snt_utils.r"))

# List required packages 
required_packages <- c("tidyr", "terra", "arrow", "sf", "dplyr", "ggplot2", "reticulate") # reticulate

# Execute function
install_and_load(required_packages)

In [None]:
# Set variables to load openhexa.sdk from the right environment
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")

# Load openhexa.sdk
print(reticulate::py_config()$python)
tryCatch({ 
    openhexa <- import("openhexa.sdk") 
},
error = function(e) {
    msg <- paste0("Error while loading openhexa.sdk", conditionMessage(e))  
    cat(msg)   
    stop(msg) 
})

In [None]:
# Load SNT config
config_json <- tryCatch({ jsonlite::fromJSON(file.path(CONFIG_PATH, "SNT_config.json"))},
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

# DHIS2 Dataset extract identifier
worldpop_dataset <- config_json$SNT_DATASET_IDENTIFIERS$WORLDPOP_DATASET_EXTRACT
dhis2_formatted_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE

### Load data 

In [None]:
# Select the parquet file from the WorldPop dataset.
dataset_last_version <- openhexa$workspace$get_dataset(worldpop_dataset)$latest_version
if (is.null(dataset_last_version)) {
    stop("No version available in SNT WorldPop dataset. Process stopped.")
}

parquet_file <- NULL
files_list <- reticulate::iterate(dataset_last_version$files)
for (file in files_list) {
    if (endsWith(file$filename, ".parquet")) {
        parquet_file <- file$filename
        parquet_file <- paste0(COUNTRY_CODE, "_", substring(parquet_file, 5))  # Make sure we select the country file.
        print(paste0("Parquet file found: ", parquet_file))
    }
}

if (is.null(parquet_file)) {
    stop("No *.parquet file available in SNT WorldPop dataset. Process stopped.")
}

In [None]:
# Load worldpop population
worldpop_population <- tryCatch({ get_latest_dataset_file_in_memory(worldpop_dataset, parquet_file) },
                  error = function(e) {
                      msg <- paste("Error while loading WorldPop population file ",parquet_file," for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
                      cat(msg)
                      stop(msg)
})

msg <- paste0("WorldPop population file ",parquet_file," loaded from dataset : ", worldpop_dataset, " dataframe dimensions: ", paste(dim(worldpop_population), collapse=", "))
log_msg(msg)

In [None]:
# Load DHIS2 population
dhis2_population <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, "_population.parquet")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 population file for: " , COUNTRY_CODE, conditionMessage(e))  # log error message
                      cat(msg)
                      stop(msg)
})

msg <- paste0("DHIS2 population data loaded from dataset : ", dhis2_formatted_dataset, " dataframe dimensions: ", paste(dim(dhis2_population), collapse=", "))
log_msg(msg)

In [None]:
# Load DHIS2 shapes data
shapes_data <- tryCatch({ get_latest_dataset_file_in_memory(dhis2_formatted_dataset, paste0(COUNTRY_CODE, "_shapes.geojson")) }, 
                  error = function(e) {
                      msg <- paste("Error while loading DHIS2 Shapes data for: " , COUNTRY_CODE, conditionMessage(e))
                      cat(msg)
                      stop(msg)
                      })
msg <- paste0("DHIS2 shapes data loaded from dataset : ", dhis2_formatted_dataset, " dataframe dimensions: ", paste(dim(shapes_data), collapse=", "))
log_msg(msg)

## Zone de sante comparison

In [None]:
# select the closest year (Worldpop does not update data since 2020)
worldpop_year <- min(worldpop_population$YEAR)  # min year
if (worldpop_year %in% unique(dhis2_population$YEAR)){
    dhis2_year <- worldpop_year
} else if(worldpop_year < min(dhis2_population$YEAR)){
    dhis2_year <- min(dhis2_population$YEAR)
} else {
    dhis2_year <- max(dhis2_population$YEAR)
}

print(paste0("Comparison years DHIS2: ",dhis2_year, " Worldpop : ", worldpop_year))

In [None]:
# Select DHIS2 data to closest year
dhis2_pop_renamed <- dhis2_population %>% 
    filter(YEAR == dhis2_year) %>%      
    select(ADM2_ID, dhis2_population = POPULATION)

worldpop_pop_renamed <- worldpop_population %>% 
    filter(YEAR == worldpop_year) %>%      
    select(ADM2_ID, worldpop_population = POPULATION)

# 5. Compare WorldPop vs DHIS2 (if you have a matching ID column)
comparison_df <- left_join(shapes_data, dhis2_pop_renamed[, c("ADM2_ID", "dhis2_population")], by = "ADM2_ID")
comparison_df <- left_join(comparison_df, worldpop_pop_renamed[, c("ADM2_ID", "worldpop_population")], by = "ADM2_ID")

In [None]:
ggplot(comparison_df, aes(x = dhis2_population, y = worldpop_population)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +
  labs(x = "DHIS2 Population", 
       y = "WorldPop Population", 
       title = "Comparison per ADM2") +
  theme_minimal()

In [None]:
pop_correlation <- cor(comparison_df$dhis2_population, comparison_df$worldpop_population, method = 'pearson')
print(paste0("Correlation : ", round(pop_correlation, 2)))

In [None]:
ggplot(comparison_df) +
  geom_sf(aes(fill = worldpop_population - dhis2_population)) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  labs(title = "Difference: WorldPop - DHIS2 Population (ADM2)",
       fill = "Pop. Diff") +
  theme_minimal()

In [None]:
sapply(
    setNames(
        list(comparison_df$dhis2_population, comparison_df$worldpop_population),
        c("DHIS2_population", "WPOP_population")
    ),
    summary
)

The above table shows that some of the values of WPOP tend to be less plausible, with some Zones de Santé (ZS) having 0 inhabitants, while the largest ZS, which is likely one of the districts of Kinshasa, appears with nearly 22 million inhabitants, thus more than the total population of Kinshasa (17 million). 

In [None]:
comp.pop <- comparison_df %>%
  select(ADM2_ID, worldpop_population, dhis2_population) %>%
  mutate(
      diff = worldpop_population - dhis2_population,
      ratio = worldpop_population / dhis2_population,
      relative_diff = diff / dhis2_population
  )

In [None]:
hist_pop <- st_drop_geometry(comp.pop) %>%
    select(ADM2_ID, worldpop_population, dhis2_population) %>%
    rename(
        WPOP = worldpop_population,
        DHIS2 = dhis2_population
    ) %>%
    pivot_longer(
    cols = c(WPOP, DHIS2),
    names_to = "source",
    values_to = "population"
  )

In [None]:
ggplot(hist_pop, aes(x=population, color=source)) +
  geom_histogram(fill = NA, alpha = 0.5, position = "identity", binwidth = 10000) +
  theme_minimal()

WorldPop data appears to have more variability and some extreme outliers to the right.

In [None]:
ggplot(comp.pop, aes(y=relative_diff)) + 
  geom_boxplot() + coord_flip() + theme_minimal()

The above plot shows that for 75% of Zones de Santé (ZS), the difference between DHIS2 and WorldPop data is within 50% of the total DHIS2, which indicates significant deviation between the two sources. Most of the remaining ZS (except the outliers represented as dots) have differences of population that are 0.5-1.5 of the total DHIS2 population.

In [None]:
ggplot(comparison_df) +
  geom_sf(aes(fill = (worldpop_population - dhis2_population)/dhis2_population)) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  labs(title = "ADM2 Relative difference: WorldPop - DHIS2 Population (reference)",
       fill = "Relative Pop. Diff") +
  theme_minimal()

## Province comparison

In [None]:
# Group by province (ADM1) and dissolve geometries
dhis2_shapes_provinces <- shapes_data %>%
  group_by(ADM1_ID) %>%
  summarise(geometry = st_union(geometry), .groups = "drop")

# Group pop by provinces
dhis2_pop_prov <- dhis2_pop %>%
  group_by(ADM1_NAME, ADM1_ID) %>%
  summarise(dhis2_value = sum(POPULATION, na.rm = TRUE))

# Group pop by provinces
worldpop_pop_prov <- worldpop_population %>%
  group_by(ADM1_NAME, ADM1_ID) %>%
  summarise(worldpop_value = sum(POPULATION, na.rm = TRUE))

comparison_df_prov <- left_join(dhis2_shapes_provinces, dhis2_pop_prov, by = c("ADM1_ID"))
comparison_df_prov <- left_join(comparison_df_prov, worldpop_pop_prov, by = c("ADM1_ID"))

In [None]:
ggplot(comparison_df_prov, aes(x = dhis2_value, y = worldpop_value)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray") +
  labs(x = "DHIS2 Population", y = "WorldPop Population", 
       title = "Comparison per ADM2") +
  theme_minimal()

In [None]:
pop_correlation_prov <- cor(comparison_df_prov$dhis2_value, comparison_df_prov$worldpop_value, method = 'pearson')
print(paste0("Correlation : ", round(pop_correlation_prov, 2)))

In [None]:
ggplot(comparison_df_prov) +
  geom_sf(aes(fill = worldpop_value - dhis2_value)) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  labs(title = "Difference: WorldPop - DHIS2 Population (ADM1)",
       fill = "Pop. Diff") +
  theme_minimal()

In [None]:
sapply(
    setNames(
        list(comparison_df_prov$dhis2_value, comparison_df_prov$worldpop_value),
        c("DHIS2_population", "WPOP_population")
    ),
    summary
)