## Preliminaries

In [None]:
# install.packages("fpp3", repos = "https://cloud.r-project.org")

In [None]:
# Global settings
options(scipen=999)

Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.setenv(GDAL_DATA = "/opt/conda/share/gdal")

In [None]:
# Paths
ROOT_PATH <- '~/workspace'
CONFIG_PATH <- file.path(ROOT_PATH, 'configuration')
CODE_PATH <- file.path(ROOT_PATH, 'code')
DATA_PATH <- file.path(ROOT_PATH, 'data')
OUTPUT_DATA_PATH <- file.path(DATA_PATH, 'seasonality_cases')

In [None]:
# Load utils
source(file.path(CODE_PATH, "snt_utils.r"))

In [None]:
# List required pcks
required_packages <- c(
  "jsonlite",
  "data.table",
  "ggplot2",
  "fpp3",
  "arrow",
  "glue",
  "sf",
  "RColorBrewer",
  "httr",
  "reticulate"
)

# Execute function
install_and_load(required_packages)

In [None]:
Sys.setenv(RETICULATE_PYTHON = "/opt/conda/bin/python")
reticulate::py_config()$python
openhexa <- import("openhexa.sdk")

In [None]:
# Load SNT config
CONFIG_FILE_NAME <- "SNT_config.json"
config_json <- tryCatch({ fromJSON(file.path(CONFIG_PATH, CONFIG_FILE_NAME)) },
    error = function(e) {
        msg <- paste0("Error while loading configuration", conditionMessage(e))  
        cat(msg)   
        stop(msg) 
    })

msg <- paste0("SNT configuration loaded from  : ", file.path(CONFIG_PATH, CONFIG_FILE_NAME)) 
log_msg(msg)

# Set config variables
COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE
dhis2_dataset <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED

print(paste("Country code: ", COUNTRY_CODE))

## Globals and parameters

**Parameters**

In [None]:
minimum_periods <- as.integer(36)
maximum_proportion_missings_overall <- 0.1
maximum_proportion_missings_per_district <- 0.2

In [None]:
# Parameters
minimum_month_block_size <- as.integer(3)
maximum_month_block_size <- as.integer(5)
threshold_for_seasonality <- 0.6
threshold_proportion_seasonal_years <- 0.5

**Fixed routine formatting columns**

In [None]:
# Global variables
type_of_seasonality <- "cases"
formatted_threshold_for_seasonality <- sprintf("%d%%", round(threshold_for_seasonality * 100))
data_source <- "DHIS2"
original_values_col <- "CONF"

# space and time columns
admin_level <- 'ADM2'
admin_id_col <- paste(admin_level, toupper('id'), sep = '_')
admin_name_col <- paste(admin_level, toupper('name'), sep = '_')
year_col <- 'YEAR'
month_col <- 'MONTH'
period_cols <- c(year_col, month_col)

In [None]:
possible_month_block_sizes <- as.integer(minimum_month_block_size:maximum_month_block_size)
formatted_threshold_for_seasonality <- sprintf("%d%%", round(threshold_for_seasonality * 100))
print(paste("Formatted threshold :",formatted_threshold_for_seasonality))

## Load data

In [None]:
# Load spatial file from dataset
spatial_data_filename <- paste(COUNTRY_CODE, "shapes.geojson", sep = "_")
spatial_data <- get_latest_dataset_file_in_memory(dhis2_dataset, spatial_data_filename)
log_msg(glue("File {spatial_data_filename} successfully loaded from dataset version: {dhis2_dataset}"))

In [None]:
# Load routine data from dataset
case_data_filename <- paste(COUNTRY_CODE, "routine.parquet", sep = "_")
original_dt <- get_latest_dataset_file_in_memory(dhis2_dataset, case_data_filename)
log_msg(glue("File {case_data_filename} successfully loaded from dataset version: {dhis2_dataset}"))

In [None]:
# Columns formatting
admin_data <- st_drop_geometry(spatial_data)
setDT(admin_data)
common_cols <- names(admin_data)

seasonality_col <- glue('SEASONALITY', toupper(type_of_seasonality), .sep = "_")
season_duration_col <- glue('SEASONAL_BLOCK_DURATION', toupper(type_of_seasonality), .sep = "_")
season_start_month_col <- glue('SEASONAL_BLOCK_START_MONTH', toupper(type_of_seasonality), .sep = "_")
cases_proportion_col <- 'CASES_PROPORTION'
final_table_cols <- c(names(admin_data), seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)
print(final_table_cols)

**Create the containers for the data**

In [None]:
# Create an empty table if the analysis is stopped for lack of enough data
seasonality_cols <- c(seasonality_col, season_duration_col, season_start_month_col, cases_proportion_col)
empty_dt <- copy(admin_data)[, (seasonality_cols) := NA]

## Preprocess input data

In [None]:
# format table
setDT(original_dt)
integer_cols <- c(year_col, month_col)
numeric_cols <- c(original_values_col)
original_dt[, (integer_cols) := lapply(.SD, as.integer), .SDcols = integer_cols]
# head(original_dt)

In [None]:
# keep only the useful columns and aggregate the data on them
original_dt <- original_dt[,
                           setNames(list(sum(get(original_values_col), na.rm = TRUE)), original_values_col), 
                           by = c(admin_id_col, period_cols)
                           ]

num_periods <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[1]]
all_rows <- make_cartesian_admin_period(original_dt, admin_id_col, year_col, month_col)[[2]]

if (num_periods < minimum_periods){    
    log_msg(glue("Data is not reliable: 
                    at least {minimum_periods} year-month periods of data are required for the case analyais; 
                    the data only contains {num_periods} periods. Abandoning analysis.")
           , level="error")
    stop("ERROR 1")
}

# inject the (possibly missing) rows into the data
original_dt <- make_full_time_space_data(
  input_dt=original_dt,
  full_rows_dt=all_rows,
  target_colname=original_values_col,
  admin_colname=admin_id_col,
  year_colname=year_col,
  month_colname=month_col)

if(nrow(original_dt[is.na(get(original_values_col)),]) > (maximum_proportion_missings_overall * nrow(original_dt))){    
    log_msg("There are too many missing values in the data overall. Abandoning analysis.", level="error")
    stop("ERROR 2")   
}

### Imputation of missings

**Remove impute files (if any)**

In [None]:
# Remove existing imputation files
filename_imputed_dt <- paste(COUNTRY_CODE, type_of_seasonality, 'imputed.csv', sep = '_')
files_in_folder <- list.files(OUTPUT_DATA_PATH, full.names = TRUE)
files_to_remove <- files_in_folder[grepl(filename_imputed_dt, basename(files_in_folder), ignore.case = TRUE)]
file.remove(files_to_remove)
print(glue("Deleted files: {str(files_to_remove)}"))

In [None]:
# create the name of the column which will store the imputed/estimated values
imputed_col = paste(original_values_col, 'EST', sep = '_')

# if there are rows of missing data for cases, impute them (SARIMA)
if(nrow(original_dt[!is.na(get(original_values_col)),]) != nrow(original_dt)) {
    log_msg("There is missing data. Proceeding to impute them.", level="warning")
    
    # extract data on only the administrative units which have missing values for original_values_col
    missing_dt <- extract_dt_with_missings(original_dt, target_colname = original_values_col, id_colname = admin_id_col)
    missing_dt <- missing_dt[, PERIOD := make_yearmonth(year = YEAR, month = MONTH)]
    missing_dt <- missing_dt[, .SD, .SDcols = c(admin_id_col, 'PERIOD', original_values_col)]
    
    # how many rows missing for each administrative unit? if too many, then not good idea to impute
    missings_by_admin_unit <- missing_dt[, .(missing_count = sum(is.na(get(original_values_col)))), by = admin_id_col][order(-missing_count)]
    
    # if for any given admin unit, more than a given % of data is missing, there's too much to impute (maybe should be stricter - to discuss)
    if(missings_by_admin_unit[, max(missing_count)] > maximum_proportion_missings_per_district * num_periods){
      log_msg("Some administrative units have too many missing values in the target data. Abandoning analysis.", level="error")
      stop("ERROR 3")
    }
    
    # split to list per admin_unit_id, to apply SARIMA imputation on each time series (per admin unit)
    missing_districts_list <- split(missing_dt, by = admin_id_col)
    
    # seasonal ARIMA to estimate missing cases: apply function to list of data.tables with missing rows, then create data.table from result
    filled_missings_dt <- rbindlist(
    lapply(missing_districts_list,
           fill_missing_cases_ts,
           original_values_colname=original_values_col,
           estimated_values_colname=imputed_col,
           admin_colname=admin_id_col,
           period_colname='PERIOD',
           threshold_for_missing = 0.0)
    )
    
    # add the imputed ("_EST") values to the original data
    imputed_dt <- merge.data.table(original_dt, filled_missings_dt[, .SD, .SDcols = !(original_values_col)], by = c(admin_id_col, year_col, month_col), all.x = TRUE)
    
    # copy from the districts without missings;
    # if data is large, this could be made faster by only copying from the districts which are not in the missing_dt
    imputed_dt[!is.na(get(original_values_col)), (imputed_col) := get(original_values_col)]

    # Save imputed file, only if it was computed
    fwrite(imputed_dt, file = file.path(OUTPUT_DATA_PATH, filename_imputed_dt))
    
} else {
    imputed_dt <- copy(original_dt)
    imputed_dt[, (imputed_col) := get(original_values_col)]
}

## Seasonality

In [None]:
# The seasonality per row (period-admin unit) -----------------------------

row_seasonality_dt <- compute_month_seasonality(
  input_dt=imputed_dt,
  indicator=type_of_seasonality,
  values_colname=imputed_col,
  vector_of_durations=possible_month_block_sizes,
  admin_colname=admin_id_col,
  year_colname=year_col,
  month_colname=month_col,
  proportion_threshold=threshold_for_seasonality
)

# The seasonality per admin unit, irrespective of year ----------------------

seasonality_source_dt <- process_seasonality(
  input_dt=row_seasonality_dt,
  indicator=type_of_seasonality,
  vector_of_durations=possible_month_block_sizes,
  admin_colname=admin_id_col,
  year_colname=year_col,
  month_colname=month_col,
  proportion_seasonal_years_threshold=threshold_proportion_seasonal_years
)

# Compute the duration block; there are normal warnings when it's only 0-es for seasonality:
# for those admin units without any seasonality, the duration of the block will be 'infinite')
check_pattern_seasonality <- paste("^SEASONALITY", toupper(type_of_seasonality), "[0-9]+_MTH$", sep = "_")
seasonality_source_dt <- seasonality_source_dt[, .SD, .SDcols = c(admin_id_col, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]

## Result file

### long

This format, until further notice, is not saved.

In [None]:
seasonality_long_dt <- melt(
  seasonality_source_dt,
  id.vars = grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE, invert = TRUE), # all cols which don't follow the pattern
  variable.name = 'MONTH_BLOCK_SIZE',
  value.name =seasonality_col
  )

In [None]:
seasonality_long_dt[, MONTH_BLOCK_SIZE := possible_month_block_sizes[match(MONTH_BLOCK_SIZE, grep(check_pattern_seasonality, names(seasonality_source_dt), value = TRUE))]]

# add remaining admin unit columns and save the final results
admin_seasonality_long_dt <- merge.data.table(admin_data, seasonality_long_dt, by = c(admin_id_col), all = TRUE)

In [None]:
# order the columns
specific_cols <- setdiff(names(admin_seasonality_long_dt), names(admin_data)) # last columns
admin_seasonality_long_dt <- admin_seasonality_long_dt[, .SD, .SDcols = c(common_cols, specific_cols)]

In [None]:
# Keeping for now.
# # filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, gsub("\\.", "", as.character(threshold_for_seasonality)), type_of_seasonality, 'seasonality_long.csv', sep = '_')
# filename_admin_seasonality_long_dt <- paste(COUNTRY_CODE, data_source, admin_level, type_of_seasonality, 'seasonality_long.csv', sep = '_')
# fwrite(admin_seasonality_long_dt, file.path(OUTPUT_DATA_PATH, filename_admin_seasonality_long_dt))

### Transform to wide format

In [None]:
seasonality_wide_dt <- compute_min_seasonality_block(
    input_dt=seasonality_source_dt,
    seasonality_column_pattern=check_pattern_seasonality,
    vector_of_possible_month_block_sizes=possible_month_block_sizes,
    # indicator=toupper(type_of_seasonality),
    seasonal_blocksize_colname=season_duration_col,
    valid_value = 1
)

In [None]:
# Create a new, overall column 'SEASONALITY_' based on the values of columns in 'check_pattern_seasonality'
seasonality_pattern_cols <- grep(check_pattern_seasonality, names(seasonality_wide_dt), value = TRUE)
if (length(seasonality_pattern_cols) > 0L) {
  seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_col) := ifelse(rowSums(.SD == 1, na.rm = TRUE) > 0, 1L, 0L), .SDcols = seasonality_pattern_cols]
  seasonality_wide_dt <- seasonality_wide_dt[, (seasonality_pattern_cols) := NULL]
} else {
  seasonality_wide_dt[, (seasonality_col) := NA_integer_]
}

# Compute CASES_PROPORTION: proportion of cases in the seasonal block vs ANNUAL total
# Only for seasonal admin units (SEASONALITY_CASES = 1)

# Step 1: Compute annual totals per admin-year from imputed data
annual_totals_dt <- imputed_dt[, .(ANNUAL_TOTAL = sum(get(imputed_col), na.rm = TRUE)), by = c(admin_id_col, year_col)]

# Step 2: Function to compute proportion = max block sum / annual total
compute_cases_proportion <- function(admin_id, block_duration, row_data, annual_data, admin_col, year_column) {
  if (is.na(block_duration) || is.infinite(block_duration)) return(NA_real_)
  
  # Column with block sum (N-month forward-looking sum)
  sum_col <- paste('CASES_SUM', block_duration, 'MTH_FW', sep = '_')
  if (!sum_col %in% names(row_data)) return(NA_real_)
  
  admin_row_data <- row_data[get(admin_col) == admin_id]
  admin_annual_data <- annual_data[get(admin_col) == admin_id]
  if (nrow(admin_row_data) == 0 || nrow(admin_annual_data) == 0) return(NA_real_)
  
  # For each year, get max block sum (only if there are non-NA values)
  yearly_max_block <- admin_row_data[
    !is.na(get(sum_col)),
    .(max_block_sum = if (.N > 0L) max(get(sum_col), na.rm = TRUE) else NA_real_),
    by = year_column
  ]
  
  # Remove rows with NA or -Inf (from max when all values were NA)
  yearly_max_block <- yearly_max_block[is.finite(max_block_sum)]
  if (nrow(yearly_max_block) == 0) return(NA_real_)
  
  # Merge with annual totals
  merged <- merge(yearly_max_block, admin_annual_data, by = year_column)
  merged <- merged[ANNUAL_TOTAL > 0]
  if (nrow(merged) == 0) return(NA_real_)
  
  # Proportion = block sum / annual total, then average across years
  merged[, prop := max_block_sum / ANNUAL_TOTAL]
  return(mean(merged$prop, na.rm = TRUE))
}

seasonality_wide_dt[, (cases_proportion_col) := mapply(
  compute_cases_proportion,
  admin_id = get(admin_id_col),
  block_duration = get(season_duration_col),
  MoreArgs = list(row_data = row_seasonality_dt, annual_data = annual_totals_dt, admin_col = admin_id_col, year_column = year_col)
)]

# Set CASES_PROPORTION to NA for non-seasonal admin units
seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (cases_proportion_col) := NA_real_]

# Compute SEASONAL_BLOCK_START_MONTH: first month of the seasonal block
# Only for seasonal admin units (SEASONALITY_CASES = 1)

# Function to find the most frequent starting month for a given admin unit and block duration
compute_start_month <- function(admin_id, block_duration, row_data, admin_col, year_column, month_column) {
  if (is.na(block_duration) || is.infinite(block_duration)) return(NA_integer_)
  
  # Column with row-level seasonality indicator for this block duration
  seasonality_row_col <- paste('CASES', block_duration, 'MTH_ROW_SEASONALITY', sep = '_')
  if (!seasonality_row_col %in% names(row_data)) return(NA_integer_)
  
  admin_row_data <- row_data[get(admin_col) == admin_id]
  if (nrow(admin_row_data) == 0) return(NA_integer_)
  
  # Filter rows where seasonality = 1 (this month is the start of a seasonal block)
  seasonal_months <- admin_row_data[get(seasonality_row_col) == 1, get(month_column)]
  
  if (length(seasonal_months) == 0) return(NA_integer_)
  
  # Find the most frequent month (mode)
  month_counts <- table(seasonal_months)
  most_frequent_month <- as.integer(names(month_counts)[which.max(month_counts)])
  
  return(most_frequent_month)
}

seasonality_wide_dt[, (season_start_month_col) := mapply(
  compute_start_month,
  admin_id = get(admin_id_col),
  block_duration = get(season_duration_col),
  MoreArgs = list(row_data = row_seasonality_dt, admin_col = admin_id_col, year_column = year_col, month_column = month_col)
)]

# Set SEASONAL_BLOCK_START_MONTH to NA for non-seasonal admin units
seasonality_wide_dt[get(seasonality_col) == 0 | is.na(get(seasonality_col)), (season_start_month_col) := NA_integer_]

In [None]:
# add remaining admin unit columns and save the final results
admin_seasonality_wide_dt <- merge.data.table(admin_data, seasonality_wide_dt, by = c(admin_id_col), all = TRUE)
admin_seasonality_wide_dt <- admin_seasonality_wide_dt[, .SD, .SDcols = c(common_cols, seasonality_cols)]
# head(admin_seasonality_wide_dt)

**Save output**

In [None]:
# Create the filename
file_stem <- paste(COUNTRY_CODE, type_of_seasonality, 'seasonality', sep = '_')
filename_csv = glue("{file_stem}.csv")
filename_parquet = glue("{file_stem}.parquet")
fwrite(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_csv))
write_parquet(admin_seasonality_wide_dt, file.path(OUTPUT_DATA_PATH, filename_parquet))
log_msg(paste0("Case seasonality results saved in folder ", OUTPUT_DATA_PATH))

In [None]:
# fwrite(row_seasonality_dt, file.path(OUTPUT_DATA_PATH, "row_seasonality.csv"))

In [None]:
# fwrite(seasonality_source_dt, file.path(OUTPUT_DATA_PATH, "processed_seasonality.csv"))