In [1]:

## versions:
## Python    : 3.11.5
## numpy     : 1.26.0
## torch     : 2.1.0
## pandas    : 2.1.1

# licensed under the Creative Commons - Attribution-NonCommercial 4.0
# International license (CC BY-NC 4.0):
# https://creativecommons.org/licenses/by-nc/4.0/. 

import os
import io
import sys
import shutil
import datetime
from typing import Dict, List, Optional
from copy import deepcopy

import numpy as np
import pandas as pd
import torch as t
from torch.utils.data import DataLoader
from scipy import stats
import matplotlib.pyplot as plt

from common.torch.ops import empty_gpu_cache
from common.sampler import ts_dataset
from common.torch.snapshots import SnapshotManager
from experiments.trainer import trainer_var
from experiments.model import generic_dec_var
from models.exog import TCN_encoder

from data_utils.forecast import tryJSON, Struct, read_config, default_settings, str_indexed_csv
from data_utils.forecast import init_target_data, load_exog_data, make_training_fn, generate_quantiles
from data_utils.forecast import pickle_results, read_pickle, output_figs
from data_utils.covid_hub import domain_defaults, domain_defaults_pretrain, specify_ensemble, output_df
from data_utils.covid_hub import read_covid_weekly, read_weather_data, download_forecast_hub, download_covid_surveil, download_weather



In [2]:
import warnings
%config InlineBackend.figure_formats = ["svg"]
plt.style.use("dark_background")
warnings.formatwarning = lambda message, category, *args, **kwargs: "{}: {}\n".format(category.__name__, message)
warnings.filterwarnings("ignore",category=FutureWarning)
#%load_ext watermark
#%watermark -n -u -v -iv -w

(if needed) read latest data

In [4]:
dest = os.path.join("storage","download")
#download_forecast_hub(dest)
#download_covid_surveil(dest)
#download_weather(2024, dest)

idx, _ = read_covid_weekly()
read_weather_data(idx)

`read_config()` returns configuration settings that don't change between models within an ensemble

gets values from `config.json` if available

see comments in `data_utils/forecast.py` for an explanation of entries

In [None]:
#rstate = read_config("config_covid.json")
rstate = read_config("config_covid_pretrain.json") ## pretraining dataset (aggregated by hhs region)

In [None]:
rstate

you can change the settings here or in `config.json`

e.g., `rstate.cut` sets the train/test split index (None = train on all data)

In [None]:
rstate.cut = None  # 166 # 170 # 176 #   (126 = end of 2022, 178 = end of 2023)

`default_settings()` returns settings that can be changed between models within an ensemble

gets defaults from `settings.json` if available

see comments in `data_utils/forecast.py` for an explanation of entries

In [None]:
settings = default_settings("settings_covid.json")

can change settings in json file or here

In [3]:
## try adjusting the amount of training based on the amount of training data history
## (lowering learning rate seems to work better than decreasing # of iterations)
def adapt_iter(x):
    return int(np.round(200 + (x - 126) * 14.0 / 3.0))

def adapt_lr(x):
    return np.round(0.00005 + (x - 126) * 1.5e-6, 7)  #np.round(0.0001 + (x - 126) * 2.8e-6, 7) 

In [None]:
#try increasing the learning rate when there's more training data
if rstate.cut is not None:
    settings.init_LR = adapt_lr(rstate.cut)

we will change `settings.exog_vars` below, to specify which exogenous predictors to use

In [None]:
settings

`domain_defaults()` is meant to be a user-defined function

returns a struct with instructions for reading or generating exogenous variables

see `data_utils/covid_hub.py` for an example/explanation

In [None]:
#domain_specs = domain_defaults()
domain_specs = domain_defaults_pretrain() ## pretraining dataset (aggregated by hhs region)

`exog_vars` specifies which exogenous predictors to use by default

the predictors in `var_names` are loaded/generated and available to use

In [None]:
domain_specs

`init_target_data()` reads in and optionally transforms target data

sets timepoint indices and series identifiers; writes data to `rstate`


In [None]:
rstate, settings = init_target_data(rstate, settings)

`rstate.data_index` was set based on the index of `rstate.target_file`

for exogenous data, the files and functions specified in `domain_defaults()` must generate data frames with the same index

In [None]:
rstate.data_dir+"/"+rstate.target_file, rstate.data_index

`load_exog_data()` appends exogenous predictors to rstate, using the data index generated above

In [None]:
rstate, settings = load_exog_data(rstate, settings, domain_specs)

`settings.exog_vars` now has the defaults from domain_specs (if this was not set in `settings.json`)

In [None]:
settings.exog_vars

the data has been read into `rstate` as a dict keyed by series name

each series is a data frame with rows as timepoints and columns as variables

In [None]:
#rstate.series_dfs["24"]
rstate.series_dfs["Region 3"]

if forecast targets are per-capita, need series weights for summing to national (per capita) forecast

In [None]:
if rstate.series_weights is not None:
    print(pd.DataFrame({"weight":rstate.series_weights.squeeze()},index=rstate.series_names)[:10])

the name of the target column was set automatically by `init_target_data()`

In [None]:
rstate.target_var

`make_training_fn()` returns a function that trains a model  (it closes over training data and config settings)

the resulting function takes `settings` and returns mean & variance forecasts

the forecasts are matrices with rows = series and columns = timepoints

the trained models are saved in `rstate.snapshot_dir`

the training function can be used on its own or called in a loop with different settings to generate an ensemble


In [None]:
training_fn = make_training_fn(rstate)

to use snapshot/pretrained model with no additional training, set iterations to 0

In [None]:

#settings.iterations = 0
#settings.iterations = 200


to train an ensemble of models, we will generate a list of `settings`, one for each model

`specify_ensemble` is a user-defined function that generates the list, based on info in `domain_specs`

see `data_utils/covid_hub.py` for an example

In [None]:
## maybe we don't really need 5 random reps
domain_specs.random_reps = 5

## generate a list of settings structs having the desired variation for ensemble
## save the list to rstate for posterity
rstate.settings_list = specify_ensemble(settings, domain_specs)


can also define some other ensemble:

In [4]:
## setting size of hidden layer based on size of lookback window:
def custom_ensemble(template, specs):
    settings_list = []
    for j in range(specs.random_reps):
        for opt in specs.lookback_opts:
            x = deepcopy(template)
            x.lookback = opt
            x.nbeats_hidden_dim = opt * 2 * 6 * 5
            settings_list.append(x)
    return settings_list


In [None]:
rstate.settings_list = custom_ensemble(settings, domain_specs)
rstate.settings_list[3]

(optional) a pretrained model file for each model in the ensemble

each must have the same structure (lookback window, hidden dims, etc.) as the corresponding ensemble entry

In [None]:

def pretrained_list(pretrain_dir, specs):
    file_list = []
    i = 0
    for j in range(specs.random_reps):
        for opt in specs.lookback_opts:
            filename = os.path.join("nbxd_" + str(i) + "_1266", "model")
            file_list.append(os.path.join(pretrain_dir,filename))
            i = i + 1
    return file_list


In [None]:
rstate.pretrained_models = [None for x in rstate.settings_list]

pretrain_dir = None # os.path.join("storage","pretrained_hhs_surveil_weekly") # 

if pretrain_dir is not None:
    rstate.pretrained_models = pretrained_list(pretrain_dir, domain_specs)

rstate.pretrained_models

empty dicts for storing the forecasts from each model:

In [None]:
mu_fc={}
var_fc={}

In [None]:
empty_gpu_cache() ## just in case?

train each model in the ensemble and write its forecast to `mu_fc` and `var_fc` (keyed w/ a semi-descriptive name):

In [None]:

## ensemble loop
for i, set_i in enumerate(rstate.settings_list):
    model_name = rstate.output_prefix+"_"+str(i)
    model_suffix = str(rstate.cut) if rstate.cut is not None else str(rstate.data_index[-1])
    model_name = model_name+"_"+model_suffix
    print("training ",model_name)
    mu_fc[model_name], var_fc[model_name] = training_fn(model_name, set_i, rstate.pretrained_models[i]) 


forecast shape for each model is [series, time]

ensemble the dict values using median across models

write results to `rstate`

In [None]:

mu_fc["ensemble"] = np.median(np.stack([mu_fc[k] for k in mu_fc]),axis=0)
var_fc["ensemble"] = np.median(np.stack([var_fc[k] for k in var_fc]),axis=0)

rstate.mu_fc = mu_fc
rstate.var_fc = var_fc


`generate_quantiles()` goes through each entry in `rstate.mu_fc` and `rstate.var_fc`

and generates dicts containing forecast quantiles for each model (and "ensemble")

see comments in `data_utils/forecast.py` for details

In [None]:
rstate = generate_quantiles(rstate)

optional: save rstate, which contains all training data, forecasts, and ensemble settings

`pickle_results()` writes it to output dir

In [None]:
pickle_results(rstate)

plot some forecasts

In [None]:
#output_figs(rstate, settings.horizon, [20, 4], 60)
output_figs(rstate, settings.horizon, [2, 8], 60)

delete the trained models if we no longer need them:

In [None]:
if rstate.delete_models:
    try:
        shutil.rmtree(rstate.snapshot_dir)
    except:
        pass


automate the above

In [5]:
def init_rstate(configfile, cut, settings, domain_specs, ensemble_fn=specify_ensemble, cut_weight_fn=None):
    rstate = read_config(configfile)
    rstate.cut = cut
    
    rstate, settings = init_target_data(rstate, settings)
    rstate, settings = load_exog_data(rstate, settings, domain_specs)

    if cut_weight_fn is not None: settings.cut_weights = cut_weight_fn(rstate, settings)

    rstate.settings_list = ensemble_fn(settings, domain_specs)
    rstate.pretrained_models = [None for x in rstate.settings_list]
    
    return rstate, settings


def generate_ensemble(rstate, ens_fn=np.median):
    mu_fc={}
    var_fc={}
    empty_gpu_cache()
    training_fn = make_training_fn(rstate)

    ## ensemble loop
    for i, set_i in enumerate(rstate.settings_list):
        model_name = rstate.output_prefix+"_"+str(i)
        model_suffix = str(rstate.cut) if rstate.cut is not None else str(rstate.data_index[-1])
        model_name = model_name+"_"+model_suffix
        print("training ",model_name)
        mu_fc[model_name], var_fc[model_name] = training_fn(model_name, set_i, rstate.pretrained_models[i]) 

    mu_fc["ensemble"] = ens_fn(np.stack([mu_fc[k] for k in mu_fc]),axis=0)
    var_fc["ensemble"] = ens_fn(np.stack([var_fc[k] for k in var_fc]),axis=0)
    rstate.mu_fc = mu_fc
    rstate.var_fc = var_fc
    
    rstate = generate_quantiles(rstate)

    return rstate


def delete_model_dir(rstate):
    if rstate.delete_models:
        try:
            shutil.rmtree(rstate.snapshot_dir)
        except:
            pass


In [6]:


def run_test(configfile, settingsfile, cut, random_reps=None, ensemble_fn=specify_ensemble, series_figs=[], n_iter=None, pretrain_list_fn=None, cut_weight_fn=None, ens_reduce=np.median, adj_iter=False, adj_LR=True, init_LR=None):
    ## if adj_*, train more when there is more data; otherwise use values from settings.json
    settings = default_settings(settingsfile)
    if cut is not None:
        if adj_iter: settings.iterations = adapt_iter(cut)
        if adj_LR: settings.init_LR = adapt_lr(cut)
    if n_iter is not None: settings.iterations = n_iter
    if init_LR is not None: settings.init_LR = init_LR

    domain_specs = domain_defaults()
    #domain_specs = domain_defaults_pretrain()
    if random_reps is not None: domain_specs.random_reps = random_reps
    
    rstate, settings = init_rstate(configfile, cut, settings, domain_specs, ensemble_fn, cut_weight_fn)

    if pretrain_list_fn is not None:
        rstate.pretrained_models = pretrain_list_fn(domain_specs)

    rstate = generate_ensemble(rstate, ens_reduce)

    pickle_results(rstate)
    output_figs(rstate, rstate.settings_list[0].horizon, 
                series_figs, 
                60,
                colors=["white","yellow"],figsize=(5,3),plot_mean=True)

    df, date_stamp = output_df(rstate, 14)
    
    delete_model_dir(rstate)

    return (df, date_stamp) 


In [None]:
## pretrained model file for each model in the ensemble

def pre2023(specs):
    pretrain_dir = os.path.join("storage","weekly_models")
    file_list = []
    i = 0
    for j in range(specs.random_reps):
        for opt in specs.lookback_opts:
            #filename = os.path.join("nbxd_" + str(i) + "_178", "model")
            filename = os.path.join("nbxd_" + str(i) + "_152", "model")
            file_list.append(os.path.join(pretrain_dir,filename))
            i = i + 1
    return file_list


In [7]:

def pre2024(specs):
    pretrain_dir = os.path.join("storage","pretrained_hhs_surveil_weekly")
    file_list = []
    for j in range(specs.random_reps):
        for opt in specs.lookback_opts:
            filename = "covid2024_"+str(opt)+"H_"+str(j+1)+".pt"
            file_list.append(os.path.join(pretrain_dir,filename))
    return file_list


In [None]:
df, date_stamp = run_test("config_covid.json", "settings_covid.json", 
                          None, 5, custom_ensemble, series_figs=[], n_iter=100, 
                          pretrain_list_fn=pre2024, adj_LR=False)

In [9]:
pops = str_indexed_csv("storage/training_data/fips_pops.csv").iloc[:,0].rename("pop")
pops["US"] = pops.sum()

df_hub = pd.merge(df,pops,left_on="series_name",right_index=True)
df_hub["per100k"] = df_hub["value"]
df_hub["value"] = (df_hub["per100k"] * (df_hub["pop"] / 100000.0)).round(2)
df_hub["output_type_id"] = pd.to_numeric(df_hub["output_type_id"], errors="coerce")

keep_cols = ["reference_date",
    "target",
    "horizon",
    "target_end_date",
    "location",
    "output_type",
    "output_type_id",
    "value"]

filename = date_stamp.strftime("%Y-%m-%d") + "-OHT_JHU-nbxd.csv"

df_hub.loc[(df_hub["output_type"]=="quantile") & (df_hub["horizon"]<4) , 
           keep_cols].to_csv(os.path.join("storage","output",filename),index=False)

graph training losses

note, ensembling not-quite-converged models seems to work better than running more iterations


In [None]:

def plot_losses(pickle_file,ylim=None):
    rstate = read_pickle(pickle_file)
    model_prefix = rstate.output_prefix
    model_suffix = str(rstate.cut) if rstate.cut is not None else str(rstate.data_index[-1])
    _, ax = plt.subplots(nrows=len(rstate.settings_list),ncols=2,figsize=[8,2*len(rstate.settings_list)])
    for i, set_i in enumerate(rstate.settings_list):
        model_name =  model_prefix+"_"+str(i)+"_"+model_suffix
        total_iter = set_i.iterations
        snapshot_manager = SnapshotManager(snapshot_dir=os.path.join(rstate.snapshot_dir, model_name), total_iterations=total_iter)
        ldf = snapshot_manager.load_training_losses()
        vdf = snapshot_manager.load_validation_losses()
        ax[i,0].plot(ldf)
        ax[i,1].plot(vdf)
        ax[i,1].set_ylim(ylim)
    #plt.show()
    plt.savefig(os.path.join(rstate.output_dir , "losses_"+model_prefix+"_"+model_suffix+".png"))


In [None]:
plot_losses(os.path.join("storage", "output", "nbxd_166.pickle"))


In [None]:
#rstate.delete_models = True
#delete_model_dir(rstate)

In [None]:
## pull pretrained models out of snapshot directories

model_dir = "storage/model_snapshots"
n = 5
opts = [3,4,5,6]
idx = 1

i = 0
for j in range(n):
    for opt in opts:
        filepath = os.path.join(model_dir,"nbxd_"+str(i)+"_2024-11-09","model")
        dest = os.path.join(model_dir,"covid2024_"+str(opt)+"H_"+str(j+idx)+".pt")
        shutil.copyfile(filepath, dest)
        i = i + 1
