In [1]:
from scipy.stats import truncnorm
import pandas as pd
import numpy as np
import itertools
import datetime
import tqdm
import sys
import os

def flatten_list(list_array):
    return list(itertools.chain(*list_array))

sys.path.insert(0, "../")
sys.path.insert(0,"../pompjax/pompjax/")

from global_config import config

results_dir           = config.get_property('results_dir')
results2_dir           = config.get_property('results2_dir')

data_dir              = config.get_property('data_dir')
paper_dir             = config.get_property('paper_dir')
data_db_dir           = config.get_property('data_db_dir')
feb_hosp_records_path = os.path.join(data_db_dir, 'long_files_8_25_2021')
path_to_save          = os.path.join(results_dir, "real_testing", "community")

COLOR_LIST1           = ["#F8AFA8", "#FDDDA0", "#F5CDB4", "#74A089"]



In [2]:
from utils_data_metapop import create_population_data, create_time_transfers

path_to_ward_counts = os.path.join(data_db_dir, "long_files_8_25_2021", "counts_ward.csv" )
path_to_ward_transf = os.path.join(data_db_dir, "long_files_8_25_2021", "transfers_ward.csv" )

A_df, D_df, H_df, tests_df, Hmean_df = create_population_data(path_to_ward_counts)

num_wards  = len(Hmean_df)
ward_names = list(Hmean_df.index)
M_df       = create_time_transfers(path_to_ward_transf, num_wards=num_wards, ward_names=ward_names)


In [None]:
selected_buildings = ['Allen Hospital-Allen', 'Harkness Pavilion-Columbia', 'Milstein Hospital-Columbia', 'Mschony-Chony', 'Presbyterian Hospital-Columbia']
building2id        = {selected_buildings[i]: i for i in range(len(selected_buildings))}

def building2observation(building):
    if building in selected_buildings:
        return building2id[building]
    else:
        return 5

ward_names_df                = pd.DataFrame(ward_names, columns=["ward"])
ward_names_df["building"]    = ward_names_df["ward"].apply(lambda x: "-".join(x.split("-")[1:]))
ward_names_df["buidling_id"] = ward_names_df["building"].apply(lambda x: building2observation(x) )
ward_names_df["ward_id"]     = ward_names_df.apply(lambda x: np.where(ward_names_df.ward == x.ward)[0][0], axis=1)
wardid2buildingid            = {row.ward_id: row.buidling_id for i, row in ward_names_df.iterrows()}


In [None]:
from models import process_metapop, observe_metapop_cluster, init_metapop, simulate_metapop, simulate_metapop_observations
from utils_local.misc import amro2cute

delta = 1/120  # decolonization rate
A     = A_df.to_numpy()
D     = D_df.to_numpy()
H     = H_df.to_numpy()
M     = M_df
tests = tests_df.to_numpy()


# Monte Carlo error
The question:
- Is the Monte Carlo error the source of bias in the inference?

In [None]:
from diagnostic_plots import convergence_plot
from utils import create_df_response
from ifeakf import ifeakf


def create_obs_infer(obs_sim, idx_infer, dates, model_settings, resample="W-Sun"):
    # obs_sim \in R^{[k x T x m]} as required by pompjax
    infer_df = pd.DataFrame(index=dates)
    for i in range(model_settings["k"]) :
        infer_df['y'+str(i+1)]   = obs_sim[i, :, idx_infer]
        infer_df['oev'+str(i+1)] = 1 +(0.2 * infer_df['y'+str(i+1)].values)**2
    infer_df                     = infer_df.resample(resample).sum()
    infer_df.index.values[-1]    = model_settings["dates"][-1]
    return infer_df

In [None]:
def run_amro_synthetic_MCE(f, f0, g, fsim, model_settings, if_settings, id_run=0, path_to_save=None):
    dates        = pd.date_range(start=pd.to_datetime("2020-02-01"), end=pd.to_datetime("2021-02-28"), freq="D")

    θtruth       = np.array([model_settings["param_truth"]]).T * np.ones((model_settings["p"], model_settings["m"]))
    x_sim, y_sim = simulate_metapop(process_model = process,
                            observational_model   = obs_model,
                            init_state            = init_state,
                            θsim                  = θtruth,
                            model_settings        = model_settings)
    y_sim = y_sim[:, :, np.argsort(y_sim.sum(0).sum(0))[::-1]]
    x_sim = x_sim[:, :, :, np.argsort(y_sim.sum(0).sum(0))[::-1]]

    ρmin              = 0.01 # test sensitivity minimum
    ρmax              = 0.2  # test sensitivity maximum
    βmin              = 0.00 # transmission rate minimum
    βmax              = 0.5  # transmission rate maximum

    q_use      = [5/100, 25/100, 50/100, 75/100, 95/100]
    idxs_infer = [int(q * model_settings["m"]) for q in q_use]

    tetas_infer      = []
    tetas_post_infer = []

    for idx_infer in idxs_infer:
        idx_infer = np.random.randint(model_settings["m"])
        obs_df    = create_obs_infer(y_sim.transpose(1, 0, 2), idx_infer, dates, model_settings, resample="W-Sun")

        max_total_pop     = np.max(H.sum(axis=0))
        state_space_range = np.array([0, max_total_pop])
        parameters_range  = np.array([[ρmin, ρmax],    [βmin, βmax]])
        σ_perturb         = np.array([(ρmax - ρmin)/4, (βmax - βmin)/4]) # (i hve the gut feeling that 0.25 is too large)

        θmle, θpost = ifeakf(process_model                = f,
                                state_space_initial_guess = f0,
                                observational_model       = g,
                                observations_df           = obs_df,
                                parameters_range          = parameters_range,
                                state_space_range         = state_space_range,
                                model_settings            = model_settings,
                                if_settings               = if_settings,
                                perturbation              = σ_perturb)

        tetas_infer.append(θmle)
        tetas_post_infer.append(θpost)

        ρ_df = create_df_response(θpost[0, :, :, :].mean(-2).T, time=if_settings["Nif"])
        β_df = create_df_response(θpost[1, :, :, :].mean(-2).T, time=if_settings["Nif"])

        p_dfs             = [ρ_df, β_df]
        param_label       = ["ρ", "β"]
        parameters_range  = np.array([[ρmin, ρmax], [βmin, βmax]])
        convergence_plot(θmle, p_dfs, parameters_range, param_label, param_truth=list(θtruth[:, 0]),
                            path_to_save=os.path.join(path_to_save, f"{str(id_run).zfill(3)}convergence_{idx_infer}_.png"))

    np.savez_compressed(os.path.join(path_to_save, f"{str(id_run).zfill(3)}posterior.npz"),
                                    mle           = θmle,
                                    posterior     = θpost,
                                    state_space   = x_sim,
                                    observations  = y_sim,
                                    teta_truth    = θtruth,
                                    idxs_infer     = idxs_infer)



In [None]:
def process_model_wrapped(t, x, θ, f, model_settings):
    n       = model_settings["n"]
    m       = model_settings["m"]
    k       = model_settings["k"]
    num_pop = model_settings["num_pop"]

    x = np.reshape(x, ( int(n/num_pop), num_pop, m))
    x = f(t, x, θ)
    x = np.reshape(x, (n, m))

    return x

def obs_model_wrapped(t, x, θ, g, model_settings):
    n       = model_settings["n"]
    m       = model_settings["m"]
    k       = model_settings["k"]
    num_pop = model_settings["num_pop"]

    x = np.reshape(x, (int(n/num_pop), num_pop, m))
    return g(t, x, θ)

def init_state_wrapped(θ, f0, model_settings):
    n = model_settings["n"]
    m = model_settings["m"]

    x = f0(θ)
    x = np.reshape(x, (n, m))
    return x

In [None]:
from utils_local.misc import amro2title

if_settings = {
        "Nif"                : 30,          # number of iterations of the IF
        "type_cooling"       : "geometric", # type of cooling schedule
        "shrinkage_factor"   : 0.9,         # shrinkage factor for the cooling schedule
        "inflation"          : 1.01         # inflation factor for spreading the variance after the EAKF step
        }

model_settings = {
    "param_name"  : ["ρ", "β"],       # importation and transmission rate
    "p"           : 2,                # number of parameters
    "dt"          : 1,                # time step
    "m"           : 300,              # number of ensembles
    "stochastic"  : True              # is stochastic
    }

dates_simulation = pd.date_range(start=pd.to_datetime("2020-02-01"), end=pd.to_datetime("2021-02-28"), freq="D")
num_pop          = num_wards

model_settings["n"]           = 3 * num_pop            # number of state variables / dimension of the state space
model_settings["T"]           = len(dates_simulation)  # time to run
model_settings["num_pop"]     = num_pop
model_settings["dates"]       = dates_simulation
model_settings["num_build"]   = len(np.unique(list(wardid2buildingid.values())))
model_settings["k"]           = model_settings["num_build"] # observing at the building aggregation

assim_dates                       = list(pd.date_range(start=pd.to_datetime("2020-02-01"), end=pd.to_datetime("2021-02-28"), freq="W-Sun"))
assim_dates[-1]                   = dates_simulation[-1]
if_settings["assimilation_dates"] = assim_dates
id_run                            = 0


In [None]:
from utils_data_metapop import empirical_prevalence

amro_search  = ['ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE',  'PSEUDOMONAS AERUGINOSA',
                'METHICILLIN-SUSCEPTIBLE STAPHYLOCOCCUS AUREUS', 'METHICILLIN-RESISTANT STAPHYLOCOCCUS AUREUS',
                "STAPHYLOCOCCUS EPIDERMIDIS", 'ENTEROCOCCUS FAECALIS', 'ENTEROCOCCUS FAECIUM']

amro = amro_search[0]
gamma        = empirical_prevalence(amro, path_to_prev="../data/amro_prevalence.csv")


print("Running IF-EAKF for amro: ", amro2title(amro))

path_to_save = os.path.join(results2_dir, "synthetic_inferences", "metapopulation", f"{amro2cute(amro)}")
scenarios_df = pd.read_csv(os.path.join(path_to_save, "scenarios.csv"))

for idx_row, row in scenarios_df.iterrows():
    model_settings["param_truth"]     = [row["rho"], row["beta"]]
    if_settings["adjust_state_space"] = True
    if_settings["shrink_variance"]    = True


    path_to_samples = os.path.join(path_to_save, "adjust_state_space", f"scenario{idx_row+1}", "MCE")
    os.makedirs(path_to_samples, exist_ok=True)


    f0          = lambda θ:  init_metapop(N0               = H[:, 0],
                                            c0             = gamma,
                                            model_settings = model_settings)
    init_state  = lambda θ: init_state_wrapped(θ, f0, model_settings)

    f  = lambda t, x, θ: process_metapop(t, x,
                                                gamma = gamma * np.ones(model_settings["m"]),
                                                beta  = θ[1, :],
                                                delta = delta,
                                                Nmean = np.expand_dims(Hmean_df, -1),
                                                N     = H[:, [t]],
                                                A     = A[:, [t]],
                                                D     = D[:, [t]],
                                                M     = M[:, :, t])
    process  = lambda t, x, θ: process_model_wrapped(t, x, θ, f, model_settings)

    g = lambda t, x, θ: observe_metapop_cluster(t, x,
                                                    rho            = θ[0, :],
                                                    N              = H[:, [t]],
                                                    num_tests      = tests[:, [t]],
                                                    model_settings = model_settings,
                                                    ward2cluster   = wardid2buildingid)
    obs_model = lambda t, x, θ: obs_model_wrapped(t, x, θ, g, model_settings)

    run_amro_synthetic_MCE(process, init_state, obs_model, simulate_metapop, model_settings, if_settings, id_run=0, path_to_save=path_to_samples)
