In [1]:
from scipy.stats import truncnorm
import pandas as pd
import numpy as np
import itertools
import datetime
import tqdm
import sys
import os

def flatten_list(list_array):
    return list(itertools.chain(*list_array))

sys.path.insert(0, "../")
sys.path.insert(0,"../pompjax/pompjax/")

from global_config import config

results_dir           = config.get_property('results_dir')
results2_dir           = config.get_property('results2_dir')

data_dir              = config.get_property('data_dir')
paper_dir             = config.get_property('paper_dir')
data_db_dir           = config.get_property('data_db_dir')
feb_hosp_records_path = os.path.join(data_db_dir, 'long_files_8_25_2021')
path_to_save          = os.path.join(results_dir, "real_testing", "community")

COLOR_LIST1           = ["#F8AFA8", "#FDDDA0", "#F5CDB4", "#74A089"]

In [2]:
def create_population_data(path_to_file, date_start=pd.to_datetime("2020-02-01"), date_end=pd.to_datetime("2021-02-28")):

    dates_simulation = pd.date_range(start=date_start, end=date_end, freq="D")

    data_df  = pd.read_csv( path_to_file, parse_dates=['date'])
    data_df  = data_df[data_df.date.isin(dates_simulation)]
    A_df     = pd.pivot(data_df, index='ward', columns='date', values='num_admitted')
    D_df     = pd.pivot(data_df, index='ward', columns='date', values='num_discharged')
    H_df     = pd.pivot(data_df, index='ward', columns='date', values='num_hospitalized')
    tests_df = pd.pivot(data_df, index='ward', columns='date', values='num_tested')
    Hmean_df = H_df.mean(axis=1)

    return A_df, D_df, H_df, tests_df, Hmean_df

def create_time_transfers(path_to_file, num_wards, ward_names, date_start=pd.to_datetime("2020-02-01"), date_end=pd.to_datetime("2021-02-28")):

    dates_simulation = pd.date_range(start=date_start, end=date_end, freq="D")
    transfers_df     = pd.read_csv(path_to_file, parse_dates=['date'])
    transfers_df     = transfers_df[transfers_df.date.isin(dates_simulation)]
    M_df             = np.zeros((num_wards, num_wards, len(dates_simulation)+1))

    for i in range(num_wards):
        ward_from = ward_names[i]
        for j in range(num_wards):
            ward_to      = ward_names[j]
            transfers_ij = transfers_df[(transfers_df.ward_from==ward_from) & (transfers_df.ward_to==ward_to)]

            if(transfers_ij.shape[0] > 0) :
                dates_ij                = transfers_ij.date.values
                dates_ind               = np.where(np.in1d(dates_ij, dates_simulation))[0]
                transfered              = transfers_ij.num_transfered.values
                M_df[i, j, dates_ind-1] = transfered

    return M_df


In [3]:
path_to_ward_counts = os.path.join(data_db_dir, "long_files_8_25_2021", "counts_ward.csv" )
path_to_ward_transf = os.path.join(data_db_dir, "long_files_8_25_2021", "transfers_ward.csv" )


A_df, D_df, H_df, tests_df, Hmean_df = create_population_data(path_to_ward_counts)

num_wards  = len(Hmean_df)
ward_names = list(Hmean_df.index)
M_df       = create_time_transfers(path_to_ward_transf, num_wards=num_wards, ward_names=ward_names)

# we want to choose synthetic scenarios that overall reproduce the synthetic observations, so we are going to use the stuff above (in the GridSearch) to sample from the parameter space and create the synthetic scenarios randomly.

In [4]:
from scipy.interpolate import UnivariateSpline

def return_score_cutoff(score, cut_off_prob=0.05):
    freq, score = np.histogram(score, bins=100, density=True)
    freq_cum    = np.cumsum(freq); freq_cum = freq_cum/freq_cum[-1]
    score       = score[1:]
    f_cum       = UnivariateSpline(score, freq_cum, s=0.001)
    sc_range    = np.linspace(np.min(score), np.max(score), 1000)
    score_cut   = sc_range[np.argmin(np.abs(f_cum(sc_range) * 100 - cut_off_prob*100))]
    return score_cut


In [5]:
selected_buildings = ['Allen Hospital-Allen', 'Harkness Pavilion-Columbia', 'Milstein Hospital-Columbia', 'Mschony-Chony', 'Presbyterian Hospital-Columbia']
building2id        = {selected_buildings[i]: i for i in range(len(selected_buildings))}

def building2observation(building):
    if building in selected_buildings:
        return building2id[building]
    else:
        return 5

ward_names_df                = pd.DataFrame(ward_names, columns=["ward"])
ward_names_df["building"]    = ward_names_df["ward"].apply(lambda x: "-".join(x.split("-")[1:]))
ward_names_df["buidling_id"] = ward_names_df["building"].apply(lambda x: building2observation(x) )
ward_names_df["ward_id"]     = ward_names_df.apply(lambda x: np.where(ward_names_df.ward == x.ward)[0][0], axis=1)
wardid2buildingid            = {row.ward_id: row.buidling_id for i, row in ward_names_df.iterrows()}


In [6]:
from models import process_metapop, observe_metapop, init_metapop, simulate_metapop, simulate_metapop_observations
from misc import amro2cute


dates_simulation = pd.date_range(start=pd.to_datetime("2020-02-01"), end=pd.to_datetime("2021-02-28"), freq="D")
num_pop          = num_wards

if_settings = {
   "Nif"                : 20,          # number of iterations of the IF
   "type_cooling"       : "geometric", # type of cooling schedule
   "shrinkage_factor"   : 0.9,         # shrinkage factor for the cooling schedule
   "inflation"          : 1.01,        # inflation factor for spreading the variance after the EAKF step
}

model_settings = {
    "param_name"  : ["ρ", "β"],            # importation and transmission rate
    "p"           : 2,                     # number of parameters
    "k"           : num_pop,               # number of observations | We are just observing carriage
    "n"           : 3*num_pop,             # number of state variables / dimension of the state space
    "dt"          : 1,                     # time step
    "T"           : len(dates_simulation), # time to run
    "m"           : 300,                   # number of ensembles
    "stochastic"  : True,                  # is stochastic
    "num_pop"     : num_pop,
    "dates"       : dates_simulation
    }

delta = 1/120  # decolonization rate

A     = A_df.to_numpy()
D     = D_df.to_numpy()
H     = H_df.to_numpy()
M     = M_df
tests = tests_df.to_numpy()

# Process model for the ifeakf | model(x, gamma, beta, delta, rho, sigma, pop, m=1, stochastic=True)
process_model_gamma = lambda t, x, θ, gamma : process_metapop(t, x,
                                            gamma = gamma * np.ones(model_settings["m"]),
                                            beta  = θ[1, :],
                                            delta = delta,
                                            Nmean = np.expand_dims(Hmean_df, -1),
                                            N     = H[:, [t]],
                                            A     = A[:, [t]],
                                            D     = D[:, [t]],
                                            M     = M[:, :, t])

# f0 model for the ifeakf            | initial_condition(c0, pop=2000, m=300)
initial_guess_x0_gamma  = lambda θ, gamma:  init_metapop(
                                                N0             = H[:, 0],
                                                c0             = gamma, # importation rate
                                                model_settings = model_settings)

# Observational model for the ifeakf |  g(t, x, rho)
observational_model  = lambda t, x, θ: observe_metapop(t, x,
                                                rho            = θ[0, :],
                                                N              = H[:, [t]],
                                                num_tests      = tests[:, [t]],
                                                model_settings = model_settings,
                                                ward2cluster   = wardid2buildingid)

def observe_metapop_cluster(t, x, N, rho, num_tests, ward2cluster):
    """ Observational model
        Args:
            t (int):      Time
            x (np.array): State space
            rho (float):  Observation probability
        Returns:
            y (np.array): Observed carriers ~ Binomial(C, rho).
    """

    m         = model_settings["m"]
    num_pop   = model_settings["num_pop"]
    num_build = model_settings["num_build"]

    with np.errstate(divide='ignore', invalid='ignore'):
        c = np.clip(np.nan_to_num(x[0, :, :]/N), 0, 1)

    observed_colonized = np.random.binomial(list(num_tests * np.ones((num_pop, m))), rho * c)  # Shape [num_pop, m]
    # need to resample this to [num_buildings x m] (maybe using the same buildings that rami used)
    obs_col_building = np.zeros((num_build, m))

    for i in range(num_build):
        obs_col_building[ward2cluster[i], :] += observed_colonized[i, :]

    return obs_col_building


In [7]:
from diagnostic_plots import convergence_plot
from utils import create_df_response
from ifeakf import ifeakf


def run_amro_synthetic(amro, id_run=0):

    cut_off_prob = 5/100
    amro_prev_df = pd.read_csv(os.path.join("..", "data", "amro_prevalence.csv"))
    dates_infer  = pd.date_range(start=pd.to_datetime("2020-02-01"), end=pd.to_datetime("2021-02-28"), freq="D")
    gs_df        = pd.read_csv( os.path.join(results2_dir, "grid_search", "metapopulation", f"{amro2cute(amro)}.csv") ).drop(columns=["Unnamed: 0"])
    gamma        = amro_prev_df[amro_prev_df.amro==amro]["prevalence_mean1"].values[0]/100

    sc_cutoff    = return_score_cutoff(gs_df.crps, cut_off_prob=cut_off_prob)
    gs_df        = gs_df[gs_df.crps <= sc_cutoff].reset_index(drop=True)
    scenarios_df = gs_df.copy()
    scenarios_df = scenarios_df.sample(n=10); scenarios_df = scenarios_df[["rho", "beta", "crps", "calibration_score"]].reset_index(drop=True)

    path_to_save = os.path.join(results2_dir, "synthetic_inferences", "no_state_space", f"{amro2cute(amro)}")
    os.makedirs(path_to_save, exist_ok=True)

    scenarios_df.to_csv(os.path.join(path_to_save, f"scenarios{id_run}.csv"))

    for idx_row, row in scenarios_df.iterrows():

        model_settings["param_truth"] = [row["rho"], row["beta"]]
        model_settings["num_build"] = len(np.unique(list(wardid2buildingid.values())))
        model_settings["k"]         = model_settings["num_build"] # observing at that aggregation

        process_model        = lambda t, x, θ: process_model_gamma(t, x, θ, gamma=gamma)
        init_conditions      = lambda θ: initial_guess_x0_gamma(θ, gamma=gamma)
        observational_model  = lambda t, x, θ: observe_metapop_cluster(t, x,
                                                    N            = H[:, [t]],
                                                    rho          = θ[0, :],
                                                    num_tests    = tests[:, [t]],
                                                    ward2cluster = wardid2buildingid)

        θtruth       = np.array([model_settings["param_truth"]]).T * np.ones((model_settings["p"], model_settings["m"]))
        x_sim, y_sim = simulate_metapop(
                        process_model       = process_model,
                        observational_model = observational_model,
                        init_state          = init_conditions,
                        θsim                = θtruth,
                        model_settings      = model_settings)

        idx_infer      = np.random.randint(y_sim.shape[1])
        obs_infer      = y_sim[:, :, idx_infer].transpose(1, 0)

        obs_df = pd.DataFrame(index=dates_infer)
        for i in range(model_settings["num_build"]) :
            obs_df['y'+str(i+1)]   = obs_infer[i, :]
            obs_df['oev'+str(i+1)] = 1 +(0.2 * obs_df['y'+str(i+1)].values)**2
        obs_df                  = obs_df.resample("W-Sun").sum()
        obs_df.index.values[-1] = model_settings["dates"][-1]


        ρmin           = 0.01 # test sensitivity minimum
        ρmax           = 0.2  # test sensitivity maximum
        βmin           = 0.00 # transmission rate minimum
        βmax           = 0.5  # transmission rate maximum

        max_total_pop     = np.max(H.sum(axis=0))
        state_space_range = np.array([0, max_total_pop])
        parameters_range  = np.array([[ρmin, ρmax], [βmin, βmax]])
        σ_perturb         = np.array([(ρmax-ρmin)/4, (βmax-βmin)/4])

        if_settings["assimilation_dates"] = obs_df.index.values
        if_settings["adjust_state_space"] = False  # for comparing with the abm

        path_to_save_sce = os.path.join(results2_dir, "synthetic_inferences", "no_state_space", f"{amro2cute(amro)}", f"scenario{idx_row+1}")
        os.makedirs(path_to_save_sce, exist_ok=True)

        θmle, θpost = ifeakf(process_model                = process_model,
                                observational_model       = observational_model,
                                state_space_initial_guess = init_conditions,
                                observations_df           = obs_df,
                                parameters_range          = parameters_range,
                                state_space_range         = state_space_range,
                                model_settings            = model_settings,
                                if_settings               = if_settings,
                                perturbation              = σ_perturb)

        np.savez_compressed(os.path.join(path_to_save_sce, f"{str(id_run).zfill(3)}posterior.npz"),
                                        mle           = θmle,
                                        posterior     = θpost,
                                        state_space   = x_sim,
                                        observations  = y_sim,
                                        teta_truth    = θtruth,
                                        idx_infer     = idx_infer)


In [8]:
amro_search  = ['ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE',  'PSEUDOMONAS AERUGINOSA',
                'METHICILLIN-SUSCEPTIBLE STAPHYLOCOCCUS AUREUS',
                "STAPHYLOCOCCUS EPIDERMIDIS", 'ENTEROCOCCUS FAECALIS', 'ENTEROCOCCUS FAECIUM']

for amro in amro_search:
    run_amro_synthetic(amro)


                                               

FileNotFoundError: [Errno 2] No such file or directory: '/Users/chaosdonkey06/Dropbox/shaman-lab/amr-hospitals/results2/synthetic_inferences/no_state_space/e_coli/scenario1/000posterior.npz'