In [1]:
from scipy.stats import truncnorm
import pandas as pd
import numpy as np
import itertools
import datetime
import tqdm
import sys
import os

def flatten_list(list_array):
    return list(itertools.chain(*list_array))

sys.path.insert(0,"../")
from global_config import config

results_dir           = config.get_property('results_dir')
results2_dir           = config.get_property('results2_dir')

data_dir              = config.get_property('data_dir')
paper_dir             = config.get_property('paper_dir')
data_db_dir           = config.get_property('data_db_dir')
feb_hosp_records_path = os.path.join(data_db_dir, 'long_files_8_25_2021')
path_to_save          = os.path.join(results_dir, "real_testing", "community")

COLOR_LIST1           = ["#F8AFA8", "#FDDDA0", "#F5CDB4", "#74A089"]

In [2]:
date_min            = pd.to_datetime("2020-02-01")
date_max            = pd.to_datetime("2021-02-28")

dates_simulation = pd.date_range(start=date_min, end=date_max, freq="D")


adht_ward_df = pd.read_csv(os.path.join(data_db_dir, "long_files_8_25_2021", "counts_ward.csv" ), parse_dates=['date'])


adht_ward_df   = adht_ward_df[adht_ward_df.date.isin(dates_simulation)]
#selected_ward = ['Allen Hospital', 'Harkness Pavilion', 'Milstein Hospital', 'Mschony', 'Presbyterian Hospital']

A_df     = pd.pivot(adht_ward_df, index='ward', columns='date', values='num_admitted')
D_df     = pd.pivot(adht_ward_df, index='ward', columns='date', values='num_discharged')
H_df     = pd.pivot(adht_ward_df, index='ward', columns='date', values='num_hospitalized')
tests_df = pd.pivot(adht_ward_df, index='ward', columns='date', values='num_tested')

pop        = H_df.mean(axis=1)
num_pop    = len(pop)
ward_names = pop.index

ward_num          = len(ward_names)

ward_transfers_df = pd.read_csv(os.path.join(data_db_dir, "long_files_8_25_2021", "transfers_ward.csv"), parse_dates=['date'])
ward_transfers_df = ward_transfers_df[ward_transfers_df.date.isin(dates_simulation)]

M_df = np.zeros((ward_num, ward_num, len(dates_simulation)+1))

for i in range(ward_num):
    ward_from = ward_names[i]
    for j in range(ward_num):
        ward_to      = ward_names[j]
        transfers_ij = ward_transfers_df[(ward_transfers_df.ward_from==ward_from) & (ward_transfers_df.ward_to==ward_to)]

        if(transfers_ij.shape[0] > 0) :
            dates_ij                = transfers_ij.date.values
            dates_ind               = np.where(np.in1d(dates_ij, dates_simulation))[0]
            transfered              = transfers_ij.num_transfered.values
            M_df[i, j, dates_ind-1] = transfered


In [3]:
from models import process_metapop, observe_metapop, init_metapop, simulate_metapop, simulate_metapop_observations

if_settings = {
   "Nif"                : 50,          # number of iterations of the IF
   "type_cooling"       : "geometric", # type of cooling schedule
   "shrinkage_factor"   : 0.9,         # shrinkage factor for the cooling schedule
   "inflation"          : 1.01,        # inflation factor for spreading the variance after the EAKF step
}

model_settings = {
    "param_name"  : ["ρ", "β"],   # importation and transmission rate
    "p"           : 2,              # number of parameters
    "k"           : num_pop,        # number of observations | We are just observing carriage
    "n"           : 3*num_pop,      # number of state variables / dimension of the state space
    "dt"          : 1,              # time step
    "T"           : len(dates_simulation), # time to run
    "m"           : 100,           # number of ensembles
    "stochastic"  : True,           # is stochastic
    "num_pop"     : num_pop,
    "dates"       : dates_simulation
    }

p = model_settings["p"]
m = model_settings["m"]
T = model_settings["T"]

delta = 1/120  # decolonization rate

A = A_df.to_numpy()
D = D_df.to_numpy()
H = H_df.to_numpy()
M = M_df

#tests = tests_df.to_numpy()
tests = np.zeros((num_pop, T))
tests = tests_df.to_numpy()

# Process model for the ifeakf | model(x, gamma, beta, delta, rho, sigma, pop, m=1, stochastic=True)
process_model_gamma = lambda t, x, θ, gamma : process_metapop(t, x,
                                            gamma = gamma * np.ones(m),
                                            beta  = θ[1, :],
                                            delta = delta,
                                            Nmean = np.expand_dims(pop, -1),
                                            N     = H[:, [t]],
                                            A     = A[:, [t]],
                                            D     = D[:, [t]],
                                            M     = M[:, :, t])

# Observational model for the ifeakf |  g(t, x, rho)
observational_model  = lambda t, x, θ: observe_metapop(t, x,
                                                rho            = θ[0, :],
                                                N              = H[:, [t]],
                                                num_tests      = tests[:, [t]],
                                                model_settings = model_settings)


# f0 model for the ifeakf            | initial_condition(c0, pop=2000, m=300)
initial_guess_x0_gamma  = lambda θ, gamma:  init_metapop(
                                                N0             = H[:, 0],
                                                c0             = gamma, # importation rate
                                                model_settings = model_settings)


ρmin = 0.01 # test sensitivity minimum
ρmax = 0.5  # test sensitivity maximum

βmin = 0.001 # transmission rate minimum
βmax = 0.5   # transmission rate maximum

max_total_pop     = np.max(H.sum(axis=0))
state_space_range = np.array([0, max_total_pop])
parameters_range  = np.array([[ρmin, ρmax],
                              [βmin, βmax]])

σ_perturb         = np.array([(ρmax - ρmin)   / 4,
                                (βmax - βmin) / 4])


In [4]:
def amro2cute(amro):
    if amro == 'ESCHERICHIA COLI':
        return "e_coli"
    elif amro == 'KLEBSIELLA PNEUMONIAE':
        return "k_pneumoniae"
    elif amro=="PSEUDOMONAS AERUGINOSA":
        return "p_aeruginosa"
    elif amro=="METHICILLIN-SUSCEPTIBLE STAPHYLOCOCCUS AUREUS":
        return "mssa"
    elif amro=="METHICILLIN-RESISTANT STAPHYLOCOCCUS AUREUS":
        return "mrsa"
    elif amro=="STAPHYLOCOCCUS EPIDERMIDIS":
        return "s_epidermidis"
    elif amro=="ENTEROCOCCUS FAECALIS":
        return "e_faecalis"
    elif amro=="ENTEROCOCCUS FAECIUM":
        return "e_faecium"

In [5]:
import sys
sys.path.insert(0, "../pompjax/pompjax/")

from pyro.contrib.forecast import eval_crps
from eval import calibration


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def compute_evals(samples, obs, beta, rho,  name_var="beta"):
    """_summary_

    Args:
        samples: num_ensembles x num_times
        obs:     time series observation

    Returns:
        pd.DataFrame: Dataframe with the continuos ranked probability score (crps) and the average calibration.
    """

    cal_df = calibration.calibration(np.expand_dims(samples.T, 0), np.expand_dims(obs, 0), observation_index=0)
    sc     = np.mean(np.abs(cal_df.quantiles.values-cal_df.proportion_inside.values))

    df_response                      = pd.DataFrame(columns=['crps', 'calibration_score', name_var, "rho"])
    df_response['crps']              = [eval_crps(samples, obs)]
    df_response["calibration_score"] = sc
    df_response[name_var]            = [beta]
    df_response['rho']               = [rho]
    return df_response


In [7]:
patient_df               = pd.read_csv(os.path.join(data_db_dir, "long_files_8_25_2021", "patient_movement_2022-Nov.csv"), parse_dates=['date'])
patient_df               = patient_df.drop_duplicates(['date','mrn'])
patient_df["ward_total"] = patient_df.apply(lambda x: x["ward"]+"-"+x["building"]+"-"+x["place"], axis=1)

ward2id                  = {w: i for i, w in enumerate(patient_df["ward_total"].index.values)}

#duplicated_pos_tests = (patient_df[['mrn','organism_name']].duplicated() & ~patient_df['organism_name'].isnull())
duplicated_pos_tests = (patient_df[['encounter_id','organism_name']].duplicated() & ~patient_df['organism_name'].isnull())

patient_df.loc[duplicated_pos_tests,'test']          = 0
patient_df.loc[duplicated_pos_tests,'organism_name'] = np.nan

wards       = patient_df.ward_total.unique()
amro_search = ['ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA', 'METHICILLIN-SUSCEPTIBLE STAPHYLOCOCCUS AUREUS',
                'METHICILLIN-RESISTANT STAPHYLOCOCCUS AUREUS', 'STAPHYLOCOCCUS EPIDERMIDIS', 'ENTEROCOCCUS FAECALIS', 'ENTEROCOCCUS FAECIUM']


In [8]:
import torch

amro_search = ['ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA', 'METHICILLIN-SUSCEPTIBLE STAPHYLOCOCCUS AUREUS',
                'METHICILLIN-RESISTANT STAPHYLOCOCCUS AUREUS', 'STAPHYLOCOCCUS EPIDERMIDIS', 'ENTEROCOCCUS FAECALIS', 'ENTEROCOCCUS FAECIUM']

amro             = amro_search[0]
amro_prev_df     = pd.read_csv(os.path.join("..", "data", "amro_prevalence.csv"))
gamma            = amro_prev_df[amro_prev_df.amro==amro]["prevalence_mean1"].values[0]/100
amro_df          = pd.read_csv( os.path.join(data_db_dir, "long_files_8_25_2021", "amro_ward.csv" ), parse_dates=["date"])


In [12]:
def generate_combination(arr1, arr2):
    a = []
    for a1 in arr1:
        for a2 in arr2:
            a.append([a1, a2])
    return np.array(a)

In [13]:
import numpy as np


ρmin    = 1/100
ρmax    = 20/100

βmin    = 0.01
βmax    = 0.5

ρ_search = np.arange(ρmin, ρmax+1/100, 1/100)
β_search = np.arange(βmin, βmax+0.01, 0.01)

In [12]:
from tqdm import tqdm

def grid_search(amro, gamma, amro_df, model_settings, previous_search=None):
    amro_df  = amro_df[amro_df.amro==amro].groupby("date").sum(numeric_only=True).reset_index()
    obs_amro = amro_df.set_index("date").resample("W-Sun").sum(numeric_only=True)["value"].values[4:]

    process_model    = lambda t, x, θ : process_model_gamma(t, x, θ, gamma=gamma)
    initial_guess_x0 = lambda θ:  initial_guess_x0_gamma(θ, gamma=gamma)

    ρmin    = 1/100
    ρmax    = 20/100

    βmin    = 0.01
    βmax    = 0.5

    ρ_search = np.linspace(ρmin, ρmax, 1/100)
    β_search = np.linspace(βmin, βmax, 0.01)

    psearch  = generate_combination(ρ_search, β_search)

    metric_df = []
    for idx_s, p in tqdm(enumerate(psearch)):
        ρsim = p[0]
        βsim = p[1]

        θsim               = np.array([[ρsim], [βsim]]) * np.ones((2, model_settings["m"]))
        y_sim              = simulate_metapop_observations(process_model, observational_model, initial_guess_x0, θsim, model_settings)
        observations       = np.sum(y_sim, axis=1)
        sim_df             = pd.DataFrame(columns=["date", "ens_id", "values", "scenario"])
        sim_df["values"]   = observations.flatten()
        sim_df["date"]     = flatten_list([ [date]*model_settings["m"]  for date in  list(model_settings["dates"])])
        sim_df["ens_id"]   = list(range(model_settings["m"] )) * len(model_settings["dates"])
        sim_df["rho"]      = ρsim

        samples_t  = sim_df.set_index(["date", "ens_id", "rho"]).unstack([1, 2]).resample("W-Sun").sum(numeric_only=True).stack().stack().reset_index()
        samples_t  = pd.pivot(data=samples_t, index="date", columns="ens_id", values="values").to_numpy()
        samples_t  = samples_t[4:, :].T
        samples_t  = torch.tensor(samples_t);  obs_t  = torch.tensor(list(obs_amro))
        df_metrics = compute_evals(samples_t, obs_t, βsim, ρsim)
        metric_df.append(df_metrics)
    return pd.concat(metric_df)


In [13]:
amro_prev_df     = pd.read_csv(os.path.join("..", "data", "amro_prevalence.csv"))
amro_df = pd.read_csv( os.path.join(data_db_dir, "long_files_8_25_2021", "amro_ward.csv" ), parse_dates=["date"])

for amro in amro_search:
    print("grid search for ", amro, " ...")
    gamma        = amro_prev_df[amro_prev_df.amro==amro]["prevalence_mean1"].values[0]/100
    crps_amro_df = grid_search(amro, gamma, amro_df, model_settings, previous_search=None)
    crps_amro_df.to_csv( os.path.join(results2_dir, "grid_search", "metapopulation", f"{amro2cute(amro)}.csv") )


656it [59:08,  5.70s/it]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

amro_search = ['ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA', 'METHICILLIN-SUSCEPTIBLE STAPHYLOCOCCUS AUREUS',
                'METHICILLIN-RESISTANT STAPHYLOCOCCUS AUREUS', 'STAPHYLOCOCCUS EPIDERMIDIS', 'ENTEROCOCCUS FAECALIS', 'ENTEROCOCCUS FAECIUM']


fig, ax = plt.subplots(2, 4, figsize=(12, 5), sharey=True, sharex=True)

for idx_axi, axi in enumerate(ax.flatten()):
    amro         = amro_search[idx_axi]
    grid_amro_df = pd.read_csv(  os.path.join(results2_dir, "grid_search", "metapopulation", f"{amro2cute(amro)}.csv") )

    hm_crps_df = grid_amro_df.pivot(index='beta', columns='rho', values='crps')
    sns.heatmap(ax=axi, data=hm_crps_df, cmap='Reds')
    axi.set_ylabel(None)
    axi.set_xlabel(None)
    axi.set_title(". ".join(amro2cute(amro).split("_")).capitalize())

ax[0, 0].set_ylabel(r'$\beta$')
ax[1, 0].set_ylabel(r'$\beta$')

for i in range(4):
    ax[1, i].set_xlabel(r'$\rho$')

fig.suptitle("Metapopulation CRPS landscapes")
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(2, 4, figsize=(12, 5), sharey=True, sharex=True)

for idx_axi, axi in enumerate(ax.flatten()):
    amro         = amro_search[idx_axi]
    grid_amro_df = pd.read_csv(  os.path.join(results2_dir, "grid_search", "metapopulation", f"{amro2cute(amro)}.csv") )

    hm_cov_df  = grid_amro_df.pivot(index='beta', columns='rho', values='calibration_score')
    sns.heatmap(ax=axi, data=hm_cov_df, cmap='Reds')
    axi.set_ylabel(None)
    axi.set_xlabel(None)
    axi.set_title(". ".join(amro2cute(amro).split("_")).capitalize())

ax[0, 0].set_ylabel(r'$\beta$')
ax[1, 0].set_ylabel(r'$\beta$')

for i in range(4):
    ax[1, i].set_xlabel(r'$\rho$')

fig.suptitle("Metapopulation average calibration landscapes")
plt.tight_layout()