In [1]:
from scipy.stats import truncnorm
import pandas as pd
import numpy as np
import itertools
import datetime
import tqdm
import sys
import os

import matplotlib.pyplot as plt

def flatten_list(list_array):
    return list(itertools.chain(*list_array))

sys.path.insert(0,"../")
from global_config import config

results_dir           = config.get_property('results_dir')
data_dir              = config.get_property('data_dir')
paper_dir             = config.get_property('paper_dir')
data_db_dir           = config.get_property('data_db_dir')
feb_hosp_records_path = os.path.join(data_db_dir, 'long_files_8_25_2021')
path_to_save          = os.path.join(results_dir, "real_testing", "community")
results2_dir          = config.get_property('results2_dir')

COLOR_LIST1 = ["#F8AFA8", "#FDDDA0", "#F5CDB4", "#74A089"]

In [2]:
dates_simulation = pd.date_range(start="2020-02-01", end="2021-02-28", freq="D")

movement_df               = pd.read_csv(os.path.join(data_db_dir, "long_files_8_25_2021", 'patient_movement_2022-Nov.csv'), parse_dates=['date']).drop_duplicates(subset=["date", "mrn"], keep="first")
movement_df["ward_total"] = movement_df.apply(lambda x: x["ward"]+"-"+x["building"]+"-"+x["place"], axis=1)
movement_df               = movement_df[movement_df["date"].isin(dates_simulation)]

mrd2id  = {mrn: id for id, mrn in enumerate(movement_df.mrn.unique())}
ward2id = {ward_name: id for id, ward_name in enumerate(movement_df.ward_total.unique())}

movement_df["mrn_id"]        = movement_df.mrn.map(mrd2id)
movement_df["ward_id"]       = movement_df.ward_total.map(ward2id)

ward_size_df                 = movement_df.reset_index()
ward_size_df["ward_id"]      = ward_size_df["ward_total"].apply(lambda x: ward2id[x])
ward_size_df["num_patients"] = 1
ward_size_df                 = ward_size_df.groupby(["date", "ward", "ward_id"]).sum()[["num_patients"]].reset_index().drop(columns=["date"])
ward_size_df                 = ward_size_df.groupby(["ward", "ward_id"]).mean().reset_index().sort_values(by="num_patients")
ward2size                    = {r.ward_id: r.num_patients for idx_r, r in ward_size_df.iterrows()}

id2ward                      = dict((v, k) for k, v in ward2id.items())

###-###-###-###-###-###-###-###-###-###-###-###
cluster_diag_df              = pd.read_csv(os.path.join("..", "data", "infomap_nondiag.csv"), sep=" ").rename(columns={"node_id": "ward_id"})
cluster_diag_df["ward_name"] = cluster_diag_df["ward_id"].map(id2ward)
cluster_diag_df["cluster"]   = cluster_diag_df.apply(lambda x: int(str(x.path).split(":")[0]), axis=1)
cluster_diag_df              = cluster_diag_df[["cluster", "ward_id", "ward_name"]].sort_values(by="cluster")
cluster_diag_df['num_wards'] = cluster_diag_df["cluster"].apply(lambda x: np.sum(cluster_diag_df["cluster"] == x))

cluster_diag_df["cluster"][cluster_diag_df.cluster>=6] = 6
cluster_diag_df["cluster"] = cluster_diag_df["cluster"].map(lambda x: int(x-1))

cluster_diag_df['num_wards']                           = cluster_diag_df["cluster"].apply(lambda x: np.sum(cluster_diag_df["cluster"] == x))
###-###-###-###-###-###-###-###-###-###-###-###

wardid2cluster         = dict(zip(cluster_diag_df["ward_id"], cluster_diag_df["cluster"]))
movement_df["cluster"] = movement_df["ward_id"].map( wardid2cluster )
movement_df["cluster"] = movement_df["cluster"].fillna(cluster_diag_df.cluster.max())
movement_df["cluster"] = movement_df["cluster"].map(lambda x: int(x))


  ward_size_df                 = ward_size_df.groupby(["date", "ward", "ward_id"]).sum()[["num_patients"]].reset_index().drop(columns=["date"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_diag_df["cluster"][cluster_diag_df.cluster>=6] = 6


In [4]:
sys.path.insert(0, "../pompjax/pompjax/")

from pyro.contrib.forecast import eval_crps
from eval import calibration

def compute_evals(samples, obs, beta, rho,  name_var="beta"):
    """_summary_

    Args:
        samples (_type_): num_ensembles x num_times
        obs (_type_): time series observation

    Returns:
        _type_: _description_
    """

    cal_df = calibration.calibration(np.expand_dims(samples.T, 0), np.expand_dims(obs, 0), observation_index=0)
    sc     = np.mean(np.abs(cal_df.quantiles.values-cal_df.proportion_inside.values))

    df_response              = pd.DataFrame(columns=['crps', 'calibration_score', name_var, "rho"])
    df_response['crps']      = [eval_crps(samples, obs)]
    df_response["calibration_score"] = sc
    df_response[name_var]    = [beta]
    df_response['rho']       = [rho]

    return df_response


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def create_amro_obs(amro, model_settings, resample="W-Sun"):
    dates_resamp = pd.date_range(model_settings["dates"][0], model_settings["dates"][-1], freq=resample)

    test_df         = movement_df.copy(); test_df = test_df[test_df["test"]==1]
    amro_df         = test_df.copy()
    amro_df["keep"] = amro_df["organism_name"].apply(lambda x: amro in str(x))
    amro_df         = amro_df[amro_df["keep"]==True]
    amro_df         = amro_df.groupby(["date", "cluster"]).sum()[["test"]].unstack([1]).fillna(0).resample("W-Sun").sum()
    amro_df         = amro_df.xs("test", axis=1, drop_level=True)
    amro_df         = amro_df.sum(axis=1)
    amro_df         = amro_df.reindex(dates_resamp, fill_value=0)

    return amro_df

In [6]:
import torch

def generate_combination(arr1, arr2):
    a = []
    for a1 in arr1:
        for a2 in arr2:
            a.append([a1, a2])
    return np.array(a)

def grid_search(f, f0, g, obs_df, model_settings, previous_search=None):
    βmin  = 0.0
    βmax  = 0.1
    βstep = 0.005

    ρmin  = 0.01
    ρmax  = 0.1
    ρstep = 0.01

    ρ_search     = np.arange(ρmin, ρmax, ρstep)
    β_search     = np.arange(βmin, βmax + βstep, βstep)

    p_new_df  = pd.DataFrame(generate_combination(ρ_search, β_search), columns=["rho", "beta"])
    p_done_df = previous_search[["rho", "beta"]]
    p_df      = pd.concat([p_new_df, p_done_df]).drop_duplicates(subset=["rho", "beta"], keep=False)

    # sum across clusters
    obs_world  = obs_df.values

    metric_df = pd.DataFrame()
    for idx_row, row in p_df.iterrows():
        rho, beta = row["rho"], row["beta"]

        observations       = simulate_abm(f, f0, g, beta, rho, model_settings)
        observations       = np.sum(observations, axis=1)
        sim_df             = pd.DataFrame(columns=["date","ens_id", "values", "scenario"])
        sim_df["values"]   = observations.flatten()
        sim_df["date"]     = flatten_list([ [date]*model_settings["m"]  for date in  list(model_settings["dates"])])
        sim_df["ens_id"]   = list(range(model_settings["m"] )) * len(model_settings["dates"])
        sim_df["rho"]      = rho

        samples_t  = sim_df.set_index(["date", "ens_id", "rho"]).unstack([1, 2]).resample("W-Sun").sum().stack().stack().reset_index()
        samples_t  = pd.pivot(data=samples_t, index="date", columns="ens_id", values="values").to_numpy().T
        samples_t  = torch.tensor(samples_t);  obs_t  = torch.tensor(list(obs_world))
        df_metrics = compute_evals(samples_t, obs_t, beta, rho)
        metric_df  = pd.concat([metric_df, df_metrics])

    return  pd.concat([metric_df, previous_search])

def empirical_prevalence(amro, path_to_prev="../data/amro_prevalence.csv"):
    amro_prev_df = pd.read_csv(path_to_prev)
    gamma        = amro_prev_df[amro_prev_df.amro==amro]["prevalence_mean1"].values[0]/100
    return gamma


In [7]:
amro_prevalence_df                 = pd.read_csv(os.path.join("..", "data", "amro_prevalence.csv"))
model_settings                     = {}
model_settings["m"]                = 100
model_settings["p"]                = 2
model_settings["n"]                = movement_df.mrn_id.unique().shape[0]
model_settings["k"]                = movement_df.cluster.unique().shape[0]
model_settings["dates"]            = pd.date_range(start="2020-02-01", end="2021-02-28", freq="D")
model_settings["dates_simulation"] = pd.date_range(start="2020-02-01", end="2021-02-28", freq="D")

amro_search = ['ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA', 'METHICILLIN-SUSCEPTIBLE STAPHYLOCOCCUS AUREUS',
                'METHICILLIN-RESISTANT STAPHYLOCOCCUS AUREUS', 'STAPHYLOCOCCUS EPIDERMIDIS', 'ENTEROCOCCUS FAECALIS', 'ENTEROCOCCUS FAECIUM']


In [None]:
from models import amr_abm, observe_cluster

def simulate_abm(f, f0, g, beta, rho, model_settings):
    dates_simulation = model_settings["dates_simulation"]

    θ = np.array([[rho], [beta]]) * np.ones((model_settings["p"], model_settings["m"]))
    x = f0(θ)

    observations          = np.full((len(dates_simulation), model_settings["k"], model_settings["m"]), np.nan)
    observations[0, :, :] = g(0, x, θ)

    for t, date in tqdm.tqdm(enumerate(dates_simulation[1:])):
        x                       = f(t, x, θ)
        observations[t+1, :, :] = g(t, x, θ)
    return observations

In [13]:


β    = 0.01
ρ    = 6/10
amro = amro_search[0]

gamma              = empirical_prevalence(amro) * np.ones(model_settings["m"])

alpha               = 1/120
init_state          = lambda θ:       amr_abm(t = 0,
                                                agents_state   = np.zeros((model_settings["n"], model_settings["m"])),
                                                gamma          = gamma,
                                                beta           = θ[1, :],
                                                alpha          = alpha,
                                                movement       = movement_df[movement_df["date"]==dates_simulation[0]],
                                                ward2size      = ward2size,
                                                model_settings = model_settings["n"])

process_model       = lambda t, x, θ: amr_abm(t = t,
                                                agents_state   = x,
                                                gamma          = gamma,
                                                beta           = θ[1, :],
                                                alpha          = alpha,
                                                movement       = movement_df[movement_df["date"]==dates_simulation[t]],
                                                ward2size      = ward2size,
                                                model_settings = model_settings["n"])

observational_model = lambda t, x, θ: observe_cluster(t   = t,
                                                    x              = x,
                                                    rho            = θ[0, :],
                                                    movement       = movement_df[movement_df["date"]==dates_simulation[t]],
                                                    model_settings = model_settings["k"])

θsim = np.array([[ρ], [β]]) * np.ones((model_settings["p"], model_settings["m"]))

x = init_state(θsim)

TypeError: amr_abm() got an unexpected keyword argument 'x'

In [10]:
from utils_local.misc import amro2cute
for amro in amro_search:
    previous_search_df = pd.read_csv( os.path.join("..", "preliminary_results", f"crps_{amro2cute(amro)}.csv"), index_col=None).drop(columns=['Unnamed: 0'])
    gamma              = empirical_prevalence(amro) * np.ones(model_settings["m"])

    alpha               = 1/120
    x                   = np.zeros((model_settings["n"], model_settings["m"]))
    init_state          = lambda θ:       amr_abm(0, x, gamma, θ[1, :], alpha, movement_df[movement_df["date"]==dates_simulation[0]], ward2size, model_settings["n"])
    process_model       = lambda t, x, θ: amr_abm(t, x, gamma, θ[1, :], alpha, movement_df[movement_df["date"]==dates_simulation[t]], ward2size, model_settings["n"])
    observational_model = lambda t, x, θ: observe_cluster(t, x, movement_df[movement_df["date"]==dates_simulation[t]], θ[0, :], model_settings["k"])

    obs_df              = create_amro_obs(amro, model_settings, resample="W-Sun")

    # init state space
    crps_amro_df = grid_search(process_model, init_state, observational_model,
                            obs_df, model_settings, previous_search=previous_search_df)

    crps_amro_df.to_csv( os.path.join(results2_dir, "grid_search", "metapopulation", f"{amro2cute(amro)}.csv") )


  amro_df         = amro_df.groupby(["date", "cluster"]).sum()[["test"]].unstack([1]).fillna(0).resample("W-Sun").sum()


TypeError: 'int' object is not subscriptable