# Empirical Data: Fit & Predict 2007 to 2019


## Package import

In [1]:
from SIR_gp import * #class implementation of the GP
from SIR_plot import * #functions for plotting of GP outputs 

import pandas as pd
import numpy as np
import torch 
from emukit.core import ParameterSpace, ContinuousParameter #emukit for LHS
from emukit.core.initial_designs.latin_design import LatinDesign #emukit for LHS
from scipy.stats import spearmanr

import warnings
warnings.filterwarnings("ignore") #only if you are really sure about this one 

## Global Variables

In [2]:
PATH_DATA = "../data/empirical/OpenDengue-epidemics-reformat.txt" #empirical outbreak data 
PATH_GP_TRAIN = "../GPs/imax/sim-training-maxIncidence-round15.txt" #GP: path training data 
PATH_GP_TEST = "../data/sim/imax-duration/DD-AML-test-LHS-10000-condSim-logDuration.txt" #GP: path test data 
PATH_GP_MODEL = "../GPs/imax/maxIncidence-round15-snap3.pth" #GP: path model snapshot 

PARAM_RANGES = ParameterSpace([ #parameter ranges, without alphaRest, including correction factor
    ContinuousParameter("alphaAmp", 0, 1),
    ContinuousParameter("alphaShift", 0, 1),
    ContinuousParameter("infTicksCount", 4, 6),
    ContinuousParameter("avgVisitsCount", 1, 5),
    ContinuousParameter("pVisits", 0.05, 0.95),
    ContinuousParameter("propSocialVisits", 0, 1),
    ContinuousParameter("locPerSGCount", 1, 20),
    ContinuousParameter("correctionFactor", 0, 0.1)
])

PARAM_RANGES_ALPHA = [0, 0.03] #define parameter range & number of steps for alphaRest 
PARAM_STEPS_ALPHA = 50

N_SAMPLE_LHS = 25000 #samples to draw from LHS 
RANDOM_STATE_SEED = 42 #random seed (LHS sample, test/train split)

PROP_FIT = 0.67 #proportion of data used for fitting

TOP_X = 250 #store TOP_X parameter combinations with lowest RMSE
N_SHUFFLE = 1000 #number of shuffling iterations 


## Functions

In [3]:
def filter_min_count_epidemics(df, mincount):
    #filter_min_count_eppidemics: filter for municipalities with at least mincount outbreaks
    outbreak_counts = df['ADM2_PCODE'].value_counts()
    municipalities_ID = outbreak_counts[outbreak_counts >= mincount].index
    df_filtered = df[df['ADM2_PCODE'].isin(municipalities_ID)]
    return df_filtered

def split_dataframe(df, p):
    #split_dataframe: split df into two df (group = municipality)
    #p --> a, 1 - p --> b
    groups = df.groupby('ADM2_PCODE')

    df_a = pd.DataFrame() #generate empty data frames
    df_b = pd.DataFrame()
    
    for name, group in groups: #for each group 
        num_outbreaks = len(group) #determine number of outbreaks 
        indices = np.arange(num_outbreaks) #split randomly into two groups
        np.random.shuffle(indices)
        split_index = int(p * num_outbreaks) #set split index
        
        df_a = pd.concat([df_a, group.iloc[indices[:split_index]]])
        df_b = pd.concat([df_b, group.iloc[indices[split_index:]]])
    
    # Reset indices
    df_a = df_a.reset_index(drop=True)
    df_b = df_b.reset_index(drop=True)
    
    return df_a, df_b

def format_data(LHS_sample, epidemics, epidemic_id):
    #format_data: generate torch for GP prediction, with epidemic-specific phaseShift parameters 
    res = LHS_sample.copy()
    epidemic_timepoint = epidemics['t'].iloc[epidemic_id] #extract epidemic-specific data 
    phaseShift = res[:, 2] + epidemic_timepoint/365
    phaseShift_scaled = phaseShift % 1
    res[: ,2] = phaseShift_scaled #calulate overall phase shift
    res = torch.from_numpy(res).float().contiguous() #reformat
    return(res)

def calculate_rmse(epidemics, predictions, model_type):
    #calculate_rmse: calculate RMSE; epidemics = df with empirical outbreaks; predictions = GP predictions
    #note different shapes --> np.tile()
    if model_type == "maxIncidence": #extract summary stat 
        observed = epidemics['imax'].values 
    if model_type == "duration":
        observed = epidemics['duration'].values 
        observed = np.log10(observed)
    
    observed = observed.reshape(1, -1)  # reshape observed values 
    observed = np.tile(observed, (predictions.shape[0], 1)) #account for different shape
    rmse = np.sqrt(np.mean((observed - predictions) ** 2, axis=1))  # calculate RMSE
    return rmse

def predict_var(LHS_sample, epidemics, GP_model, verbose = False):
    #predict_VAR: formate LHS_sample parameters with information from epidemics and predict using GP_model
    if GP_model.model_type == "maxIncidence": #extract summary stat 
        observed = epidemics['imax'].values 
    if GP_model.model_type == "duration":
        observed = epidemics['duration'].values 
        observed = np.log10(observed)

    last_column = LHS_sample[:, -1].reshape(-1, 1)  # correction factor: reshaped to (num_predictions, 1)

    num_predictions = LHS_sample.shape[0] #number of predictions 
    num_epidemics = len(observed) #number of epidemics

    res = np.empty((num_predictions, num_epidemics)) #generate empty numpy array
    
    if verbose == True:
        print(f"Predicting {num_predictions} points for {num_epidemics} epidemics - please be patient ...")

    for i in range(num_epidemics): #for each epidemic
        if verbose == True:
            if i % 10 == 0:
                print(f"{i} out of {num_epidemics} epidemics done")

        iter_points = format_data(LHS_sample=LHS_sample[:,:-1], epidemics=epidemics, epidemic_id=i) #prep data 
        iter_pred, _, _ = GP_model.predict_ys(parsed_data = iter_points) #make predictions
            
        if GP_model.model_type == "maxIncidence": #store predictions; adjust for correction factor if estimate == imax
            res[:, i] = iter_pred.numpy().flatten() * last_column.flatten()  
        if GP_model.model_type == "duration":
            res[:, i] = iter_pred.numpy().flatten()

    if GP_model.model_type == "maxIncidence": #clip to range
        res = np.clip(res, 0.0, 1.0)
    
    if GP_model.model_type == "duration":
        res = np.clip(res, 0.0, 3.0)

    return res #nrow(LHS_sample) x len(observed)  

def shuffle_var(emp_df, f_alpha_df, f_candidate_LHS, n_iter, shuffle_var, GP_model): 
   #shuffle_var: conduct perumtation tests
      #emp_df : empirical data frame
      #f_alpha_df: alphaRest estimates per municipality
      #f_candidate_LHS: input parameters 
      #n_iter: iteration count for reshuffling
      #shuffle_var: shuffling type; 3 options
         #day: time of outbreak
         #municipality: municipality
         #both: day & municipality 
      #GP_model: GP model used for predictions 
   f_df = emp_df.copy()
   shuffle_stat = [] #create empty list for corr. coeff

   for i in range(n_iter): #conduct shuffling 
        f_predictions = pd.DataFrame()

        if shuffle_var == 'day':
            f_df['t'] = np.random.permutation(f_df['t'].values)

        if shuffle_var == 'municipality':
            f_df['ADM2_PCODE'] = np.random.permutation(f_df['ADM2_PCODE'].values)

        if shuffle_var == 'both':
            f_df['t'] = np.random.permutation(f_df['t'].values)
            f_df['ADM2_PCODE'] = np.random.permutation(f_df['ADM2_PCODE'].values)
            
        for m in f_alpha_df['municipality']: #for each municipality

            f_df_m = f_df.loc[f_df['ADM2_PCODE'] == m] #subset epidemics
            f_df_m = f_df_m.reset_index(drop = True)
         
            f_alpha_m = f_alpha_df.loc[f_alpha_df['municipality'] == m] #retrieve alphaRest values
            f_alpha_m = f_alpha_m['alphaRest']
         
            f_params = np.hstack((f_alpha_m, f_candidate_LHS)) #create parameter array
            f_params = f_params.reshape(1, 9)
         
            f_pred = predict_var(LHS_sample=f_params, epidemics=f_df_m, GP_model=GP_model) #predict
            f_pred_df = pd.DataFrame(f_pred.flatten(), columns=['pred']) #attach predictions 
            f_combinded_df = pd.concat((f_df_m, f_pred_df), axis = 1) 
      
            f_predictions = pd.concat([f_predictions, f_combinded_df], ignore_index=True) #store municipality results 

        iter_stat = spearmanr(f_predictions['imax'], f_predictions['pred']).statistic #calculate corr.coeff
        shuffle_stat.append(iter_stat)
   return shuffle_stat #return n_iter long list of correlation coefficients


## Emp Data

In [4]:
np.random.seed(RANDOM_STATE_SEED) #for reproducibility

epidemics = pd.read_csv(PATH_DATA, sep="\t", header=0) #read all epidemics
epidemics_filtered = filter_min_count_epidemics(epidemics, 3) #filter for municipalities with at least 3 outbreaks 
df_fit, df_pred = split_dataframe(epidemics_filtered, PROP_FIT) #split into two data frames: fit, prediction 
municipalities = df_fit['ADM2_PCODE'].unique() #unique municipalities 

print(epidemics_filtered.shape)
print(df_fit.shape) #df_fit purpose: determine best fitting parameter combination
print(df_pred.shape) #df_pred purpose: assess predictive power | best fitting parameter combination 
print(len(municipalities))

(1186, 5)
(737, 5)
(449, 5)
173


## Input Domain

In [5]:
candidates_LHS = LatinDesign(PARAM_RANGES).get_samples(N_SAMPLE_LHS) #LHS sample
candidates_alpha = np.linspace(PARAM_RANGES_ALPHA[0], PARAM_RANGES_ALPHA[1], PARAM_STEPS_ALPHA) #alphaRest sample 
candidates_alpha = candidates_alpha.reshape(candidates_alpha.shape[0], 1)

candidates = candidates_LHS.repeat(candidates_alpha.shape[0], axis=0) #repeat LHS sample 
candidates_alpha_tiled = np.tile(candidates_alpha, (candidates_LHS.shape[0],1)) #tile alphaRest sample 
candidates = np.hstack((candidates_alpha_tiled, candidates)) #concatenate numpy arrays: alphaRest in 1st column 

print(candidates_LHS.shape)
print(candidates_alpha.shape)
print(candidates.shape)


(25000, 8)
(50, 1)
(1250000, 9)


## Load: GP

In [6]:
myGP = SIR_GP(training_data=PATH_GP_TRAIN, model_type="maxIncidence") #load the GP surrogate model
myGP.load(filename=PATH_GP_MODEL)
myGP.get_rmse(PATH_GP_TEST) #sanity check

Model loaded. Loss: -1.7720226049423218


0.04204195387554575

## Predictions

In [7]:
pred = predict_var(candidates, GP_model=myGP, epidemics=df_fit, verbose=True) #perform N_SAMPLE_LHS x N_ALPHAREST predictions for all epidemics in df_fit
print(pred.shape)


Predicting 1250000 points for 737 epidemics - please be patient ...
0 out of 737 epidemics done
10 out of 737 epidemics done
20 out of 737 epidemics done
30 out of 737 epidemics done
40 out of 737 epidemics done
50 out of 737 epidemics done
60 out of 737 epidemics done
70 out of 737 epidemics done
80 out of 737 epidemics done
90 out of 737 epidemics done
100 out of 737 epidemics done
110 out of 737 epidemics done
120 out of 737 epidemics done
130 out of 737 epidemics done
140 out of 737 epidemics done
150 out of 737 epidemics done
160 out of 737 epidemics done
170 out of 737 epidemics done
180 out of 737 epidemics done
190 out of 737 epidemics done
200 out of 737 epidemics done
210 out of 737 epidemics done
220 out of 737 epidemics done
230 out of 737 epidemics done
240 out of 737 epidemics done
250 out of 737 epidemics done
260 out of 737 epidemics done
270 out of 737 epidemics done
280 out of 737 epidemics done
290 out of 737 epidemics done
300 out of 737 epidemics done
310 out of 73

## Calculate RMSE per municipality

In [8]:
num_municipalities = len(municipalities) #number of municipalities 
num_predictions = candidates.shape[0] #number of predictions 
rmse_municipality = np.empty((num_predictions, num_municipalities))

for m in range(num_municipalities):
    m_index = df_fit.index[df_fit['ADM2_PCODE'] == municipalities[m]].tolist() #ID with specific municipality
    rmse_iter = calculate_rmse(epidemics=df_fit.iloc[m_index], predictions=pred[:, m_index], model_type=myGP.model_type)
    rmse_municipality[:, m] = rmse_iter.flatten()

print(rmse_municipality.shape)

(1250000, 173)


## argmin(alpha) per municipality

In [9]:
alpha_municipalitites = np.empty((N_SAMPLE_LHS, num_municipalities)) #set up empty numpy arrays 
min_rmse_municipalities = np.empty((N_SAMPLE_LHS, num_municipalities))

for m in range(num_municipalities): #for each municipality
        alpha_m = [] #create empty lists 
        rmse_m = []
        for i in range(0, num_predictions, PARAM_STEPS_ALPHA): #for each "block" of input params (i.e., same input domain, different alphaRest values)
                rmse_L_m = rmse_municipality[i:(i + PARAM_STEPS_ALPHA), m] #subset rmse_municipality to block
                min_rmse_index = np.argmin(rmse_L_m) #find index & value of smallest RMSE within block
                min_rmse = np.min(rmse_L_m)

                alpha_L = candidates_alpha[min_rmse_index] #store corresponding alpha 
                alpha_m.append(alpha_L)
                rmse_m.append(min_rmse) #store smallest RMSE value within block

        alpha_m = np.array(alpha_m) #reformat
        rmse_m = np.array(rmse_m)

        alpha_municipalitites[:,m] = alpha_m.flatten()
        min_rmse_municipalities[:,m] = rmse_m.flatten()

print(alpha_municipalitites.shape) #LHS x N_municipalities
print(min_rmse_municipalities.shape) #LHS x N_municipalities



(25000, 173)
(25000, 173)


## argmin(LHS) for df_fit 

In [10]:

rmse_sums = np.sum(min_rmse_municipalities, axis=1) #build RMSE sums across all municipalities 
bestFitID = np.argmin(rmse_sums) #ID of best fit = lowest RMSE sum 

print(rmse_sums.shape) #N_LHSE
print(candidates_LHS.shape)  #N_LHSE x 8 (no alpha)
print(candidates_LHS[bestFitID:(bestFitID+1),:]) #best fit 

alphaData={ #store argmin(alpha) for all municipalities
    'municipality': municipalities,
    'alphaRest' : alpha_municipalitites[bestFitID]
}

alpha_df = pd.DataFrame(alphaData)
print(alpha_df.describe)



(25000,)
(25000, 8)
[[0.15758  0.57682  4.6714   4.3892   0.465818 0.99326  4.54958  0.035294]]
<bound method NDFrame.describe of     municipality  alphaRest
0        CO05001   0.005510
1        CO05045   0.003673
2        CO05051   0.003673
3        CO05079   0.003673
4        CO05088   0.003673
..           ...        ...
168      CO86865   0.015306
169      CO91001   0.029388
170      CO95001   0.007347
171      CO97001   0.012857
172      CO99773   0.006735

[173 rows x 2 columns]>


## top X candidates for df_fit

In [11]:
top_indices = np.argsort(rmse_sums)[:TOP_X] #TOP_X LHS with lowest RMSE

best_fits = candidates_LHS[top_indices, :] #get best TOP_X fits
print(best_fits.shape)  # Top X best fits
best_fits_df = pd.DataFrame(best_fits, columns=PARAM_RANGES.parameter_names)
print(best_fits_df.describe)


(250, 8)
<bound method NDFrame.describe of      alphaAmp  alphaShift  infTicksCount  avgVisitsCount   pVisits  \
0     0.15758     0.57682        4.67140         4.38920  0.465818   
1     0.05066     0.90394        4.04916         1.77752  0.775670   
2     0.08278     0.63590        4.66916         4.69256  0.941162   
3     0.00758     0.52826        4.23660         1.65576  0.699134   
4     0.06206     0.95290        5.69348         1.73160  0.489758   
..        ...         ...            ...             ...       ...   
245   0.07866     0.58714        5.24924         2.88104  0.273434   
246   0.07650     0.21110        4.19628         1.35864  0.646934   
247   0.07746     0.60634        5.35988         1.88936  0.423626   
248   0.12554     0.47910        4.07788         1.75128  0.125006   
249   0.02158     0.89882        4.78516         2.46808  0.159746   

     propSocialVisits  locPerSGCount  correctionFactor  
0             0.99326        4.54958          0.035294  
1 

In [12]:
plt.figure(figsize=(3, 2)) #store RMSE histogram
plt.hist(np.log10(rmse_sums), bins=1000)
plt.axvline(x = np.log10(np.max(rmse_sums[top_indices])), color = 'black', lw = 0.5, linestyle = 'dashed')
plt.title('Histogram of RMSE values')
plt.xlabel('log10(RMSE)')
plt.ylabel('Count')
plt.savefig(f'../FitPredict/{myGP.model_type}-RMSE.png')
plt.close()

In [13]:
for i in range(len(top_indices)):

    alphaData_iter = { #alphaRest estimates for top X candidates 
        'municipality': municipalities,
        'alphaRest': alpha_municipalitites[top_indices[i]],
    }
    alpha_df_iter = pd.DataFrame(alphaData_iter)
    alpha_df_iter['rmse'] =rmse_sums[top_indices[i]]
    alpha_df_iter['rank'] = i + 1 

    if i == 0 :
        alpha_df_topX = alpha_df_iter
    else:
        alpha_df_topX = pd.concat([alpha_df_topX, alpha_df_iter])

print(alpha_df_topX.describe)


<bound method NDFrame.describe of     municipality  alphaRest      rmse  rank
0        CO05001   0.005510  0.556908     1
1        CO05045   0.003673  0.556908     1
2        CO05051   0.003673  0.556908     1
3        CO05079   0.003673  0.556908     1
4        CO05088   0.003673  0.556908     1
..           ...        ...       ...   ...
168      CO86865   0.009184  0.570453   250
169      CO91001   0.015306  0.570453   250
170      CO95001   0.006735  0.570453   250
171      CO97001   0.011020  0.570453   250
172      CO99773   0.006735  0.570453   250

[43250 rows x 4 columns]>


## Predict

In [14]:
my_predictions = pd.DataFrame()

for m in alpha_df['municipality']:
    df_pred_m = df_pred.loc[df_pred['ADM2_PCODE'] == m] #extract municipality-specific epidemics
    df_pred_m = df_pred_m.reset_index(drop = True)
    alpha_m = alpha_df.loc[alpha_df['municipality'] == m ]
    alpha_m = alpha_m['alphaRest'] #extract municipality-specific alphaRest estimates 
    pred_params = np.hstack((alpha_m, candidates_LHS[bestFitID]))
    pred_params = pred_params.reshape(1, 9)
    pred_m = predict_var(LHS_sample=pred_params, epidemics=df_pred_m, GP_model=myGP, verbose=False)
    pred_m_df = pd.DataFrame(pred_m.flatten(), columns=['pred'])
    combined_df = pd.concat([df_pred_m, pred_m_df], axis = 1)
    my_predictions = pd.concat([my_predictions, combined_df], ignore_index=True)

## Permutation Tests



In [15]:
permut_day = shuffle_var(emp_df=df_pred, f_alpha_df=alpha_df, f_candidate_LHS=candidates_LHS[bestFitID], n_iter=N_SHUFFLE, shuffle_var='day', GP_model=myGP)
permut_municipality = shuffle_var(emp_df=df_pred, f_alpha_df=alpha_df, f_candidate_LHS=candidates_LHS[bestFitID], n_iter=N_SHUFFLE, shuffle_var='municipality', GP_model=myGP)
permut_both = shuffle_var(emp_df=df_pred, f_alpha_df=alpha_df, f_candidate_LHS=candidates_LHS[bestFitID], n_iter=N_SHUFFLE, shuffle_var='both', GP_model=myGP)

In [16]:
permut_df  = pd.DataFrame({'day': permut_day,
                           'municipality': permut_municipality,
                           'both': permut_both})

In [17]:
permut_df['day'].describe()

count    1000.000000
mean        0.476932
std         0.009892
min         0.442987
25%         0.470417
50%         0.476721
75%         0.483508
max         0.505091
Name: day, dtype: float64

In [18]:
permut_df['municipality'].describe()

count    1000.000000
mean       -0.024904
std         0.046158
min        -0.198912
25%        -0.058474
50%        -0.023948
75%         0.008134
max         0.101897
Name: municipality, dtype: float64

In [19]:
permut_df['both'].describe()

count    1000.000000
mean        0.000788
std         0.049002
min        -0.172837
25%        -0.031850
50%        -0.000027
75%         0.034127
max         0.168697
Name: both, dtype: float64

In [20]:
spearmanr(my_predictions['imax'], my_predictions['pred']).statistic

0.4581849943199259

## Store

In [21]:
my_predictions.to_csv(f'../FitPredict/{myGP.model_type}-pred-localAlpha.csv', index=False, header=True, sep='\t') #predictions
alpha_df.to_csv(f'../FitPredict/{myGP.model_type}-alphaRest-fit-localAlpha.csv', index=False, header=True, sep='\t') #best fitting alphaRest parameters
alpha_df_topX.to_csv(f'../FitPredict/{myGP.model_type}-alphaRest-top{TOP_X}-localAlpha.csv', index=False, header=True, sep='\t') #top <TOP_X> alphaRest parameters
best_fits_df.to_csv(f'../FitPredict/{myGP.model_type}-params-top{TOP_X}-localAlpha.csv', index=False, header=True, sep="\t") #top <TOP_X> parameters
permut_df.to_csv(f'../FitPredict/{myGP.model_type}-permutations-localAlpha.csv', index=False, header=True, sep='\t') #permutation results 