## Utilities
File containting utilities for DengAI Competition

### Environment Prep

In [2]:
#Get the current conda environment Jupyter is running in
env=!conda info
env=str.lower(env[1])
if 'keras' in env:
    env='keras'
elif 'xgboost' in env:
    env='xgboost'
elif 'azureml' in env:
    env='AzureML'
else: 
    env='other env'

#Import needed packages
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import numpy as np
import scipy as sp
import random
import time
from datetime import datetime

#Import statsmodels for ARIMA and other functions
#import statsmodels.api as sm
#from statsmodels.tsa.stattools import adfuller

#Preprocessing libraries
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Normalizer,RobustScaler,MaxAbsScaler,OneHotEncoder
from sklearn.metrics import mean_absolute_error

#Pipeline and gridsearch tasks
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#Set default grid behavior for Seaborn
sns.set(style='darkgrid')

#Import Unsupervised learning models from Keras
if env=='keras':
    import tensorflow as tf
    import keras
    from keras import models, layers
    from keras.callbacks import EarlyStopping, ModelCheckpoint
    from keras.wrappers.scikit_learn import KerasClassifier
    from keras.optimizers import SGD, Adam, Nadam
    from tensorflow.keras.optimizers import RMSprop
    from keras.layers import LSTM,Dense,GRU,TimeDistributed, Conv1D, InputLayer
    from keras.models import Sequential

#Import XGBoost
if env=='xgboost' or env=='base':
    from xgboost import XGBRegressor

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('Environment is: '+env)

Environment is: AzureML


### Data Prep

In [2]:
'''Data Loads'''
#Load data into dataframes for analysis
def load_input_data():
    #df_input=pd.read_csv('inputdata/dengue_features_train.csv')
    df_input=pd.read_csv('inputdata/training_all.csv')
    return df_input

def load_label_data():
    df_labels=pd.read_csv('inputdata/dengue_labels_train.csv')
    return df_labels

def load_holdout_data():
    #df_holdout=pd.read_csv('inputdata/dengue_features_test.csv')
    df_holdout=pd.read_csv('inputdata/holdout_all.csv')
    return df_holdout

def load_original_data():
    df_input=pd.read_csv('inputdata/dengue_features_train.csv')
    df_labels=pd.read_csv('inputdata/dengue_labels_train.csv')
    df_holdout=pd.read_csv('inputdata/dengue_features_test.csv')
    return df_input, df_labels, df_holdout

def load_all_data():
    df_input=pd.read_csv('inputdata/training_all.csv')
    df_labels=pd.read_csv('inputdata/labels_all.csv')
    df_holdout=pd.read_csv('inputdata/holdout_all.csv')
    return df_input, df_labels, df_holdout


In [3]:
'''
Interpolate data
'''
#Interpolate missing data for each time series. This uses a simple linear interpolation for each series. Primarily ndvi measures
def interp(df):
    df.interpolate(method='linear',limit_direction='forward',inplace=True)
    return df

'''
Create time lagged data horizontally
'''
#Function creates a new feature for each period in the defined lag. Updates both label and training data
def create_lag_features(df,lag,end_col=0):
    for i in range(lag):
        df_lag=df.iloc[:,:end_col]
        df_lag=df_lag.shift(periods=i)
        df=df.join(df_lag,rsuffix='_shift_'+str(i))
    
    df=df.iloc[lag:,:]
    df.reset_index(inplace=True,drop=True)
    
    return df
    

In [4]:
'''Feature Scaling and Engineering'''

#create a one-hot encoding scheme for the week number to eliminate effects of week order
def week_encoder(df):
    enc=OneHotEncoder()
    np_week=np.array(df['weekofyear'])
    np_week=np.reshape(np_week,(-1,1))
    input_onehot=pd.DataFrame(enc.fit_transform(np_week).toarray())

    #Get the feature names
    f=enc.get_feature_names().tolist()
    for name in range(0,len(f)): f[name]='week_'+f[name]
    for i in range(0,len(input_onehot.columns)): input_onehot.rename(columns={i:f[i]},inplace=True)
    
    #add one hot encoded features back to the df
    df=df.join(input_onehot)
    
    #return the df with weeks one-hot encoded and the list of week names
    return df, f


#Applies a standard scaler to the training and holdout data and returns a df along with column names
def scale_standard(df_input,df_holdout,scale_list):
    #Set up the scaler with the right features
    scaler=StandardScaler()
    df_i_sc=df_input[scale_list]
    df_h_sc=df_holdout[scale_list]
    
    #fit and transform scaler
    scaler=scaler.fit(df_i_sc)
    np_i_sc=scaler.transform(df_i_sc)
    np_h_sc=scaler.transform(df_h_sc)
    
    #update the original dataframes with the scaled values
    for i in range(len(scale_list)):
        df_input[scale_list[i]]=np_i_sc[:,i]
        df_holdout[scale_list[i]]=np_h_sc[:,i]
        
    return df_input, df_holdout, scaler

#Applies a MinMax scaler to the training and holdout data and returns a df along with column names
def scale_MinMax(df_input,df_holdout,scale_list,feature_range=(0,1)):
    #Set up the scaler with the right features
    scaler=MinMaxScaler(feature_range=feature_range)
    df_i_sc=df_input[scale_list]
    df_h_sc=df_holdout[scale_list]
    
    #fit and transform scaler
    scaler=scaler.fit(df_i_sc)
    np_i_sc=scaler.transform(df_i_sc)
    np_h_sc=scaler.transform(df_h_sc)
    
    #update the original dataframes with the scaled values
    for i in range(len(scale_list)):
        df_input[scale_list[i]]=np_i_sc[:,i]
        df_holdout[scale_list[i]]=np_h_sc[:,i]
        
    return df_input, df_holdout, scaler

#Applies a robust scaler to the training and holdout data and returns a df along with column names
def scale_robust(df_input,df_holdout,scale_list):
    #Set up the scaler with the right features
    scaler=RobustScaler()
    df_i_sc=df_input[scale_list]
    df_h_sc=df_holdout[scale_list]
    
    #fit and transform scaler
    scaler=scaler.fit(df_i_sc)
    np_i_sc=scaler.transform(df_i_sc)
    np_h_sc=scaler.transform(df_h_sc)
    
    #update the original dataframes with the scaled values
    for i in range(len(scale_list)):
        df_input[scale_list[i]]=np_i_sc[:,i]
        df_holdout[scale_list[i]]=np_h_sc[:,i]
        
    return df_input, df_holdout, scaler

#Applies a max absolute scaler to the training and holdout data and returns a df along with column names
def max_abs_scaler(df_input,df_holdout,scale_list):
    #Set up the scaler with the right features
    scaler=MaxAbsScaler()
    df_i_sc=df_input[scale_list]
    df_h_sc=df_holdout[scale_list]
    
    #fit and transform scaler
    scaler=scaler.fit(df_i_sc)
    np_i_sc=scaler.transform(df_i_sc)
    np_h_sc=scaler.transform(df_h_sc)
    
    #update the original dataframes with the scaled values
    for i in range(len(scale_list)):
        df_input[scale_list[i]]=np_i_sc[:,i]
        df_holdout[scale_list[i]]=np_h_sc[:,i]
        
    return df_input, df_holdout, scaler


In [6]:
'''Data Splitting'''
#reduce df_labels to just the target value
def get_targets(df):
    df_labels=pd.DataFrame(df['total_cases'])
    return df_labels


#Create function that only splits a dataset by city
def split_by_city(df):
    df_sj=df[df['city']=='sj']
    df_iq=df[df['city']=='iq']
    
    return df_sj, df_iq 

In [14]:
'''Feature List Creation'''
#Create lists of features for each city and the lag for that feature
sj_features=[
    'year',
    'yearcount',
    'weekofyear',
    #'outbreak_severity',
    'station_max_temp_c',
    'station_min_temp_c',
    'cum_rain_prior_24_wks',
    'avg_max_temp_prior_22_wks',
    #'total_cases'
    #'avg_max_temp_prior_24_wks',
    #'avg_min_temp_prior_21_wks'
]

sj_lags={
    'year':0,
    'yearcount':0,
    'weekofyear':0,
    #'outbreak_severity':0,
    'station_max_temp_c':0,
    'station_min_temp_c':0,
    'cum_rain_prior_24_wks':46,
    'avg_max_temp_prior_22_wks':0,
    #'total_cases':3
    #'avg_max_temp_prior_24_wks':0,
    #'avg_min_temp_prior_21_wks':0
}

iq_features=[
    'year',
    'yearcount',
    'weekofyear',
    #'outbreak_severity',
    'reanalysis_min_air_temp_k',
    'station_max_temp_c',
    'cum_rain_prior_22_wks',
    #'total_cases'
    #'avg_min_temp_prior_4_wks',
    #'avg_specific_humidity_prior_4_wks'
]

iq_lags={
    'year':0,
    'yearcount':0,
    'weekofyear':0,
    'outbreak_severity':0,
    'reanalysis_min_air_temp_k':0,
    'station_max_temp_c':0,
    'cum_rain_prior_22_wks':43,
    #'total_cases':3
    #'avg_min_temp_prior_4_wks':0,
    #'avg_specific_humidity_prior_4_wks':0
}

def get_feature_list(city,lag_names=True):
    if city=='sj':
        feature_list=[]
        if lag_names==True:
            for key, value in sj_lags.items():
                for i in range(value): feature_list.append(str(key)+'_shift_'+str(i))
        else:
            for key, value in sj_lags.items(): feature_list.append(str(key))
    elif city=='iq':
        feature_list=[]
        if lag_names==True:
            for key, value in iq_lags.items():
                for i in range(value): feature_list.append(str(key)+'_shift_'+str(i))
        else:
            for key, value in iq_lags.items(): feature_list.append(str(key))
                
    return feature_list

#returns a list of features to scale
def get_standard_scale_list():
    standard_scale_list=['reanalysis_air_temp_k',
                         'reanalysis_avg_temp_k',
                         'reanalysis_dew_point_temp_k',
                         'reanalysis_max_air_temp_k',
                         'reanalysis_min_air_temp_k',
                         'reanalysis_specific_humidity_g_per_kg',
                         'station_avg_temp_c',
                         'station_max_temp_c',
                         'station_min_temp_c']
    return standard_scale_list 

def get_minmax_scale_list():
    minmax_scale_list=['year',
                       'yearcount',
                       'weekofyear',
                       'total_cases',
                       'station_diur_temp_rng_c',
                       'reanalysis_tdtr_k',
                       'precipitation_amt_mm',
                       'reanalysis_precip_amt_kg_per_m2',
                       'reanalysis_relative_humidity_percent',
                       'reanalysis_sat_precip_amt_mm',
                       'station_precip_mm','cum_rain_prior_24_wks','cum_rain_prior_22_wks',
                       'avg_max_temp_prior_22_wks',
                       'avg_specific_humidity_prior_4_wks',
                       'avg_max_temp_prior_24_wks',
                       'avg_min_temp_prior_21_wks',
                       'avg_min_temp_prior_4_wks',
                       'outbreak_severity']
    
    return minmax_scale_list

def get_robust_scale_list():
    robust_scale_list=['year',
                       'yearcount',
                       'weekofyear',
                       'outbreak_severity',
                       'reanalysis_min_air_temp_k',
                       'station_max_temp_c',
                       'cum_rain_prior_22_wks',
                       'station_min_temp_c',
                       'cum_rain_prior_24_wks',
                       'avg_max_temp_prior_22_wks'
                      ]
    return robust_scale_list

def get_mas_scale_list():
    robust_scale_list=['year',
                       'yearcount',
                       'weekofyear',
                       'outbreak_severity',
                       'reanalysis_min_air_temp_k',
                       'station_max_temp_c',
                       'cum_rain_prior_22_wks',
                       'station_min_temp_c',
                       'cum_rain_prior_24_wks',
                       'avg_max_temp_prior_22_wks'
                      ]
    return robust_scale_list


In [8]:
def pre_process_data(city,scale_norm=True,lookback=50,train_split=.8,test_split=.1,valid_split=.1,xy_split=.2,time_series_split=True):

    #Load data
    df_input, df_labels, df_holdout = load_all_data()

    #Interp data
    df_input=interp(df_input)
    df_holdout=interp(df_holdout)

    #Designate city to prep data for
    df_input=df_input[df_input['city']==city]
    df_holdout=df_holdout[df_holdout['city']==city]
    
    #Create consolidated training file
    df_all=df_input.append(df_holdout,ignore_index=True)
    df_labels=df_labels[df_labels['city']==city]
    df_holdout['total_cases']=0
    df_holdout_labels=df_holdout['total_cases'].copy()

    #Remember the length of each df to break them back apart
    len_input=len(df_input)
    len_labels=len(df_labels)
    len_holdout=len(df_holdout)

    #Scale and normalize
    standard_scale_list=get_standard_scale_list()
    minmax_scale_list=get_minmax_scale_list()
    robust_scale_list=get_robust_scale_list()
    mas_scalelist=get_mas_scale_list()

    #Scale the data
    df_all,df_holdout,scaler_robust=scale_robust(df_all,df_holdout,robust_scale_list)

    #Get the lists of features to train
    training_feature_list=[]
    city_feature_list=get_feature_list(city,lag_names=False)
    for i in range(len(city_feature_list)):training_feature_list.append(city_feature_list[i])
    df_all_lag=df_all[training_feature_list]

    #Create lagged data
    df_all_lag=create_lag_features(df_all_lag,lag=lookback,end_col=df_all_lag.shape[1])
    df_labels=df_labels.iloc[lookback:,:]
    df_labels=df_labels.reset_index(drop=True)
    
    len_input=len_input-lookback
    len_labels=len_labels-lookback
    
    #Split training data into train and test sets for each city
    if time_series_split==True: #Split the test and train set along a time series 
        df_x_train_city=df_all_lag.iloc[:int(len_input*train_split),:]
        df_y_train_city=df_labels.iloc[:int(len_labels*train_split),:]
        df_y_train_city.reset_index(inplace=True,drop=True)
        df_x_test_city=df_all_lag.iloc[int(len_input*train_split):int(len_input*(train_split+test_split)),:]
        df_y_test_city=df_labels.iloc[int(len_labels*train_split):int(len_input*(train_split+test_split)),:]
        df_x_valid_city=df_all_lag.iloc[int(len_input*(train_split+test_split)):int(len_input*(train_split+test_split+valid_split)),:]
        df_y_valid_city=df_labels.iloc[int(len_labels*(train_split+test_split)):,:]
        df_x_holdout_city=df_all_lag.iloc[len_input:]
        df_y_holdout_city=df_holdout_labels
    else: #split test and train randomly
        df_x_valid_city=df_all_lag.iloc[len_input-int(len_input*(valid_split)):len_input,:]
        df_y_valid_city=df_labels.iloc[-(int(len_labels*(valid_split))):,:]
        df_x_holdout_city=df_all_lag.iloc[len_input:]
        df_y_holdout_city=df_holdout_labels
        df_x_train_city, df_x_test_city, df_y_train_city, df_y_test_city=train_test_split(
            df_all_lag.iloc[:int(len_input*(train_split+test_split)),:],
            df_labels.iloc[:int(len_input*(train_split+test_split)),:],
            test_size=xy_split, random_state=43)
    
    return df_x_train_city, df_y_train_city, df_x_test_city, df_y_test_city, df_x_valid_city, df_y_valid_city, df_x_holdout_city, df_y_holdout_city


In [9]:
def pre_process_data_smote(city,lookback=50,train_split=.8,test_split=.2):
    #Load data
    df_input, df_labels, df_holdout = load_all_data()

    #Interp data
    df_input=interp(df_input)
    df_holdout=interp(df_holdout)

    #Designate city to prep data for
    df_input=df_input[df_input['city']==city]
    df_holdout=df_holdout[df_holdout['city']==city]
    
    #Create consolidated training file for creating lookback features in the holdout file
    df_all=df_input.append(df_holdout,ignore_index=True)
    df_labels=df_labels[df_labels['city']==city]
    df_holdout['total_cases']=0
    df_holdout_labels=df_holdout['total_cases'].copy()
    
    #drop city and week start date for SMOTE
    df_all.drop(columns=['city','week_start_date'],inplace=True)
    
    #Remember the length of each df to break them back apart
    len_input=len(df_input)
    len_labels=len(df_labels)
    len_holdout=len(df_holdout)

    #Get the lists of features to train
    training_feature_list=[]
    city_feature_list=get_feature_list(city,lag_names=False)
    for i in range(len(city_feature_list)):training_feature_list.append(city_feature_list[i])
    df_all_lag=df_all[training_feature_list]

    #Create lagged data
    df_all_lag=create_lag_features(df_all_lag,lag=lookback,end_col=df_all_lag.shape[1])
    df_labels=df_labels.iloc[lookback:,:]
    df_labels=df_labels.reset_index(drop=True)
    
    len_input=len_input-lookback
    len_labels=len_labels-lookback
    
    #split the holdout file away from the input/training file
    df_input=df_all.iloc[:len_input,:]
    df_holdout=df_all.loc[len_input:,:]
    
    df_input.to_csv('temp.csv')
    
    #create smote records for df_input based on outbreak severity
    from imblearn.over_sampling import SMOTE
    smt=SMOTE()
    df_sev=df_input['outbreak_severity'] #for purposes of smote, the class variable outbreak severity is the y value
    df_input, df_sev = smt.fit_sample(df_input, df_sev) #this should give us a balanced set of variables
    df_labels=df_input['total_cases'] #get total cases so the labels align with the input records
    
    #Scale and normalize
    standard_scale_list=get_standard_scale_list()
    minmax_scale_list=get_minmax_scale_list()
    df_input,df_holdout,scaler_standard=scale_standard(df_input,df_holdout,standard_scale_list)
    df_input,df_holdout,scaler_minmax=scale_MinMax(df_input,df_holdout,minmax_scale_list,feature_range=(0,1))
    
    #randomly create train and test as opposed to time series split
    x_train,x_test,y_train,y_test=train_test_split(df_input,df_labels,test_size=test_split,random_state=42)
    
    #return the datasets
    return x_train,y_train,x_test,y_test,df_holdout
    

In [10]:
'''
Submission File Creation
'''
def create_submit_file(y_pred_sj,y_pred_iq):
    
    #create a single array from the two results
    np_submit=np.append(y_pred_sj,y_pred_iq)
    
    #Round the results to the nearest integer value
    np_submit=np_submit.astype(int)
    
    #Replace any negative values with zero
    np_submit=np.where(np_submit>0,np_submit,0)
    
    #Open the submission file and create a df
    df_submit=pd.read_csv('inputdata/submission_format.csv')
    
    #update the target values (total_cases) with the predictions
    df_submit['total_cases']=np_submit
    
    #write the submission file to a csv
    df_submit.to_csv('outputdata/submit_file.csv',index=False)

In [11]:
    
'''
Evaluate results
'''
def evaluate_results(model_sj,model_iq,df_x_test_sj,df_x_test_iq,df_y_test_sj,df_y_test_sq,target):
    y_pred_sj=model_sj.predict(df_x_test_sj)
    y_pred_iq=model_iq.predict(df_x_test_iq)
    y_pred_combined=np.append(y_pred_sj,y_pred_iq)
    y_pred_combined=np.where(y_pred_combined>0,y_pred_combined,0) #remove negative predictions
    y_target_combined=np.append(df_y_test_sj[target],df_y_test_iq[target])
    print('MAE of SJ: '+ str(mean_absolute_error(df_y_test_sj[target],y_pred_sj)))
    print('MAE of IQ: '+ str(mean_absolute_error(df_y_test_iq[target],y_pred_iq)))
    print('MAE of Combined: ' + str(mean_absolute_error(y_target_combined,y_pred_combined)))
    
    

In [13]:
'''
#One time procedure to create a combined file with engineered features for cumulative min/max temperature and humidity
#load and interpolate data
df_i,df_l,df_h = load_original_data()
df_i=interp(df_i)
df_h=interp(df_h)

df_i['total_cases']=df_l['total_cases']
df_h['total_cases']=0

#Begin SJ
#Break out source file and holdout file by city
df_sj=df_i[df_i['city']=='sj']
df_sj_h=df_h[df_h['city']=='sj']
len_sj=len(df_sj)
len_sj_h=len(df_sj_h)

#concat training and holdout data
df_sj=df_sj.append(df_sj_h)

#Create an outbreak label for SJ
df_sj['outbreak_severity']=0
df_sj.loc[df_sj['total_cases']<=50,'outbreak_severity']=0
df_sj.loc[df_sj['total_cases']>50,'outbreak_severity']=1
df_sj.loc[df_sj['total_cases']>100,'outbreak_severity']=2
df_sj.loc[df_sj['total_cases']>175,'outbreak_severity']=3
df_sj.loc[df_sj['total_cases']>300,'outbreak_severity']=4

#Get cumulative totals at various intervals - past 4 to past 25 weeks
for i in range(2,25):
    df_sj['cum_rain_prior_'+str(i)+'_wks']=df_sj['precipitation_amt_mm'].rolling(i).sum()

for i in range(2,25):
    df_sj['avg_min_temp_prior_'+str(i)+'_wks']=df_sj['station_min_temp_c'].rolling(i).mean()
    
for i in range(2,25):
    df_sj['avg_max_temp_prior_'+str(i)+'_wks']=df_sj['station_max_temp_c'].rolling(i).mean()
    
for i in range(2,25):
    df_sj['avg_specific_humidity_prior_'+str(i)+'_wks']=df_sj['reanalysis_specific_humidity_g_per_kg'].rolling(i).mean()
    
for i in range(2,25):
    df_sj['avg_relative_humidity_prior_'+str(i)+'_wks']=df_sj['reanalysis_relative_humidity_percent'].rolling(i).mean()
    
for i in range(2,3):
    df_sj['avg_total_cases_'+str(i)+'_wks']=df_sj['total_cases'].rolling(i).mean()
    
for i in range(2,3):
    df_sj['cum_total_cases_'+str(i)+'_wks']=df_sj['total_cases'].rolling(i).sum()


#split the files back apart
df_sj_h=df_sj.iloc[len_sj:]
df_sj=df_sj.iloc[:len_sj]

#Begin IQ
#Break out source file and holdout file by city
df_iq=df_i[df_i['city']=='iq']
df_iq_h=df_h[df_h['city']=='iq']
len_iq=len(df_iq)
len_iq_h=len(df_iq_h)

#concat training and holdout data
df_iq=df_iq.append(df_iq_h)

#Create outbreak label for iq
df_iq['outbreak_severity']=0
df_iq.loc[df_iq['total_cases']<=20,'outbreak_severity']=0
df_iq.loc[df_iq['total_cases']>20,'outbreak_severity']=1
df_iq.loc[df_iq['total_cases']>39,'outbreak_severity']=2

#Get cumulative totals at various intervals - past 4 to past 25 weeks
#Get cumulative rainfall totals at various accumulations - past 4 to past 25 weeks
for i in range(2,25):
    df_iq['cum_rain_prior_'+str(i)+'_wks']=df_iq['precipitation_amt_mm'].rolling(i).sum()

for i in range(2,25):
    df_iq['avg_min_temp_prior_'+str(i)+'_wks']=df_iq['station_min_temp_c'].rolling(i).mean()
    
for i in range(2,25):
    df_iq['avg_max_temp_prior_'+str(i)+'_wks']=df_iq['station_max_temp_c'].rolling(i).mean()
    
for i in range(2,25):
    df_iq['avg_specific_humidity_prior_'+str(i)+'_wks']=df_iq['reanalysis_specific_humidity_g_per_kg'].rolling(i).mean()
    
for i in range(2,25):
    df_iq['avg_relative_humidity_prior_'+str(i)+'_wks']=df_iq['reanalysis_relative_humidity_percent'].rolling(i).mean()
    
for i in range(2,3):
    df_iq['avg_total_cases_'+str(i)+'_wks']=df_iq['total_cases'].rolling(i).mean()
    
for i in range(2,3):
    df_iq['cum_total_cases_'+str(i)+'_wks']=df_iq['total_cases'].rolling(i).sum()

#split the files back apart
df_iq_h=df_iq.iloc[len_iq:]
df_iq=df_iq.iloc[:len_iq]

#Create train and holdout files
#Create the input and holdout files with engineered features
df_all=df_sj.append(df_iq)
df_all.to_csv('training_all_all.csv',index=False)
df_holdout=df_sj_h.append(df_iq_h)
df_holdout.to_csv('holdout_all_all.csv',index=False)
'''

"\n#One time procedure to create a combined file with engineered features for cumulative min/max temperature and humidity\n#load and interpolate data\ndf_i,df_l,df_h = load_original_data()\ndf_i=interp(df_i)\ndf_h=interp(df_h)\n\ndf_i['total_cases']=df_l['total_cases']\ndf_h['total_cases']=0\n\n#Begin SJ\n#Break out source file and holdout file by city\ndf_sj=df_i[df_i['city']=='sj']\ndf_sj_h=df_h[df_h['city']=='sj']\nlen_sj=len(df_sj)\nlen_sj_h=len(df_sj_h)\n\n#concat training and holdout data\ndf_sj=df_sj.append(df_sj_h)\n\n#Create an outbreak label for SJ\ndf_sj['outbreak_severity']=0\ndf_sj.loc[df_sj['total_cases']<=50,'outbreak_severity']=0\ndf_sj.loc[df_sj['total_cases']>50,'outbreak_severity']=1\ndf_sj.loc[df_sj['total_cases']>100,'outbreak_severity']=2\ndf_sj.loc[df_sj['total_cases']>175,'outbreak_severity']=3\ndf_sj.loc[df_sj['total_cases']>300,'outbreak_severity']=4\n\n#Get cumulative totals at various intervals - past 4 to past 25 weeks\nfor i in range(2,25):\n    df_sj['cum