In [8]:
# Start writing code here...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
from sklearn.preprocessing import StandardScaler

from sklearn.impute import KNNImputer
from datetime import datetime
from datetime import date
from tqdm import tqdm



In [5]:
# You may need to define your file path accordingly 
path = ""

In [6]:
# # If you are running on colab, uncomment these lines
# path = "/content/drive/MyDrive/209 Project/"
# from google.colab import drive 
# drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


## Load preprocessed data

In [9]:
temp_static = pd.read_csv(path + "data/temp_static.csv")
temp_static['fip'] = temp_static['fip'].apply(lambda x: str(int(x)).zfill(5))
temp_static['date'] = temp_static['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
print("Aggregated temporal and static data shape: ", temp_static.shape)
temp_static.head(1)

Aggregated temporal and static data shape:  (490806, 22)


Unnamed: 0,fip,County Name,State,date,daily_confirmed,new_cases,trip_ratio,short_trip_ratio,med_trip_ratio,long_trip_ratio,avg_stay_at_home_ratio,log_new_cases,mean_neighbor_daily_confirmed,mean_neightbor_new_cases,neighbor_log_new_cases,over_65_percent,no_diploma_percent,no_insurance_percent,unemployed_percent,poverty_percent,over_65_percent.1,beds_per_1000
0,1001,Autauga County,AL,2020-05-02,45,3.0,3.212118,0.776026,0.239752,0.020611,0.192281,1.386294,65.333333,4.666667,1.734601,-0.774663,-0.166213,-0.644388,-0.372038,-0.376802,-0.774663,-0.111074


## Split Train and Test

In [14]:
timesteps = 28
fips = list(temp_static["fip"].unique())
print("We have %s of counties now"%(len(fips)))

We have 2682 of counties now


In [15]:
def create_temporal_data(data, target_col, timesteps):
    temporal_data = []
    for i in range(1,timesteps+1):
        temp = data.groupby('fip')[target_col].shift(i)
        temporal_data.append(temp)
        
    temporal_data = np.array(temporal_data).T
    
    return temporal_data

In [16]:
def split_train_val_test(data, splits):
    train = data[splits[0]: splits[1]]
    val = data[splits[1]: splits[2]]
    test = data[splits[2]:]
    return train, val, test

In [17]:
train_date = date(2020, 6, 1)
val_date =  date(2020, 9, 1)
test_date =  date(2020, 10, 1)

total_len = temp_static.groupby(by = "fip").count()["date"].unique()[0]
target_len = temp_static[temp_static["date"] >= train_date ].groupby(by = "fip").count()["date"].unique()[0]
train_range = temp_static[(temp_static["date"] >= train_date) & (temp_static["date"] < val_date) ].groupby(by = "fip").count()["date"].unique()[0]
val_range = temp_static[(temp_static["date"] >= val_date) & (temp_static["date"] < test_date) ].groupby(by = "fip").count()["date"].unique()[0]


ignore_len = total_len - target_len
train_cutoff = ignore_len + train_range
val_cutoff = train_cutoff + val_range
splits = [ignore_len, train_cutoff, val_cutoff, total_len]
print(splits)

[30, 122, 152, 183]


In [18]:
temp_cols = ['log_new_cases','trip_ratio', 'short_trip_ratio', 'avg_stay_at_home_ratio', 'mean_neightbor_new_cases',]

n_temp_features = len(temp_cols)

static_cols = ['neighbor_log_new_cases', 'over_65_percent', 'no_diploma_percent',
       'no_insurance_percent', 'unemployed_percent', 'poverty_percent',
       'over_65_percent.1', 'beds_per_1000']
n_static_features = len(static_cols)
print(n_temp_features, n_static_features)

5 8


In [19]:
X_temporal = [[], [], []]
X_static = [[], [], [],]
y = [[], [], [],]
lag = 3
    
for fip in tqdm(fips): 
    # temporal inputs
    selected = temp_static[temp_static["fip"] == fip].copy()
    
    temp_lags = []
    for col in temp_cols:
        lagged = create_temporal_data(selected, col, timesteps)
        temp_lags.append(lagged)
            
    temporal_inputs = np.concatenate(temp_lags, axis = 1).reshape(total_len, timesteps, n_temp_features)
    temp_train, temp_val, temp_test = split_train_val_test(temporal_inputs, splits)
    X_temporal[0].append(temp_train)
    X_temporal[1].append(temp_val)
    X_temporal[2].append(temp_test)
    
    # static inputs
    static_inputs = selected[static_cols].values
    static_train, static_val, static_test = split_train_val_test(static_inputs, splits)
    X_static[0].append(static_train)
    X_static[1].append(static_val)
    X_static[2].append(static_test)
    
    # response variable
    response = selected[["log_new_cases"]].values
    resp_train, resp_val, resp_test = split_train_val_test(response, splits)
    y[0].append(resp_train)
    y[1].append(resp_val)
    y[2].append(resp_test)
    

100%|██████████| 2682/2682 [06:38<00:00,  6.73it/s]


In [20]:
for i in range(3):
    X_temporal[i] = np.asarray(X_temporal[i])
    X_temporal[i] =  X_temporal[i].reshape(X_temporal[i].shape[0]*X_temporal[i].shape[1], 
                                                  X_temporal[i].shape[2], X_temporal[i].shape[3])
    
    X_static[i] = np.asarray(X_static[i])
    X_static[i] = X_static[i].reshape(X_static[i].shape[0]*X_static[i].shape[1], 
                                                  X_static[i].shape[2])
    
    y[i] = np.asarray(y[i]) 
    y[i] = y[i].reshape(y[i].shape[0]*y[i].shape[1], y[i].shape[2]) 

print(X_temporal[0].shape, X_temporal[1].shape, X_temporal[2].shape)
print(X_static[0].shape, X_static[1].shape, X_static[2].shape)
print(y[0].shape, y[1].shape, y[2].shape)

(246744, 28, 5) (80460, 28, 5) (83142, 28, 5)
(246744, 8) (80460, 8) (83142, 8)
(246744, 1) (80460, 1) (83142, 1)


In [21]:
# save numpy array to file
np.save(path + "data/train_temp", X_temporal[0])
np.save(path + "data/val_temp", X_temporal[1])
np.save(path + "data/test_temp", X_temporal[2])

np.save(path + "data/train_static", X_static[0])
np.save(path + "data/val_static", X_static[1])
np.save(path + "data/test_static", X_static[2])

np.save(path + "data/train_y", y[0])
np.save(path + "data/val_y", y[1])
np.save(path + "data/test_y", y[2])

