In [1]:
import pandas as pd
import numpy as np
import multiprocessing

from fbprophet import Prophet
from datetime import datetime
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn.metrics import mean_absolute_error as mae
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import scipy.stats

In [2]:
def Var_Opt(features, model, X_train, y_train):
    model.fit(X_train[features], y_train)
    return {'features': features, 
            'train score': np.round(model.score(X_train[features], y_train),3),
            'CV score': np.round(model.best_score_, 3)}

In [3]:
def lagger(x_vec, y_vec, x_lag, y_forecast):
    y_vec = pd.Series(y_vec)
    x_vec = pd.Series(x_vec)
    
    best_corr = []  
    for i in range(x_lag+1):
        best_corr.append({'lag': i,
                          'corr': np.abs((y_vec.shift(-y_forecast).corr(x_vec.shift(i))))})
    return x_vec.shift(pd.DataFrame(best_corr).sort_values('corr')[-1:].lag.values[0])

In [4]:
def preprocess_data(data, x_lag, y_forecast, remove_rows):
    
    res = {}
    for state in data.state.unique():
        if state in ['MP', 
                     'AS',
                     'GU',
                     'PR', 
                     'VI']:
            continue
        print('State: ', state)
        tmp_state = data[data.state == state]
        tmp = []
        for date in tmp_state.date.unique():
            tmp_state_day = tmp_state[tmp_state.date == date]
            tmp.append({'total_deceased': sum(tmp_state_day.total_deceased),
                        'average_temperature': np.mean(tmp_state_day.average_temperature),
                        'minimum_temperature': np.mean(tmp_state_day.minimum_temperature),
                        'maximum_temperature': np.mean(tmp_state_day.maximum_temperature),
                        'rainfall': np.mean(tmp_state_day.rainfall),
                        'dew_point': np.mean(tmp_state_day.dew_point),
                        'relative_humidity': np.mean(tmp_state_day.relative_humidity),
                        'google': np.mean(tmp_state_day.google),
                        'new_confirmed': sum(tmp_state_day.new_confirmed)})
        tmp = pd.DataFrame(tmp)
        
        for col in tmp.columns:
            if col != 'date':
                tmp[col+'_lag'] = lagger(tmp[col], tmp['new_confirmed'], x_lag, y_forecast)
        
        tmp = tmp.iloc[remove_rows:]
        tmp.reset_index(drop=True, inplace = True)
        
        res[state] = {'All Data':tmp, 
                      'X': tmp.iloc[:-y_forecast],
                      'X_future': tmp.iloc[-y_forecast:],
                      'y': tmp['new_confirmed'].shift(-y_forecast).iloc[:-y_forecast],
                     }
        
    return(res)

In [5]:
start_date_back = 182
end_date_back = 2
x_lag = 14
y_forecast = 7
remove_rows = 30

df1 = pd.read_csv("https://storage.googleapis.com/covid19-open-data/v2/main.csv", low_memory=False)
df1 = df1.dropna(subset=['subregion1_code'])
df1 = df1[df1.country_code == 'US']
df1 = df1[['date',
           'subregion1_code',
           'total_deceased',
           'average_temperature', 
           'minimum_temperature',
           'maximum_temperature', 
           'rainfall', 
           'dew_point', 
           'relative_humidity',
           'new_confirmed']]
df1.columns = ['date',
               'state',
               'total_deceased',
               'average_temperature', 
               'minimum_temperature',
               'maximum_temperature', 
               'rainfall', 
               'dew_point', 
               'relative_humidity',
               'new_confirmed']
df1['days_back'] = [(datetime.now() - day).days for day in pd.to_datetime(df1.date)]
df1 = df1[df1['days_back']<=start_date_back]
df1 = df1[df1['days_back']>=end_date_back]
df1.fillna(0, inplace = True)

df2 =  pd.read_csv("./google_clean copy.csv")
df2 =  pd.read_csv("./google_clean copy.csv")
df2.columns = ['date', 
               'state', 
               'direction', 
               'lag', 
               'google']
df2 = df2[['date','state','google']]

df3 = pd.merge(df1, df2,  how='inner', on=['date', 'state'])
df3.sort_values(['days_back'], inplace = True, ascending = False)
df3.reset_index(drop = True, inplace = True)

In [6]:
data = preprocess_data(df3, x_lag, y_forecast, remove_rows)

State:  AK
State:  VA
State:  IL
State:  IN
State:  ID
State:  IA
State:  KY
State:  LA
State:  OR
State:  MD
State:  MA
State:  ME
State:  KS
State:  UT
State:  CA
State:  WV
State:  CT
State:  CO
State:  FL
State:  DE
State:  DC
State:  WY
State:  AL
State:  AR
State:  AZ
State:  GA
State:  WA
State:  HI
State:  VT
State:  WI
State:  SD
State:  NE
State:  SC
State:  NJ
State:  NH
State:  TN
State:  NC
State:  ND
State:  PA
State:  OK
State:  OH
State:  NY
State:  NV
State:  NM
State:  RI
State:  MO
State:  TX
State:  MI
State:  MN
State:  MT
State:  MS


In [7]:
for state in data.keys():
    rf = RandomForestRegressor()
    rf.fit(data[state]['X'], data[state]['y'])
    feature_importance = pd.DataFrame()
    feature_importance['feature'] = data[state]['X'].columns
    feature_importance['importance'] = rf.feature_importances_
    feature_importance.sort_values(['importance'], ascending=False, inplace=True)
    feature_importance.reset_index(drop = True, inplace = True)
    data[state]['Feature Importance'] = feature_importance[feature_importance['importance']>0]

In [8]:
pipe = Pipeline(steps=[('estimator', SVR())])
params_grid = [{
                'estimator':[SVR()],
                'estimator__C': [1, 10, 100, 1000],
                'estimator__gamma': [0.001, 0.0001],
                },
                {
                'estimator': [RandomForestRegressor()],
                'estimator__n_estimators': [25, 50, 100, 150, 200],
                'estimator__max_depth': [1,2,3,4,5],
                },
    
               {'estimator':[LinearRegression()],
                'estimator__fit_intercept':[True, False]},
    
               {'estimator':[ElasticNet(max_iter=2000)],
                'estimator__alpha':[0, .1, .2, .5, .7, .9, 1],
                'estimator__l1_ratio':[0, .1, .2, .5, .7, .9, 1]},
    
                {'estimator':[KNeighborsRegressor()],
                'estimator__n_neighbors':[3, 5, 7, 9],
                'estimator__weights':['uniform', 'distance']}
              ]




In [11]:
num_cores = multiprocessing.cpu_count()

for state in np.sort(list(data.keys())):
    print(state)
    gridCV = GridSearchCV(pipe, params_grid, scoring = 'neg_mean_absolute_error')
    features_list = tqdm([list(data[state]['Feature Importance'].feature[:i+1].values) for i in range(len(data[state]['Feature Importance']))])
    processed_list = Parallel(n_jobs=num_cores)(delayed(Var_Opt)(features, 
                                                                 gridCV, 
                                                                 data[state]['X'], 
                                                                 data[state]['y']) for features in features_list)
    best_features = pd.DataFrame(processed_list)
    best_features.sort_values('CV score', ascending = False, inplace = True)
    best_features.reset_index(drop=True, inplace=True)
    data[state]['Best Features'] = best_features



  0%|          | 0/18 [00:00<?, ?it/s]

AK


100%|██████████| 18/18 [00:00<00:00, 173.49it/s]
100%|██████████| 18/18 [00:00<00:00, 5520.84it/s]

AL



100%|██████████| 18/18 [00:00<00:00, 6296.70it/s]

AR



100%|██████████| 18/18 [00:00<00:00, 6242.04it/s]

AZ



100%|██████████| 18/18 [00:00<00:00, 6215.32it/s]

CA



100%|██████████| 18/18 [00:00<00:00, 4567.30it/s]

CO



100%|██████████| 18/18 [00:00<00:00, 4491.49it/s]

CT



100%|██████████| 18/18 [00:00<00:00, 10299.79it/s]

DC



100%|██████████| 18/18 [00:00<00:00, 6224.03it/s]

DE



100%|██████████| 18/18 [00:00<00:00, 5806.60it/s]

FL



100%|██████████| 18/18 [00:00<00:00, 6153.51it/s]

GA



100%|██████████| 18/18 [00:00<00:00, 6231.22it/s]

HI



100%|██████████| 18/18 [00:00<00:00, 6212.25it/s]

IA



100%|██████████| 18/18 [00:00<00:00, 10485.76it/s]

ID



100%|██████████| 18/18 [00:00<00:00, 12876.94it/s]

IL



100%|██████████| 18/18 [00:00<00:00, 6006.64it/s]

IN



100%|██████████| 18/18 [00:00<00:00, 5806.16it/s]

KS



100%|██████████| 18/18 [00:00<00:00, 6070.88it/s]

KY



100%|██████████| 18/18 [00:00<00:00, 12787.51it/s]

LA



100%|██████████| 18/18 [00:00<00:00, 5490.73it/s]

MA



100%|██████████| 18/18 [00:00<00:00, 12328.13it/s]

MD



100%|██████████| 18/18 [00:00<00:00, 4207.16it/s]

ME



100%|██████████| 18/18 [00:00<00:00, 5516.00it/s]

MI



100%|██████████| 18/18 [00:00<00:00, 9965.35it/s]

MN



100%|██████████| 18/18 [00:00<00:00, 6252.38it/s]

MO



100%|██████████| 18/18 [00:00<00:00, 4429.30it/s]

MS



100%|██████████| 18/18 [00:00<00:00, 6266.91it/s]

MT



100%|██████████| 18/18 [00:00<00:00, 5896.40it/s]

NC



100%|██████████| 18/18 [00:00<00:00, 5871.18it/s]

ND



100%|██████████| 18/18 [00:00<00:00, 4568.68it/s]

NE



100%|██████████| 18/18 [00:00<00:00, 12216.42it/s]

NH



100%|██████████| 18/18 [00:00<00:00, 5859.33it/s]

NJ



100%|██████████| 18/18 [00:00<00:00, 6492.73it/s]

NM



100%|██████████| 18/18 [00:00<00:00, 4468.63it/s]

NV



100%|██████████| 18/18 [00:00<00:00, 4584.77it/s]

NY



100%|██████████| 18/18 [00:00<00:00, 5686.76it/s]

OH



100%|██████████| 18/18 [00:00<00:00, 5450.69it/s]

OK



100%|██████████| 18/18 [00:00<00:00, 5975.74it/s]

OR



100%|██████████| 18/18 [00:00<00:00, 12595.51it/s]

PA



100%|██████████| 18/18 [00:00<00:00, 5893.18it/s]

RI



100%|██████████| 18/18 [00:00<00:00, 6128.54it/s]

SC



100%|██████████| 18/18 [00:00<00:00, 6006.64it/s]

SD



100%|██████████| 18/18 [00:00<00:00, 4691.32it/s]

TN



100%|██████████| 18/18 [00:00<00:00, 5964.41it/s]

TX



100%|██████████| 18/18 [00:00<00:00, 4775.30it/s]

UT



100%|██████████| 18/18 [00:00<00:00, 4576.44it/s]

VA



100%|██████████| 18/18 [00:00<00:00, 4896.71it/s]

VT



100%|██████████| 18/18 [00:00<00:00, 12778.85it/s]

WA



100%|██████████| 18/18 [00:00<00:00, 6212.76it/s]

WI



100%|██████████| 18/18 [00:00<00:00, 5938.14it/s]

WV



100%|██████████| 18/18 [00:00<00:00, 6183.76it/s]

WY





In [45]:
state_cv_scores = pd.DataFrame([{'State':state, 'CV Score':data[state]['Best Features']['CV score'].iloc[0]} for state in data.keys()])
state_cv_scores.sort_values(['CV Score'], inplace = True)


In [13]:
future_pred_res = []
for state in data.keys():
    top_feat = data[state]['Best Features']['features'].iloc[0]
    X_train = data[state]['X'][top_feat]
    y = data[state]['y']
    X_future = data[state]['X_future'][top_feat]

    future_pred_res.append({'State':state, 
                            'Average Future Prediction '+str(y_forecast)+' Days':np.mean(gridCV.fit(X_train,y).predict(X_future))})

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coord

In [14]:
last_known = []
for state in data.keys():
    last_known.append({'State':state, 
                       'Average Last 7 Days':np.mean(data[state]['y'].iloc[-7:])})


In [15]:
tmp1 = state_cv_scores.merge(pd.DataFrame(future_pred_res))

In [16]:
tmp2 = tmp1.merge(pd.DataFrame(last_known))

In [17]:
tmp2['difference'] = tmp2['Average Future Prediction '+str(y_forecast)+' Days'] - tmp2['Average Last 7 Days']

In [18]:
tmp2['Percent Change'] = np.round(tmp2['difference']/tmp2['Average Last 7 Days']*100)

In [19]:
tmp2.sort_values(['Percent Change'], ascending=False)

Unnamed: 0,State,CV Score,Average Future Prediction 7 Days,Average Last 7 Days,difference,Percent Change
7,AL,-780.356,4701.362208,3030.857143,1670.505065,55.0
5,TN,-883.601,5819.123492,4565.857143,1253.26635,27.0
40,NM,-111.652,1557.030261,1254.571429,302.458832,24.0
41,ND,-98.89,2036.394448,1653.428571,382.965876,23.0
16,PA,-452.375,4016.060643,3370.857143,645.2035,19.0
11,SC,-599.293,2220.721389,1879.714286,341.007104,18.0
19,MA,-371.299,1955.333333,1656.857143,298.47619,18.0
43,HI,-69.446,205.309283,176.571429,28.737854,16.0
35,NE,-208.633,2011.35761,1771.714286,239.643324,14.0
29,CO,-253.472,2707.333333,2384.285714,323.047619,14.0


In [20]:
tmp2.to_csv('./statepreds.csv')

In [44]:
data['DC']['Feature Importance']

Unnamed: 0,feature,importance
0,new_confirmed,0.155078
1,new_confirmed_lag,0.14091
2,total_deceased,0.111904
3,total_deceased_lag,0.081325
4,maximum_temperature_lag,0.07408
5,relative_humidity_lag,0.067401
6,dew_point_lag,0.043795
7,relative_humidity,0.042354
8,minimum_temperature,0.040794
9,maximum_temperature,0.037646
