In [1]:
#imports
import numpy as np

import pandas as pd
from sklearn import dummy, metrics
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import shap

In [2]:
#random seed for repeatability
random_seed = 18

In [3]:
#read the data into pandas

filepath = '../data/X_train.csv'
X_train = pd.read_csv(filepath)

filepath = '../data/y_train.csv'
y_train = pd.read_csv(filepath)

filepath = '../data/X_test.csv'
X_test = pd.read_csv(filepath)

filepath = '../data/y_test.csv'
y_test = pd.read_csv(filepath)

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7354 entries, 0 to 7353
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   DaysSinceSpray              7354 non-null   float64
 1   IsDayAfterSpray_1           7354 non-null   float64
 2   IsNearSprayZone_1           7354 non-null   float64
 3   IsNorthWestZone_1           7354 non-null   float64
 4   IsPeakSeason_1              7354 non-null   float64
 5   MoistureConditions_Optimal  7354 non-null   float64
 6   MoistureConditions_Poor     7354 non-null   float64
 7   NumMosquitos                7354 non-null   float64
 8   PrecipTotal_lag1            7354 non-null   float64
 9   PrecipTotal_lag23           7354 non-null   float64
 10  PrecipTotal_lag27           7354 non-null   float64
 11  PrecipTotal_lag28_mean      7354 non-null   float64
 12  RelHumidity_lag1            7354 non-null   float64
 13  RelHumidity_lag11           7354 

## Undersample the Training Data

In [5]:
undersample_size = y_train.value_counts().min()

us_data = pd.concat([X_train,y_train],axis=1)

pos_us_data = us_data[us_data['WnvPresent']==1]
neg_us_data = us_data[us_data['WnvPresent']==0].sample(undersample_size, random_state=random_seed)

us_data = pd.concat([neg_us_data, pos_us_data], axis=0)

X_train_us = us_data.drop(columns='WnvPresent')
y_train_us = us_data[['WnvPresent']]


## Results Table

In [6]:
df_test_results = pd.DataFrame(columns = ['Model','TrainAUC','TestAUC'])

## Baseline Model

In [7]:
dummy_model = DummyClassifier(random_state = random_seed, strategy='uniform')
dummy_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = dummy_model.predict(X_train)
y_test_pred = dummy_model.predict(X_test)

In [8]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.5051133085072843
Test AUC:  0.5143723813291942


In [9]:
df_test_results = df_test_results.append({'Model':'Dummy','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## Logistic Regression

In [10]:
lr_model = LogisticRegression(random_state = random_seed)
lr_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

In [11]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7838390004937448
Test AUC:  0.7571801036816102


In [12]:
df_test_results = df_test_results.append({'Model':'Logistic Regression','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## Support Vector Machine

In [13]:
svm_model = SVC(random_state = random_seed)
svm_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

In [14]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7844427969756637
Test AUC:  0.7368739284373699


In [15]:
df_test_results =  df_test_results.append({'Model':'SVM','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## Random Forest 

In [16]:
rf_model = RandomForestClassifier(random_state = random_seed)
rf_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [17]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.863111827272565
Test AUC:  0.7511367440728003


In [18]:
df_test_results = df_test_results.append({'Model':'Random Forest','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## Gradient Boosting

In [19]:
gb_model = GradientBoostingClassifier(random_state = random_seed)
gb_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

In [20]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.832533476499527
Test AUC:  0.768597254770673


In [21]:
df_test_results = df_test_results.append({'Model':'Gradient Boosting','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## XG Boost

In [22]:
xgb_model = XGBClassifier(use_label_encoder=False, seed=random_seed)
xgb_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)



In [23]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.8536968406274723
Test AUC:  0.7482738330746366


In [24]:
df_test_results = df_test_results.append({'Model':'XG Boost','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## LightGBM

In [25]:
lgb_model = LGBMClassifier(random_state = random_seed)
lgb_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = lgb_model.predict(X_train)
y_test_pred = lgb_model.predict(X_test)

In [26]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.855350960423074
Test AUC:  0.7549867608120034


In [27]:
df_test_results = df_test_results.append({'Model':'LightGBM','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## AdaBoost

In [28]:
ada_model = AdaBoostClassifier(random_state = random_seed)
ada_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = ada_model.predict(X_train)
y_test_pred = ada_model.predict(X_test)

In [29]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.8073591785988352
Test AUC:  0.7511022511692079


In [30]:
df_test_results = df_test_results.append({'Model':'AdaBoost','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## ExtraTrees Classifier

In [31]:
et_model = ExtraTreesClassifier(random_state = random_seed)
et_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = et_model.predict(X_train)
y_test_pred = et_model.predict(X_test)

In [32]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.838504146267467
Test AUC:  0.7361526209534244


In [33]:
df_test_results = df_test_results.append({'Model':'Extra Trees','TrainAUC':train_auc,'TestAUC':test_auc}, 
                       ignore_index=True)

## Hyperparameter Tuning

In [34]:
df_test_results.sort_values(by='TestAUC', ascending=False)

Unnamed: 0,Model,TrainAUC,TestAUC
4,Gradient Boosting,0.832533,0.768597
1,Logistic Regression,0.783839,0.75718
6,LightGBM,0.855351,0.754987
3,Random Forest,0.863112,0.751137
7,AdaBoost,0.807359,0.751102
5,XG Boost,0.853697,0.748274
2,SVM,0.784443,0.736874
8,Extra Trees,0.838504,0.736153
0,Dummy,0.505113,0.514372


In [35]:
grid_params = {
    'learning_rate':[0.001,0.01,0.1, 1],
    'max_depth':[3,6,8,10]
}


gb_model = GradientBoostingClassifier(random_state = random_seed)
gs_gb_model = GridSearchCV(gb_model,grid_params, scoring='roc_auc',n_jobs=-1, cv=15)
gs_gb_model.fit(X_train_us,np.ravel(y_train_us))

gb_model = gs_gb_model.best_estimator_

y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

In [36]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7992135030308798
Test AUC:  0.7739537997991296


In [37]:
gs_gb_model.best_params_

{'learning_rate': 0.01, 'max_depth': 3}

In [38]:
grid_params = {
    'learning_rate':[0.01],
    'max_depth':[3],  
    'subsample':[0.25, 0.5, 0.75, 1],
    'max_features':['log2', 1]
}


gs_gb_model = GridSearchCV(gb_model,grid_params, scoring='roc_auc',n_jobs=-1, cv=15)
gs_gb_model.fit(X_train_us,np.ravel(y_train_us))

gb_model = gs_gb_model.best_estimator_

y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

In [39]:
gs_gb_model.best_params_

{'learning_rate': 0.01,
 'max_depth': 3,
 'max_features': 'log2',
 'subsample': 0.25}

In [40]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7915429825761586
Test AUC:  0.7820920960525916


## SHAP Values

In [41]:
'''
X_test_sample = shap.sample(X_test,400)
explainer = shap.KernelExplainer(gb_model.predict, X_test_sample)
shap_values = explainer.shap_values(X_test_sample)
'''

'\nX_test_sample = shap.sample(X_test,400)\nexplainer = shap.KernelExplainer(gb_model.predict, X_test_sample)\nshap_values = explainer.shap_values(X_test_sample)\n'

In [42]:
#shap.summary_plot(shap_values, X_test_sample)