In [1]:
#imports
import numpy as np

import pandas as pd
from sklearn import dummy, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
#random seed for repeatability
random_seed = 18

In [3]:
#read the data into pandas

filepath = '../data/X_train.csv'
X_train = pd.read_csv(filepath)

filepath = '../data/y_train.csv'
y_train = pd.read_csv(filepath)

filepath = '../data/X_test.csv'
X_test = pd.read_csv(filepath)

filepath = '../data/y_test.csv'
y_test = pd.read_csv(filepath)

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7354 entries, 0 to 7353
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PrecipTotal_lag23           7354 non-null   float64
 1   Tavg_lag7                   7354 non-null   float64
 2   Tavg_lag18                  7354 non-null   float64
 3   Wind_AvgSpeed_lag23         7354 non-null   float64
 4   RelHumidity_lag4            7354 non-null   float64
 5   Wind_AvgSpeed_lag18         7354 non-null   float64
 6   PrecipTotal_lag28_mean      7354 non-null   float64
 7   RelHumidity_lag3            7354 non-null   float64
 8   PrecipTotal_lag1            7354 non-null   float64
 9   RelHumidity_lag11           7354 non-null   float64
 10  DaylightMinutes             7354 non-null   float64
 11  Wind_AvgSpeed_lag5          7354 non-null   float64
 12  Wind_AvgSpeed_lag21         7354 non-null   float64
 13  PrecipTotal_lag27           7354 

## Undersample the Training Data

In [5]:
undersample_size = y_train.value_counts().min()

us_data = pd.concat([X_train,y_train],axis=1)

pos_us_data = us_data[us_data['WnvPresent']==1]
neg_us_data = us_data[us_data['WnvPresent']==0].sample(undersample_size, random_state=random_seed)

us_data = pd.concat([neg_us_data, pos_us_data], axis=0)

X_train_us = us_data.drop(columns='WnvPresent')
y_train_us = us_data[['WnvPresent']]


## Logistic Regression

In [6]:
lr_model = LogisticRegression(random_state = random_seed)
lr_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

In [7]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7926081777243713
Test AUC:  0.7564932891012569


## Support Vector Machine

In [8]:
svm_model = SVC(random_state = random_seed)
svm_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

In [9]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7874825999535999
Test AUC:  0.7441580180783395


## Random Forest 

In [10]:
rf_model = RandomForestClassifier(random_state = random_seed)
rf_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [11]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.8593656121544528
Test AUC:  0.7615495429690275


## Gradient Boosting

In [12]:
gb_model = GradientBoostingClassifier(random_state = random_seed)
gb_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

In [13]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.8283072729219586
Test AUC:  0.7741901776384535


## XG Boost

In [14]:
xgb_model = XGBClassifier(use_label_encoder=False, seed=random_seed)
xgb_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)



In [15]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.855282921780099
Test AUC:  0.750299783912104


## LightGBM

In [16]:
lgb_model = LGBMClassifier(random_state = random_seed)
lgb_model.fit(X_train_us,np.ravel(y_train_us))

y_train_pred = lgb_model.predict(X_train)
y_test_pred = lgb_model.predict(X_test)

In [17]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.8563667067214744
Test AUC:  0.7492954317192683


## Model Tuning

In [18]:
grid_params = {
    'learning_rate':[0.01,0.05,0.1,0.2,0.5,0.75,1],
    'n_estimators':[10, 20, 50, 100, 200, 500],
    'max_features':['sqrt','log2',None,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}

gb_model = GradientBoostingClassifier(random_state = random_seed)
rs_gb_model = GridSearchCV(gb_model,grid_params, scoring='roc_auc',n_jobs=-1)
rs_gb_model.fit(X_train_us,np.ravel(y_train_us))

gb_model = rs_gb_model.best_estimator_

y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

In [19]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.8034497450967562
Test AUC:  0.772250459059967
