In [1]:
#imports
import numpy as np

import pandas as pd
from sklearn import dummy, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
#read the data into pandas

filepath = '../data/X_train.csv'
X_train = pd.read_csv(filepath)

filepath = '../data/y_train.csv'
y_train = pd.read_csv(filepath)

filepath = '../data/X_test.csv'
X_test = pd.read_csv(filepath)

filepath = '../data/y_test.csv'
y_test = pd.read_csv(filepath)

## Undersample the Training Data

In [3]:
undersample_size = y_train.value_counts().min()

us_data = pd.concat([X_train,y_train],axis=1)

pos_us_data = us_data[us_data['WnvPresent']==1]
neg_us_data = us_data[us_data['WnvPresent']==0].sample(undersample_size)

us_data = pd.concat([neg_us_data, pos_us_data], axis=0)

X_train_us = us_data.drop(columns='WnvPresent')
y_train_us = us_data[['WnvPresent']]


## Logistic Regression

Logistic Regression Hyperparameters to tune:
* penalty
* C
* max_iter

In [4]:
grid_params = {
    'penalty':['none','l1','l2','elasticnet'],
    'C':[0.01, 0.1, 1, 10, 100, 1000],
    'max_iter':[50,100,500,1000]
}

lr_model = LogisticRegression()
rs_lr_model = RandomizedSearchCV(lr_model,grid_params)
rs_lr_model.fit(X_train_us,np.ravel(y_train_us))

lr_model = rs_lr_model.best_estimator_

y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

Traceback (most recent call last):
  File "F:\Anaconda\envs\SpringBoardMain\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "F:\Anaconda\envs\SpringBoardMain\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "F:\Anaconda\envs\SpringBoardMain\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "F:\Anaconda\envs\SpringBoardMain\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "F:\Anaconda\envs\SpringBoardMain\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    s

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [5]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7826834589507623
Test AUC:  0.7786752696026215


## Support Vector Machine

SVM Hyperparameters to tune:
* C
* kernel
* gamma

In [6]:
grid_params = {
    'C':[0.01, 0.1, 1, 10, 100, 1000],
    'kernel':['rbf','poly'],
    'gamma':['scale','auto',1, 0.1, 0.01, 0.001,]
}

svm_model = SVC()
rs_svm_model = RandomizedSearchCV(svm_model,grid_params, cv=10)
rs_svm_model.fit(X_train_us,np.ravel(y_train_us))

svm_model = rs_svm_model.best_estimator_

y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)



In [7]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.7629838551364343
Test AUC:  0.782380213247304


## Random Forest 

Random Forest Hyperparameters to tune:
* n_estimators
* criterion
* max_features

In [8]:
grid_params = {
    'n_estimators':[10, 20, 50, 100, 200, 500],
    'criterion': ['gini','entropy'],
    'max_features':['sqrt','log2',None,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}

rf_model = RandomForestClassifier()
rs_rf_model = RandomizedSearchCV(rf_model,grid_params, scoring='roc_auc')
rs_rf_model.fit(X_train_us,np.ravel(y_train_us))

rf_model = rs_rf_model.best_estimator_

y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [9]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.885816657049547
Test AUC:  0.7814519483418044


## Gradient Boosting

Gradient Boosting Hyperparameters to tune:
* learning_rate
* n_estimators
* max_features

In [10]:
grid_params = {
    'learning_rate':[0.01,0.05,0.1,0.2,0.5,0.75,1],
    'n_estimators':[10, 20, 50, 100, 200, 500],
    'max_features':['sqrt','log2',None,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}

gb_model = GradientBoostingClassifier()
rs_gb_model = RandomizedSearchCV(gb_model,grid_params, scoring='roc_auc')
rs_gb_model.fit(X_train_us,np.ravel(y_train_us))

gb_model = rs_gb_model.best_estimator_

y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

In [11]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred, pos_label = 1)
train_auc = metrics.auc(fpr,tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label = 1)
test_auc = metrics.auc(fpr,tpr)

print('Training AUC: ', train_auc)
print('Test AUC: ', test_auc)

Training AUC:  0.8280436696549139
Test AUC:  0.7869931318541965
