In [1]:
import numpy as np, pandas as pd
import warnings
import psutil, os
warnings.filterwarnings('ignore')

X_train = pd.read_pickle('X_train_small.pkl')
y_train = pd.read_pickle('y_train_small.pkl')
X_test = pd.read_pickle('X_test_small.pkl')
y_test = pd.read_pickle('y_test_small.pkl')

In [2]:
# Get dummies
X_train = pd.get_dummies(X_train, prefix_sep='_', drop_first=True)
X_test = pd.get_dummies(X_test, prefix_sep='_', drop_first=True)

In [3]:
X_train.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_time,confidence,bright_t31,frp,type,FIRE_YEAR,MONTH,WEEK,DAY,satellite_Terra,daynight_N
9,19.3454,-155.046906,368.200012,1.0,1.0,849,100,309.299988,110.300003,2,2001,1,1,1,1,1
19,19.3552,-155.055206,366.200012,1.3,1.1,2100,100,314.799988,137.899994,2,2001,1,1,1,1,0
29,31.2537,-84.508698,310.399994,1.0,1.0,1643,70,278.5,13.6,0,2001,1,1,2,1,0
39,30.9461,-84.917099,304.399994,1.0,1.0,1643,59,280.899994,8.6,0,2001,1,1,2,1,0
49,26.872299,-81.117996,342.600006,1.6,1.3,1644,93,290.5,96.800003,0,2001,1,1,2,1,0


In [16]:
# imports
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from statistics import mean

# defaults
tscv = TimeSeriesSplit(n_splits=3)
scoring = ['accuracy', 'f1', 'roc_auc']

In [19]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
scores = cross_validate(lr, X_test, y_test, scoring=scoring, cv=tscv)

lr_accuracy = scores["test_accuracy"].mean()
lr_f1 = scores["test_f1"].mean()
lr_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", lr_accuracy)
print("Average Test F1: ", lr_f1)
print("Average Test ROC AUC: ", lr_roc_auc)

Average Test Accuracy:  0.5297368830764441
Average Test F1:  0.15222430648298016
Average Test ROC AUC:  0.5586340848308216


In [20]:
# Naive Bayes Model
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
scores = cross_validate(nb, X_test, y_test, scoring=scoring, cv=tscv)

nb_accuracy = scores["test_accuracy"].mean()
nb_f1 = scores["test_f1"].mean()
nb_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", nb_accuracy)
print("Average Test F1: ", nb_f1)
print("Average Test ROC AUC: ", nb_roc_auc)

Average Test Accuracy:  0.5144792153199439
Average Test F1:  0.3232249711931816
Average Test ROC AUC:  0.5970993402364742


In [21]:
# SGD Model
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss="modified_huber")
sgd.fit(X_train, y_train)
scores = cross_validate(sgd, X_test, y_test, scoring=scoring, cv=tscv)

sgd_accuracy = scores["test_accuracy"].mean()
sgd_f1 = scores["test_f1"].mean()
sgd_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", sgd_accuracy)
print("Average Test F1: ", sgd_f1)
print("Average Test ROC AUC: ", sgd_roc_auc)

Average Test Accuracy:  0.47929316518760706
Average Test F1:  0.13447675863783246
Average Test ROC AUC:  0.5327763942500136


In [22]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
scores = cross_validate(knn, X_test, y_test, scoring=scoring, cv=tscv)

knn_accuracy = scores["test_accuracy"].mean()
knn_f1 = scores["test_f1"].mean()
knn_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", knn_accuracy)
print("Average Test F1: ", knn_f1)
print("Average Test ROC AUC: ", knn_roc_auc)

Average Test Accuracy:  0.6420675696714931
Average Test F1:  0.3514979807053045
Average Test ROC AUC:  0.5885315132962211


In [23]:
# Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
scores = cross_validate(dt, X_test, y_test, scoring=scoring, cv=tscv)

dt_accuracy = scores["test_accuracy"].mean()
dt_f1 = scores["test_f1"].mean()
dt_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", dt_accuracy)
print("Average Test F1: ", dt_f1)
print("Average Test ROC AUC: ", dt_roc_auc)

Average Test Accuracy:  0.6269655924023042
Average Test F1:  0.40366193621216695
Average Test ROC AUC:  0.5660844225882432


In [24]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
scores = cross_validate(rfc, X_test, y_test, scoring=scoring, cv=tscv)

rfc_accuracy = scores["test_accuracy"].mean()
rfc_f1 = scores["test_f1"].mean()
rfc_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", rfc_accuracy)
print("Average Test F1: ", rfc_f1)
print("Average Test ROC AUC: ", rfc_roc_auc)

Average Test Accuracy:  0.6700918573875136
Average Test F1:  0.33285664115339425
Average Test ROC AUC:  0.6525407086101808


In [25]:
# LDA Model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
scores = cross_validate(lda, X_test, y_test, scoring=scoring, cv=tscv)

lda_accuracy = scores["test_accuracy"].mean()
lda_f1 = scores["test_f1"].mean()
lda_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", lda_accuracy)
print("Average Test F1: ", lda_f1)
print("Average Test ROC AUC: ", lda_roc_auc)

Average Test Accuracy:  0.5336291452592247
Average Test F1:  0.15258063694071797
Average Test ROC AUC:  0.5741491849914188


In [26]:
# QDA Model
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
scores = cross_validate(qda, X_test, y_test, scoring=scoring, cv=tscv)

qda_accuracy = scores["test_accuracy"].mean()
qda_f1 = scores["test_f1"].mean()
qda_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", qda_accuracy)
print("Average Test F1: ", qda_f1)
print("Average Test ROC AUC: ", qda_roc_auc)

Average Test Accuracy:  0.44457418651720376
Average Test F1:  0.48703796900845053
Average Test ROC AUC:  0.48581603894093545


In [29]:
# XGB Model
from xgboost import XGBClassifier 

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
scores = cross_validate(xgb, X_test, y_test, scoring=scoring, cv=tscv)

xgb_accuracy = scores["test_accuracy"].mean()
xgb_f1 = scores["test_f1"].mean()
xgb_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", xgb_accuracy)
print("Average Test F1: ", xgb_f1)
print("Average Test ROC AUC: ", xgb_roc_auc)

Average Test Accuracy:  0.6619959520473299
Average Test F1:  0.3735380440799003
Average Test ROC AUC:  0.6214343299231487


In [33]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
scores = cross_validate(gb, X_test, y_test, scoring=scoring, cv=tscv)

gb_accuracy = scores["test_accuracy"].mean()
gb_f1 = scores["test_f1"].mean()
gb_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", gb_accuracy)
print("Average Test F1: ", gb_f1)
print("Average Test ROC AUC: ", gb_roc_auc)

Average Test Accuracy:  0.6628522497275416
Average Test F1:  0.3689051738507024
Average Test ROC AUC:  0.6244599017832141


In [34]:
from sklearn.ensemble import AdaBoostClassifier

ab = AdaBoostClassifier()
ab.fit(X_train, y_train)
scores = cross_validate(ab, X_test, y_test, scoring=scoring, cv=tscv)

ab_accuracy = scores["test_accuracy"].mean()
ab_f1 = scores["test_f1"].mean()
ab_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", ab_accuracy)
print("Average Test F1: ", ab_f1)
print("Average Test ROC AUC: ", ab_roc_auc)

Average Test Accuracy:  0.6227619492449011
Average Test F1:  0.21930469355629467
Average Test ROC AUC:  0.6388755391994043


In [36]:
from sklearn.ensemble import BaggingClassifier

bag = BaggingClassifier()
bag.fit(X_train, y_train)
scores = cross_validate(bag, X_test, y_test, scoring=scoring, cv=tscv)

bag_accuracy = scores["test_accuracy"].mean()
bag_f1 = scores["test_f1"].mean()
bag_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", bag_accuracy)
print("Average Test F1: ", bag_f1)
print("Average Test ROC AUC: ", bag_roc_auc)

Average Test Accuracy:  0.6606725829051845
Average Test F1:  0.4442132511287414
Average Test ROC AUC:  0.6456626700655507


In [38]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)
scores = cross_validate(etc, X_test, y_test, scoring=scoring, cv=tscv)

etc_accuracy = scores["test_accuracy"].mean()
etc_f1 = scores["test_f1"].mean()
etc_roc_auc = scores["test_roc_auc"].mean()

print("Average Test Accuracy: ", etc_accuracy)
print("Average Test F1: ", etc_f1)
print("Average Test ROC AUC: ", etc_roc_auc)

Average Test Accuracy:  0.6712595360423478
Average Test F1:  0.3692414360072885
Average Test ROC AUC:  0.6555576633448229


In [40]:
models_df = pd.DataFrame({
    'Model':["Logistic Regression", "Naive Bayes", "Stochastic Gradient Descent", "K-Nearest Neighbors", "Decision Tree", "Random Forest", "Linear Discriminant Analysis", "Quadratic Discriminant Analysis", "XGBoost Classifier", "Gradient Boosting Classifier", "Ada Boost Classifier", "Bagging Classifier", "Extra Trees Classifier"],
    'Accuracy':[lr_accuracy, nb_accuracy, sgd_accuracy, knn_accuracy, dt_accuracy, rfc_accuracy, lda_accuracy, qda_accuracy, xgb_accuracy, gb_accuracy, ab_accuracy, bag_accuracy, etc_accuracy],
    'F1':[lr_f1, nb_f1, sgd_f1, knn_f1, dt_f1, rfc_f1, lda_f1, qda_f1, xgb_f1, gb_f1, ab_f1, bag_f1, etc_f1],
    'ROC_AUC':[lr_roc_auc, nb_roc_auc, sgd_roc_auc, knn_roc_auc, dt_roc_auc, rfc_roc_auc, lda_roc_auc, qda_roc_auc, xgb_roc_auc, gb_roc_auc, ab_roc_auc, bag_roc_auc, etc_roc_auc],
}, columns = ["Model", "Accuracy", "F1", "ROC_AUC"])

models_df.sort_values(by='F1', ascending=False)

Unnamed: 0,Model,Accuracy,F1,ROC_AUC
7,Quadratic Discriminant Analysis,0.444574,0.487038,0.485816
11,Bagging Classifier,0.660673,0.444213,0.645663
4,Decision Tree,0.626966,0.403662,0.566084
8,XGBoost Classifier,0.661996,0.373538,0.621434
12,Extra Trees Classifier,0.67126,0.369241,0.655558
9,Gradient Boosting Classifier,0.662852,0.368905,0.62446
3,K-Nearest Neighbors,0.642068,0.351498,0.588532
5,Random Forest,0.670092,0.332857,0.652541
1,Naive Bayes,0.514479,0.323225,0.597099
10,Ada Boost Classifier,0.622762,0.219305,0.638876
