# Model Selection

In [5]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from yellowbrick.classifier import ConfusionMatrix, confusion_matrix
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.features import RadViz
from imblearn.under_sampling import RandomUnderSampler

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [6]:
filepath = '../data/train_test_split/'

infile = open(filepath + 'X_train_ac.pickle','rb')
X_train = pickle.load(infile)
infile.close()

infile = open(filepath + 'X_test_ac.pickle','rb')
X_test = pickle.load(infile)
infile.close()

infile = open(filepath + 'y_train_ac.pickle','rb')
y_train = pickle.load(infile)
infile.close()

infile = open(filepath + 'y_test_ac.pickle','rb')
y_test = pickle.load(infile)
infile.close()

In [7]:
y_train = np.array(y_train).reshape(-1, ).astype('int')
y_test = np.array(y_test).reshape(-1, ).astype('int')

## Undersample

In [8]:
under = RandomUnderSampler(sampling_strategy='auto')
X_under, y_under = under.fit_resample(X_train, y_train)



## Create results dict

In [9]:
results = {}

## Dummy Model

In [10]:
from sklearn.dummy import DummyClassifier

In [11]:
dc = DummyClassifier(strategy = 'most_frequent')
dc.fit(X_under, y_under)
dc.score(X_under, y_under)

0.3333333333333333

In [12]:
results.update({'Dummy': [dc.score(X_train, y_train), 0]})

## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
lr = LogisticRegression(random_state = 31, max_iter = 1000)
lr.fit(X_under, y_under)
lr.score(X_under, y_under)

0.4245699897152179

In [15]:
results.update({'LogReg': [lr.score(X_under, y_under), 0]})

## Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
dtree = DecisionTreeClassifier(random_state = 31)
dtree.fit(X_under,y_under)
dtree.score(X_under, y_under)

0.9999858140936979

In [18]:
dtree_cv = cross_val_score(dtree, X_train, y_train, cv=5, scoring= 'accuracy')
np.average(dtree_cv)

0.4516660769673019

In [19]:
results.update({'DecisionTree': [dtree.score(X_under, y_under), np.average(dtree_cv)]})

## Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier(n_jobs=-1, bootstrap=True, random_state=31)
rf.fit(X_under, y_under);
rf.score(X_under, y_under)

0.9999858140936979

In [22]:
rf_cv = cross_val_score(rf, X_train, y_train, cv=5, scoring= 'accuracy', n_jobs = 8, verbose = 1)
np.average(rf_cv)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:  2.2min remaining:  3.3min
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:  2.2min finished


0.5429474888845093

In [23]:
results.update({'RandomForest': [rf.score(X_under, y_under), np.average(rf_cv)]})

## Gradient Boosting Classifier

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

In [25]:
gbc = GradientBoostingClassifier()
gbc.fit(X_under, y_under)
gbc.score(X_under, y_under)

0.46578714047593717

In [26]:
gbc_cv = cross_val_score(gbc, X_train, y_train, cv=5, scoring= 'accuracy', n_jobs = 8, verbose = 1)
np.average(gbc_cv)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed: 17.9min remaining: 26.8min
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed: 17.9min finished


0.541126421816495

In [27]:
results.update({'GradientBoosting': [gbc.score(X_under, y_under), np.average(gbc_cv)]})

## Ada Boost

In [28]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [29]:
dc = DecisionTreeClassifier(class_weight='balanced')
ada = AdaBoostClassifier(base_estimator = dc, random_state=31)
ada.fit(X_under, y_under)
ada.score(X_under, y_under)

0.9999858140936979

In [30]:
ada_cv = cross_val_score(ada, X_train, y_train, cv=5, scoring= 'accuracy', n_jobs = 8, verbose = 1)
np.average(ada_cv)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed: 13.4min remaining: 20.1min
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed: 14.0min finished


0.532371375995855

In [31]:
results.update({'Ada-DT': [ada.score(X_under, y_under), np.average(ada_cv)]})

## SVC

In [32]:
from sklearn.svm import SVC

In [33]:
svc = SVC(C=.1, kernel = 'sigmoid', max_iter=1000)
svc.fit(X_under, y_under)
svc.score(X_under, y_under)



0.3377167783806788

In [34]:
svc_cv = cross_val_score(svc, X_train, y_train, cv=5, n_jobs = 8, verbose = 1)
np.average(svc_cv)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed:  6.3min remaining:  9.5min
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed:  6.4min finished


0.3803084991162589

In [35]:
results.update({'SVC': [svc.score(X_under, y_under), np.average(svc_cv)]})

## MLP Classifier

In [36]:
from sklearn.neural_network import MLPClassifier

In [37]:
mlp = MLPClassifier(random_state=31, max_iter=10000, hidden_layer_sizes=(500,))
mlp.fit(X_under, y_under)
mlp.score(X_under, y_under)

0.6543391140901514

In [38]:
mlp_cv = cross_val_score(mlp, X_train, y_train, cv=5, n_jobs = 8, verbose = 1)
mlp_cv

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   5 | elapsed: 32.9min remaining: 49.3min
[Parallel(n_jobs=8)]: Done   5 out of   5 | elapsed: 39.0min finished


array([0.51323208, 0.51605125, 0.52306917, 0.51898461, 0.52451504])

In [44]:
(0.51323208 + 0.51605125 + 0.52306917 + 0.51898461 + 0.52451504) /5

0.51917043

In [39]:
results.update({'MLP': [mlp.score(X_under, y_under), np.average(mlp_cv)]})

## Results

In [40]:
pd.DataFrame(results, columns = results.keys()).T

Unnamed: 0,0,1
Dummy,0.357834,0.0
LogReg,0.42457,0.0
DecisionTree,0.999986,0.451666
RandomForest,0.999986,0.542947
GradientBoosting,0.465787,0.541126
Ada-DT,0.999986,0.532371
SVC,0.337717,0.380308
MLP,0.654339,0.51917
