# Modeling data

Lets select the best model among logistic regression, svm, random forest, gbm & neural network.

Will be using 5 fold cross validation in the model training process and Grid search for the model selection.

In [30]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

#### Load feature engineered train, test datasets

In [31]:
x_train = pd.read_csv('data/train_features.csv')
x_test = pd.read_csv('data/test_features.csv')

y_train = pd.read_csv('data/train_labels.csv')
y_test = pd.read_csv('data/test_labels.csv')

In [32]:
# Method to evaluate cross validation results
def print_results(results):

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('Acc: {} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
        
    print('\nBest Params: {},'.format(results.best_params_), 'Accuracy: {}'.format(results.best_score_))
        
# Method to get f1 score of the best model
def best_model_f1_score(results):
    y_pred = results.best_estimator_.predict(x_test)
    # return f1 score
    return f1_score(y_test, y_pred)

### Logistic Regression

In [33]:
lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

lr_cv = GridSearchCV(lr, parameters, cv=5)
lr_cv.fit(x_train, y_train.values.ravel())

# Find best best model
print_results(lr_cv)

Acc: 0.717 (+/-0.12) for {'C': 0.001}
Acc: 0.731 (+/-0.11) for {'C': 0.01}
Acc: 0.748 (+/-0.106) for {'C': 0.1}
Acc: 0.752 (+/-0.097) for {'C': 1}
Acc: 0.752 (+/-0.096) for {'C': 10}
Acc: 0.752 (+/-0.097) for {'C': 100}
Acc: 0.752 (+/-0.097) for {'C': 1000}

Best Params: {'C': 10}, Accuracy: 0.752422480620155


##### F1 score gives an idea about both precision and recall. it is a better metric when there are imbalanced classes.

In [34]:
print("lr:", best_model_f1_score(lr_cv))

lr: 0.45484581497797355


### Support Vector Machine

In [35]:
svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
}

svm_cv = GridSearchCV(svc, parameters, cv=5)
svm_cv.fit(x_train, y_train.values.ravel())

print_results(svm_cv)

Acc: 0.749 (+/-0.103) for {'C': 0.1, 'kernel': 'linear'}
Acc: 0.77 (+/-0.124) for {'C': 0.1, 'kernel': 'rbf'}
Acc: 0.751 (+/-0.098) for {'C': 1, 'kernel': 'linear'}
Acc: 0.807 (+/-0.1) for {'C': 1, 'kernel': 'rbf'}
Acc: 0.752 (+/-0.098) for {'C': 10, 'kernel': 'linear'}
Acc: 0.822 (+/-0.086) for {'C': 10, 'kernel': 'rbf'}

Best Params: {'C': 10, 'kernel': 'rbf'}, Accuracy: 0.8220930232558139


In [36]:
print("svm:", best_model_f1_score(svm_cv))

svm: 0.5703794369645043


### Random Forest

In [37]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

rf_cv = GridSearchCV(rf, parameters, cv=5)
rf_cv.fit(x_train, y_train.values.ravel())

print_results(rf_cv)

Acc: 0.742 (+/-0.059) for {'max_depth': 2, 'n_estimators': 5}
Acc: 0.772 (+/-0.096) for {'max_depth': 2, 'n_estimators': 50}
Acc: 0.761 (+/-0.1) for {'max_depth': 2, 'n_estimators': 250}
Acc: 0.778 (+/-0.076) for {'max_depth': 4, 'n_estimators': 5}
Acc: 0.795 (+/-0.084) for {'max_depth': 4, 'n_estimators': 50}
Acc: 0.798 (+/-0.096) for {'max_depth': 4, 'n_estimators': 250}
Acc: 0.815 (+/-0.076) for {'max_depth': 8, 'n_estimators': 5}
Acc: 0.825 (+/-0.075) for {'max_depth': 8, 'n_estimators': 50}
Acc: 0.825 (+/-0.071) for {'max_depth': 8, 'n_estimators': 250}
Acc: 0.82 (+/-0.066) for {'max_depth': 16, 'n_estimators': 5}
Acc: 0.846 (+/-0.08) for {'max_depth': 16, 'n_estimators': 50}
Acc: 0.848 (+/-0.083) for {'max_depth': 16, 'n_estimators': 250}
Acc: 0.818 (+/-0.076) for {'max_depth': 32, 'n_estimators': 5}
Acc: 0.848 (+/-0.08) for {'max_depth': 32, 'n_estimators': 50}
Acc: 0.85 (+/-0.076) for {'max_depth': 32, 'n_estimators': 250}
Acc: 0.823 (+/-0.068) for {'max_depth': None, 'n_estima

In [38]:
print("rf:", best_model_f1_score(rf_cv))

rf: 0.5778069599474721


Note: The default value for max_depth is None, which means that each tree will expand until every leaf is pure. A pure leaf is one where all of the data on the leaf comes from the same class.

### Gradient Boosting Model

In [39]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1]
}

gbm_cv = GridSearchCV(gb, parameters, cv=5)
gbm_cv.fit(x_train, y_train.values.ravel())

print_results(gbm_cv)

Acc: 0.68 (+/-0.094) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
Acc: 0.705 (+/-0.04) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
Acc: 0.748 (+/-0.076) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
Acc: 0.775 (+/-0.077) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
Acc: 0.748 (+/-0.087) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
Acc: 0.754 (+/-0.087) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Acc: 0.807 (+/-0.061) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
Acc: 0.819 (+/-0.064) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
Acc: 0.777 (+/-0.075) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
Acc: 0.79 (+/-0.061) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
Acc: 0.821 (+/-0.069) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250}
Acc: 0.831 (+/-0.071) for {'learning_rate': 0.01, 'max_

In [40]:
print("gbm:", best_model_f1_score(gbm_cv))

gbm: 0.5683060109289618


### Artificial Neural Network

In [41]:
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

ann_model = Sequential()

ann_model.add(Dense(1024, input_dim = 10, activation = 'relu'))
ann_model.add(Dropout(0.4))

ann_model.add(Dense(256, activation = 'relu'))
ann_model.add(Dropout(0.4))

ann_model.add(Dense(64, activation = 'relu'))
ann_model.add(Dropout(0.4))

ann_model.add(Dense(32, activation = 'relu'))

ann_model.add(Dense(1, activation = 'sigmoid'))

ann_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 1024)              11264     
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               262400    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 32)               

In [42]:
# Compile model
ann_model.compile(loss='mse', optimizer=Adam(), metrics=['accuracy'])

In [43]:
# train model
history = ann_model.fit(x_train, y_train, epochs=25, batch_size=32, validation_data=(x_test, y_test))

Train on 10320 samples, validate on 3500 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


#### Evaluate ANN model

In [44]:
# evaluate the model
_, train_acc = ann_model.evaluate(x_train, y_train, verbose=0)
_, test_acc = ann_model.evaluate(x_test, y_test, verbose=0)
print('Train acc: %.4f, Test acc: %.4f' % (train_acc, test_acc))

ann_y_preds = ann_model.predict_classes(x_test)

# Get score 
# f1_score(y_test, ann_y_preds)
print("ann:", f1_score(y_test, ann_y_preds))

Train acc: 0.8444, Test acc: 0.7797
ann: 0.5571510626076966
