In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report, roc_curve

from xgboost import XGBClassifier
from xgboost import DMatrix, cv
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA

from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

import seaborn as sns
import numpy
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib
import pandas as pd
import pickle
import joblib
%matplotlib inline
# import

In [None]:
raw_data_folder = "raw_data//"
processed_data_folder = "processed_data//"
models_folder = "models//"

In [2]:
standardized_train_onehot = pd.read_csv(processed_data_folder + "standardized_train_onehot.csv")
standardized_test_onehot = pd.read_csv(processed_data_folder + "standardized_train_onehot.csv")

standardized_train_ordinal = pd.read_csv(processed_data_folder + "standardized_train_ordinal.csv")
standardized_test_ordinal = pd.read_csv(processed_data_folder + "standardized_train_ordinal.csv")

In [3]:
def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(conf_mat,
                     annot=True, # Annotate the boxes 
                     cbar=False,
                     fmt = 'g')

    plt.xlabel('true label')
    plt.ylabel('predicted label');

# Workflow
Since we have data that is mostly categorical, we can draw some conclusions:
1. A neural network/perceptron/logistic regression probably will not perform well
2. We dont have text data so a weighted Bayes is not preferred

A good aspect of this dataset is it's medium sized, meaning:
1. We can brute force(Grid Search) a lot of algortihms to see what si best, then refine the search for some
2. We can make try both encodings, and see if the best performing algorithms prefer one or the other.
<br>
We will first make use of just the F1 score for quick and dirty filtering of algorithms. Given one single metric, we can quicly eliminate the low performers. 
<br>
After we eliminate the low performers, we can compare the metrics on the better ones and even retrain using finer Grid Search.

We will make functions out of each model training, so they can be adapted to different data preprocessings and be iterated upon.
<br>
All the models have a default grid, but can be fed different search grids as a function parameter.

In [4]:
def randomized_search_model(X,Y,model, grid, n_iter,n_jobs = 6):
    """Perform randomized parameter search on the model according to the grid.
    This function requires the full dataset because it makes use of cross-validation"""
    rs_classif = RandomizedSearchCV(estimator = model, param_distributions= grid,
                                    n_iter = n_iter, cv = 5, verbose = 1,n_jobs = n_jobs)
    rs_classif.fit(X,Y)
    return rs_classif

def grid_search_model(X,Y,model, grid,n_jobs = 6):
    gs_classif = GridSearchCV(estimator = model, param_grid = grid,
                              cv =5, verbose = 1, n_jobs = n_jobs)
    gs_classif.fit(X,Y)
    return gs_classif

def eval_on_test(model,X,Y,method = f1_score):
    Y_pred = model.predict(X)
    value = method(Y,Y_pred)
    print(value)
    return value

def full_metrics(model,x,y):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels.
    """
    y_true = y
    y_pred = model.predict(x)
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    metric_dict = {"accuracy": round(accuracy, 4),
                   "precision": round(precision, 4), 
                   "recall": round(recall, 4),
                   "f1": round(f1, 4)}
    print(f"Acc: {accuracy * 100:.4f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 score: {f1:.4f}")

    return metric_dict

Training a random forest classifier

In [6]:
def train_rf(X_train, X_test, Y_train, Y_test, grid = None):
    if grid is None:
        grid_rf = {"n_estimators": [,40,50,100],
                "max_depth": [None, 5, 10, 20],
                "max_features": ["auto", "sqrt"],
                "min_samples_split": [2, 4, 6],
                "min_samples_leaf": [1, 2, 4]}
    else:
        grid_rf = grid
#     parameter_grid = {
#                  'max_depth' : [4, 6, 8],
#                  'n_estimators': [50, 10],
#                  'max_features': ['sqrt', 'auto', 'log2'],
#                  'min_samples_split': [2, 3, 10],
#                  'min_samples_leaf': [1, 3, 10],
#                  'bootstrap': [True, False],
#                  }
    rf = grid_search_model(X_train,Y_train,RandomForestClassifier(),grid_rf)
    f1 = (eval_on_test(rf,X_train,Y_train), 
          eval_on_test(rf, X_test, Y_test))
    return rf,f1

Support Vector Classifier

In [7]:
def train_svc(X_train, X_test, Y_train, Y_test, grid = None):
    if grid is None:
        grid_svc = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [10, 100, 1000, 2000]},
                {'kernel': ['linear'], 'C': [10, 100, 1000, 2000]}]
    else:
        grid_svc = grid
    svc = randomized_search_model(X_train,Y_train,SVC(),grid_svc,20)
    f1 = (eval_on_test(svc,X_train,Y_train), 
          eval_on_test(svc, X_test, Y_test))
    return svc,f1

In [8]:
def train_knn(X_train, X_test, Y_train, Y_test, grid = None):
    if grid is None:
        grid_knn = [{'n_neighbors': [3,5,7,11], 'weights':['uniform','distance'], 'metric': ['euclidean','manhattan']}]
    else:
        grid_knn = grid
    knn = grid_search_model(X_train, Y_train, KNeighborsClassifier(),grid_knn)
    f1 = (eval_on_test(knn,X_train,Y_train), 
          eval_on_test(knn, X_test, Y_test))
    return knn,f1

In [9]:
def train_adaboost(X_train, X_test, Y_train, Y_test, grid = None):
#     grid_adaboost = {
#      'n_estimators': [50,75, 100],
#      'learning_rate' : [0.01,0.05,0.1,0.3,1],
#      'loss' : ['linear', 'square', 'exponential']
#      }
    if grid is None:
        grid_adaboost = {"base_estimator__criterion" : ["gini", "entropy"],
                         "base_estimator__splitter" :   ["best", "random"],
                         "n_estimators": [50,75,100,125]}
    else:
        grid_adaboost = grid
    # ab = AdaBoostClassifier().fit(X_train,Y_train)
    DTC = DecisionTreeClassifier(max_features = "auto",max_depth = 6)

    adaboost = grid_search_model(X_train,Y_train, AdaBoostClassifier(base_estimator= DTC), grid_adaboost)

    f1 = (eval_on_test(adaboost,X_train,Y_train), 
          eval_on_test(adaboost, X_test, Y_test))
    return adaboost,f1

In [10]:
# def train_gp(X_train, X_test, Y_train, Y_test):
#     grid_gp = {'kernel':[RBF(1.0), 0.5* RBF(1.0), 0.33* RBF(1.0), 0.1 * RBF(1.0), RBF(0.5)],
#                  'n_restarts_optimizer' : [0,1,2,3]}
#     gp = grid_search_model(X_train, Y_train, GaussianProcessClassifier(RBF(1.0)), grid_gp, 5)

#     eval_on_test(gp,X_train,Y_train)
#     eval_on_test(gp,X_test,Y_test)
#     return gp

In [11]:
def train_xgb(X_train, X_test, Y_train, Y_test, grid = None, early_stopping_rounds=50):
    
    if grid is None:
        grid_xg ={
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'max_depth': [3, 4, 5]
            }
    else:
        grid_xg = grid
    xgb = XGBClassifier(
     learning_rate =0.1,
     n_estimators=1000,
     max_depth=5,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1)
    
    
#     gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
#      min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
#      objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
#      param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
    
    
    
    xgb_param = xgb.get_xgb_params()
    xgtrain = DMatrix(X_train, label=Y_train)
    cvresult = cv(xgb_param, xgtrain, num_boost_round=xgb.get_params()['n_estimators'], nfold=5,
        metrics='auc', early_stopping_rounds=early_stopping_rounds)
    xgb.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    xgb.fit(X_train, Y_train)

    f1 = (eval_on_test(xgb,X_train,Y_train), 
          eval_on_test(xgb, X_test, Y_test))
    return xgb,f1

In [17]:

def train_all(X_train, X_test, Y_train, Y_test):
    sep = '-'*80
    print(sep)
    print("Random Forest",flush=True)
    rf, rff1 = train_rf(X_train, X_test, Y_train, Y_test)
    print(sep)
    print("Support Vector Classifier",flush=True)
    svc, svcf1 = train_svc(X_train, X_test, Y_train, Y_test)
    print(sep)
    print("K Nearest Neighbors",flush=True)
    knn, knnf1 = train_knn(X_train, X_test, Y_train, Y_test)
    print(sep)
    print("Adaboost",flush=True)
    adaboost,adaboostf1 = train_adaboost(X_train, X_test, Y_train, Y_test)
#     print('-'*20)
#     print("Gaussian Process")
#     gp = train_gp(X_train, X_test, Y_train, Y_test)
    print(sep)
    print("XGBoost Classifier")
    xgb,xgbf1 = train_xgb(X_train, X_test, Y_train, Y_test)
    print(sep)
    print("")
    return {'rf':(rf,rff1),'svc':(svc,svcf1),'knn':(knn,knnf1),'adaboost':(adaboost,adaboostf1),'xgb':(xgb,xgbf1)}

In [15]:
from preprocessing import split_train
onehot_split = split_train(standardized_train_onehot,.2)
models_onehot = train_all(*onehot_split)

--------------------
Random Forest
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    2.0s
[Parallel(n_jobs=6)]: Done 416 tasks      | elapsed:   19.6s
[Parallel(n_jobs=6)]: Done 666 tasks      | elapsed:   34.4s
[Parallel(n_jobs=6)]: Done 1016 tasks      | elapsed:   47.5s
[Parallel(n_jobs=6)]: Done 1440 out of 1440 | elapsed:  1.0min finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


0.829827915869981
0.7868852459016392
--------------------
Support Vector Classifier
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=6)]: Done  60 out of  60 | elapsed:   31.7s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


0.7741935483870968
0.8
--------------------
K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits
0.7933579335793357
0.8319999999999999
--------------------
Adaboost


[Parallel(n_jobs=6)]: Done  68 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed:    0.2s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=6)]: Done  64 tasks      | elapsed:    2.4s
[Parallel(n_jobs=6)]: Done  69 out of  80 | elapsed:    2.7s remaining:    0.3s
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed:    3.1s finished


0.9383177570093458
0.7559055118110236
--------------------
XGBoost Classifier
0.8073394495412843
0.7619047619047619
--------------------



In [16]:
ordinal_split = split_train(standardized_train_ordinal,.2)
models_ordinal = train_all(*ordinal_split)

--------------------
Random Forest
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  64 tasks      | elapsed:    1.5s
[Parallel(n_jobs=6)]: Done 364 tasks      | elapsed:    8.3s
[Parallel(n_jobs=6)]: Done 864 tasks      | elapsed:   19.2s
[Parallel(n_jobs=6)]: Done 1440 out of 1440 | elapsed:   31.6s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


0.8764478764478766
0.8029197080291972
--------------------
Support Vector Classifier
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=6)]: Done  60 out of  60 | elapsed:   36.8s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


0.7671232876712328
0.7887323943661971
--------------------
K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits
0.7976878612716762
0.7659574468085105
--------------------
Adaboost


[Parallel(n_jobs=6)]: Done  68 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed:    0.1s finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=6)]: Done  64 tasks      | elapsed:    2.1s
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed:    2.6s finished


0.9795158286778398
0.7153284671532847
--------------------
XGBoost Classifier
0.8863198458574182
0.7611940298507464
--------------------



# Iterative training approach
Now that we have a raw sense of what each algorithms does with each encoding, we can move on to iterating the best.
<br>
We will choose two categories:
1. The ones that overfit the training set, because that gives us the choice to regularize and decrease variance to improve performance
2. The ones that do well out of the box,giving good performance on both the cross-validation tests and the validation set.

In [None]:
# joblib.dump(models,"..//models//onehot_models_it1.pkl")
# joblib.dump(models_ordinal,"..//models//ordinal_models_it1.pkl")

In [None]:
# onehot_models = joblib.load("..//models//onehot_models_it1.pkl")