In [None]:
import time
from contextlib import contextmanager

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# Read the training data

In [None]:
# standard Python tools
import numpy as np
import pandas as pd

# Main table
train = pd.read_pickle('data/global_train_data.pkl').sample(20000)

# Split the training and validation sets

In [None]:
from sklearn.model_selection import train_test_split

train_size = 0.75

y = train['TARGET'].values
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(
    ['TARGET', 'SK_ID_CURR'], axis=1), y, stratify=y, test_size=1 - train_size, random_state=1)
print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_valid:', X_valid.shape)
print('Shape of y_valid:', y_valid.shape)

# Preprocess data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

pipeline.fit(X_train)
X_train = pipeline.transform(X_train)
X_valid = pipeline.transform(X_valid)

# Initialize classifiers

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

# Initializing the LogisticRegression
LR_clf = LogisticRegression(max_iter=400, solver='lbfgs')
LR_parameters = {"C": [0.005], "penalty": ['l2']}

# Initializing the DecisionTreeClassifier
DT_clf = DecisionTreeClassifier(random_state=1)
DT_parameters = {'max_depth': [8], 'min_samples_leaf': [10]}

# Initializing the RandomForestClassifier
RF_clf = RandomForestClassifier(random_state=1, n_estimators=100)
RF_parameters = {'max_depth': [39],  'min_samples_leaf': [37]}

# Initializing the ExtraTreesClassifier
XT_clf = ExtraTreesClassifier(random_state=1, class_weight={0: 11.5, 1: 1})
XT_params = {'n_estimators': [300], 'min_samples_leaf': [
    18], 'min_samples_split': [8], }

# Initializing the XGBClassifier
XGB_clf = XGBClassifier()
XGB_parameters = {'max_depth': [6], 'n_estimators': [
    200], 'learning_rate': [0.05], 'gamma': [0]}

# Initializing the LGBMClassifier
params = {'boosting_type': 'gbdt', 'objective': 'binary', 'max_depth': 18,
          'n_jobs': -1, 'num_leaves': 30, 'n_estimators': 1600,
          'max_bin': 512, 'subsample_for_bin': 200, 'subsample': 0.8,
          'subsample_freq': 1, 'colsample_bytree': 0.8,
          'reg_alpha': 80, 'reg_lambda': 20,
          'min_split_gain': 0.5, 'min_child_weight': 1,
          'min_child_samples': 10, 'scale_pos_weight': 11.5, 'num_class': 1,
          'metric': 'roc_auc'
          }
LGB_clf = LGBMClassifier(**params)
LGB_parameters = {'learning_rate': [0.02]}

# Initializing the MLPClassifier
params = {'activation': 'relu', 'solver': 'adam', 'random_state': 1,
          'learning_rate': 'constant', 'max_iter': 1000, 'alpha': 0.0001,
          'learning_rate_init': 0.0001
          }
MLP_clf = MLPClassifier(**params)
MLP_parameters = {'hidden_layer_sizes': [(400, 50, 4)]}

# Initializing the AdaBoostClassifier
AB_clf = AdaBoostClassifier(random_state = 7)
AB_parameters = {'n_estimators': [100]}

In [None]:
classifiers = {
    'LogisticRegression': [LR_clf, LR_parameters],
    'DecisionTreeClassifier': [DT_clf, DT_parameters],
    'RandomForestClassifier': [RF_clf, RF_parameters],
    'ExtraTreesClassifier': [XT_clf, XT_params],
    'XGBClassifier': [XGB_clf, XGB_parameters],
    'LGBMClassifier': [LGB_clf, LGB_parameters],
    'MLPClassifier': [MLP_clf, MLP_parameters],
    'AdaBoostClassifier': [AB_clf, AB_parameters],
}

# Fit classifiers on data and get results

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score


def get_statistics(_y_valid, _y_pred):
    TN, FP, FN, TP = confusion_matrix(
        list(_y_valid), list(_y_pred), labels=[0, 1]).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    sensitivity = TP/(TP+FN) if TP+FN != 0 else 0
    # Specificity or true negative rate
    specifity = TN/(TN+FP) if TN+FP != 0 else 0
    # Precision or positive predictive value
    precision = TP/(TP+FP) if TP+FP != 0 else 0
    # Overall accuracy
    accuracy = (TP+TN)/(TP+FP+FN+TN) if TP+FP+FN+TN != 0 else 0

    return TN, FP, FN, TP, sensitivity, specifity, precision, accuracy


def eval_error(_y_valid, _y_pred):
    TN, FP, FN, TP = confusion_matrix(
        list(_y_valid), list(_y_pred), labels=[0, 1]).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    sensitivity = TP/(TP+FN) if TP+FN != 0 else 0
    # Overall accuracy
    accuracy = (TP+TN)/(TP+FP+FN+TN) if TP+FP+FN+TN != 0 else 0
    error = sensitivity*accuracy
    return error


def get_cross_validation_fitting_result(classifier_name, classifier, parameters):
    
    my_scorer = make_scorer(eval_error, greater_is_better=True)

    clf_grid = GridSearchCV(classifier, parameters, cv=10,
                            refit='True', n_jobs=-1, verbose=1, scoring=my_scorer)
    clf_grid.fit(X_train, y_train)
    clf_model = clf_grid.best_estimator_
    y_pred_proba = clf_model.predict_proba(X_valid)[:, 1]
    y_pred = clf_model.predict(X_valid)

    TN, FP, FN, TP, sensitivity, specifity, precision, accuracy = get_statistics(
        y_valid, y_pred)

    result = pd.DataFrame({'Model Type': classifier_name,
                         'TN': [TN], 'FP': [FP], 'FN': [FN], 'TP': [TP],
                         'sensitivity': [sensitivity], 'specifity': [specifity],
                         'precision': [precision], 'accuracy': [accuracy],
                         'error': [eval_error(y_valid, y_pred)], 'AUC - 10xv': [clf_grid.best_score_],
                         'AUC - Valid': [roc_auc_score(y_valid, y_pred_proba, average=None)],
                         'Hyperparameters': [clf_grid.best_params_]})
    return result

In [None]:
results = pd.DataFrame(columns = ['Model Type', 'TN', 'FP', 'FN', 'TP',
                                  'sensitivity', 'specifity', 'precision', 'accuracy',
                                  'error', 'AUC - 10xv', 'AUC - Valid', 'Hyperparameters'])

for item in classifiers.items():
    with timer(item[0]):
        result = get_cross_validation_fitting_result(item[0], item[1][0], item[1][1])
    
        # update model scoreboard
        results = pd.concat([results, result], ignore_index=True)
    
results

In [None]:
results