In [1]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import copy
import matplotlib as mpl
from random import sample
from joblib import dump, load
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score, fbeta_score, make_scorer, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, learning_curve, validation_curve
from sklearn.utils.class_weight import compute_class_weight

plt.style.use('bmh')
mpl.rcParams.update({
    "grid.linestyle" : "dashed",
    "axes.facecolor" : "white",
    "axes.spines.top" : False,
    "axes.spines.right" : False,
    "legend.frameon" : False,
    "figure.figsize" : (8, 5),
    "figure.dpi" : 300,
})
%matplotlib inline

# Suppress sklearn deprecated warnings
import warnings
def warn(*args, **kwargs): pass
warnings.warn = warn
np.set_printoptions(threshold=sys.maxsize)

np.random.seed(42)

In [2]:
suffix_old = ""
suffix = ""

n_features = 18
path_thyroid = "data/thyroid/"
path_without_thyroid = "data/no_thyroid/"
path = path_without_thyroid if n_features == 18 else path_thyroid
path_models = f"models/{n_features}features/"

In [3]:
# Read data
df_train = pd.read_csv(f"{path}train{suffix}.csv", index_col=0)
df_valid = pd.read_csv(f"{path}valid{suffix_old}.csv", index_col=0)
df_test = pd.read_csv(f"{path}test{suffix_old}.csv", index_col=0)

if n_features == 7:
    top_variables = [
        "Dyslipidemia\nHystory of dyslipidemia",
        "fe",
        "Previous CABG",
        "Diabetes\nHistory of diabetes",
        "Previous Myocardial Infarction",
        "Smoke\nHistory of smoke",
        "Documented resting \nor exertional ischemia",
        "Survive7Y"
    ]
    df_train = df_train.loc[:, top_variables]
    df_valid = df_valid.loc[:, top_variables]
    df_test = df_test.loc[:, top_variables]


train, valid, test = df_train.to_numpy(), df_valid.to_numpy(), df_test.to_numpy()

# y_**** contains the value of Survive7y as a list
# X_**** contains everything except for Survive7y as a list of list
X_train, y_train = train[:, :-1], train[:, -1]
X_valid, y_valid = valid[:, :-1], valid[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

feat_names = list(df_train.columns)

# Print how Survive7y are distribuited in each set
from collections import Counter
print(Counter(y_train))
print(Counter(y_valid))
print(Counter(y_test))

# All the numerical features that can be standardized
from utils import get_preprocess_std_num
preprocess_std = get_preprocess_std_num(feat_names)

preprocess_std_all = StandardScaler()

# Preprocessed ready-to-use train and valid set
process_tmp = preprocess_std.fit(X_train)
X_train_std = process_tmp.transform(X_train)
X_valid_std = process_tmp.transform(X_valid)
print(X_train_std)

Counter({1.0: 3494, 0.0: 505})
Counter({1.0: 1165, 0.0: 169})
Counter({1.0: 1165, 0.0: 169})
[[-4.48927617e-01  6.87087892e-01 -7.29396253e-01  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
   1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
   0.00000000e+00]
 [-3.26141156e+00  1.10781445e-01 -7.29396253e-01  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-8.53265802e-01 -1.36207032e-01  1.95194796e+00  1.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00]
 [-7.93945421e-01  6.04758400e-01  1.05816656e+00  1.00000

### Training


In [4]:
from functools import partial
from train import report, evaluate, train_and_evaluate

train_partial = partial(
    train_and_evaluate, 
    preprocess_std, 
    X_train=X_train,
    y_train=y_train,
    X_valid=X_valid,
    y_valid=y_valid,
    scoring="f1_macro", 
    iter=5000, 
    save=True
)

In [7]:
from sklearn.linear_model import LogisticRegression

hyperparams = {
    'model__penalty': ['l1', 'l2', 'elasticnet'], #type of penalities added, 'elasticnet' means both
    'model__dual': [True, False], # use primal or dual form. Default is True.  
    'model__warm_start': [True, False], # default is False. When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. 
    'model__C': stats.randint(1, 10), #default is 1, if the value is larger, then it indicates stronger regularization
    'model__max_iter': stats.randint(50, 500), # Default is 100, Maximum number of iterations taken for the solvers to converge.
    'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default is lbfgs. 
}
#Default is None (thus weight = 1). Balanced uses the formula n_samples / (n_classes * np.bincount(y))
model = LogisticRegression(class_weight="balanced")
train_partial(model=model, hyperparams=hyperparams, savename="logistic_regression")
# metrics.plot_roc_curve(pipe, X_valid, y_valid)

# import math
# w = logreg.coef_[0]
# feature_importance = pd.DataFrame(df_feat.columns[:-1], columns=["features"])
# feature_importance["importance"] = pow(math.e, w)
# feature_importance = feature_importance.sort_values(by = ["importance"], ascending=False)
# feature_importance

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.346     0.675     0.457       505
         1.0      0.946     0.815     0.876      3494

    accuracy                          0.798      3999
   macro avg      0.646     0.745     0.667      3999
weighted avg      0.870     0.798     0.823      3999

auc macro 0.836
confusion matrix
[[ 341  164]
 [ 645 2849]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.355     0.651     0.459       169
         1.0      0.942     0.828     0.882      1165

    accuracy                          0.806      1334
   macro avg      0.649     0.740     0.670      1334
weighted avg      0.868     0.806     0.828      1334

auc macro 0.817
confusion matrix
[[110  59]
 [200 965]]
Model rank: 1
Mean validation score: 0.666 (std: 0.011)
Parameters: {'model__C': 9, 'model__dual': True, 'model_

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8])])),
                ('model',
                 LogisticRegression(C=9, class_weight='balanced', dual=True,
                                    max_iter=54, solver='liblinear',
                                    warm_start=True))])

In [5]:
from sklearn.svm import SVC

hyperparams = {
    'model__C': stats.randint(100, 600),
    'model__kernel': ['rbf', 'poly', 'sigmoid'],
    'model__degree': stats.randint(5, 200),
    'model__gamma': ['scale', 'auto'], #Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. scale = 1 / (n_features * X.var()) as value of gamma,
    'model__coef0': stats.uniform(0.0, 1), # It is only significant in ‘poly’ and ‘sigmoid’.
    'model__max_iter': [400, 800, 1200, 1600]
}

model = SVC(class_weight="balanced", probability=True)
train_partial(model=model, hyperparams=hyperparams, savename="svc")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits




Testing on training set:
              precision    recall  f1-score   support

         0.0      0.239     0.461     0.315       505
         1.0      0.910     0.788     0.844      3494

    accuracy                          0.746      3999
   macro avg      0.575     0.625     0.580      3999
weighted avg      0.825     0.746     0.778      3999

auc macro 0.324
confusion matrix
[[ 233  272]
 [ 742 2752]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.200     0.367     0.259       169
         1.0      0.896     0.787     0.838      1165

    accuracy                          0.734      1334
   macro avg      0.548     0.577     0.548      1334
weighted avg      0.807     0.734     0.764      1334

auc macro 0.394
confusion matrix
[[ 62 107]
 [248 917]]
Model rank: 1
Mean validation score: 0.648 (std: 0.009)
Parameters: {'model__C': 412, 'model__coef0': 0.26302583574278404, 'model__degree': 199, 'model__gamma': 'auto', 'model__k

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8])])),
                ('model',
                 SVC(C=412, class_weight='balanced', coef0=0.26302583574278404,
                     degree=199, gamma='auto', max_iter=1200,
                     probability=True))])

In [15]:
from sklearn.neighbors import KNeighborsClassifier

hyperparams = {
    'model__n_neighbors': stats.randint(2, 100),
    'model__weights': ('uniform', 'distance'), # ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.
    'model__algorithm': ('ball_tree', 'kd_tree'),
    'model__leaf_size': stats.randint(10, 60)
}

model = KNeighborsClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="knn")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.634     0.505     0.562       505
         1.0      0.930     0.958     0.944      3494

    accuracy                          0.901      3999
   macro avg      0.782     0.731     0.753      3999
weighted avg      0.893     0.901     0.896      3999

auc macro 0.928
confusion matrix
[[ 255  250]
 [ 147 3347]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.385     0.296     0.334       169
         1.0      0.901     0.931     0.916      1165

    accuracy                          0.851      1334
   macro avg      0.643     0.614     0.625      1334
weighted avg      0.836     0.851     0.842      1334

auc macro 0.686
confusion matrix
[[  50  119]
 [  80 1085]]
Model rank: 1
Mean validation score: 0.614 (std: 0.004)
Parameters: {'model__algorithm': 'ball_tree', 'model

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8])])),
                ('model',
                 KNeighborsClassifier(algorithm='ball_tree', leaf_size=22,
                                      n_neighbors=4))])

In [7]:
from sklearn.ensemble import RandomForestClassifier

hyperparams = {
    'model__n_estimators': stats.randint(10, 200),
    'model__criterion': ('gini', 'entropy'), # The function to measure the quality of a split. Tree-specific parameter
    'model__min_samples_split': stats.randint(1, 8), #The minimum number of samples required to split an internal node
    'model__min_samples_leaf': stats.randint(1, 5), # The minimum number of samples required to be at a leaf node
    'model__max_features': ('sqrt', 'log2', None), # If “sqrt”, then max_features=sqrt(n_features). If “log2”, then max_features=log2(n_features), If None, then max_features=n_features.
    'model__class_weight': ['balanced', 'balanced_subsample'], 
}

model = RandomForestClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="rf")

# feature importance use permutation importance
# importance = rf_rand.best_estimator_["model"].feature_importances_
# plt.bar(list(range(len(importance))), importance)

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.610     0.907     0.729       505
         1.0      0.986     0.916     0.950      3494

    accuracy                          0.915      3999
   macro avg      0.798     0.912     0.839      3999
weighted avg      0.938     0.915     0.922      3999

auc macro 0.972
confusion matrix
[[ 458   47]
 [ 293 3201]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.395     0.491     0.438       169
         1.0      0.923     0.891     0.907      1165

    accuracy                          0.840      1334
   macro avg      0.659     0.691     0.672      1334
weighted avg      0.857     0.840     0.848      1334

auc macro 0.819
confusion matrix
[[  83   86]
 [ 127 1038]]
Model rank: 1
Mean validation score: 0.680 (std: 0.002)
Parameters: {'model__class_weight': 'balanced_subsam

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8])])),
                ('model',
                 RandomForestClassifier(class_weight='balanced_subsample',
                                        criterion='entropy',
                                        max_features='log2', min_samples_leaf=4,
                                        min_samples_split=5,
                                        n_estimators=41))])

In [8]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

hyperparams = {
    'model__n_estimators': stats.randint(10, 100),
    'model__learning_rate': stats.uniform(0.2, 1)
}

model = AdaBoostClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="adaboost")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.637     0.216     0.322       505
         1.0      0.897     0.982     0.937      3494

    accuracy                          0.885      3999
   macro avg      0.767     0.599     0.630      3999
weighted avg      0.864     0.885     0.860      3999

auc macro 0.847
confusion matrix
[[ 109  396]
 [  62 3432]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.538     0.166     0.253       169
         1.0      0.890     0.979     0.933      1165

    accuracy                          0.876      1334
   macro avg      0.714     0.573     0.593      1334
weighted avg      0.845     0.876     0.847      1334

auc macro 0.813
confusion matrix
[[  28  141]
 [  24 1141]]
Model rank: 1
Mean validation score: 0.642 (std: 0.006)
Parameters: {'model__learning_rate': 1.1827301048579

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8])])),
                ('model',
                 AdaBoostClassifier(learning_rate=1.1827301048579324,
                                    n_estimators=39))])

In [9]:
from sklearn.neural_network import MLPClassifier
import random

hyperparams = {
    'model__hidden_layer_sizes': [[stats.randint.rvs(100, 300), stats.randint.rvs(50, 150)], [stats.randint.rvs(50, 300)]], 
    'model__solver': ['sgd', 'adam'], #sgd’ refers to stochastic gradient descent. ‘adam’ refers to a stochastic gradient-based optimizer
    'model__learning_rate_init': stats.uniform(0.0005, 0.005), 
    'model__learning_rate': ('constant', 'adaptive'), 
    'model__alpha': stats.uniform(0, 1), #Strength of the L2 regularization term
    'model__early_stopping': [True],
    'model__max_iter': stats.randint(300, 500),
}

model = MLPClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="nn")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.704     0.160     0.261       505
         1.0      0.891     0.990     0.938      3494

    accuracy                          0.885      3999
   macro avg      0.798     0.575     0.600      3999
weighted avg      0.867     0.885     0.852      3999

auc macro 0.839
confusion matrix
[[  81  424]
 [  34 3460]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.703     0.154     0.252       169
         1.0      0.890     0.991     0.937      1165

    accuracy                          0.885      1334
   macro avg      0.796     0.572     0.595      1334
weighted avg      0.866     0.885     0.851      1334

auc macro 0.816
confusion matrix
[[  26  143]
 [  11 1154]]
Model rank: 1
Mean validation score: 0.653 (std: 0.004)
Parameters: {'model__alpha': 0.4523746016562262, 'mo

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8])])),
                ('model',
                 MLPClassifier(alpha=0.4523746016562262, early_stopping=True,
                               hidden_layer_sizes=[269],
                               learning_rate='adaptive',
                               learning_rate_init=0.0037189203772429744,
                               max_iter=426))])

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

hyperparams = {
    'model__learning_rate': stats.uniform(0.03, 0.2),
    'model__n_estimators': stats.randint(10, 100),
    'model__max_depth': stats.randint(2, 6),
    'model__max_features': ('sqrt', 'log2', None),  # regularization
    'model__subsample': (0.25, 0.5, 0.75, 1),       # regularization
}

model = GradientBoostingClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="gb")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.649     0.319     0.428       505
         1.0      0.908     0.975     0.941      3494

    accuracy                          0.892      3999
   macro avg      0.779     0.647     0.684      3999
weighted avg      0.876     0.892     0.876      3999

auc macro 0.856
confusion matrix
[[ 161  344]
 [  87 3407]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.507     0.213     0.300       169
         1.0      0.895     0.970     0.931      1165

    accuracy                          0.874      1334
   macro avg      0.701     0.591     0.615      1334
weighted avg      0.846     0.874     0.851      1334

auc macro 0.807
confusion matrix
[[  36  133]
 [  35 1130]]
Model rank: 1
Mean validation score: 0.658 (std: 0.014)
Parameters: {'model__learning_rate': 0.2285717550369

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8])])),
                ('model',
                 GradientBoostingClassifier(learning_rate=0.22857175503692595,
                                            max_features='sqrt',
                                            n_estimators=51, subsample=0.25))])

In [None]:
import os
#os.environ['KMP_DUPLICATE_LIB_OK']='True'

import xgboost as xgb

hyperparams = {
    'model__booster': ['gbtree', 'gblinear', 'dart'],
    'model__eta': stats.uniform(0.05, 0.5),
    'model__gamma': stats.uniform(0, 0.2),
    'model__max_depth': [2, 3, 4, 6],
    'model__n_estimators': stats.randint(10, 100),
    'model__subsample': [0.25, 0.5, 0.75, 1],     # Stochastic regularization
    'model__lambda': stats.uniform(0.5, 1.5),     # L2 regularization
    'model__alpha': stats.uniform(0, 0.5),        # L1 regularization
    'model__scale_pos_weight': [0.2, 0.4, 0.8, 1, 2],
}

model = xgb.XGBClassifier(n_jobs=1)
train_partial(model=model, hyperparams=hyperparams, savename="xgb")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits


### Evaluate

In [None]:
models = [
    "logreg_random_svmsmote_logreg",
    "svc_random_bordersmote_svc",
    "knn2_random_svmsmote_knn2",
    "rf2_random_svmsmote_rf2",
    "adaboost2_random_svmsmote_adaboost2",
    "nn_random_svmsmote_nn",
    "gb_random_svmsmote_gb",
    #"xgb2_random_svmsmote_xgb2",
    #"xgb2",
    #"nn1_random_svmsmote_nn1",
]
name = f"{models[6]}.joblib"
model = load(path_models+name)
#model = load(path_models)
# model.fit(X_train, y_train)
# evaluate(model, X_train, y_train)
print(model)


evaluate(model, X_valid, y_valid)
evaluate(model, X_test, y_test)
X_train.shape

In [None]:
tree = [load(path_models+f"tree4.joblib")]
evaluate(tree[0], X_valid4, y_valid4)
evaluate(tree[0], X_test4, y_test4)