In [1]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import copy
import matplotlib as mpl
from random import sample
from joblib import dump, load
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score, fbeta_score, make_scorer, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, learning_curve, validation_curve
from sklearn.utils.class_weight import compute_class_weight

plt.style.use('bmh')
mpl.rcParams.update({
    "grid.linestyle" : "dashed",
    "axes.facecolor" : "white",
    "axes.spines.top" : False,
    "axes.spines.right" : False,
    "legend.frameon" : False,
    "figure.figsize" : (8, 5),
    "figure.dpi" : 300,
})
%matplotlib inline

# Suppress sklearn deprecated warnings
import warnings
def warn(*args, **kwargs): pass
warnings.warn = warn
np.set_printoptions(threshold=sys.maxsize)

np.random.seed(42)

In [2]:
suffix_old = ""
suffix = ""
# TODO make test for dataset with creatina column
# Dataset without thyroid = 18 features (including survive7y)
# Dataset with thyroid = 27 features (including survive7y)
# With columns that have missing values, 23 and 32
# Default 18
n_features = 27
path = f"data/{n_features}features/"
path_models = f"models/{n_features}features/"

In [3]:
# Read data
df_train = pd.read_csv(f"{path}train{suffix}.csv", index_col=0)
df_valid = pd.read_csv(f"{path}valid{suffix_old}.csv", index_col=0)
df_test = pd.read_csv(f"{path}test{suffix_old}.csv", index_col=0)

if n_features == 7:
    top_variables = [
        "Dyslipidemia\nHystory of dyslipidemia",
        "fe",
        "Previous CABG",
        "Diabetes\nHistory of diabetes",
        "Previous Myocardial Infarction",
        "Smoke\nHistory of smoke",
        "Documented resting \nor exertional ischemia",
        "Survive7Y"
    ]
    df_train = df_train.loc[:, top_variables]
    df_valid = df_valid.loc[:, top_variables]
    df_test = df_test.loc[:, top_variables]

print(df_train)
train, valid, test = df_train.to_numpy(), df_valid.to_numpy(), df_test.to_numpy()

# y_**** contains the value of Survive7y as a list
# X_**** contains everything except for Survive7y as a list of list
X_train, y_train = train[:, :-1], train[:, -1]
X_valid, y_valid = valid[:, :-1], valid[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

feat_names = list(df_train.columns)

# Print how Survive7y are distribuited in each set
from collections import Counter
print(Counter(y_train))
print(Counter(y_valid))
print(Counter(y_test))

# All the numerical features that can be standardized
from utils import get_preprocess_std_num
preprocess_std = get_preprocess_std_num(feat_names)

preprocess_std_all = StandardScaler()

# Preprocessed ready-to-use train and valid set
process_tmp = preprocess_std.fit(X_train)
X_train_std = process_tmp.transform(X_train)
X_valid_std = process_tmp.transform(X_valid)
print(X_train_std)

        Gender (Male = 1)        Age   TSH   fT3    fT4  Euthyroid  \
Number                                                               
6436                    1  59.179775  3.33  2.26   9.70          1   
2999                    1  65.367978  2.74  2.16  16.30          1   
4007                    0  73.994382  0.71  1.94  11.20          0   
1158                    1  55.154494  2.04  2.50   7.90          1   
5581                    0  71.471910  5.11  3.05  15.80          0   
...                   ...        ...   ...   ...    ...        ...   
5884                    0  71.932584  1.25  3.43  17.60          1   
2339                    1  55.334270  1.71  1.88  13.20          0   
1486                    0  75.323034  1.52  1.58  10.84          0   
3568                    0  82.609551  1.97  2.39  11.60          1   
7528                    1  71.786517  0.82  2.46  13.50          1   

        Subclinical primary hypothyroidism (SCH)  \
Number                               

### Training


In [4]:
from functools import partial
from train import report, evaluate, train_and_evaluate

train_partial = partial(
    train_and_evaluate, 
    preprocess_std, 
    X_train=X_train,
    y_train=y_train,
    X_valid=X_valid,
    y_valid=y_valid,
    scoring="f1_macro", 
    iter=5000, 
    save=True,
    path_models = path_models
)

In [5]:
from sklearn.linear_model import LogisticRegression

hyperparams = {
    'model__penalty': ['l1', 'l2', 'elasticnet'], #type of penalities added, 'elasticnet' means both
    'model__dual': [True, False], # use primal or dual form. Default is True.  
    'model__warm_start': [True, False], # default is False. When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. 
    'model__C': stats.randint(1, 10), #default is 1, if the value is larger, then it indicates stronger regularization
    'model__max_iter': stats.randint(50, 500), # Default is 100, Maximum number of iterations taken for the solvers to converge.
    'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default is lbfgs. 
}
#Default is None (thus weight = 1). Balanced uses the formula n_samples / (n_classes * np.bincount(y))
model = LogisticRegression(class_weight="balanced")
train_partial(model=model, hyperparams=hyperparams, savename="lr")
# metrics.plot_roc_curve(pipe, X_valid, y_valid)

# import math
# w = logreg.coef_[0]
# feature_importance = pd.DataFrame(df_feat.columns[:-1], columns=["features"])
# feature_importance["importance"] = pow(math.e, w)
# feature_importance = feature_importance.sort_values(by = ["importance"], ascending=False)
# feature_importance

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.520     0.414     0.461       505
         1.0      0.918     0.945     0.931      3494

    accuracy                          0.878      3999
   macro avg      0.719     0.679     0.696      3999
weighted avg      0.867     0.878     0.872      3999

auc macro 0.838
confusion matrix
[[ 209  296]
 [ 193 3301]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.507     0.438     0.470       169
         1.0      0.920     0.938     0.929      1165

    accuracy                          0.875      1334
   macro avg      0.713     0.688     0.699      1334
weighted avg      0.868     0.875     0.871      1334

auc macro 0.826
confusion matrix
[[  74   95]
 [  72 1093]]
Model rank: 1
Mean validation score: 0.703 (std: 0.000)
Parameters: {'model__C': 9, 'model__dual': True, 'model__max_iter': 157, 'model__penalty': 'l2', 'model__solver': 'lib

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 25, 17, 2])])),
                ('model',
                 LogisticRegression(C=9, class_weight='balanced', dual=True,
                                    max_iter=157, solver='liblinear',
                                    warm_start=True))])

In [6]:
from sklearn.svm import SVC

hyperparams = {
    'model__C': stats.randint(100, 600),
    'model__kernel': ['rbf', 'poly', 'sigmoid'],
    'model__degree': stats.randint(5, 200),
    'model__gamma': ['scale', 'auto'], #Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. scale = 1 / (n_features * X.var()) as value of gamma,
    'model__coef0': stats.uniform(0.0, 1), # It is only significant in ‘poly’ and ‘sigmoid’.
    'model__max_iter': [400, 800, 1200, 1600]
}

model = SVC(class_weight="balanced", probability=True)
train_partial(model=model, hyperparams=hyperparams, savename="svc")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.214     0.483     0.296       505
         1.0      0.909     0.743     0.818      3494

    accuracy                          0.710      3999
   macro avg      0.561     0.613     0.557      3999
weighted avg      0.821     0.710     0.752      3999

auc macro 0.701
confusion matrix
[[ 244  261]
 [ 898 2596]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.175     0.402     0.244       169
         1.0      0.893     0.725     0.801      1165

    accuracy                          0.684      1334
   macro avg      0.534     0.564     0.522      1334
weighted avg      0.802     0.684     0.730      1334

auc macro 0.642
confusion matrix
[[ 68 101]
 [320 845]]
Model rank: 1
Mean validation score: 0.665 (std: 0.011)
Parameters: {'model__C': 102, 'model__coef0': 0.7438620447180111, 'model__degree': 105, 'model__gamma': 'scale', 'model__k

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 25, 17, 2])])),
                ('model',
                 SVC(C=102, class_weight='balanced', coef0=0.7438620447180111,
                     degree=105, max_iter=1600, probability=True))])

In [7]:
from sklearn.neighbors import KNeighborsClassifier

hyperparams = {
    'model__n_neighbors': stats.randint(2, 100),
    'model__weights': ('uniform', 'distance'), # ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.
    'model__algorithm': ('ball_tree', 'kd_tree'),
    'model__leaf_size': stats.randint(10, 60)
}

model = KNeighborsClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="knn")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.673     0.566     0.615       505
         1.0      0.939     0.960     0.949      3494

    accuracy                          0.910      3999
   macro avg      0.806     0.763     0.782      3999
weighted avg      0.905     0.910     0.907      3999

auc macro 0.943
confusion matrix
[[ 286  219]
 [ 139 3355]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.378     0.331     0.353       169
         1.0      0.905     0.921     0.913      1165

    accuracy                          0.846      1334
   macro avg      0.642     0.626     0.633      1334
weighted avg      0.838     0.846     0.842      1334

auc macro 0.699
confusion matrix
[[  56  113]
 [  92 1073]]
Model rank: 1
Mean validation score: 0.644 (std: 0.016)
Parameters: {'model__algorithm': 'ball_tree', 'model

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 25, 17, 2])])),
                ('model',
                 KNeighborsClassifier(algorithm='ball_tree', leaf_size=11,
                                      n_neighbors=4))])

In [8]:
from sklearn.ensemble import RandomForestClassifier

hyperparams = {
    'model__n_estimators': stats.randint(10, 200),
    'model__criterion': ('gini', 'entropy'), # The function to measure the quality of a split. Tree-specific parameter
    'model__min_samples_split': stats.randint(1, 8), #The minimum number of samples required to split an internal node
    'model__min_samples_leaf': stats.randint(1, 5), # The minimum number of samples required to be at a leaf node
    'model__max_features': ('sqrt', 'log2', None), # If “sqrt”, then max_features=sqrt(n_features). If “log2”, then max_features=log2(n_features), If None, then max_features=n_features.
    'model__class_weight': ['balanced', 'balanced_subsample'], 
}

model = RandomForestClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="rf")

# feature importance use permutation importance
# importance = rf_rand.best_estimator_["model"].feature_importances_
# plt.bar(list(range(len(importance))), importance)

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.782     0.947     0.857       505
         1.0      0.992     0.962     0.977      3494

    accuracy                          0.960      3999
   macro avg      0.887     0.954     0.917      3999
weighted avg      0.966     0.960     0.962      3999

auc macro 0.990
confusion matrix
[[ 478   27]
 [ 133 3361]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.425     0.450     0.437       169
         1.0      0.919     0.912     0.916      1165

    accuracy                          0.853      1334
   macro avg      0.672     0.681     0.676      1334
weighted avg      0.857     0.853     0.855      1334

auc macro 0.821
confusion matrix
[[  76   93]
 [ 103 1062]]
Model rank: 1
Mean validation score: 0.699 (std: 0.004)
Parameters: {'model__class_weight': 'balanced', 'mod

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 25, 17, 2])])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        max_features='log2', min_samples_leaf=4,
                                        min_samples_split=4,
                                        n_estimators=111))])

In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

hyperparams = {
    'model__n_estimators': stats.randint(10, 100),
    'model__learning_rate': stats.uniform(0.2, 1)
}

model = AdaBoostClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="adaboost")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits
Testing on training set:
              precision    recall  f1-score   support

         0.0      0.639     0.295     0.404       505
         1.0      0.905     0.976     0.939      3494

    accuracy                          0.890      3999
   macro avg      0.772     0.636     0.672      3999
weighted avg      0.872     0.890     0.872      3999

auc macro 0.863
confusion matrix
[[ 149  356]
 [  84 3410]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.591     0.308     0.405       169
         1.0      0.906     0.969     0.937      1165

    accuracy                          0.885      1334
   macro avg      0.749     0.638     0.671      1334
weighted avg      0.866     0.885     0.869      1334

auc macro 0.823
confusion matrix
[[  52  117]
 [  36 1129]]
Model rank: 1
Mean validation score: 0.676 (std: 0.005)
Parameters: {'model__learning_rate': 0.8039516858638

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 25, 17, 2])])),
                ('model',
                 AdaBoostClassifier(learning_rate=0.8039516858638249,
                                    n_estimators=44))])

In [None]:
from sklearn.neural_network import MLPClassifier
import random

hyperparams = {
    'model__hidden_layer_sizes': [[stats.randint.rvs(100, 300), stats.randint.rvs(50, 150)], [stats.randint.rvs(50, 300)]], 
    'model__solver': ['sgd', 'adam'], #sgd’ refers to stochastic gradient descent. ‘adam’ refers to a stochastic gradient-based optimizer
    'model__learning_rate_init': stats.uniform(0.0005, 0.005), 
    'model__learning_rate': ('constant', 'adaptive'), 
    'model__alpha': stats.uniform(0, 1), #Strength of the L2 regularization term
    'model__early_stopping': [True],
    'model__max_iter': stats.randint(300, 500),
}

model = MLPClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="nn")

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

hyperparams = {
    'model__learning_rate': stats.uniform(0.03, 0.2),
    'model__n_estimators': stats.randint(10, 100),
    'model__max_depth': stats.randint(2, 6),
    'model__max_features': ('sqrt', 'log2', None),  # regularization
    'model__subsample': (0.25, 0.5, 0.75, 1),       # regularization
}

model = GradientBoostingClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="gb")

In [None]:
#Don't run this in jupyter within vscode, run this with notebooks within browsers.
import os
#os.environ['KMP_DUPLICATE_LIB_OK']='True'

import xgboost as xgb

hyperparams = {
    'model__booster': ['gbtree', 'gblinear', 'dart'],
    'model__eta': stats.uniform(0.05, 0.5),
    'model__gamma': stats.uniform(0, 0.2),
    'model__max_depth': [2, 3, 4, 6],
    'model__n_estimators': stats.randint(10, 100),
    'model__subsample': [0.25, 0.5, 0.75, 1],     # Stochastic regularization
    'model__lambda': stats.uniform(0.5, 1.5),     # L2 regularization
    'model__alpha': stats.uniform(0, 0.5),        # L1 regularization
    'model__scale_pos_weight': [0.2, 0.4, 0.8, 1, 2],
}

model = xgb.XGBClassifier(n_jobs=1)
train_partial(model=model, hyperparams=hyperparams, savename="xgb")

### Evaluate

In [None]:
models = [
    "logreg_random_svmsmote_logreg",
    "svc_random_bordersmote_svc",
    "knn2_random_svmsmote_knn2",
    "rf2_random_svmsmote_rf2",
    "adaboost2_random_svmsmote_adaboost2",
    "nn_random_svmsmote_nn",
    "gb_random_svmsmote_gb",
    #"xgb2_random_svmsmote_xgb2",
    #"xgb2",
    #"nn1_random_svmsmote_nn1",
]
name = f"{models[6]}.joblib"
model = load(path_models+name)
#model = load(path_models)
# model.fit(X_train, y_train)
# evaluate(model, X_train, y_train)
print(model)


evaluate(model, X_valid, y_valid)
evaluate(model, X_test, y_test)
X_train.shape

In [None]:
tree = [load(path_models+f"tree4.joblib")]
evaluate(tree[0], X_valid4, y_valid4)
evaluate(tree[0], X_test4, y_test4)