In [2]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
from hyperparameters import hyperparameters
import matplotlib as mpl

from IPython.display import display
plt.style.use('bmh')
mpl.rcParams.update({
    "grid.linestyle" : "dashed",
    "axes.facecolor" : "white",
    "axes.spines.top" : False,
    "axes.spines.right" : False,
    "legend.frameon" : False,
    "figure.figsize" : (8, 5),
    "figure.dpi" : 300,
})
%matplotlib inline

# Suppress sklearn deprecated warnings
import warnings
def warn(*args, **kwargs): pass
warnings.warn = warn
np.set_printoptions(threshold=sys.maxsize)

np.random.seed(42)

In [16]:
# TODO make test for dataset with creatina column
# Dataset without thyroid = 18 features (including survive7y)
# Dataset with thyroid = 27 features (including survive7y)
# With columns that have missing values, 23 and 32
# Default 18
n_features = 23
extra_path = n_features != 27 and n_features != 18
dropped_na_key = "dropped_na/"
mean_key = "mean/"
key = mean_key
path = f"data/{n_features}features/{key if extra_path else '' }"
path_models = f"models/{n_features}features/{key if extra_path else '' }"
output_models = f"models_output/{n_features}features/{key if extra_path else '' }"
print(path_models)
print(path)
print(output_models)

models/23features/mean/
data/23features/mean/
models_output/23features/mean/


In [17]:
# Read data
df_train = pd.read_csv(f"{path}train.csv", index_col=0)
df_valid = pd.read_csv(f"{path}valid.csv", index_col=0)
df_test = pd.read_csv(f"{path}test.csv", index_col=0)
print(len(df_train) + len(df_valid) + len(df_test))
print(len(df_train.columns))


train, valid, test = df_train.to_numpy(), df_valid.to_numpy(), df_test.to_numpy()

# y_**** contains the value of Survive7y as a list
# X_**** contains everything except for Survive7y as a list of list
X_train, y_train = train[:, :-1], train[:, -1]
X_valid, y_valid = valid[:, :-1], valid[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]
feat_names = list(df_train.columns)
# Print how Survive7y are distribuited in each set
from collections import Counter
print(Counter(y_train))
print(Counter(y_valid))
print(Counter(y_test))

# All the numerical features that can be standardized
from utils import get_preprocess_std_num
preprocess_std = get_preprocess_std_num(feat_names)

# Preprocessed ready-to-use train and valid set
process_tmp = preprocess_std.fit(X_train)
X_train_std = process_tmp.transform(X_train)
X_valid_std = process_tmp.transform(X_valid)

#If you want to print the resulting df
# Note: You don't need to pass the _std to the train function. The function will call predict on the pipeline and transform the dataset accordingly to the transformer  
#df_scaled = pd.DataFrame(X_valid_std,columns = preprocess_std.get_feature_names_out())
#display(df_scaled)

6447
23
Counter({1.0: 3494, 0.0: 505})
Counter({1.0: 1072, 0.0: 149})
Counter({1.0: 1084, 0.0: 143})


### Training


In [18]:
from functools import partial
from train import report, evaluate, train_and_evaluate
train_partial = partial(
    train_and_evaluate, 
    preprocess_std, 
    X_train=X_train,
    y_train=y_train,
    X_valid=X_valid,
    y_valid=y_valid,
    scoring="f1_macro", 
    iter=5000, 
    save=True,
    path_models = path_models,
    output_models = output_models
)

In [19]:
from sklearn.linear_model import LogisticRegression

hyperparams = hyperparameters["lr"] 
#Default is None (thus weight = 1). Balanced uses the formula n_samples / (n_classes * np.bincount(y))
model = LogisticRegression(class_weight="balanced")
train_partial(model=model, hyperparams=hyperparams, savename="lr")



Testing on training set:
              precision    recall  f1-score   support

         0.0      0.376     0.616     0.467       505
         1.0      0.939     0.852     0.893      3494

    accuracy                          0.822      3999
   macro avg      0.657     0.734     0.680      3999
weighted avg      0.868     0.822     0.840      3999

auc macro 0.835
confusion matrix
[[ 311  194]
 [ 516 2978]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.389     0.638     0.483       149
         1.0      0.945     0.861     0.901      1072

    accuracy                          0.834      1221
   macro avg      0.667     0.749     0.692      1221
weighted avg      0.877     0.834     0.850      1221

auc macro 0.841
confusion matrix
[[ 95  54]
 [149 923]]
Model rank: 1
Mean validation score: 0.673 (std: 0.012)
Parameters: {'model__C': 7, 'model__dual': True, 'model__max_iter': 50, 'model__penalty': 'l2', 'model__solver': 'liblinea

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 LogisticRegression(C=7, class_weight='balanced', dual=True,
                                    max_iter=50, solver='liblinear'))])

In [20]:
from sklearn.svm import SVC
hyperparams = hyperparameters["svc"] 

model = SVC(class_weight="balanced", probability=True)
train_partial(model=model, hyperparams=hyperparams, savename="svc")



Testing on training set:
              precision    recall  f1-score   support

         0.0      0.229     0.450     0.304       505
         1.0      0.908     0.782     0.840      3494

    accuracy                          0.740      3999
   macro avg      0.568     0.616     0.572      3999
weighted avg      0.822     0.740     0.772      3999

auc macro 0.685
confusion matrix
[[ 227  278]
 [ 763 2731]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.132     0.268     0.177       149
         1.0      0.881     0.756     0.814      1072

    accuracy                          0.696      1221
   macro avg      0.507     0.512     0.496      1221
weighted avg      0.790     0.696     0.736      1221

auc macro 0.514
confusion matrix
[[ 40 109]
 [262 810]]
Model rank: 1
Mean validation score: 0.673 (std: 0.020)
Parameters: {'model__C': 148, 'model__coef0': 0.7937729590829007, 'model__degree': 167, 'model__gamma': 'auto', 'model__ke

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 SVC(C=148, class_weight='balanced', coef0=0.7937729590829007,
                     degree=167, gamma='auto', max_iter=1600,
                     probability=True))])

In [21]:
from sklearn.neighbors import KNeighborsClassifier

hyperparams = hyperparameters["knn"] 

model = KNeighborsClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="knn")

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.688     0.537     0.603       505
         1.0      0.935     0.965     0.950      3494

    accuracy                          0.911      3999
   macro avg      0.811     0.751     0.776      3999
weighted avg      0.904     0.911     0.906      3999

auc macro 0.939
confusion matrix
[[ 271  234]
 [ 123 3371]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.420     0.315     0.360       149
         1.0      0.908     0.939     0.923      1072

    accuracy                          0.863      1221
   macro avg      0.664     0.627     0.642      1221
weighted avg      0.848     0.863     0.855      1221

auc macro 0.688
confusion matrix
[[  47  102]
 [  65 1007]]
Model rank: 1
Mean validation score: 0.620 (std: 0.012)
Parameters: {'model__algorithm': 'ball_tree', 'model__leaf_size': 47, 'model__n_neighbors': 4, 'model__weights': 'unif

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 KNeighborsClassifier(algorithm='ball_tree', leaf_size=47,
                                      n_neighbors=4))])

In [22]:
from sklearn.ensemble import RandomForestClassifier

hyperparams = hyperparameters["rf"] 

model = RandomForestClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="rf")

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.834     0.964     0.894       505
         1.0      0.995     0.972     0.983      3494

    accuracy                          0.971      3999
   macro avg      0.914     0.968     0.939      3999
weighted avg      0.974     0.971     0.972      3999

auc macro 0.995
confusion matrix
[[ 487   18]
 [  97 3397]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.496     0.389     0.436       149
         1.0      0.918     0.945     0.931      1072

    accuracy                          0.877      1221
   macro avg      0.707     0.667     0.684      1221
weighted avg      0.866     0.877     0.871      1221

auc macro 0.844
confusion matrix
[[  58   91]
 [  59 1013]]
Model rank: 1
Mean validation score: 0.694 (std: 0.022)
Parameters: {'model__class_weight': 'balanced_subsample', 'model__criterion': 'entropy', 'model__max_features': 'sqrt'

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 RandomForestClassifier(class_weight='balanced_subsample',
                                        criterion='entropy',
                                        max_features='sqrt', min_samples_leaf=4,
                                        min_samples_split=3,
                                        n_estimators=148))])

In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

hyperparams = hyperparameters["adaboost"] 

model = AdaBoostClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="adaboost")

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.634     0.309     0.415       505
         1.0      0.907     0.974     0.939      3494

    accuracy                          0.890      3999
   macro avg      0.771     0.642     0.677      3999
weighted avg      0.873     0.890     0.873      3999

auc macro 0.873
confusion matrix
[[ 156  349]
 [  90 3404]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.522     0.242     0.330       149
         1.0      0.902     0.969     0.934      1072

    accuracy                          0.880      1221
   macro avg      0.712     0.605     0.632      1221
weighted avg      0.856     0.880     0.861      1221

auc macro 0.806
confusion matrix
[[  36  113]
 [  33 1039]]
Model rank: 1
Mean validation score: 0.669 (std: 0.016)
Parameters: {'model__learning_rate': 1.1595261644623975, 'model__n_estimators': 67}

Model rank: 3
Mean validation sco

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 AdaBoostClassifier(learning_rate=1.1595261644623975,
                                    n_estimators=67))])

In [24]:
from sklearn.neural_network import MLPClassifier
import random

hyperparams = hyperparameters["nn"] 

model = MLPClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="nn")

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.764     0.192     0.307       505
         1.0      0.895     0.991     0.941      3494

    accuracy                          0.890      3999
   macro avg      0.829     0.592     0.624      3999
weighted avg      0.878     0.890     0.861      3999

auc macro 0.847
confusion matrix
[[  97  408]
 [  30 3464]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.588     0.134     0.219       149
         1.0      0.891     0.987     0.937      1072

    accuracy                          0.883      1221
   macro avg      0.740     0.561     0.578      1221
weighted avg      0.854     0.883     0.849      1221

auc macro 0.835
confusion matrix
[[  20  129]
 [  14 1058]]
Model rank: 1
Mean validation score: 0.678 (std: 0.016)
Parameters: {'model__alpha': 0.5625455560730596, 'model__early_stopping': True, 'model__hidden_layer_sizes': [296, 52]

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 MLPClassifier(alpha=0.5625455560730596, early_stopping=True,
                               hidden_layer_sizes=[296, 52],
                               learning_rate='adaptive',
                               learning_rate_init=0.004942684674076121,
                               max_iter=343))])

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

hyperparams = hyperparameters["gb"] 

model = GradientBoostingClassifier()
train_partial(model=model, hyperparams=hyperparams, savename="gb")

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.976     0.644     0.776       505
         1.0      0.951     0.998     0.974      3494

    accuracy                          0.953      3999
   macro avg      0.963     0.821     0.875      3999
weighted avg      0.954     0.953     0.949      3999

auc macro 0.976
confusion matrix
[[ 325  180]
 [   8 3486]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.514     0.255     0.341       149
         1.0      0.903     0.966     0.934      1072

    accuracy                          0.880      1221
   macro avg      0.708     0.611     0.637      1221
weighted avg      0.856     0.880     0.861      1221

auc macro 0.831
confusion matrix
[[  38  111]
 [  36 1036]]
Model rank: 1
Mean validation score: 0.671 (std: 0.018)
Parameters: {'model__learning_rate': 0.09433621306092999, 'model__max_depth': 5, 'model__max_features': None, 'model__

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 GradientBoostingClassifier(learning_rate=0.09433621306092999,
                                            max_depth=5, n_estimators=86,
                                            subsample=0.75))])

In [26]:
#Don't run this in jupyter within vscode, run this with notebooks within browsers.
import os
#os.environ['KMP_DUPLICATE_LIB_OK']='True'

import xgboost as xgb

hyperparams = hyperparameters["xgb"] 

model = xgb.XGBClassifier(n_jobs=1)
train_partial(model=model, hyperparams=hyperparams, savename="xgb")

Testing on training set:
              precision    recall  f1-score   support

         0.0      0.535     0.519     0.527       505
         1.0      0.931     0.935     0.933      3494

    accuracy                          0.882      3999
   macro avg      0.733     0.727     0.730      3999
weighted avg      0.881     0.882     0.881      3999

auc macro 0.865
confusion matrix
[[ 262  243]
 [ 228 3266]]
Testing on validation set:
              precision    recall  f1-score   support

         0.0      0.482     0.450     0.465       149
         1.0      0.924     0.933     0.929      1072

    accuracy                          0.874      1221
   macro avg      0.703     0.691     0.697      1221
weighted avg      0.870     0.874     0.872      1221

auc macro 0.832
confusion matrix
[[  67   82]
 [  72 1000]]
Model rank: 1
Mean validation score: 0.707 (std: 0.022)
Parameters: {'model__alpha': 0.06894819970410404, 'model__booster': 'dart', 'model__eta': 0.09428791561963863, 'model_

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('stand', StandardScaler(),
                                                  [1, 16, 8, 17, 18, 19, 20,
                                                   21])],
                                   verbose_feature_names_out=False)),
                ('model',
                 XGBClassifier(alpha=0.06894819970410404, booster='dart',
                               eta=0.09428791561963863,
                               gamma=0.1663660901938827,
                               lambda=0.9468305193894822, max_depth=2,
                               n_estimators=94, scale_pos_weight=0.4,
                               subsample=0.25))])



In [3]:

mean_path = f"data/27features/"
df_train = pd.read_csv(f"{mean_path}train.csv")
df_valid = pd.read_csv(f"{mean_path}valid.csv")
df_test = pd.read_csv(f"{mean_path}test.csv")
sum_valid = 0
sum_test = 0
print(1 in df_valid.iloc[:,0].to_numpy())
for val in df_train.iloc[:,0].to_numpy():
    if val in df_valid.iloc[:,0].to_numpy():
        print(val)
        sum_valid +=1
    if val in df_test.iloc[:,0].to_numpy():
        sum_test +=1
print("#######################")
print(sum_valid)
print(sum_test)

False
#######################
0
0
