Libraries

In [51]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, brier_score_loss

import pandas as pd
import numpy as np
import joblib

### Load training

In [52]:
X = joblib.load('../data/X_training.joblib')
y = joblib.load('../data/y_training.joblib')

print(X.shape, y.shape)

(2563, 10) (2563,)


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.18, random_state=42, stratify=y)

### Build Classification models

K nearest neighbors

In [56]:
from sklearn.neighbors import KNeighborsClassifier

param_dist = {
    'n_neighbors': np.arange(1, 25),            # Number of neighbors
    'weights': ['uniform', 'distance'],         # Weight options
    'metric': ['euclidean', 'manhattan']        # Distance metric options
}

random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train) # Train model

knn = random_search.best_estimator_ 
print("Best hyperparameters:", random_search.best_params_)

# Make predictions
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

# Training set performance
knn_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
knn_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
knn_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
knn_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
knn_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
knn_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
knn_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
knn_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % knn_train_accuracy)
print('- MCC: %s' % knn_train_mcc)
print('- F1 score: %s' % knn_train_f1)
print('- Brier score: %s' % knn_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % knn_test_accuracy)
print('- MCC: %s' % knn_test_mcc)
print('- F1 score: %s' % knn_test_f1)
print('- Brier score: %s' % knn_test_brier)

Best hyperparameters: {'weights': 'distance', 'n_neighbors': 13, 'metric': 'manhattan'}
Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
- Brier score: 0.0
----------------------------------
Model performance for Test set
- Accuracy: 0.8917748917748918
- MCC: 0.0
- F1 score: 0.8407580215360535
- Brier score: 0.10822510822510822


Support vector machine (Radial basis function kernel)


In [57]:
from sklearn.svm import SVC

param_dist = {
    'C': np.logspace(-3, 3, 14),            # Regularization parameter
    'gamma': np.logspace(-3, 3, 14),         # Kernel coefficient for 'rbf'
    'kernel': ['rbf']            # Kernel type  ['linear', 'rbf'] 
}

random_search = RandomizedSearchCV(SVC(), param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

svm_rbf = random_search.best_estimator_ 
print("Best hyperparameters:", random_search.best_params_)

# Make predictions
y_train_pred = svm_rbf.predict(X_train)
y_test_pred = svm_rbf.predict(X_test)

# Training set performance
svm_rbf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
svm_rbf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
svm_rbf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
svm_rbf_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
svm_rbf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
svm_rbf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
svm_rbf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
svm_rbf_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % svm_rbf_train_accuracy)
print('- MCC: %s' % svm_rbf_train_mcc)
print('- F1 score: %s' % svm_rbf_train_f1)
print('- Brier score: %s' % svm_rbf_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % svm_rbf_test_accuracy)
print('- MCC: %s' % svm_rbf_test_mcc)
print('- F1 score: %s' % svm_rbf_test_f1)
print('- Brier score: %s' % svm_rbf_test_brier)

Best hyperparameters: {'kernel': 'rbf', 'gamma': 1000.0, 'C': 14.251026703029993}
Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
- Brier score: 0.0
----------------------------------
Model performance for Test set
- Accuracy: 0.8917748917748918
- MCC: 0.0
- F1 score: 0.8407580215360535
- Brier score: 0.10822510822510822


Decision tree

In [58]:
from sklearn.tree import DecisionTreeClassifier

param_dist = {
    'max_depth': np.arange(1, 25),            # Max depth of the tree
    'min_samples_split': np.arange(2, 14),    # Minimum number of samples required to split an internal node
    'min_samples_leaf': np.arange(1, 14),     # Minimum number of samples required to be in a leaf node
    'criterion': ['gini', 'entropy'],         # Division criterion ('gini' or 'entropy')
    'max_features': ['sqrt', 'log2', None],   # Maximum number of features considered to split a node ['auto', 'sqrt', 'log2', None]
    'splitter': ['best', 'random']
}

random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train) # Train model

dt = random_search.best_estimator_ 
print("Best hyperparameters:", random_search.best_params_)

# Make predictions
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

# Training set performance
dt_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
dt_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
dt_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
dt_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
dt_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
dt_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
dt_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
dt_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- MCC: %s' % dt_train_mcc)
print('- F1 score: %s' % dt_train_f1)
print('- Brier score: %s' % dt_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- MCC: %s' % dt_test_mcc)
print('- F1 score: %s' % dt_test_f1)
print('- Brier score: %s' % dt_test_brier)

Best hyperparameters: {'splitter': 'random', 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': None, 'max_depth': 17, 'criterion': 'gini'}
Model performance for Training set
- Accuracy: 0.8929081389814374
- MCC: 0.16398705003135747
- F1 score: 0.8534281599817568
- Brier score: 0.10709186101856259
----------------------------------
Model performance for Test set
- Accuracy: 0.8939393939393939
- MCC: 0.13369437740973306
- F1 score: 0.8459652110349501
- Brier score: 0.10606060606060606


Random forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

param_dist = {
    'n_estimators': np.arange(10, 220, 10),          # Number of trees in the forest
    'max_depth': np.arange(1, 21),                   # Maximum tree depth
    'min_samples_split': np.arange(2, 11),           # Minimum number of samples required to split a node
    'min_samples_leaf': np.arange(1, 11),            # Minimum number of samples required in a leaf node
    'max_features': ['sqrt', 'log2', None]           # Maximum number of features considered to split a node ['auto', 'sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train) # Train model

rf = random_search.best_estimator_ 
print("Best hyperparameters:", random_search.best_params_)

# Make predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# Training set performance
rf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
rf_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
rf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
rf_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- MCC: %s' % rf_train_mcc)
print('- F1 score: %s' % rf_train_f1)
print('- Brier score: %s' % rf_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- MCC: %s' % rf_test_mcc)
print('- F1 score: %s' % rf_test_f1)
print('- Brier score: %s' % rf_test_brier)

Best hyperparameters: {'n_estimators': 40, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 'log2', 'max_depth': 7}
Model performance for Training set
- Accuracy: 0.8938600666349358
- MCC: 0.13998800623977403
- F1 score: 0.8460495402028504
- Brier score: 0.10613993336506425
----------------------------------
Model performance for Test set
- Accuracy: 0.8917748917748918
- MCC: 0.0
- F1 score: 0.8407580215360535
- Brier score: 0.10822510822510822


Neural network

In [60]:
from sklearn.neural_network import MLPClassifier

param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100,), (100, 50, 25)],  # Hidden layer sizes
    'activation': ['logistic', 'tanh', 'relu'],                          # Activation functions
    'alpha': np.logspace(-5, 3, 9),                                      # L2 regularization term
    'learning_rate': ['constant', 'invscaling', 'adaptive'],             # Learning rate
    'max_iter': np.arange(100, 1401, 100)                                # Maximum number of iterations
}

random_search = RandomizedSearchCV(MLPClassifier(), param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train) # Train model

mlp = random_search.best_estimator_ 
print("Best hyperparameters:", random_search.best_params_)

# Make predictions
y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)

# Training set performance
mlp_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
mlp_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
mlp_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
mlp_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % mlp_train_accuracy)
print('- MCC: %s' % mlp_train_mcc)
print('- F1 score: %s' % mlp_train_f1)
print('- Brier score: %s' % mlp_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % mlp_test_accuracy)
print('- MCC: %s' % mlp_test_mcc)
print('- F1 score: %s' % mlp_test_f1)
print('- Brier score: %s' % mlp_test_brier)



Best hyperparameters: {'max_iter': 700, 'learning_rate': 'invscaling', 'hidden_layer_sizes': (50,), 'alpha': 0.1, 'activation': 'logistic'}
Model performance for Training set
- Accuracy: 0.8919562113279391
- MCC: 0.06254488780716103
- F1 score: 0.84149270082697
- Brier score: 0.10804378867206092
----------------------------------
Model performance for Test set
- Accuracy: 0.8917748917748918
- MCC: 0.0
- F1 score: 0.8407580215360535
- Brier score: 0.10822510822510822


XGBClassifier

In [61]:
from xgboost import XGBClassifier

param_dist = {
    'n_estimators': np.arange(50, 200, 10),         # Number of trees
    'learning_rate': [0.01, 0.1, 0.2, 0.3],         # Learning rate
    'max_depth': np.arange(3, 10),                  # Maximum tree depth
    'subsample': [0.8, 0.9, 1.0],                   # Proportion of samples used to train each tree
    'colsample_bytree': [0.8, 0.9, 1.0],            # Proportion of features used to train each tree
    'gamma': [0, 1, 2]                              # Regularization parameter to control tree complexity
}

random_search = RandomizedSearchCV(XGBClassifier(), param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train) # Train model

xgbc = random_search.best_estimator_ 
print("Best hyperparameters:", random_search.best_params_)

# Make predictions
y_train_pred = xgbc.predict(X_train)
y_test_pred = xgbc.predict(X_test)

# Training set performance
xgbc_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
xgbc_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
xgbc_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
xgbc_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
xgbc_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
xgbc_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
xgbc_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
xgbc_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % xgbc_train_accuracy)
print('- MCC: %s' % xgbc_train_mcc)
print('- F1 score: %s' % xgbc_train_f1)
print('- Brier score: %s' % xgbc_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % xgbc_test_accuracy)
print('- MCC: %s' % xgbc_test_mcc)
print('- F1 score: %s' % xgbc_test_f1)
print('- Brier score: %s' % xgbc_test_brier)

Best hyperparameters: {'subsample': 1.0, 'n_estimators': 160, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
Model performance for Training set
- Accuracy: 0.8971918134221799
- MCC: 0.21594032671081012
- F1 score: 0.8545131991456252
- Brier score: 0.10280818657782008
----------------------------------
Model performance for Test set
- Accuracy: 0.8939393939393939
- MCC: 0.13369437740973306
- F1 score: 0.8459652110349501
- Brier score: 0.10606060606060606


## Build Stacked model

In [62]:
# Define estimators
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('knn',knn),
    ('svm_rbf',svm_rbf),
    ('dt',dt),
    ('rf',rf),
    ('mlp',mlp), 
    ('xgbc',xgbc) ]

# Build stack model
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
stack_model_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
stack_model_test_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('- Brier score: %s' % stack_model_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)
print('- Brier score: %s' % stack_model_test_brier)

Model performance for Training set
- Accuracy: 0.8933841028081866
- MCC: 0.12517922130012696
- F1 score: 0.8449221629850129
- Brier score: 0.10661589719181343
----------------------------------
Model performance for Test set
- Accuracy: 0.8917748917748918
- MCC: 0.0
- F1 score: 0.8407580215360535
- Brier score: 0.10661589719181343


## Results

In [63]:
acc_train_list = {'knn':knn_train_accuracy,
'svm_rbf': svm_rbf_train_accuracy,
'dt': dt_train_accuracy,
'rf': rf_train_accuracy,
'mlp': mlp_train_accuracy,
'xgbc': xgbc_train_accuracy,
'stack': stack_model_train_accuracy}

mcc_train_list = {'knn':knn_train_mcc,
'svm_rbf': svm_rbf_train_mcc,
'dt': dt_train_mcc,
'rf': rf_train_mcc,
'mlp': mlp_train_mcc,
'xgbc': xgbc_train_mcc,
'stack': stack_model_train_mcc}

f1_train_list = {'knn':knn_train_f1,
'svm_rbf': svm_rbf_train_f1,
'dt': dt_train_f1,
'rf': rf_train_f1,
'mlp': mlp_train_f1,
'xgbc': xgbc_train_f1,
'stack': stack_model_train_f1}

brier_train_list = {'knn':knn_train_brier,
'svm_rbf': svm_rbf_train_brier,
'dt': dt_train_brier,
'rf': rf_train_brier,
'mlp': mlp_train_brier,
'xgbc': xgbc_train_brier,
'stack': stack_model_train_brier}

In [64]:
acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])
brier_df = pd.DataFrame.from_dict(brier_train_list, orient='index', columns=['Brier'])
df = pd.concat([acc_df, mcc_df, f1_df, brier_df], axis=1)
df

Unnamed: 0,Accuracy,MCC,F1,Brier
knn,1.0,1.0,1.0,0.0
svm_rbf,1.0,1.0,1.0,0.0
dt,0.892908,0.163987,0.853428,0.107092
rf,0.89386,0.139988,0.84605,0.10614
mlp,0.891956,0.062545,0.841493,0.108044
xgbc,0.897192,0.21594,0.854513,0.102808
stack,0.893384,0.125179,0.844922,0.106616


In [65]:
acc_test_list = {'knn':knn_test_accuracy,
'svm_rbf': svm_rbf_test_accuracy,
'dt': dt_test_accuracy,
'rf': rf_test_accuracy,
'mlp': mlp_test_accuracy,
'xgbc': xgbc_test_accuracy,
'stack': stack_model_test_accuracy}

mcc_test_list = {'knn':knn_test_mcc,
'svm_rbf': svm_rbf_test_mcc,
'dt': dt_test_mcc,
'rf': rf_test_mcc,
'mlp': mlp_test_mcc,
'xgbc': xgbc_test_mcc,
'stack': stack_model_test_mcc}

f1_test_list = {'knn':knn_test_f1,
'svm_rbf': svm_rbf_test_f1,
'dt': dt_test_f1,
'rf': rf_test_f1,
'mlp': mlp_test_f1,
'xgbc': xgbc_test_f1,
'stack': stack_model_test_f1}

brier_test_list = {'knn':knn_test_brier,
'svm_rbf': svm_rbf_test_brier,
'dt': dt_test_brier,
'rf': rf_test_brier,
'mlp': mlp_test_brier,
'xgbc': xgbc_test_brier,
'stack': stack_model_test_brier}

In [66]:
acc_df = pd.DataFrame.from_dict(acc_test_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_test_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_test_list, orient='index', columns=['F1'])
brier_df = pd.DataFrame.from_dict(brier_test_list, orient='index', columns=['Brier'])
df = pd.concat([acc_df, mcc_df, f1_df, brier_df], axis=1)
df

Unnamed: 0,Accuracy,MCC,F1,Brier
knn,0.891775,0.0,0.840758,0.108225
svm_rbf,0.891775,0.0,0.840758,0.108225
dt,0.893939,0.133694,0.845965,0.106061
rf,0.891775,0.0,0.840758,0.108225
mlp,0.891775,0.0,0.840758,0.108225
xgbc,0.893939,0.133694,0.845965,0.106061
stack,0.891775,0.0,0.840758,0.106616


## Save model

In [67]:
import joblib

# Guardar el modelo en un archivo
joblib.dump(svm_rbf, '../models/svm_model.joblib')
joblib.dump(xgbc, '../models/xgbc_model.joblib')
joblib.dump(stack_model, '../models/stack_model.joblib')

# Guardar listado de features para el modelo
# joblib.dump(selected_columns, '../models/list_of_features.joblib')

['../models/stack_model.joblib']