Libraries

In [1]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, brier_score_loss

import pandas as pd

Load dataset

In [2]:
#import pandas as pd
df = pd.read_csv('../data/training.csv')

X = df.drop('Target', axis=1)
y = df['Target'].copy()

print('X:', X.shape, 'y:', y.shape)


X: (2563, 548) y: (2563,)


In [3]:
# Work with non-numeric features
X = X.select_dtypes(exclude=['object'])


In [4]:
from sklearn.impute import SimpleImputer

# Crear un objeto SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Puedes cambiar 'mean' por 'median', 'most_frequent', o cualquier valor constante

# Aplicar la imputación a todas las columnas
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Imprimir el DataFrame resultante con valores imputados
# print(X)


In [5]:
# Remove low variance features
selection = VarianceThreshold(threshold=(0.1))
X_transformed = selection.fit_transform(X)

selected_columns = list(X.columns[selection.get_support()])
# print(selected_columns)

# X = selection.fit_transform(X)
X = X_transformed; del(X_transformed)
X.shape

(2563, 39)

In [6]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

((2050, 39), (513, 39))

In [7]:
print(y_train.value_counts(), y_test.value_counts())

Target
0    1828
1     222
Name: count, dtype: int64 Target
0    457
1     56
Name: count, dtype: int64


### Build Classification models

K nearest neighbors

In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3) # Define classifier
knn.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

# Training set performance
knn_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
knn_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
knn_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
knn_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
knn_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
knn_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
knn_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
knn_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % knn_train_accuracy)
print('- MCC: %s' % knn_train_mcc)
print('- F1 score: %s' % knn_train_f1)
print('- Brier score: %s' % knn_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % knn_test_accuracy)
print('- MCC: %s' % knn_test_mcc)
print('- F1 score: %s' % knn_test_f1)
print('- Brier score: %s' % knn_test_brier)

Model performance for Training set
- Accuracy: 0.9073170731707317
- MCC: 0.37531213176181616
- F1 score: 0.8850438839083913
- Brier score: 0.09268292682926829
----------------------------------
Model performance for Test set
- Accuracy: 0.8732943469785575
- MCC: 0.07488162711470178
- F1 score: 0.8420409294240728
- Brier score: 0.1267056530214425


In [9]:
knn.predict_proba(X_train)

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       ...,
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [1.        , 0.        ]])

Support vector machine (Radial basis function kernel)


In [10]:
from sklearn.svm import SVC

svm_rbf = SVC(gamma=2, C=1)
svm_rbf.fit(X_train, y_train)

# Make predictions
y_train_pred = svm_rbf.predict(X_train)
y_test_pred = svm_rbf.predict(X_test)

# Training set performance
svm_rbf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
svm_rbf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
svm_rbf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
svm_rbf_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
svm_rbf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
svm_rbf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
svm_rbf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
svm_rbf_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % svm_rbf_train_accuracy)
print('- MCC: %s' % svm_rbf_train_mcc)
print('- F1 score: %s' % svm_rbf_train_f1)
print('- Brier score: %s' % svm_rbf_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % svm_rbf_test_accuracy)
print('- MCC: %s' % svm_rbf_test_mcc)
print('- F1 score: %s' % svm_rbf_test_f1)
print('- Brier score: %s' % svm_rbf_test_brier)

Model performance for Training set
- Accuracy: 0.9980487804878049
- MCC: 0.9898676257556559
- F1 score: 0.9980409775483746
- Brier score: 0.001951219512195122
----------------------------------
Model performance for Test set
- Accuracy: 0.8908382066276803
- MCC: 0.0
- F1 score: 0.8394083720182473
- Brier score: 0.10916179337231968


Decision tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5) # Define classifier
dt.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

# Training set performance
dt_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
dt_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
dt_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
dt_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
dt_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
dt_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
dt_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
dt_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- MCC: %s' % dt_train_mcc)
print('- F1 score: %s' % dt_train_f1)
print('- Brier score: %s' % dt_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- MCC: %s' % dt_test_mcc)
print('- F1 score: %s' % dt_test_f1)
print('- Brier score: %s' % dt_test_brier)

Model performance for Training set
- Accuracy: 0.9
- MCC: 0.27538212309834736
- F1 score: 0.8679479671122976
- Brier score: 0.1
----------------------------------
Model performance for Test set
- Accuracy: 0.8771929824561403
- MCC: 0.0008353219102231928
- F1 score: 0.8357966075037109
- Brier score: 0.12280701754385964


Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10) # Define classifier
rf.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# Training set performance
rf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
rf_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
rf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
rf_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- MCC: %s' % rf_train_mcc)
print('- F1 score: %s' % rf_train_f1)
print('- Brier score: %s' % rf_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- MCC: %s' % rf_test_mcc)
print('- F1 score: %s' % rf_test_f1)
print('- Brier score: %s' % rf_test_brier)

Model performance for Training set
- Accuracy: 0.9829268292682927
- MCC: 0.9091299899324812
- F1 score: 0.9822772630128498
- Brier score: 0.01707317073170732
----------------------------------
Model performance for Test set
- Accuracy: 0.8869395711500975
- MCC: -0.02189981576173119
- F1 score: 0.8374615372223028
- Brier score: 0.11306042884990253


Neural network

In [13]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha=1, max_iter=1000)
mlp.fit(X_train, y_train)

# Make predictions
y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)

# Training set performance
mlp_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
mlp_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
mlp_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
mlp_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % mlp_train_accuracy)
print('- MCC: %s' % mlp_train_mcc)
print('- F1 score: %s' % mlp_train_f1)
print('- Brier score: %s' % mlp_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % mlp_test_accuracy)
print('- MCC: %s' % mlp_test_mcc)
print('- F1 score: %s' % mlp_test_f1)
print('- Brier score: %s' % mlp_test_brier)

Model performance for Training set
- Accuracy: 0.8917073170731707
- MCC: 0.0
- F1 score: 0.8406606372407202
- Brier score: 0.10829268292682927
----------------------------------
Model performance for Test set
- Accuracy: 0.8908382066276803
- MCC: 0.0
- F1 score: 0.8394083720182473
- Brier score: 0.10916179337231968


XGBClassifier

In [21]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(n_estimators=100, random_state=42)
xgbc.fit(X_train, y_train)

# Make predictions
y_train_pred = xgbc.predict(X_train)
y_test_pred = xgbc.predict(X_test)

# Training set performance
xgbc_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
xgbc_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
xgbc_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
xgbc_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set performance
xgbc_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
xgbc_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
xgbc_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
xgbc_test_brier = brier_score_loss(y_test, y_test_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % xgbc_train_accuracy)
print('- MCC: %s' % xgbc_train_mcc)
print('- F1 score: %s' % xgbc_train_f1)
print('- Brier score: %s' % xgbc_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % xgbc_test_accuracy)
print('- MCC: %s' % xgbc_test_mcc)
print('- F1 score: %s' % xgbc_test_f1)
print('- Brier score: %s' % xgbc_test_brier)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
- Brier score: 0.0
----------------------------------
Model performance for Test set
- Accuracy: 0.8830409356725146
- MCC: 0.05684405386725444
- F1 score: 0.8420991842044472
- Brier score: 0.11695906432748537


## Build Stacked model

In [22]:
# Define estimators
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('knn',knn),
    ('svm_rbf',svm_rbf),
    ('dt',dt),
    ('rf',rf),
    ('mlp',mlp), 
    ('xgbc',xgbc) ]

# Build stack model
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
stack_model_train_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
stack_model_test_brier = brier_score_loss(y_train, y_train_pred) # Calculate Brier Score

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('- Brier score: %s' % stack_model_train_brier)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)
print('- Brier score: %s' % stack_model_test_brier)

Model performance for Training set
- Accuracy: 0.8917073170731707
- MCC: 0.0
- F1 score: 0.8406606372407202
- Brier score: 0.10829268292682927
----------------------------------
Model performance for Test set
- Accuracy: 0.8908382066276803
- MCC: 0.0
- F1 score: 0.8394083720182473
- Brier score: 0.10829268292682927


## Results

In [25]:
acc_train_list = {'knn':knn_train_accuracy,
'svm_rbf': svm_rbf_train_accuracy,
'dt': dt_train_accuracy,
'rf': rf_train_accuracy,
'mlp': mlp_train_accuracy,
'xgbc': xgbc_train_accuracy,
'stack': stack_model_train_accuracy}

mcc_train_list = {'knn':knn_train_mcc,
'svm_rbf': svm_rbf_train_mcc,
'dt': dt_train_mcc,
'rf': rf_train_mcc,
'mlp': mlp_train_mcc,
'xgbc': xgbc_train_mcc,
'stack': stack_model_train_mcc}

f1_train_list = {'knn':knn_train_f1,
'svm_rbf': svm_rbf_train_f1,
'dt': dt_train_f1,
'rf': rf_train_f1,
'mlp': mlp_train_f1,
'xgbc': xgbc_train_f1,
'stack': stack_model_train_f1}

brier_train_list = {'knn':knn_train_brier,
'svm_rbf': svm_rbf_train_brier,
'dt': dt_train_brier,
'rf': rf_train_brier,
'mlp': mlp_train_brier,
'xgbc': xgbc_train_brier,
'stack': stack_model_train_brier}

In [26]:
acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])
brier_df = pd.DataFrame.from_dict(brier_train_list, orient='index', columns=['Brier'])
df = pd.concat([acc_df, mcc_df, f1_df, brier_df], axis=1)
df

Unnamed: 0,Accuracy,MCC,F1,Brier
knn,0.907317,0.375312,0.885044,0.092683
svm_rbf,0.998049,0.989868,0.998041,0.001951
dt,0.9,0.275382,0.867948,0.1
rf,0.982927,0.90913,0.982277,0.017073
mlp,0.891707,0.0,0.840661,0.108293
xgbc,1.0,1.0,1.0,0.0
stack,0.891707,0.0,0.840661,0.108293


In [28]:
acc_test_list = {'knn':knn_test_accuracy,
'svm_rbf': svm_rbf_test_accuracy,
'dt': dt_test_accuracy,
'rf': rf_test_accuracy,
'mlp': mlp_test_accuracy,
'xgbc': xgbc_test_accuracy,
'stack': stack_model_test_accuracy}

mcc_test_list = {'knn':knn_test_mcc,
'svm_rbf': svm_rbf_test_mcc,
'dt': dt_test_mcc,
'rf': rf_test_mcc,
'mlp': mlp_test_mcc,
'xgbc': xgbc_test_mcc,
'stack': stack_model_test_mcc}

f1_test_list = {'knn':knn_test_f1,
'svm_rbf': svm_rbf_test_f1,
'dt': dt_test_f1,
'rf': rf_test_f1,
'mlp': mlp_test_f1,
'xgbc': xgbc_test_f1,
'stack': stack_model_test_f1}

brier_test_list = {'knn':knn_test_brier,
'svm_rbf': svm_rbf_test_brier,
'dt': dt_test_brier,
'rf': rf_test_brier,
'mlp': mlp_test_brier,
'xgbc': xgbc_test_brier,
'stack': stack_model_test_brier}

In [29]:
acc_df = pd.DataFrame.from_dict(acc_test_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_test_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_test_list, orient='index', columns=['F1'])
brier_df = pd.DataFrame.from_dict(brier_test_list, orient='index', columns=['Brier'])
df = pd.concat([acc_df, mcc_df, f1_df, brier_df], axis=1)
df

Unnamed: 0,Accuracy,MCC,F1,Brier
knn,0.873294,0.074882,0.842041,0.126706
svm_rbf,0.890838,0.0,0.839408,0.109162
dt,0.877193,0.000835,0.835797,0.122807
rf,0.88694,-0.0219,0.837462,0.11306
mlp,0.890838,0.0,0.839408,0.109162
xgbc,0.883041,0.056844,0.842099,0.116959
stack,0.890838,0.0,0.839408,0.108293


## Save model

In [30]:
import joblib

# Guardar el modelo en un archivo
joblib.dump(svm_rbf, '../models/svm_model.joblib')
joblib.dump(xgbc, '../models/xgbc_model.joblib')
joblib.dump(stack_model, '../models/stack_model.joblib')

# Guardar listado de features para el modelo
joblib.dump(selected_columns, '../models/list_of_features.joblib')

# Cargar el modelo desde el archivo
# loaded_model = joblib.load('stack_model.joblib')

['../models/list_of_features.joblib']