# INTRODUCCIÓN

In [None]:
# Se importan las librerías necesarias.
import time
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import balanced_accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV

# ANÁLISIS EXPLORATORIO DE DATOS (EDA)

In [None]:
# Se cargan los datos y se incluyen en un Dataframe
with open('attrition_available_4.pkl', 'rb') as f:
    data = pickle.load(f)

df = pd.DataFrame(data)

# Se cuentan el número de filas y columnas
num_rows = df.shape[0]
num_columns = df.shape[1]

print("El DataFrame tiene {} filas y {} columnas".format(num_rows, num_columns))

El DataFrame tiene 4410 filas y 31 columnas


instancias filas atributos columnas

In [None]:
# Se imprime todo el Dataframe
pd.options.display.max_columns = 31
pd.options.display.max_rows = 4410

""" SE MOVERA A LA SIGUIENTE SECCION (NO TOCAR TODAVIA) """
"""

# Se crea un nuevo dataframe con los datos imputados
df_new = pd.concat([df[num_cols], df[cat_cols].fillna('VACIO')], axis='columns')

"""

df

Unnamed: 0,hrs,absences,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
1,,13.0,2.0,4.0,3.0,2.0,,31.0,Yes,Travel_Frequently,Research & Development,10.0,,Life Sciences,1,,Female,1.0,Research Scientist,Single,41890.0,0.0,Y,23,8.0,1.0,6.0,,5.0,1.0,4.0
6,6.450877,17.0,3.0,4.0,,3.0,,28.0,Yes,Travel_Rarely,Research & Development,11.0,2.0,Medical,1,7.0,Male,2.0,Sales Executive,,58130.0,2.0,Y,20,8.0,1.0,5.0,,0.0,0.0,0.0
13,8.871421,14.0,2.0,,1.0,2.0,2.0,47.0,Yes,Non-Travel,Research & Development,,,,1,14.0,Male,1.0,Research Scientist,Married,57620.0,1.0,Y,11,,2.0,10.0,4.0,10.0,9.0,9.0
28,10.713066,6.0,,3.0,4.0,3.0,1.0,44.0,Yes,Travel_Frequently,,1.0,,Medical,1,,Male,2.0,Research Scientist,Divorced,103330.0,3.0,Y,14,8.0,,19.0,2.0,1.0,0.0,
30,9.662808,11.0,,3.0,1.0,2.0,3.0,26.0,Yes,Travel_Rarely,Research & Development,,3.0,Medical,1,,Male,3.0,Research Scientist,,68540.0,2.0,Y,11,,0.0,5.0,,3.0,0.0,2.0
35,9.570924,17.0,3.0,3.0,,4.0,,26.0,Yes,Travel_Rarely,Research & Development,8.0,3.0,Medical,1,36.0,Male,,Manager,Single,157870.0,,Y,12,8.0,,8.0,5.0,8.0,7.0,4.0
38,6.177288,22.0,2.0,3.0,,3.0,3.0,18.0,Yes,Travel_Rarely,Research & Development,,,,1,39.0,Male,1.0,Sales Executive,Single,,1.0,Y,14,8.0,2.0,0.0,3.0,0.0,0.0,0.0
59,7.588037,10.0,3.0,,3.0,2.0,2.0,52.0,Yes,,Research & Development,7.0,1.0,Life Sciences,1,,Female,2.0,Research Scientist,,47740.0,2.0,Y,18,8.0,1.0,11.0,,8.0,7.0,7.0
65,10.641073,2.0,2.0,3.0,1.0,4.0,2.0,28.0,Yes,Travel_Rarely,Research & Development,9.0,4.0,,1,66.0,Male,1.0,Sales Executive,Married,56730.0,,Y,14,8.0,1.0,5.0,4.0,3.0,2.0,2.0
69,9.851604,3.0,3.0,,4.0,3.0,,39.0,Yes,Travel_Rarely,Research & Development,1.0,1.0,Medical,1,70.0,Female,1.0,Laboratory Technician,Married,42840.0,4.0,Y,17,,1.0,12.0,0.0,1.0,0.0,0.0


In [None]:
# Se imprime el tipo de datos que tienen los atributos
df.dtypes

num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

print("Columnas numéricas: {}\nColumnas categóricas/ordinales: {}".format(num_cols, cat_cols))
print()
print("Tipo de las columnas numéricas: {}\nTipo de las columnas categóricas/ordinales: {}".format(dict(df[num_cols].dtypes), dict(df[cat_cols].dtypes)))

Columnas numéricas: Index(['hrs', 'absences', 'JobInvolvement', 'PerformanceRating',
       'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age',
       'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeID',
       'JobLevel', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
Columnas categóricas/ordinales: Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'Over18'],
      dtype='object')

Tipo de las columnas numéricas: {'hrs': dtype('float64'), 'absences': dtype('float64'), 'JobInvolvement': dtype('float64'), 'PerformanceRating': dtype('float64'), 'EnvironmentSatisfaction': dtype('float64'), 'JobSatisfaction': dtype('float64'), 'WorkLifeBalance': dtype('float64'), 'Age': dtype('float64'), 'D

Hay un total de 23 atributos numéricos y 8 categóricos/ordinales, encontrándose dentro de este último grupo la clase "Attrition".

In [None]:
# Se comprueba si hay atributos constantes
for col in df.columns:
    if df[col].nunique() == 1:
        print(f"La columna '{col}' tiene valores constantes: {df[col].iloc[0]}")

La columna 'EmployeeCount' tiene valores constantes: 1
La columna 'Over18' tiene valores constantes: Y
La columna 'StandardHours' tiene valores constantes: 8.0


Al haber columnas con valores constantes estas pueden eliminarse, siendo estas "EmployeeCount", "Over18" y "StandarHours".

In [None]:
# Se calcula la proporción de valores faltantes (missing values) en cada atributo
prop_missing = df.isna().mean()
print(prop_missing)

hrs                        0.167574
absences                   0.165306
JobInvolvement             0.185034
PerformanceRating          0.166440
EnvironmentSatisfaction    0.219048
JobSatisfaction            0.004535
WorkLifeBalance            0.209070
Age                        0.163492
Attrition                  0.000000
BusinessTravel             0.173696
Department                 0.189342
DistanceFromHome           0.211791
Education                  0.213832
EducationField             0.184580
EmployeeCount              0.000000
EmployeeID                 0.173243
Gender                     0.178005
JobLevel                   0.184580
JobRole                    0.196145
MaritalStatus              0.168254
MonthlyIncome              0.167574
NumCompaniesWorked         0.183673
Over18                     0.194331
PercentSalaryHike          0.000000
StandardHours              0.184580
StockOptionLevel           0.190023
TotalWorkingYears          0.002041
TrainingTimesLastYear      0

Dado que se busca responder a la pregunta de si el empleado abandonaría o no la empresa, el problema es categórico.


A continuación se comprueba si el conjunto de datos está desbalanceado o no. Para ello, primero se procede a revisar si el número de entradas totales que tiene la columna "Attrition" (la clase) coincide con el número total de filas y estas entradas se reparten entre los valores "No" y "Yes". Si se cumple con esto, se descubre que la clase no contiene valores nulos, lo cual igualmente ya se conocía en la ejecución del bloque de código anterior; aunque se prefiere volver a demostrar.

Finalmente, se calcula la proporción de los valores categóricos "Yes" y "No" de la clase con respecto al número total de instancias.

In [None]:
# Se cuenta el número de ocurrencias de cada valor en la columna "attrition"
counts = df["Attrition"].value_counts()

print(counts)
print()

# Se compruba si la clase "Attrition" tiene valores nulos
total_counts = counts.sum()

if total_counts == num_rows:
  print("La clase 'Attrition' NO tiene valores nulos\n")
else:
  print("La clase 'Attrition' tiene valores nulos\n")

# Se calcula la proporción de muestras en cada clase
proportions = counts / len(df)

proportions

No     3699
Yes     711
Name: Attrition, dtype: int64

La clase 'Attrition' NO tiene valores nulos



No     0.838776
Yes    0.161224
Name: Attrition, dtype: float64

Como puede observarse, sí hay desbalance en la muestra de datos.

# PARTICIÓN DE LOS DATOS

ADVERTENCIA: SOLO EJECUTA UNA VEZ O HABRÁ QUE VOLVER A CARGAR EL DATAFRAME

In [None]:
# Se actualizan el listado de atributos categóricos y numéricos, eliminando los constantes
num_cols = np.delete(num_cols, [10, 16])
print(num_cols)
cat_cols = cat_cols[:7]
print(cat_cols)

Index(['hrs', 'absences', 'JobInvolvement', 'PerformanceRating',
       'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age',
       'DistanceFromHome', 'Education', 'EmployeeID', 'JobLevel',
       'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')
Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus'],
      dtype='object')


In [None]:
# Se crea el conjunto de atributos y clase
X = df.drop(['Attrition', 'EmployeeCount', 'Over18', 'StandardHours'], axis=1)
y = df['Attrition']

#X_train, X_test = train_test_split(df, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=4, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# Procesamiento de valores numéricos.
imputer_num = SimpleImputer(strategy='mean')
scaler = StandardScaler()
pipeline_num = Pipeline(
    steps=[
        ("imputer", imputer_num),
        ("scaler", scaler)
    ]
)


# Procesamiento de valores categóricos.
imputer_cat = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder()
pipeline_cat =  Pipeline(
    steps=[
        ('imputer', imputer_cat),
        ('encoder', encoder)
    ]
)

# Dependiendo del tipo de columna se aplica una transformación u otra.
processor = ColumnTransformer(
    transformers=[
        ("num", pipeline_num, num_cols),
        ("cat", pipeline_cat, cat_cols[1:]),
    ]
)

X_train_processed = processor.fit_transform(X_train)
X_test_processed = processor.transform(X_test)

X_processed = processor.fit_transform(X)


# Se reemplazan los valores categóricos (booleanos) de la clase por valores númericos (0-1) para poder aplicar la métrica f1
y_train.replace({'No': 0, 'Yes': 1}, inplace=True)
y_test.replace({'No': 0, 'Yes': 1}, inplace=True)
y.replace({'No': 0, 'Yes': 1}, inplace=True)

# Se crea la variable de validación cruzada para su posterior uso en el HPO
stratified_cv = StratifiedKFold(n_splits=5)

# Se crea el filto para los mejores atributos
filter = SelectKBest(f_classif, k=8)

(2940, 27) (2940,)
(1470, 27) (1470,)


# CONSTRUCCIÓN DE LOS MODELOS

## LOGISTIC REGRESSION

In [None]:
# Modelo de Regresión Logística
logistic_regr = Pipeline([('logistic_regression', LogisticRegression(class_weight='balanced', random_state=4))])

# Entrenamiento
start_time = time.time()
logistic_regr.fit(X_train_processed, y_train)
end_time = time.time()

elapsed_time_logistic_regr = end_time - start_time
logistic_regr_pred = logistic_regr.predict(X_test_processed)

# Evaluación
balanced_accuracy_logistic_regr = balanced_accuracy_score(y_test, logistic_regr_pred)
f1_logistic_regr = f1_score(y_test, logistic_regr_pred)
confusion_matrix_logistic_regr = confusion_matrix(y_test, logistic_regr_pred)

print("Tiempo de ejecución para entrenar el modelo de Regresión Logística: ", elapsed_time_logistic_regr)

print("Balance Accuracy de la Regresión Logística:", balanced_accuracy_logistic_regr)
print("F1 de la Regresión Logística:", f1_logistic_regr)
print("Matriz de confusión de la Regresión Logística:", confusion_matrix_logistic_regr)


print(logistic_regr[:-1].inverse_transform(logistic_regr[-1].coef_))

Tiempo de ejecución para entrenar el modelo de Regresión Logística:  0.05402541160583496
Balance Accuracy de la Regresión Logística: 0.7052008582545402
F1 de la Regresión Logística: 0.4346701164294955
Matriz de confusión de la Regresión Logística: [[865 368]
 [ 69 168]]
[[ 0.52593914 -0.05146453 -0.00702557  0.03822581 -0.30342705 -0.38294074
  -0.20352201 -0.20450767  0.02413791 -0.10751567 -0.02863041 -0.12811585
  -0.0406491   0.30565543  0.05508735  0.04012774 -0.57866429 -0.21849372
   0.08943416  0.3454858  -0.33940676 -0.71694924  0.64523934  0.07229105
   0.67202817 -0.34952945 -0.32191756  0.59734189  0.08178099 -0.15171747
   0.11856393 -0.13584808 -0.50954009  0.03307736 -0.0324962   0.07437925
  -0.13139002  0.03866674 -0.0679876  -0.67176494  0.51059289  0.25207027
   0.1337819  -0.13776731 -0.46192154 -0.16743817  0.62994087]]


In [None]:
X_processed = processor.fit_transform(X)
logistic_regr.fit(X_processed, y)

## BOOSTING

De entre los métodos de Boosting ofrecidos en scikit-learn se utilizará Adaboost Classifier

In [None]:
# Modelo de Boosting (Adaboost.M1)
boosting_mandatory = AdaBoostClassifier(estimator=DecisionTreeClassifier())

# Definición del espacio de búsqueda de hiperparámetros
boosting_mandatory_params = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'random_state': [4]
}

tree_params = {
    'estimator__max_depth': range(2, 16, 2),
    'estimator__min_samples_split': range(2, 8, 2),
    'estimator__class_weight': ['balanced']
}

boosting_mandatory_rs = RandomizedSearchCV(estimator=boosting_mandatory,
                                           param_distributions={**tree_params, **boosting_mandatory_params},
                                           n_iter=10,
                                           cv=stratified_cv,
                                           n_jobs=-1,
                                           random_state=4,
                                           scoring='f1')

# Entrenamiento
start_time = time.time()
boosting_mandatory_rs.fit(X_train_processed, y_train)
boosting_mandatory_model = boosting_mandatory_rs.best_estimator_
end_time = time.time()

elapsed_time_boosting_mandatory = end_time - start_time
boosting_mandatory_pred = boosting_mandatory_model.predict(X_test_processed)

# Evaluación
print(f'El mejor modelo encontrado es: {boosting_mandatory_rs.best_params_}')
print()
print("Tiempo de ejecución para entrenar el modelo Adaboost: ", elapsed_time_boosting_mandatory)

balanced_accuracy_boosting_mandatory = balanced_accuracy_score(y_test, boosting_mandatory_pred)
f1_boosting_mandatory = f1_score(y_test, boosting_mandatory_pred)
confusion_matrix_boosting_mandatory = confusion_matrix(y_test, boosting_mandatory_pred)

print("Balance Accuracy del AdaBoost:", balanced_accuracy_boosting_mandatory)
print("F1 del AdaBoost:", f1_boosting_mandatory)
print("Matriz de confusión del AdaBoost:", confusion_matrix_boosting_mandatory)

El mejor modelo encontrado es: {'random_state': 4, 'n_estimators': 150, 'learning_rate': 0.1, 'estimator__min_samples_split': 4, 'estimator__max_depth': 6, 'estimator__class_weight': 'balanced'}

Tiempo de ejecución para entrenar el modelo Adaboost:  109.68558120727539
Balance Accuracy del AdaBoost: 0.7708583572022545
F1 del AdaBoost: 0.6751269035532995
Matriz de confusión del AdaBoost: [[1209   24]
 [ 104  133]]


In [None]:
start_time = time.time()

boosting_mandatory_model.fit(X_processed, y)
end_time = time.time()
elapsed_time_boosting_mandatory_model = end_time - start_time

print("Tiempo de ejecución para entrenar el modelo Adaboost: ", elapsed_time_boosting_mandatory_model)

Tiempo de ejecución para entrenar el modelo Adaboost:  5.595022678375244


## XGBOOSTING (MODELO ADICIONAL)

In [None]:
# Modelo de XGBoost
xgboost = xgb.XGBClassifier(objective="binary:logistic", random_state=4)

# Definición del espacio de búsqueda de hiperparámetros
xgboost_params = {
    "colsample_bytree": [0.3,0.5,0.7],
    "gamma": [0.1,0.25,0.45], 
    "max_depth": [2,3,4,5,6], 
    "n_estimators": [50,100,150],
    "subsample": [0.6,0.5,0.4]}

xgboost_rs = RandomizedSearchCV(estimator=xgboost, 
                                param_distributions=xgboost_params, 
                                random_state=4, 
                                n_iter=10, 
                                cv=stratified_cv, 
                                scoring = 'f1',
                                n_jobs=-1)

# Entrenamiento
start_time = time.time()
xgboost_rs.fit(X_train_processed, y_train)
xgboost_model = xgboost_rs.best_estimator_
end_time = time.time()


elapsed_time_xgboost = end_time - start_time
xgboost_pred = xgboost_model.predict(X_test_processed)


# Evaluación
print(f'El mejor modelo encontrado es: {xgboost_rs.best_params_}')
print()
print("Tiempo de ejecución para entrenar el modelo xgb: ", elapsed_time_xgboost)

balanced_accuracy_xgboost = balanced_accuracy_score(y_test, xgboost_pred)
f1_xgboost = f1_score(y_test, xgboost_pred)
confusion_matrix_xgboost = confusion_matrix(y_test, xgboost_pred)

print("Balance Accuracy del xgboost:", balanced_accuracy_xgboost)
print("F1 del xgboost:", f1_xgboost)
print("Matriz de confusión del xgboost:", confusion_matrix_xgboost)

El mejor modelo encontrado es: {'subsample': 0.5, 'n_estimators': 150, 'max_depth': 6, 'gamma': 0.45, 'colsample_bytree': 0.3}

Tiempo de ejecución para entrenar el modelo xgb:  22.08345603942871
Balance Accuracy del xgboost: 0.6944213454885172
F1 del xgboost: 0.5439093484419263
Matriz de confusión del xgboost: [[1213   20]
 [ 141   96]]


In [None]:
start_time = time.time()
xgboost_model.fit(X_processed, y)
end_time = time.time()
elapsed_time_xgboost_model = end_time - start_time

print("Tiempo de ejecución para entrenar el modelo Adaboost: ", elapsed_time_xgboost_model)

Tiempo de ejecución para entrenar el modelo Adaboost:  0.8963310718536377


# ¿SE PUEDEN MEJORAR LOS DATOS?

In [None]:
# Crear un dataframe con los resultados de los modelos y sus tiempos
results = pd.DataFrame({'Model': ['Logistic Regression', 'Adaboost_M1', 'xgboost'],
                        'Balanced Accuracy': [balanced_accuracy_logistic_regr, balanced_accuracy_boosting_mandatory, balanced_accuracy_xgb],
                        'F1': [f1_logistic_regr, f1_boosting_mandatory, f1_xgb],
                        'Confusion Matrix': [confusion_matrix_logistic_regr, confusion_matrix_boosting_mandatory, confusion_matrix_xgb],
                        'Time': [elapsed_time_logistic_regr, elapsed_time_boosting_mandatory, elapsed_time_xgb]})

# Dataframe con mejor estimador y parametros para cada modelo (con HPO)
best_info_model = pd.DataFrame({'Model': ['Adaboost_M1', 'xgboost'],
                               'best-estimator': [boosting_mandatory_rs.best_estimator_, xgb_rs.best_estimator_],
                               'best-params': [boosting_mandatory_rs.best_params_, xgb_rs.best_params_]})

# Mostrar los resultados,  mejor info de modelos resultantes del HPO
display(results)
display(best_info_model)

Unnamed: 0,Model,Balanced Accuracy,F1,Confusion Matrix,Time
0,Logistic Regression,0.705201,0.43467,"[[865, 368], [69, 168]]",0.054025
1,Adaboost_M1,0.770858,0.675127,"[[1209, 24], [104, 133]]",109.685581
2,xgboost,0.694421,0.543909,"[[1213, 20], [141, 96]]",14.231444


Unnamed: 0,Model,best-estimator,best-params
0,Adaboost_M1,(DecisionTreeClassifier(class_weight='balanced...,"{'random_state': 4, 'n_estimators': 150, 'lear..."
1,xgboost,"XGBClassifier(base_score=None, booster=None, c...","{'subsample': 0.5, 'n_estimators': 150, 'max_d..."


In [None]:
# Modelo de Regresión Logística con filtro
logistic_regr_filter = Pipeline([('filter', filter), 
                          ('logistic_regression', LogisticRegression(class_weight='balanced', random_state=4))])

# Entrenamiento
start_time = time.time()
logistic_regr_filter.fit(X_train_processed, y_train)
end_time = time.time()

elapsed_time_logistic_regr_filter = end_time - start_time
logistic_regr_filter_pred = logistic_regr_filter.predict(X_test_processed)

# Evaluación
balanced_accuracy_logistic_regr_filter = balanced_accuracy_score(y_test, logistic_regr_filter_pred)
f1_logistic_regr_filter = f1_score(y_test, logistic_regr_filter_pred)
confusion_matrix_logistic_regr_filter = confusion_matrix(y_test, logistic_regr_filter_pred)

print("Tiempo de ejecución para entrenar el modelo de Regresión Logística (con filtro): ", elapsed_time_logistic_regr_filter)

print("Balance Accuracy de la Regresión Logística (con filtro):", balanced_accuracy_logistic_regr_filter)
print("F1 de la Regresión Logística (con filtro):", f1_logistic_regr_filter)
print("Matriz de confusión de la Regresión Logística (con filtro):", confusion_matrix_logistic_regr_filter)


print(logistic_regr_filter[:-1].inverse_transform(logistic_regr_filter[-1].coef_))

Tiempo de ejecución para entrenar el modelo de Regresión Logística:  0.04040384292602539
Balance Accuracy de la Regresión Logística: 0.6699929163201823
F1 de la Regresión Logística: 0.39552238805970147
Matriz de confusión de la Regresión Logística: [[825 408]
 [ 78 159]]
[[ 0.50309663  0.          0.          0.          0.         -0.3326891
   0.         -0.13117681  0.          0.          0.          0.
   0.          0.          0.          0.         -0.37619784  0.
   0.10931322  0.         -0.28660403  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.27454903  1.01202003]]


In [None]:
# Modelo de Boosting (Adaboost.M1) con filtro
boosting_mandatory_filter = Pipeline([('filter', filter),
                                      ('Adaboost', AdaBoostClassifier())])

# Definición del espacio de búsqueda de hiperparámetros
boosting_mandatory_filter_params = {
    'Adaboost__estimator': [DecisionTreeClassifier()],
    'Adaboost__n_estimators': [50, 100, 150, 200],
    'Adaboost__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'Adaboost__random_state': [4]
}

tree_filter_params = {
    'Adaboost__estimator__max_depth': range(2, 16, 2),
    'Adaboost__estimator__min_samples_split': range(2, 8, 2),
    'Adaboost__estimator__class_weight': ['balanced']
}

boosting_mandatory_filter_rs = RandomizedSearchCV(estimator=boosting_mandatory_filter,
                                           param_distributions={**tree_filter_params, **boosting_mandatory_filter_params},
                                           n_iter=10,
                                           cv=stratified_cv,
                                           n_jobs=-1,
                                           random_state=4,
                                           scoring='f1')

# Entrenamiento
start_time = time.time()
boosting_mandatory_filter_rs.fit(X_train_processed, y_train)
boosting_mandatory_filter_model = boosting_mandatory_filter_rs.best_estimator_
end_time = time.time()

elapsed_time_boosting_mandatory_filter = end_time - start_time
boosting_mandatory_filter_pred = boosting_mandatory_filter_model.predict(X_test_processed)

# Evaluación
print(f'El mejor modelo (con filtro) encontrado es: {boosting_mandatory_filter_rs.best_params_}')
print()
print("Tiempo de ejecución para entrenar el modelo Adaboost (con filtro): ", elapsed_time_boosting_mandatory_filter)

balanced_accuracy_boosting_mandatory_filter = balanced_accuracy_score(y_test, boosting_mandatory_pred)
f1_boosting_mandatory_filter = f1_score(y_test, boosting_mandatory_filter_pred)
confusion_matrix_boosting_mandatory_filter = confusion_matrix(y_test, boosting_mandatory_filter_pred)

print("Balance Accuracy del AdaBoost (con filtro):", balanced_accuracy_boosting_mandatory_filter)
print("F1 del AdaBoost (con filtro):", f1_boosting_mandatory_filter)
print("Matriz de confusión del AdaBoost (con filtro):", confusion_matrix_boosting_mandatory_filter)

El mejor modelo (con filtro) encontrado es: {'Adaboost__random_state': 4, 'Adaboost__n_estimators': 150, 'Adaboost__learning_rate': 0.1, 'Adaboost__estimator__min_samples_split': 4, 'Adaboost__estimator__max_depth': 6, 'Adaboost__estimator__class_weight': 'balanced', 'Adaboost__estimator': DecisionTreeClassifier(class_weight='balanced', max_depth=6,
                       min_samples_split=4)}

Tiempo de ejecución para entrenar el modelo Adaboost (con filtro):  43.548912525177
Balance Accuracy del AdaBoost (con filtro): 0.7708583572022545
F1 del AdaBoost (con filtro): 0.6320987654320988
Matriz de confusión del AdaBoost (con filtro): [[1193   40]
 [ 109  128]]


In [None]:
# Modelo de XGBoost
xgboost_filter = Pipeline([('filter', filter),
                           ('xgboost', xgb.XGBClassifier(objective="binary:logistic", random_state=4))])


# Definición del espacio de búsqueda de hiperparámetros
xgboost_filter_params = {
    "xgboost__colsample_bytree": [0.3,0.5,0.7],
    "xgboost__gamma": [0.1,0.25,0.45], 
    "xgboost__max_depth": [2,3,4,5,6], 
    "xgboost__n_estimators": [50,100,150],
    "xgboost__subsample": [0.6,0.5,0.4]}

xgboost_filter_rs = RandomizedSearchCV(estimator=xgboost_filter, 
                                param_distributions=xgboost_filter_params, 
                                random_state=4, 
                                n_iter=10, 
                                cv=stratified_cv, 
                                scoring = 'f1',
                                n_jobs=-1)

# Entrenamiento
start_time = time.time()
xgboost_filter_rs.fit(X_train_processed, y_train)
xgboost_filter_model = xgboost_filter_rs.best_estimator_
end_time = time.time()


elapsed_time_xgboost_filter = end_time - start_time
xgboost_filter_pred = xgboost_filter_model.predict(X_test_processed)


# Evaluación
print(f'El mejor modelo (con filtro) encontrado es: {xgboost_filter_rs.best_params_}')
print()
print("Tiempo de ejecución para entrenar el modelo xgb (con filtro): ", elapsed_time_xgboost_filter)

balanced_accuracy_xgboost_filter = balanced_accuracy_score(y_test, xgboost_filter_pred)
f1_xgboost_filter = f1_score(y_test, xgboost_filter_pred)
confusion_matrix_xgboost_filter = confusion_matrix(y_test, xgboost_filter_pred)

print("Balance Accuracy del xgboost (con filtro):", balanced_accuracy_xgboost_filter)
print("F1 del xgboost (con filtro):", f1_xgboost_filter)
print("Matriz de confusión del xgboost (con filtro):", confusion_matrix_xgboost_filter)

El mejor modelo (con filtro) encontrado es: {'xgboost__subsample': 0.5, 'xgboost__n_estimators': 150, 'xgboost__max_depth': 6, 'xgboost__gamma': 0.45, 'xgboost__colsample_bytree': 0.3}

Tiempo de ejecución para entrenar el modelo xgb (con filtro):  6.606186389923096
Balance Accuracy del xgboost (con filtro): 0.6454977568347244
F1 del xgboost (con filtro): 0.43093922651933697
Matriz de confusión del xgboost (con filtro): [[1186   47]
 [ 159   78]]


In [None]:
feature_selection = f_classif(X_processed, y)
feature_selection

(array([1.37013704e+02, 6.43606053e+00, 1.01549810e+00, 2.87440460e+00,
        3.62044772e+01, 4.70280292e+01, 9.68441478e+00, 9.86599724e+01,
        4.56915835e-01, 2.53885631e+00, 4.36344193e-01, 4.99253185e+00,
        4.82347516e+00, 4.67023661e+00, 4.61903133e-01, 1.17433521e+01,
        6.82348877e+01, 6.29826081e+00, 7.78362450e+01, 1.73812586e+01,
        4.39206240e+01, 8.05175692e+00, 3.11332628e+01, 1.96551469e+00,
        8.52754871e-01, 2.32212143e+01, 2.87240150e-01, 3.24807922e-03,
        6.46017037e-02, 1.11029084e+00, 7.21132319e+00, 9.00966784e-01,
        9.00966784e-01, 1.17191709e-01, 7.23971543e-01, 5.63930537e-03,
        1.17003759e+00, 6.41244989e+00, 5.61895901e+00, 2.92962868e+00,
        1.79565467e-01, 3.52022058e-01, 2.67047822e+01, 2.50433088e+01,
        1.05021435e+02]),
 array([3.45103032e-31, 1.12168336e-02, 3.13644569e-01, 9.00688783e-02,
        1.92070306e-09, 7.96782878e-12, 1.87026772e-03, 5.22218087e-23,
        4.99104362e-01, 1.11146914e-01

# Nueva sección