# Modelo Gradient Boosting

## Balanceo de datos

In [6]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

# CARGAMOS EL DATASET DE ENTRENAMIENTO
df_train = pd.read_csv(r"mitbih_train.csv")

df_train.columns = [f'Feature {i + 1}' for i in range(len(df_train.columns) - 1)] + ['Target']

# SEPARAMOS LOS DATOS PREDICTORES (X) Y OBJETIVO (y)
X = df_train.iloc[:, :-1]  # Todas las columnas excepto la última (predictores)
y = df_train.iloc[:, -1]   # La última columna (objetivo)

# VALIDAMOS LOS VALORES FALTANTES
if df_train.isnull().sum().sum() > 0:
    print("Existen valores faltantes en el dataset.")
else:
    print("No se encontraron valores faltantes en el dataset.")

# CONTAMOS LOS VALORES DE CADA CLASE
class_counts = df_train['Target'].value_counts()
print("Distribución de clases antes del submuestreo:")
print(class_counts)

# APLICAMOS RANDOM UNDER SAMPLER PARA BALANCEAR LOS DATOS
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# VERIFICAMOS LA NUEVA DISTRIBUCIÓN DE CLASES
print("Distribución de clases después del submuestreo:")
print(pd.Series(y_resampled).value_counts())

No se encontraron valores faltantes en el dataset.
Distribución de clases antes del submuestreo:
Target
0.0    72470
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: count, dtype: int64
Distribución de clases después del submuestreo:
Target
0.0    641
1.0    641
2.0    641
3.0    641
4.0    641
Name: count, dtype: int64


In [4]:
import warnings as wr
wr.filterwarnings('ignore')

In [5]:
df_train.head(3)

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 179,Feature 180,Feature 181,Feature 182,Feature 183,Feature 184,Feature 185,Feature 186,Feature 187,Target
0,0.960114,0.863248,0.461538,0.196581,0.094017,0.125356,0.099715,0.088319,0.074074,0.082621,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.659459,0.186486,0.07027,0.07027,0.059459,0.056757,0.043243,0.054054,0.045946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.925414,0.665746,0.541436,0.276243,0.196133,0.077348,0.071823,0.060773,0.066298,0.058011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
print('train set classes: ', df_train.iloc[:, -1].unique())

train set classes:  [0. 1. 2. 3. 4.]


## Modelo

In [13]:
# Se conservaron los nombres del ejercicio 1 para poder reutilizar los códigos disminuyendo el número de modificcaciones
X_resampled = X.to_numpy()
y_resampled = y.to_numpy()

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

#------------------------------------------------------------------------------------------------------------------
# Parámetros del clasificador Gradient Boosting
#------------------------------------------------------------------------------------------------------------------

print("----- Gradient Boosting Classifier - Hyperparameter tuning -----")

n_estimators_range = np.arange(50, 201, 10)  # Rango para el número de estimadores
learning_rate_range = np.linspace(0.01, 0.2, 10)  # Rango para la tasa de aprendizaje
best_acc = 0
best_params = {}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Búsqueda de los mejores hiperparámetros
for n_estimators in n_estimators_range:
    for learning_rate in learning_rate_range:
        print(f'---- n_estimators = {n_estimators}, learning_rate = {learning_rate:.3f}')
        
        acc_cv = []
        
        for train_index, test_index in kf.split(X_resampled, y_resampled):
            # Fase de entrenamiento
            x_train = X_resampled[train_index, :]
            y_train = y_resampled[train_index]
            
            clf_cv = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
            clf_cv.fit(x_train, y_train)
            
            # Fase de prueba
            x_test = X_resampled[test_index, :]
            y_test = y_resampled[test_index]
            y_pred = clf_cv.predict(x_test)
            
            acc_i = accuracy_score(y_test, y_pred)
            acc_cv.append(acc_i)
        
        # Promedio de las precisiones en las 5 particiones
        acc_hyp = np.mean(acc_cv)
        
        # Guardamos los mejores hiperparámetros
        if acc_hyp > best_acc:
            best_acc = acc_hyp
            best_params = {'n_estimators': n_estimators, 'learning_rate': learning_rate}
        
        print('ACC:', acc_hyp)

print("Mejores hiperparámetros:")
print("n_estimators:", best_params['n_estimators'])
print("learning_rate:", best_params['learning_rate'])
print("Mejor precisión:", best_acc)

# Entrenamos el modelo final con los mejores hiperparámetros
clf_gb = GradientBoostingClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], random_state=42)
clf_gb.fit(X_resampled, y_resampled)


----- Gradient Boosting Classifier - Hyperparameter tuning -----
---- n_estimators = 50, learning_rate = 0.010
ACC: 0.8807236479339853
---- n_estimators = 50, learning_rate = 0.031
ACC: 0.934005669146009
---- n_estimators = 50, learning_rate = 0.052
ACC: 0.9477002348299564
---- n_estimators = 50, learning_rate = 0.073
ACC: 0.9543590604597043
---- n_estimators = 50, learning_rate = 0.094
ACC: 0.9574314860780501
---- n_estimators = 50, learning_rate = 0.116


## función de carlos

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    
    pipeline = ImbPipeline(steps=[
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

    f1_scorer = make_scorer(f1_score, average='macro')
    # Validación cruzada en el conjunto de entrenamiento
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=f1_scorer)
    print(f'F1 Score: {scores.mean():.4f}')
    


    # Ajuste del modelo
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [None]:
print('-------- Evaluando GB --------')
evaluate_model(GradientBoostingClassifier(), X_train, y_train, X_test, y_test)