In [1]:
# Importar librerías
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
# Configuración de gráficos
sns.set_theme(style="whitegrid")

# Cargar Datos Procesados

In [3]:
# Cargar datos procesados
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

In [4]:
# Inspección de los datos cargados
print("Tamaño de los datos de entrenamiento:", X_train.shape, y_train.shape)
print("Tamaño de los datos de prueba:", X_test.shape, y_test.shape)

Tamaño de los datos de entrenamiento: (454902, 8) (454902, 1)
Tamaño de los datos de prueba: (56962, 8) (56962, 1)


# Entrenar y Evaluar Diferentes Modelos

## Definir los modelos

In [5]:
# Definir modelos y sus hiperparámetros
models_with_params = {
    "Logistic Regression": {
        "model": LogisticRegression(random_state=42, max_iter=1000),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"],
            "solver": ["liblinear"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Support Vector Machine": {
        "model": SVC(random_state=42, probability=True),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [3, 5, 10],
            "learning_rate": [0.01, 0.1, 0.2]
        }
    }
}

## Entrenar y evaluar cada modelo

In [None]:
# Entrenar modelos y seleccionar el mejor
results = []
best_model = None
best_model_name = None
best_score = 0

for model_name, model_details in models_with_params.items():
    print(f"\nEntrenando y evaluando: {model_name}")
    model = model_details["model"]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Calcular métricas
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

    # Guardar resultados
    results.append({
        "Modelo": model_name,
        "F1-Score": f1,
        "ROC-AUC": roc_auc
    })

    # Seleccionar el mejor modelo dinámicamente
    if f1 > best_score:
        best_model = model
        best_model_name = model_name
        best_score = f1

# Mostrar resultados
results_df = pd.DataFrame(results).sort_values(by="F1-Score", ascending=False)
print("\nResultados finales:")
display(results_df)

print(f"\nEl mejor modelo es: {best_model_name}")


Entrenando y evaluando: Logistic Regression

Entrenando y evaluando: Decision Tree


## Ajuste de Hiperparámetros para el Mejor Modelo

In [None]:
if best_model_name in models_with_params:
    print("\nAjuste de hiperparámetros para el mejor modelo...")
    best_model_params = models_with_params[best_model_name]["params"]
    grid_search = GridSearchCV(
        models_with_params[best_model_name]["model"],
        best_model_params,
        scoring="f1",
        cv=3,
        verbose=2,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train.values.ravel())
    best_model = grid_search.best_estimator_

    print("\nMejores parámetros encontrados:")
    print(grid_search.best_params_)

    # Evaluar el modelo con los mejores parámetros
    y_pred = best_model.predict(X_test)
    print("\nReporte de clasificación con el mejor modelo ajustado:")
    print(classification_report(y_test, y_pred))

# Guardar el Mejor Modelo

In [None]:
# Guardar el modelo ajustado
joblib.dump(best_model, "../models/best_model.pkl")
print("\nEl mejor modelo ha sido guardado en '../models/best_model.pkl'.")