In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay


# Función para mostrar los resultados de un modelo
def evaluate_model(model, x_test, y_test, model_name=""):
    # Predicciones
    y_preds = model.predict(x_test)

    # Reporte de Clasificación
    print(f"Reporte de Clasificación - {model_name}")
    print(classification_report(y_test, y_preds))

    # Matriz de Confusión
    cm = confusion_matrix(y_test, y_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f"Matriz de Confusión - {model_name}")
    plt.show()


# Función para entrenar y evaluar un modelo con características balanceadas
def train_and_evaluate_model(x_train, x_test, y_train, y_test, top_features, scale):
    # Modelos
    models = {
        "XGBoost con Balanceo": XGBClassifier(
            random_state=1, learning_rate=0.01, scale_pos_weight=scale
        ),
        "XGBoost sin Balanceo": XGBClassifier(random_state=1, learning_rate=0.01),
        "Logistic Regression con Balanceo": LogisticRegression(
            class_weight={1: scale, 0: 1}
        ),
        "Logistic Regression sin Balanceo": LogisticRegression(),
    }

    # Entrenamiento y evaluación
    for model_name, model in models.items():
        print(f"\nEntrenando {model_name}...")
        model.fit(x_train[top_features], y_train)
        evaluate_model(model, x_test[top_features], y_test, model_name)


# Función principal para preparar los datos y evaluar los modelos
def main():
    # Cargar datos
    data = pd.read_csv("../data/data.csv")

    # Selección de características importantes (simuladas en este caso)
    top_10_features = [
        "OPERA_Latin American Wings",
        "MES_7",
        "MES_10",
        "OPERA_Grupo LATAM",
        "MES_12",
        "TIPOVUELO_I",
        "MES_4",
        "MES_11",
        "OPERA_Sky Airline",
        "OPERA_Copa Air",
    ]

    # Generar features (simulado aquí con one-hot encoding)
    features = pd.concat(
        [
            pd.get_dummies(data["OPERA"], prefix="OPERA"),
            pd.get_dummies(data["TIPOVUELO"], prefix="TIPOVUELO"),
            pd.get_dummies(data["MES"], prefix="MES"),
        ],
        axis=1,
    )

    # Target
    target = data["delay"]

    # Dividir los datos
    x_train, x_test, y_train, y_test = train_test_split(
        features, target, test_size=0.33, random_state=42
    )

    # Balanceo de clases
    n_y0 = len(y_train[y_train == 0])
    n_y1 = len(y_train[y_train == 1])
    scale = n_y0 / n_y1

    # Entrenar y evaluar modelos
    train_and_evaluate_model(x_train, x_test, y_train, y_test, top_10_features, scale)


if __name__ == "__main__":
    main()

  main()


KeyError: 'delay'