# Librerías

In [54]:
# Librerías
import os
import json
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import pandas as pd


# Configuracion inicial

In [55]:
# Configuración inicial
project_path = os.path.abspath(os.path.join(os.getcwd())) 
data_processed_path = os.path.join(project_path, "..", "data", "processed")
artifacts_path = os.path.join(project_path, "..", "artifacts")
pipeline_path = os.path.join(artifacts_path, "base_pipeline.pkl")
final_pipeline_path = os.path.join(artifacts_path, "best_model_pipeline.pkl")
results_file_path = os.path.join(artifacts_path, "model_results.json")

In [56]:
# Crear carpeta de artefactos si no existe
os.makedirs(artifacts_path, exist_ok=True)

In [57]:
def load_data():
    """Carga los datos procesados."""
    try:
        print("\nCargando datos procesados...")
        X_train = pd.read_csv(os.path.join(data_processed_path, "X_train.csv"))
        y_train = pd.read_csv(os.path.join(data_processed_path, "y_train.csv"))
        X_test = pd.read_csv(os.path.join(data_processed_path, "X_test.csv"))
        y_test = pd.read_csv(os.path.join(data_processed_path, "y_test.csv"))
        return X_train, y_train, X_test, y_test
    except FileNotFoundError as e:
        raise FileNotFoundError(f"Error al cargar los datos procesados: {e}")

In [58]:
# Validar columnas del pipeline
def validate_pipeline_columns(pipeline, X):
    updated_steps = []
    for name, step in pipeline.steps:
        if hasattr(step, "features_to_drop"):
            valid_features = [col for col in step.features_to_drop if col in X.columns]
            if valid_features:
                step.features_to_drop = valid_features
                updated_steps.append((name, step))
            else:
                print(f"El paso '{name}' fue eliminado porque no tiene columnas válidas.")
        else:
            updated_steps.append((name, step))
    return Pipeline(updated_steps)

# Modelos e hiperparámetros

In [59]:
def define_models():
    """Define los modelos a entrenar."""
    return {
        "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVC": SVC(random_state=42, probability=True),
        "XGBoost": XGBClassifier(random_state=42, eval_metric="logloss")
    }

In [60]:
# Función para definir los hiperparámetros de cada modelo
def get_hyperparameter_grid(model_name):
    """
    Devuelve un diccionario con el grid de hiperparámetros para cada modelo.

    Args:
        model_name (str): Nombre del modelo.

    Returns:
        dict: Diccionario con los hiperparámetros.
    """
    param_grids = {
        "Logistic Regression": {"C": [0.01, 0.1, 1, 10]},
        "Decision Tree": {"max_depth": [5, 10, 20, None]},
        "Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]},
        "SVC": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
        "XGBoost": {"n_estimators": [50, 100, 200], "max_depth": [3, 5, 7]}
    }
    return param_grids.get(model_name, {})

In [61]:

def train_model_with_hyperparameters(model_name, model, param_grid, X_train, y_train):
    """Entrena un modelo y ajusta los hiperparámetros si el grid está definido."""
    if param_grid:
        print(f"Ajustando hiperparámetros para {model_name}...")
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring="f1", n_jobs=-1)
        grid_search.fit(X_train, y_train.values.ravel())
        return grid_search.best_estimator_, grid_search.best_params_
    else:
        model.fit(X_train, y_train.values.ravel())
        return model, None

In [62]:
def train_and_tune_models(X_train, y_train, X_test, y_test, models_with_params):
    """Entrena y ajusta los modelos, guarda sus resultados."""
    results = {}
    print("\nEntrenando modelos y evaluando...")
    for model_name, model in models_with_params.items():
        print(f"\nEntrenando modelo: {model_name}")
        param_grid = get_hyperparameter_grid(model_name)
        best_model, best_params = train_model_with_hyperparameters(model_name, model, param_grid, X_train, y_train)
        
        # Evaluar modelo
        y_pred = best_model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        print(f"F1-Score para {model_name}: {f1}")
        
        # Guardar modelo
        model_file = os.path.join(artifacts_path, f"{model_name.replace(' ', '_').lower()}.pkl")
        with open(model_file, "wb") as f:
            pickle.dump(best_model, f)

        # Guardar resultados
        results[model_name] = {
            "f1_score": f1,
            "hyperparameters": best_params,
            "model_path": model_file
        }
    return results

In [63]:
def save_results(results):
    """Guarda los resultados en un archivo JSON."""
    with open(results_file_path, "w") as f:
        json.dump(results, f)
    print(f"\nResultados guardados en {results_file_path}")


In [64]:
def select_best_model(results):
    """Selecciona el mejor modelo basado en el F1-Score."""
    best_model_name = max(results, key=lambda x: results[x]["f1_score"])
    best_model_path = results[best_model_name]["model_path"]
    print(f"\nMejor modelo: {best_model_name} con F1-Score: {results[best_model_name]['f1_score']}")
    return best_model_name, best_model_path

# Pipeline

In [65]:
def load_pipeline():
    """Carga el pipeline base."""
    print("\nCargando pipeline base...")
    with open(pipeline_path, "rb") as f:
        return pickle.load(f)

In [66]:
def update_pipeline_with_model(pipeline, X_train, best_model):
    """Actualiza el pipeline con el modelo ganador."""
    def validate_pipeline_columns(pipeline, X):
        updated_steps = []
        for name, step in pipeline.steps:
            if hasattr(step, "features_to_drop"):
                valid_features = [col for col in step.features_to_drop if col in X.columns]
                if valid_features:
                    step.features_to_drop = valid_features
                    updated_steps.append((name, step))
                else:
                    print(f"El paso '{name}' fue eliminado porque no tiene columnas válidas.")
            else:
                updated_steps.append((name, step))
        return Pipeline(updated_steps)

    pipeline = validate_pipeline_columns(pipeline, X_train)
    pipeline.steps.append(("best_model", best_model))
    print("Modelo ganador agregado al pipeline.")
    return pipeline

In [67]:
def save_pipeline(pipeline):
    """Guarda el pipeline ajustado."""
    print(f"\nGuardando el pipeline ajustado en: {final_pipeline_path}")
    with open(final_pipeline_path, "wb") as f:
        pickle.dump(pipeline, f)

In [68]:
def evaluate_pipeline(pipeline, X_test, y_test):
    """Evalúa el pipeline ajustado."""
    print("\nEvaluando el pipeline ajustado...")
    y_pred = pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"\nF1-Score del pipeline ajustado: {f1}")
    print("\nReporte de clasificación:")
    print(classification_report(y_test, y_pred))

# Función principal

In [69]:
# Función Principal
def main():
    # Cargar datos
    X_train, y_train, X_test, y_test = load_data()
    
    # Definir modelos
    models_with_params = define_models()
    
    # Entrenar modelos y guardar resultados
    results = train_and_tune_models(X_train, y_train, X_test, y_test, models_with_params)
    save_results(results)
    
    # Seleccionar el mejor modelo
    best_model_name, best_model_path = select_best_model(results)
    
    # Cargar el modelo ganador
    with open(best_model_path, "rb") as f:
        best_model = pickle.load(f)
    
    # Cargar y actualizar pipeline
    pipeline = load_pipeline()
    pipeline = update_pipeline_with_model(pipeline, X_train, best_model)
    
    # Ajustar pipeline
    print("\nAjustando el pipeline completo...")
    pipeline.fit(X_train, y_train.values.ravel())
    
    # Guardar pipeline y evaluar
    save_pipeline(pipeline)
    evaluate_pipeline(pipeline, X_test, y_test)

In [70]:

if __name__ == "__main__":
    main()


Cargando datos procesados...

Entrenando modelos y evaluando...

Entrenando modelo: Logistic Regression
Ajustando hiperparámetros para Logistic Regression...
F1-Score para Logistic Regression: 0.9175257731958762

Entrenando modelo: Decision Tree
Ajustando hiperparámetros para Decision Tree...
F1-Score para Decision Tree: 0.9183673469387755

Entrenando modelo: Random Forest
Ajustando hiperparámetros para Random Forest...
F1-Score para Random Forest: 0.9166666666666666

Entrenando modelo: SVC
Ajustando hiperparámetros para SVC...
F1-Score para SVC: 0.93

Entrenando modelo: XGBoost
Ajustando hiperparámetros para XGBoost...
F1-Score para XGBoost: 0.9270833333333334

Resultados guardados en c:\Users\hp i7\Documents\Mini_Proyecto_2\notebooks\..\artifacts\model_results.json

Mejor modelo: SVC con F1-Score: 0.93

Cargando pipeline base...
El paso 'delete_features' fue eliminado porque no tiene columnas válidas.
Modelo ganador agregado al pipeline.

Ajustando el pipeline completo...

Guardando