# Librerías

In [17]:
# Librerías
import os
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline


# Configuracion inicial

In [18]:
# Configuración inicial
project_path = os.path.abspath(os.path.join(os.getcwd())) 
data_processed_path = os.path.join(project_path, "..", "data", "processed")
artifacts_path = os.path.join(project_path, "..", "artifacts")
pipeline_path = os.path.join(artifacts_path, "base_pipeline.pkl")
model_path = os.path.join(artifacts_path, "best_model_pipeline.pkl")

In [19]:
# Cargar Datos Procesados
try:
    X_train = pd.read_csv(os.path.join(data_processed_path, "X_train.csv"))
    y_train = pd.read_csv(os.path.join(data_processed_path, "y_train.csv"))
    X_test = pd.read_csv(os.path.join(data_processed_path, "X_test.csv"))
    y_test = pd.read_csv(os.path.join(data_processed_path, "y_test.csv"))

    print(f"Datos cargados: X_train {X_train.shape}, y_train {y_train.shape}")
except FileNotFoundError as e:
    raise FileNotFoundError(f"Error al cargar los datos procesados: {e}")


Datos cargados: X_train (1180, 19), y_train (1180, 1)


# Entrenar modelo y seleccionar el mejor

In [20]:
# Entrenar Modelos y Seleccionar el Mejor
models_with_params = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVC": SVC(random_state=42, probability=True),
    "XGBoost": XGBClassifier(random_state=42, eval_metric="logloss")
}

In [21]:
best_model = None
best_model_name = None
best_f1_score = 0

In [22]:
print("\nEntrenando modelos y evaluando...")
for model_name, model in models_with_params.items():
    print(f"\nEntrenando modelo: {model_name}")
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    
    print(f"F1-Score para {model_name}: {f1}")

    if f1 > best_f1_score:
        best_model = model
        best_model_name = model_name
        best_f1_score = f1

print(f"\nMejor modelo: {best_model_name} con F1-Score: {best_f1_score}")


Entrenando modelos y evaluando...

Entrenando modelo: Logistic Regression
F1-Score para Logistic Regression: 0.9175257731958762

Entrenando modelo: Decision Tree
F1-Score para Decision Tree: 0.8855721393034826

Entrenando modelo: Random Forest
F1-Score para Random Forest: 0.9319371727748691

Entrenando modelo: SVC
F1-Score para SVC: 0.9214659685863874

Entrenando modelo: XGBoost
F1-Score para XGBoost: 0.9230769230769231

Mejor modelo: Random Forest con F1-Score: 0.9319371727748691


# Ajustar hiperparámetros

In [23]:
# Ajustar Hiperparámetros
if best_model_name == "Random Forest":
    print("\nAjustando hiperparámetros para Random Forest...")
    param_grid = {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
    grid_search = GridSearchCV(best_model, param_grid, cv=3, scoring="f1", n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    best_model = grid_search.best_estimator_

    print("Mejores parámetros encontrados:")
    print(grid_search.best_params_)


Ajustando hiperparámetros para Random Forest...
Mejores parámetros encontrados:
{'max_depth': 10, 'n_estimators': 50}


# Actualizar pipeline

In [24]:
# Validar columnas del pipeline antes de ajustar
def validate_pipeline_columns(pipeline, X):
    updated_steps = []
    for name, step in pipeline.steps:
        if hasattr(step, 'features_to_drop'):
            valid_features = [col for col in step.features_to_drop if col in X.columns]
            if valid_features:
                step.features_to_drop = valid_features
                updated_steps.append((name, step))
            else:
                print(f"El paso '{name}' fue eliminado porque no tiene columnas válidas.")
        else:
            updated_steps.append((name, step))
    return Pipeline(updated_steps)

In [25]:
# Cargar Pipeline Base y Validar
print("\nCargando pipeline base...")
with open(pipeline_path, "rb") as f:
    pipeline = pickle.load(f)

pipeline = validate_pipeline_columns(pipeline, X_train)
pipeline.steps.append(("best_model", best_model))
print("Modelo ganador agregado al pipeline.")


Cargando pipeline base...
El paso 'delete_features' fue eliminado porque no tiene columnas válidas.
Modelo ganador agregado al pipeline.


In [26]:
# Ajustar el pipeline
try:
    print("\nAjustando el pipeline completo...")
    pipeline.fit(X_train, y_train.values.ravel())
except Exception as e:
    raise ValueError(f"Error al ajustar el pipeline completo: {e}")




Ajustando el pipeline completo...


# Guardar pipeline actualizado

In [27]:
# Guardar Pipeline Ajustado
print("\nGuardando el pipeline ajustado...")
with open(model_path, "wb") as f:
    pickle.dump(pipeline, f)


Guardando el pipeline ajustado...


# Evaluar pipeline

In [28]:
# Evaluar el Pipeline Ajustado
print("\nEvaluando el pipeline ajustado...")
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred)
print(f"\nF1-Score del pipeline ajustado: {f1}")
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))


Evaluando el pipeline ajustado...

F1-Score del pipeline ajustado: 0.9166666666666666

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       197
           1       0.95      0.89      0.92        99

    accuracy                           0.95       296
   macro avg       0.95      0.93      0.94       296
weighted avg       0.95      0.95      0.95       296

