In [None]:
# Celda 1: Configuración de MLflow para Codespaces
import os
import socket
from contextlib import closing
import mlflow
import subprocess
import time

def find_free_port():
    """Encuentra un puerto disponible automáticamente"""
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(('', 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        return s.getsockname()[1]

# Configurar MLflow
mlflow_port = find_free_port()
mlflow_dir = os.path.join(os.getcwd(), "mlruns")
os.makedirs(mlflow_dir, exist_ok=True)

# Iniciar MLflow UI en segundo plano
mlflow.set_tracking_uri(f"file:{mlflow_dir}")
subprocess.Popen(f"mlflow ui --port {mlflow_port} --host 0.0.0.0 --serve-artifacts", shell=True)

# Esperar un momento para que el servidor inicie
time.sleep(3)

# Mostrar URL de acceso
codespace_name = os.getenv('CODESPACE_NAME', 'localhost')
forwarding_domain = os.getenv('GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN', 'preview.app.github.dev')
print(f"\nMLflow UI disponible en: https://{codespace_name}-{mlflow_port}.{forwarding_domain}\n")

# 2. Importar librerías
import pandas as pd
import numpy as np
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, mean_absolute_error, precision_score,
                           recall_score, f1_score, mean_squared_error, roc_curve,
                           confusion_matrix, ConfusionMatrixDisplay)
import joblib
import matplotlib.pyplot as plt

# 3. Leer los datos
try:
    # Asegúrate de que la ruta es correcta para tu estructura de proyecto
    data_path = os.path.join('data', 'diabetes_cleaned.csv')
    diabetes_cleaned = pd.read_csv(data_path)
    print("Datos cargados correctamente")
except FileNotFoundError:
    raise Exception("Error: Ejecuta primero el notebook de Análisis Exploratorio o verifica la ruta del archivo")

# 4. Dividir los datos
X = diabetes_cleaned[['hbA1c_level', 'blood_glucose_level', 'age', 'bmi']]
y = diabetes_cleaned['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlflow_dir = os.path.join(os.getcwd(), "mlruns")
os.makedirs(mlflow_dir, exist_ok=True)
mlflow.set_tracking_uri(f"file:{mlflow_dir}")

# Crear un experimento si no existe
experiment_name = "Diabetes_Prediction"
if not mlflow.get_experiment_by_name(experiment_name):
  mlflow.create_experiment(experiment_name, artifact_location=mlflow_dir)
mlflow.set_experiment(experiment_name)

# 5. Iniciar un run y registrar todo
with mlflow.start_run():
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Métricas
    acc = accuracy_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Log de métricas
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("mae", mae)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("mse", mse)

    # Imprimir métricas en terminal
    print(f"\n Métricas del modelo:")
    print(f"  - Accuracy: {acc:.4f}")
    print(f"  - MAE: {mae:.4f}")
    print(f"  - Precision: {precision:.4f}")
    print(f"  - Recall: {recall:.4f}")
    print(f"  - F1-score: {f1:.4f}")
    print(f"  - MSE: {mse:.4f}")
    print(f"\n Gráficas guardadas en:")
    print(f"  - roc_curve.png (Curva ROC)")
    print(f"  - confusion_matrix.png (Matriz de confusión)")

    # Guardar modelo
    model_path = "modelo_rf.pkl"
    joblib.dump(model, model_path)
    mlflow.log_artifact(model_path)

    # Guardar dataset
    diabetes_cleaned.to_csv("diabetes_logged.csv", index=False)
    mlflow.log_artifact("diabetes_logged.csv")

    # Curva ROC
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure()
    plt.plot(fpr, tpr, label="ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title("Curva ROC")
    plt.legend()
    plt.savefig("roc_curve.png")
    mlflow.log_artifact("roc_curve.png")
    plt.close()

    # Matriz de confusión
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cm)
    disp.plot()
    plt.title("Matriz de Confusión")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.close()

    # Guardar predicciones
    pred_df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred})
    pred_df.to_csv("predicciones.csv", index=False)
    mlflow.log_artifact("predicciones.csv")

    # Log del modelo
    mlflow.sklearn.log_model(model, "modelo_random_forest", input_example=X.iloc[:2])