In [11]:
import os
import warnings
import pandas as pd
from sqlalchemy import create_engine
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Suprimir warnings de InsecureRequestWarning
warnings.filterwarnings("ignore", category=UserWarning)

# Variables de conexión a MySQL CleanData
db_user = "model_user"
db_pass = "model_password"
db_host = "mysql-service"
db_port = 3306
clean_db = "CleanData"
CLEAN_URI = f"mysql+pymysql://{db_user}:{db_pass}@{db_host}:{db_port}/{clean_db}"

# Configuración MLflow / MinIO
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://10.43.101.196:30001"
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "supersecret"
mlflow.set_tracking_uri("http://10.43.101.196:30003")
mlflow.set_experiment("diabetes_readmission_experiments")

<Experiment: artifact_location='s3://mlflows3/artifacts/1', creation_time=1746120440749, experiment_id='1', last_update_time=1746120440749, lifecycle_stage='active', name='diabetes_readmission_experiments', tags={}>

In [12]:
# Crear engine y cargar splits procesados
en_engine = create_engine(CLEAN_URI)
df_train = pd.read_sql("SELECT * FROM diabetes_train_processed", con=en_engine)
df_val   = pd.read_sql("SELECT * FROM diabetes_validation_processed", con=en_engine)
df_test  = pd.read_sql("SELECT * FROM diabetes_test_processed", con=en_engine)

print("Shapes:", df_train.shape, df_val.shape, df_test.shape)

Shapes: (30000, 51) (20353, 51) (20354, 51)


In [13]:
#Defiónicion de Modelos y funcion de Entrenamiento
def run_experiment(model, name):
    """
    Entrena el modelo, evalúa en validación y test,
    y registra métricas y artefactos en MLflow.
    Incluye escalado de datos y generación de signature + input_example.
    """
    # Separar X/y
    X_train_df = df_train.drop("early_readmit", axis=1)
    y_train    = df_train["early_readmit"]
    X_val_df   = df_val.drop("early_readmit", axis=1)
    y_val      = df_val["early_readmit"]
    X_test_df  = df_test.drop("early_readmit", axis=1)
    y_test     = df_test["early_readmit"]

    # Escalado
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_df)
    X_val_scaled   = scaler.transform(X_val_df)
    X_test_scaled  = scaler.transform(X_test_df)

    # Reconstruir DataFrames escalados
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_df.columns)
    X_val_scaled_df   = pd.DataFrame(X_val_scaled,   columns=X_val_df.columns)
    X_test_scaled_df  = pd.DataFrame(X_test_scaled,  columns=X_test_df.columns)

    with mlflow.start_run(run_name=name):
        # Entrenamiento
        model.fit(X_train_scaled_df, y_train)

        # Evaluación en validación
        y_val_pred  = model.predict(X_val_scaled_df)
        y_val_proba = model.predict_proba(X_val_scaled_df)[:, 1]
        val_acc     = accuracy_score(y_val, y_val_pred)
        val_roc     = roc_auc_score(y_val, y_val_proba)

        # Evaluación en test
        y_test_pred  = model.predict(X_test_scaled_df)
        y_test_proba = model.predict_proba(X_test_scaled_df)[:, 1]
        test_acc     = accuracy_score(y_test, y_test_pred)
        test_roc     = roc_auc_score(y_test, y_test_proba)

        # Log de parámetros y métricas
        mlflow.log_params(model.get_params())
        mlflow.log_metric("val_accuracy", val_acc)
        mlflow.log_metric("val_roc_auc", val_roc)
        mlflow.log_metric("test_accuracy", test_acc)
        mlflow.log_metric("test_roc_auc", test_roc)

        # Generar signature e input example
        input_example = X_train_scaled_df.head(3)
        signature     = infer_signature(X_train_scaled_df, model.predict_proba(X_train_scaled_df))

        # Registrar modelo con signature y example
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            signature=signature,
            input_example=input_example
        )

        print(f"Run {name}: val_acc={val_acc:.4f}, val_roc={val_roc:.4f}, test_acc={test_acc:.4f}, test_roc={test_roc:.4f}")

# Definir lista de modelos a probar con max_iter aumentado
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, solver='lbfgs'),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

In [15]:
# Definimos diversas configuraciones de modelos
experiments = [
    ("LR_C_0.01", LogisticRegression(C=0.01, max_iter=1000, solver='lbfgs')),
    ("LR_C_0.1",  LogisticRegression(C=0.1,  max_iter=1000, solver='lbfgs')),
    ("LR_C_1",    LogisticRegression(C=1.0,  max_iter=1000, solver='lbfgs')),
    ("RF_100",    RandomForestClassifier(n_estimators=100, random_state=42)),
    ("RF_200",    RandomForestClassifier(n_estimators=200, random_state=42)),
    ("RF_depth10",RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)),
]

# Ejecutar cada experimento
for name, model in experiments:
    run_experiment(model, name)

Run LR_C_0.01: val_acc=0.8890, val_roc=0.6378, test_acc=0.8880, test_roc=0.6389
🏃 View run LR_C_0.01 at: http://10.43.101.196:30003/#/experiments/1/runs/0296729dc7ce451aae1d6fc5b7fb2f9d
🧪 View experiment at: http://10.43.101.196:30003/#/experiments/1
Run LR_C_0.1: val_acc=0.8890, val_roc=0.6375, test_acc=0.8878, test_roc=0.6380
🏃 View run LR_C_0.1 at: http://10.43.101.196:30003/#/experiments/1/runs/8abd032dabcb49a68ccde29817c9acf0
🧪 View experiment at: http://10.43.101.196:30003/#/experiments/1
Run LR_C_1: val_acc=0.8890, val_roc=0.6374, test_acc=0.8878, test_roc=0.6379
🏃 View run LR_C_1 at: http://10.43.101.196:30003/#/experiments/1/runs/b9e2e9038c354319a2bbbd4f78eb69bf
🧪 View experiment at: http://10.43.101.196:30003/#/experiments/1
Run RF_100: val_acc=0.8885, val_roc=0.6053, test_acc=0.8870, test_roc=0.6067
🏃 View run RF_100 at: http://10.43.101.196:30003/#/experiments/1/runs/c23821f9b0a3411ba10e6320b4df7989
🧪 View experiment at: http://10.43.101.196:30003/#/experiments/1
Run RF_200

In [16]:
from mlflow.tracking import MlflowClient

# Configurar cliente
client = MlflowClient(tracking_uri="http://10.43.101.196:30003")

# Obtener ID del experimento
experiment = client.get_experiment_by_name("diabetes_readmission_experiments")
# Buscar runs ordenadas por val_roc_auc descendente
runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.val_roc_auc DESC"]
)

# Seleccionar la mejor run
best_run = runs[0]
best_run_id = best_run.info.run_id
model_uri = f"runs:/{best_run_id}/model"
model_name = "best_diabetes_readmission_model"

# Crear o recuperar registro de modelo
try:
    client.get_registered_model(model_name)
except Exception:
    client.create_registered_model(model_name)

# Registrar nueva versión y promover a Production
mv = client.create_model_version(
    name=model_name,
    source=model_uri,
    run_id=best_run_id
)
client.transition_model_version_stage(
    name=model_name,
    version=mv.version,
    stage="Production",
    archive_existing_versions=True
)

print(f"Modelo '{model_name}' versión {mv.version} promocionado a Production (run_id={best_run_id}).")

2025/05/01 18:10:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best_diabetes_readmission_model, version 1


Modelo 'best_diabetes_readmission_model' versión 1 promocionado a Production (run_id=d4dab0df295c46fc84bf055aadc2d9c0).


  client.transition_model_version_stage(
