# 🐧 Experimentos MLflow - Palmer Penguins

Este notebook implementa:
1. Carga de datos en MySQL (penguins_raw)
2. Preprocesamiento y generación de datos limpios (penguins_clean)
3. ≥20 experimentos con diferentes modelos e hiperparámetros
4. Registro del mejor modelo en MLflow Model Registry

In [1]:
# Fix de compatibilidad para SQLAlchemy 2.0
import sqlalchemy
from sqlalchemy import text as sql_text

# Monkey patch para compatibilidad
original_execute = sqlalchemy.engine.Connection.execute

def patched_execute(self, statement, *args, **kwargs):
    if isinstance(statement, str):
        statement = sql_text(statement)
    return original_execute(self, statement, *args, **kwargs)

sqlalchemy.engine.Connection.execute = patched_execute

print("✅ Patch aplicado para compatibilidad con SQLAlchemy 2.0")

✅ Patch aplicado para compatibilidad con SQLAlchemy 2.0


In [2]:
# Importaciones necesarias
import os
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import xgboost as xgb
import lightgbm as lgb
import pymysql
from sqlalchemy import create_engine, text
import warnings
warnings.filterwarnings('ignore')

# Configurar MLflow
mlflow.set_tracking_uri(os.getenv('MLFLOW_TRACKING_URI', 'http://mlflow:5000'))
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")

MLflow tracking URI: http://mlflow:5000


In [3]:
# Configuración de conexión a MySQL
MYSQL_CONFIG = {
    'host': os.getenv('MYSQL_HOST', 'mysql'),
    'port': int(os.getenv('MYSQL_PORT', 3306)),
    'user': os.getenv('MYSQL_USER', 'penguins'),
    'password': os.getenv('MYSQL_PASSWORD', 'penguins123'),
    'database': os.getenv('MYSQL_DATABASE', 'penguins_db')
}

# Crear engine de SQLAlchemy
engine = create_engine(
    f"mysql+pymysql://{MYSQL_CONFIG['user']}:{MYSQL_CONFIG['password']}@"
    f"{MYSQL_CONFIG['host']}:{MYSQL_CONFIG['port']}/{MYSQL_CONFIG['database']}"
)

## 1. Cargar datos crudos en MySQL

In [4]:
# Cargar dataset de penguins
try:
    from palmerpenguins import load_penguins
    df_raw = load_penguins()
    print("Dataset cargado desde palmerpenguins")
except:
    import seaborn as sns
    df_raw = sns.load_dataset('penguins')
    print("Dataset cargado desde seaborn")

print(f"Shape: {df_raw.shape}")
print(f"Columnas: {df_raw.columns.tolist()}")
df_raw.head()

Dataset cargado desde palmerpenguins
Shape: (344, 8)
Columnas: ['species', 'island', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex', 'year']


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [5]:
# Limpiar tabla penguins_raw
with engine.connect() as conn:
    conn.execute("TRUNCATE TABLE penguins_raw")
    conn.commit()
print("Tabla penguins_raw limpiada")

# Insertar datos crudos
df_raw.to_sql('penguins_raw', engine, if_exists='append', index=False)
print(f"✓ {len(df_raw)} registros insertados en penguins_raw")

Tabla penguins_raw limpiada
✓ 344 registros insertados en penguins_raw


## 2. Preprocesar datos y generar penguins_clean

In [6]:
# Leer datos desde MySQL
df_from_db = pd.read_sql("SELECT * FROM penguins_raw", engine)
print(f"Datos leídos: {df_from_db.shape}")

# Eliminar filas con valores nulos en features críticas
critical_features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']
df_clean = df_from_db.dropna(subset=critical_features)
print(f"Después de eliminar nulos: {df_clean.shape}")

# Codificar variable objetivo
label_encoder = LabelEncoder()
df_clean['species_encoded'] = label_encoder.fit_transform(df_clean['species'])
species_mapping = {i: sp for i, sp in enumerate(label_encoder.classes_)}
print(f"Mapeo de especies: {species_mapping}")

# Rellenar valores faltantes opcionales
df_clean['sex'] = df_clean['sex'].fillna('Unknown')
df_clean['year'] = df_clean['year'].fillna(df_clean['year'].median())

# Agregar timestamp
df_clean['processed_at'] = pd.Timestamp.now()

Datos leídos: (344, 10)
Después de eliminar nulos: (342, 10)
Mapeo de especies: {0: 'Adelie', 1: 'Chinstrap', 2: 'Gentoo'}


In [7]:
# Limpiar e insertar en penguins_clean
with engine.connect() as conn:
    conn.execute("TRUNCATE TABLE penguins_clean")
    conn.commit()

df_clean.to_sql('penguins_clean', engine, if_exists='append', index=False)
print(f"✓ {len(df_clean)} registros insertados en penguins_clean")

✓ 342 registros insertados en penguins_clean


## 3. Preparar datos para Machine Learning

In [8]:
# Preparar features y target
feature_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = df_clean[feature_cols].values
y = df_clean['species_encoded'].values

# División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Distribución de clases en train: {np.bincount(y_train)}")

Train set: (273, 4)
Test set: (69, 4)
Distribución de clases en train: [121  54  98]


## 4. Experimentos con MLflow (≥20 runs)

In [9]:
# Crear experimento en MLflow
experiment_name = "penguins-classification"
mlflow.set_experiment(experiment_name)

# Obtener ID del experimento
experiment = mlflow.get_experiment_by_name(experiment_name)
print(f"Experimento: {experiment_name} (ID: {experiment.experiment_id})")

Experimento: penguins-classification (ID: 1)


### Experimentos 1-5: Random Forest con diferentes hiperparámetros

In [10]:
# Configuraciones de Random Forest
rf_configs = [
    {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 5},
    {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2},
    {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2},
    {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 4},
    {'n_estimators': 500, 'max_depth': 20, 'min_samples_split': 3}
]

for i, config in enumerate(rf_configs, 1):
    with mlflow.start_run(run_name=f"rf_experiment_{i}"):
        # Log parameters
        mlflow.log_params(config)
        mlflow.log_param("model_type", "RandomForest")
        
        # Crear y entrenar modelo
        rf = RandomForestClassifier(random_state=42, **config)
        rf.fit(X_train, y_train)
        
        # Predicciones
        y_pred = rf.predict(X_test)
        
        # Métricas
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # Log model
        mlflow.sklearn.log_model(rf, "model", 
                                input_example=X_train[:1],
                                signature=mlflow.models.infer_signature(X_train, y_train))
        
        print(f"RF Exp {i}: Accuracy={accuracy:.4f}, F1={f1:.4f}")



🏃 View run rf_experiment_1 at: http://mlflow:5000/#/experiments/1/runs/135e82d187664aa090c12fcc2b2256b2
🧪 View experiment at: http://mlflow:5000/#/experiments/1


S3UploadFailedError: Failed to upload /tmp/tmpwsokro_r/model/conda.yaml to mlflows3/artifacts/1/models/m-d5b5e804895a4785b6da31775e6e98e2/artifacts/conda.yaml: An error occurred (NoSuchBucket) when calling the PutObject operation: The specified bucket does not exist

### Experimentos 6-10: KNN con diferentes valores de k

In [None]:
# KNN experiments
k_values = [3, 5, 7, 10, 15]
weights_options = ['uniform', 'distance']

exp_num = 6
for k in k_values:
    for weights in weights_options[:1]:  # Solo 'uniform' para llegar a 5 experimentos
        with mlflow.start_run(run_name=f"knn_experiment_{exp_num}"):
            # Log parameters
            mlflow.log_param("model_type", "KNN")
            mlflow.log_param("n_neighbors", k)
            mlflow.log_param("weights", weights)
            
            # Crear pipeline con escalado
            knn_pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=k, weights=weights))
            ])
            
            # Entrenar
            knn_pipeline.fit(X_train, y_train)
            
            # Predicciones
            y_pred = knn_pipeline.predict(X_test)
            
            # Métricas
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='macro')
            
            # Log metrics
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("f1_score", f1)
            
            # Log model
            mlflow.sklearn.log_model(knn_pipeline, "model",
                                    input_example=X_train[:1],
                                    signature=mlflow.models.infer_signature(X_train, y_train))
            
            print(f"KNN Exp {exp_num}: k={k}, Accuracy={accuracy:.4f}, F1={f1:.4f}")
            exp_num += 1

### Experimentos 11-15: SVM con diferentes kernels

In [None]:
# SVM experiments
svm_configs = [
    {'kernel': 'linear', 'C': 0.1},
    {'kernel': 'linear', 'C': 1.0},
    {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale'},
    {'kernel': 'rbf', 'C': 10.0, 'gamma': 'auto'},
    {'kernel': 'poly', 'C': 1.0, 'degree': 3}
]

for i, config in enumerate(svm_configs, 11):
    with mlflow.start_run(run_name=f"svm_experiment_{i}"):
        # Log parameters
        mlflow.log_params(config)
        mlflow.log_param("model_type", "SVM")
        
        # Crear pipeline con escalado
        svm_pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('svm', SVC(probability=True, random_state=42, **config))
        ])
        
        # Entrenar
        svm_pipeline.fit(X_train, y_train)
        
        # Predicciones
        y_pred = svm_pipeline.predict(X_test)
        
        # Métricas
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # Log model
        mlflow.sklearn.log_model(svm_pipeline, "model",
                                input_example=X_train[:1],
                                signature=mlflow.models.infer_signature(X_train, y_train))
        
        print(f"SVM Exp {i}: kernel={config['kernel']}, Accuracy={accuracy:.4f}, F1={f1:.4f}")

### Experimentos 16-20: XGBoost con diferentes configuraciones

In [None]:
# XGBoost experiments
xgb_configs = [
    {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1},
    {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05},
    {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.01},
    {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.1},
    {'n_estimators': 150, 'max_depth': 6, 'learning_rate': 0.3}
]

for i, config in enumerate(xgb_configs, 16):
    with mlflow.start_run(run_name=f"xgboost_experiment_{i}"):
        # Log parameters
        mlflow.log_params(config)
        mlflow.log_param("model_type", "XGBoost")
        
        # Crear modelo
        xgb_model = xgb.XGBClassifier(
            objective='multi:softprob',
            random_state=42,
            **config
        )
        
        # Entrenar
        xgb_model.fit(X_train, y_train)
        
        # Predicciones
        y_pred = xgb_model.predict(X_test)
        
        # Métricas
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # Log model
        mlflow.sklearn.log_model(xgb_model, "model",
                                input_example=X_train[:1],
                                signature=mlflow.models.infer_signature(X_train, y_train))
        
        print(f"XGB Exp {i}: Accuracy={accuracy:.4f}, F1={f1:.4f}")

### Experimentos adicionales 21-25: LightGBM

In [None]:
# LightGBM experiments para completar ≥20
lgb_configs = [
    {'n_estimators': 100, 'num_leaves': 31, 'learning_rate': 0.1},
    {'n_estimators': 200, 'num_leaves': 50, 'learning_rate': 0.05},
    {'n_estimators': 150, 'num_leaves': 20, 'learning_rate': 0.15},
    {'n_estimators': 300, 'num_leaves': 40, 'learning_rate': 0.01},
    {'n_estimators': 250, 'num_leaves': 60, 'learning_rate': 0.08}
]

for i, config in enumerate(lgb_configs, 21):
    with mlflow.start_run(run_name=f"lightgbm_experiment_{i}"):
        # Log parameters
        mlflow.log_params(config)
        mlflow.log_param("model_type", "LightGBM")
        
        # Crear modelo
        lgb_model = lgb.LGBMClassifier(
            objective='multiclass',
            random_state=42,
            verbose=-1,
            **config
        )
        
        # Entrenar
        lgb_model.fit(X_train, y_train)
        
        # Predicciones
        y_pred = lgb_model.predict(X_test)
        
        # Métricas
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # Log model
        mlflow.sklearn.log_model(lgb_model, "model",
                                input_example=X_train[:1],
                                signature=mlflow.models.infer_signature(X_train, y_train))
        
        print(f"LGB Exp {i}: Accuracy={accuracy:.4f}, F1={f1:.4f}")

## 5. Seleccionar mejor modelo y registrar en Model Registry

In [None]:
# Buscar el mejor run basado en F1 score
from mlflow.entities import ViewType

runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    filter_string="",
    order_by=["metrics.f1_score DESC"],
    max_results=1
)

best_run = runs.iloc[0]
print(f"Mejor run ID: {best_run['run_id']}")
print(f"Modelo: {best_run['params.model_type']}")
print(f"F1 Score: {best_run['metrics.f1_score']:.4f}")
print(f"Accuracy: {best_run['metrics.accuracy']:.4f}")

In [None]:
# Registrar el mejor modelo
model_name = "penguins-classifier"
model_uri = f"runs:/{best_run['run_id']}/model"

# Registrar modelo
try:
    # Intentar crear el modelo registrado
    mlflow.register_model(model_uri, model_name)
    print(f"✓ Modelo registrado: {model_name}")
except Exception as e:
    print(f"Modelo ya existe o error: {e}")

# Obtener versión del modelo
client = mlflow.tracking.MlflowClient()
model_version = client.get_latest_versions(model_name, stages=["None"])[0]
print(f"Versión del modelo: {model_version.version}")

In [None]:
# Transicionar modelo a Production
client.transition_model_version_stage(
    name=model_name,
    version=model_version.version,
    stage="Production",
    archive_existing_versions=True
)

# Añadir descripción al modelo
client.update_model_version(
    name=model_name,
    version=model_version.version,
    description=f"Mejor modelo para clasificación de pingüinos. F1={best_run['metrics.f1_score']:.4f}"
)

print(f"✓ Modelo {model_name} v{model_version.version} promovido a Production")

## 6. Verificar modelo en Production

In [None]:
# Cargar modelo desde Production
model_uri = f"models:/{model_name}/Production"
loaded_model = mlflow.pyfunc.load_model(model_uri)

# Hacer predicción de prueba
test_input = pd.DataFrame({
    'bill_length_mm': [44.5],
    'bill_depth_mm': [17.1],
    'flipper_length_mm': [200],
    'body_mass_g': [4200]
})

prediction = loaded_model.predict(test_input)
predicted_species = species_mapping[prediction[0]]

print(f"Predicción de prueba:")
print(f"Input: {test_input.values[0]}")
print(f"Predicción: {predicted_species} (código: {prediction[0]})")

In [None]:
# Resumen final
total_runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
print(f"\n📊 RESUMEN FINAL:")
print(f"- Total de experimentos realizados: {len(total_runs)}")
print(f"- Mejor modelo: {best_run['params.model_type']}")
print(f"- F1 Score: {best_run['metrics.f1_score']:.4f}")
print(f"- Modelo registrado: {model_name}")
print(f"- Versión en Production: {model_version.version}")
print(f"\n✅ Pipeline MLflow completado exitosamente!")