# 03 - Entrenamiento de Modelos con Azure ML y MLflow

Este notebook demuestra el entrenamiento de múltiples modelos con tracking automático.

**Objetivos:**
- Entrenar modelos de clasificación
- Optimización de hiperparámetros con Optuna
- Tracking con MLflow
- Registro en Azure ML

In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

print("✅ Imports completados")

## 1. Configuración de MLflow y Azure ML

In [None]:
# Configurar MLflow
mlflow.set_experiment("clasificacion-avanzada")
mlflow.sklearn.autolog(log_models=True, log_input_examples=True)

# Conectar a Azure ML (opcional)
try:
    ml_client = MLClient(
        credential=DefaultAzureCredential(),
        subscription_id="<YOUR_SUBSCRIPTION_ID>",
        resource_group_name="<YOUR_RESOURCE_GROUP>",
        workspace_name="<YOUR_WORKSPACE>"
    )
    print(f"✅ Conectado a Azure ML: {ml_client.workspace_name}")
except Exception as e:
    print(f"⚠️  Azure ML no disponible: {e}")
    ml_client = None

## 2. Carga de Datos Preparados

In [None]:
# Cargar datasets
X_train = pd.read_parquet('./data/X_train.parquet')
X_test = pd.read_parquet('./data/X_test.parquet')
y_train = pd.read_parquet('./data/y_train.parquet')['target']
y_test = pd.read_parquet('./data/y_test.parquet')['target']

print(f"Train: {X_train.shape}, Test: {X_test.shape}")

## 3. Baseline: Logistic Regression

💡 **Copilot tip:** Siempre comienza con un modelo simple como baseline.

In [None]:
with mlflow.start_run(run_name="logistic_regression_baseline"):
    # Entrenar
    lr = LogisticRegression(max_iter=1000, random_state=42)
    lr.fit(X_train, y_train)
    
    # Predecir
    y_pred = lr.predict(X_test)
    y_proba = lr.predict_proba(X_test)[:, 1]
    
    # Métricas
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    # Log manual de métricas adicionales
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_f1": f1,
        "test_auc": auc
    })
    
    print(f"📊 Baseline Logistic Regression:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score: {f1:.4f}")
    print(f"   AUC: {auc:.4f}")

## 4. Random Forest con Optimización de Hiperparámetros

In [None]:
def objective(trial):
    """Función objetivo para Optuna"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'random_state': 42
    }
    
    # Entrenar con parámetros propuestos
    rf = RandomForestClassifier(**params)
    rf.fit(X_train, y_train)
    
    # Evaluar en validación
    y_pred = rf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    
    return f1

# Ejecutar optimización
print("🔍 Iniciando optimización con Optuna...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20, show_progress_bar=True)

print(f"\n✅ Mejores hiperparámetros:")
for key, value in study.best_params.items():
    print(f"   {key}: {value}")
print(f"   Best F1: {study.best_value:.4f}")

In [None]:
# Entrenar modelo final con mejores parámetros
with mlflow.start_run(run_name="random_forest_optimized"):
    best_rf = RandomForestClassifier(**study.best_params)
    best_rf.fit(X_train, y_train)
    
    # Evaluar
    y_pred = best_rf.predict(X_test)
    y_proba = best_rf.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    mlflow.log_params(study.best_params)
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_f1": f1,
        "test_auc": auc
    })
    
    print(f"📊 Random Forest Optimizado:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score: {f1:.4f}")
    print(f"   AUC: {auc:.4f}")

## 5. Gradient Boosting

In [None]:
with mlflow.start_run(run_name="gradient_boosting"):
    gb = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )
    gb.fit(X_train, y_train)
    
    y_pred = gb.predict(X_test)
    y_proba = gb.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_f1": f1,
        "test_auc": auc
    })
    
    print(f"📊 Gradient Boosting:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score: {f1:.4f}")
    print(f"   AUC: {auc:.4f}")

## 6. Comparación de Modelos

In [None]:
# Obtener runs del experimento
experiment = mlflow.get_experiment_by_name("clasificacion-avanzada")
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Mostrar comparación
comparison = runs[['tags.mlflow.runName', 'metrics.test_accuracy', 'metrics.test_f1', 'metrics.test_auc']].sort_values(
    'metrics.test_f1', ascending=False
)
comparison.columns = ['Modelo', 'Accuracy', 'F1 Score', 'AUC']

print("\n📊 Comparación de Modelos:")
display(comparison)

## 7. Guardar Mejor Modelo

In [None]:
import joblib

# Guardar mejor modelo (Random Forest optimizado)
model_path = './models/best_model.joblib'
joblib.dump(best_rf, model_path)

print(f"✅ Mejor modelo guardado en: {model_path}")
print(f"   Tipo: RandomForestClassifier")
print(f"   F1 Score: {f1:.4f}")

## 8. Resumen

**Modelos entrenados:**
1. ✅ Logistic Regression (baseline)
2. ✅ Random Forest (optimizado con Optuna)
3. ✅ Gradient Boosting

**Herramientas usadas:**
- MLflow para tracking automático
- Optuna para optimización
- Azure ML para gestión central

**Próximo paso:** Evaluación detallada (notebook 04)