# Entrenamiento de Modelo Deep Learning con Nested Cross Validation

Este notebook implementa el entrenamiento de un modelo de Deep Learning para predicci√≥n de churn usando Nested Cross Validation y registra todo en MLflow.


In [None]:
# Importar librer√≠as necesarias
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import mlflow
import mlflow.keras
from pathlib import Path

from src.domain.services.preprocessing_service import PreprocessingService
from src.domain.models.deep_learning_model import DeepLearningModel
from src.application.use_cases.train_model_use_case import TrainModelUseCase
from src.infrastructure.mlflow.mlflow_tracking import MLflowTracking
from src.config.settings import settings

print("Librer√≠as importadas correctamente")
print(f"MLflow Tracking URI: {settings.MLFLOW_TRACKING_URI}")
print(f"MLflow Experiment: {settings.MLFLOW_EXPERIMENT_NAME}")


## 1. Carga y Preparaci√≥n de Datos


In [None]:
# Cargar dataset
df = pd.read_csv('../data/churn_data.csv')

print(f"Dimensiones del dataset: {df.shape}")
print(f"\nPrimeras filas:")
df.head()


In [None]:
# Inicializar servicios
preprocessing_service = PreprocessingService()

# Preprocesar datos
X, y = preprocessing_service.preprocess_pipeline(df, fit=True)

print(f"Dimensiones de X: {X.shape}")
print(f"Dimensiones de y: {y.shape}")
print(f"\nDistribuci√≥n de clases:")
unique, counts = np.unique(y, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  Clase {u}: {c} ({c/len(y)*100:.2f}%)")


## 2. Configuraci√≥n de MLflow


In [None]:
# Configurar MLflow
mlflow_tracking = MLflowTracking()

# Crear o obtener experimento
mlflow.set_experiment(settings.MLFLOW_EXPERIMENT_NAME)

print(f"Experimento configurado: {settings.MLFLOW_EXPERIMENT_NAME}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")


## 3. Definici√≥n de Hiperpar√°metros para B√∫squeda


In [None]:
# Definir grilla de hiperpar√°metros para b√∫squeda
hyperparameter_grid = [
    {
        'hidden_layers': 2,
        'units_per_layer': [64, 32],
        'dropout_rate': 0.2,
        'learning_rate': 0.001,
        'batch_size': 32,
        'epochs': 50,
        'activation': 'relu',
        'optimizer': 'adam'
    },
    {
        'hidden_layers': 2,
        'units_per_layer': [128, 64],
        'dropout_rate': 0.3,
        'learning_rate': 0.001,
        'batch_size': 32,
        'epochs': 50,
        'activation': 'relu',
        'optimizer': 'adam'
    },
    {
        'hidden_layers': 3,
        'units_per_layer': [128, 64, 32],
        'dropout_rate': 0.3,
        'learning_rate': 0.0005,
        'batch_size': 64,
        'epochs': 50,
        'activation': 'relu',
        'optimizer': 'adam'
    },
    {
        'hidden_layers': 2,
        'units_per_layer': [256, 128],
        'dropout_rate': 0.4,
        'learning_rate': 0.0005,
        'batch_size': 64,
        'epochs': 50,
        'activation': 'relu',
        'optimizer': 'adam'
    }
]

print(f"Total de combinaciones de hiperpar√°metros: {len(hyperparameter_grid)}")
for i, hp in enumerate(hyperparameter_grid):
    print(f"\nConfiguraci√≥n {i+1}:")
    for key, value in hp.items():
        print(f"  {key}: {value}")


## 4. Entrenamiento con Nested Cross Validation


In [None]:
# Inicializar caso de uso
train_use_case = TrainModelUseCase(preprocessing_service)

# Configurar Nested CV
outer_k = 5  # Folds externos
inner_k = 3  # Folds internos para selecci√≥n de hiperpar√°metros

print(f"Configuraci√≥n de Nested Cross Validation:")
print(f"  - Folds externos (evaluaci√≥n final): {outer_k}")
print(f"  - Folds internos (selecci√≥n de hiperpar√°metros): {inner_k}")
print(f"\nIniciando entrenamiento...")


In [None]:
# Iniciar run principal en MLflow
with mlflow.start_run(run_name="nested_cv_training") as parent_run:
    # Registrar par√°metros generales
    mlflow.log_params({
        "outer_cv_folds": outer_k,
        "inner_cv_folds": inner_k,
        "dataset_size": len(df),
        "features_count": X.shape[1],
        "hyperparameter_combinations": len(hyperparameter_grid)
    })
    
    # Ejecutar Nested Cross Validation
    results = train_use_case.nested_cross_validation(
        df=df,
        outer_k=outer_k,
        inner_k=inner_k,
        hyperparameter_grid=hyperparameter_grid,
        random_state=42
    )
    
    # Registrar m√©tricas promedio
    mlflow.log_metrics(results['average_metrics'])
    
    # Registrar mejores hiperpar√°metros
    mlflow.log_params({
        f"best_{k}": v for k, v in results['best_hyperparameters'].items()
    })
    
    # Guardar y registrar el mejor modelo
    if results['best_model'] is not None:
        # Guardar modelo temporalmente
        model_path = "../models/best_churn_model"
        results['best_model'].save_model(model_path)
        
        # Registrar modelo en MLflow
        mlflow.keras.log_model(
            results['best_model'].model,
            artifact_path="model",
            registered_model_name=settings.MODEL_NAME
        )
        
        # Registrar en Model Registry con stage Production
        model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
        mlflow_tracking.register_model_version(
            model_uri=model_uri,
            registered_model_name=settings.MODEL_NAME,
            stage="Production"
        )
        
        print(f"\n‚úÖ Modelo registrado en MLflow Model Registry:")
        print(f"   Nombre: {settings.MODEL_NAME}")
        print(f"   Stage: Production")
        print(f"   URI: {model_uri}")
    
    print("\n‚úÖ Entrenamiento completado exitosamente")


## 5. Resultados del Entrenamiento


In [None]:
# Mostrar resultados detallados
print("="*60)
print("RESUMEN DE RESULTADOS - NESTED CROSS VALIDATION")
print("="*60)

print("\nüìä M√âTRICAS PROMEDIO (en todos los folds externos):")
for metric, value in results['average_metrics'].items():
    print(f"   {metric}: {value:.4f}")

print("\nüîß MEJORES HIPERPAR√ÅMETROS:")
for key, value in results['best_hyperparameters'].items():
    print(f"   {key}: {value}")

print("\nüìà RESULTADOS POR FOLD EXTERNO:")
for fold_result in results['nested_cv_results']:
    print(f"\n   Fold {fold_result['fold']}:")
    for metric, value in fold_result['test_metrics'].items():
        print(f"      {metric}: {value:.4f}")


In [None]:
# Visualizar resultados por fold
import matplotlib.pyplot as plt

fold_numbers = [r['fold'] for r in results['nested_cv_results']]
f1_scores = [r['test_metrics']['f1_score'] for r in results['nested_cv_results']]
accuracies = [r['test_metrics']['accuracy'] for r in results['nested_cv_results']]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(fold_numbers, f1_scores, marker='o', linewidth=2, markersize=8, label='F1-Score')
axes[0].axhline(y=results['average_metrics']['f1_score'], color='r', linestyle='--', label='Promedio')
axes[0].set_title('F1-Score por Fold Externo', fontweight='bold')
axes[0].set_xlabel('Fold')
axes[0].set_ylabel('F1-Score')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(fold_numbers, accuracies, marker='o', linewidth=2, markersize=8, label='Accuracy', color='green')
axes[1].axhline(y=results['average_metrics']['accuracy'], color='r', linestyle='--', label='Promedio')
axes[1].set_title('Accuracy por Fold Externo', fontweight='bold')
axes[1].set_xlabel('Fold')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 6. Guardar Preprocessing Service

Es importante guardar el preprocessing service para poder usarlo en producci√≥n.


In [None]:
# Guardar preprocessing service usando pickle
import pickle
import joblib

preprocessing_path = "../models/preprocessing_service.pkl"
joblib.dump(preprocessing_service, preprocessing_path)

print(f"‚úÖ Preprocessing service guardado en: {preprocessing_path}")

# Tambi√©n registrar como artefacto en MLflow
mlflow.log_artifact(preprocessing_path, "preprocessing")
print("‚úÖ Preprocessing service registrado en MLflow")


## 7. Verificaci√≥n del Modelo en MLflow

Verificar que el modelo se haya registrado correctamente en MLflow Model Registry.


In [None]:
# Verificar modelo en MLflow
from mlflow.tracking import MlflowClient

client = MlflowClient(settings.MLFLOW_TRACKING_URI)

try:
    latest_version = client.get_latest_versions(settings.MODEL_NAME, stages=["Production"])[0]
    print(f"‚úÖ Modelo encontrado en Model Registry:")
    print(f"   Nombre: {latest_version.name}")
    print(f"   Versi√≥n: {latest_version.version}")
    print(f"   Stage: {latest_version.current_stage}")
    print(f"   Run ID: {latest_version.run_id}")
    
    # Obtener informaci√≥n del run
    run = client.get_run(latest_version.run_id)
    print(f"\nüìä M√©tricas del modelo:")
    for key, value in run.data.metrics.items():
        print(f"   {key}: {value:.4f}")
        
except Exception as e:
    print(f"‚ö†Ô∏è Error al verificar modelo: {e}")


## 8. Conclusi√≥n

El modelo ha sido entrenado exitosamente usando Nested Cross Validation y registrado en MLflow. 

**Pr√≥ximos pasos:**
1. El modelo est√° disponible en MLflow Model Registry
2. La API puede cargar el modelo desde MLflow
3. El frontend puede consumir la API para realizar predicciones

**Para usar el modelo:**
- Accede a MLflow UI: `http://localhost:5000`
- El modelo est√° registrado como: `churn_deep_learning_model`
- Stage: `Production`
