In [1]:
import pred_lgbm as pred
import funciones_lgbm as f_lgbm
import pandas as pd
import numpy as np
import matplotlib as plt

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def train_model_ridge_optuna(data, test_size=0.2, random_state=42, n_trials=100, timeout=300):
    """
    Args:
        data: DataFrame completo con todas las columnas (incluyendo Salary)
        test_size: Proporción del conjunto de prueba
        random_state: Semilla aleatoria
        n_trials: Número máximo de pruebas de Optuna
        timeout: Tiempo límite en segundos para la optimización
    """
    print(f"\n🚀 Entrenando modelo Ridge Regression con Optuna + Features Estadísticos")
    print(f"   Trials: {n_trials}, Timeout: {timeout}s")
    
    # Importaciones necesarias
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    from sklearn.linear_model import Ridge
    from sklearn.preprocessing import StandardScaler
    import optuna
    import numpy as np
    import warnings
    
    # Silenciar warnings
    warnings.filterwarnings('ignore')
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    
    # ============= PASO 1: PREPARAR GROUPING INFO =============
    print("📊 Paso 1: Preparando información de agrupación...")
    data_with_groups, grouping_info = f_lgbm.create_and_save_grouping_info(data)
    all_job_cats, all_seniority_cats = f_lgbm.get_all_categories(data_with_groups)
    
    print(f"   ✅ Grupos creados: Age_group, Exp_group")
    print(f"   ✅ Job categories: {len(all_job_cats)}")
    print(f"   ✅ Seniority levels: {len(all_seniority_cats)}")
    
    # ============= PASO 2: SEPARAR TARGET Y FEATURES =============
    
    print("🔄 Paso 2: Separando target y features...")
    
    X_data = data_with_groups.drop('Salary', axis=1)  # Variables disponibles en producción
    y = data_with_groups['Salary']  # Target
    
    print(f"   📊 Datos originales: {X_data.shape}")
    print(f"   🎯 Target: {len(y)} registros")
    
    # ============= PASO 3: SPLIT PRINCIPAL TRAIN/TEST =============
    print("✂️  Paso 3: Split principal train/test...")
    X_train_base, X_test_base, y_train, y_test = train_test_split(
        X_data, y, test_size=test_size, random_state=random_state
    )
    
    print(f"   📈 Train: {X_train_base.shape[0]} registros")
    print(f"   📉 Test:  {X_test_base.shape[0]} registros")
    
    # ============= PASO 4: CREAR FEATURES CON ESTADÍSTICAS =============
    print("🔧 Paso 4: Creando features con estadísticas...")
    
    # Crear features en TRAIN (calcula estadísticas)
    X_train, feature_names, stats_dict = f_lgbm.create_features_with_stats(
        X_train_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=None,
        is_training=True
    )
    
    # Aplicar features a TEST (usa estadísticas de train)
    X_test, _ = f_lgbm.create_features_with_stats(
        X_test_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=stats_dict,
        is_training=False
    )
    
    print(f"   ✅ Features totales: {X_train.shape[1]}")
    print(f"   ✅ Train: {X_train.shape}")
    print(f"   ✅ Test:  {X_test.shape}")
    
    # ============= PASO 5: NORMALIZACIÓN DE FEATURES =============
    print("🔄 Paso 5: Normalizando features (importante para Ridge)...")
    
    # Inicializar scaler
    scaler = StandardScaler()
    
    # Ajustar scaler en train y transformar
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"   ✅ Features normalizadas")
    print(f"   ✅ Train scaled: {X_train_scaled.shape}")
    print(f"   ✅ Test scaled: {X_test_scaled.shape}")
    
    # ============= PASO 6: SPLIT PARA VALIDACIÓN DE OPTUNA =============
    print("🔄 Paso 6: Split para validación de Optuna...")
    X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
        X_train_scaled, y_train, test_size=0.2, random_state=random_state
    )
    
    print(f"   🎯 Train opt: {X_train_opt.shape}")
    print(f"   🔍 Validation: {X_val_opt.shape}")
    
    # ============= PASO 7: FUNCIÓN OBJETIVO PARA OPTUNA =============
    def objective(trial):
        """Función objetivo para Optuna con Ridge Regression"""
        
        # Hiperparámetros a optimizar para Ridge
        params = {
            'alpha': trial.suggest_float('alpha', 0.001, 1000.0, log=True),
            'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
            'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']),
            'max_iter': trial.suggest_int('max_iter', 1000, 10000),
            'tol': trial.suggest_float('tol', 1e-6, 1e-2, log=True),
            'random_state': random_state
        }
        
        # Crear y entrenar modelo
        model = Ridge(**params)
        
        try:
            # Entrenar modelo
            model.fit(X_train_opt, y_train_opt)
            
            # Predecir en conjunto de validación
            y_pred = model.predict(X_val_opt)
            rmse = np.sqrt(mean_squared_error(y_val_opt, y_pred))
            
            return rmse
            
        except Exception as e:
            # Si hay error, devolver un valor alto
            return float('inf')
    
    # ============= PASO 8: OPTIMIZACIÓN CON OPTUNA =============
    print("🎯 Paso 8: Optimizando hiperparámetros con Optuna...")
    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=random_state))
    
    # Optimizar
    study.optimize(objective, n_trials=n_trials, timeout=timeout, show_progress_bar=True)
    
    print(f"   ✅ Optimización completada: {len(study.trials)} trials realizados")
    print(f"   🏆 Mejor RMSE de validación: ${study.best_value:,.2f}")
    
    # ============= PASO 9: MODELO FINAL =============
    print("🏆 Paso 9: Entrenando modelo final...")
    
    # Obtener mejores parámetros
    best_params = study.best_params.copy()
    best_params['random_state'] = random_state
    
    print("   📋 Mejores hiperparámetros encontrados:")
    for param, value in best_params.items():
        if param != 'random_state':
            print(f"      {param}: {value}")
    
    # Entrenar modelo final con mejores parámetros
    final_model = Ridge()
    
    try:
        # Entrenar en todo el conjunto de entrenamiento escalado
        final_model.fit(X_train_scaled, y_train)
        
        # ============= PASO 10: EVALUACIÓN FINAL =============
        print("📊 Paso 10: Evaluación final...")
        
        # Predicciones finales
        y_pred = final_model.predict(X_test_scaled)
        
        # Métricas en conjunto de prueba
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Cross-validation con el modelo optimizado
        print("   🔄 Realizando validación cruzada final...")
        cv_model = Ridge(**best_params)
        cv_scores = cross_val_score(
            cv_model, X_train_scaled, y_train, cv=5, 
            scoring='neg_mean_squared_error', n_jobs=-1
        )
        cv_rmse = np.sqrt(-cv_scores.mean())
        cv_std = np.sqrt(cv_scores.std())
        
        # ============= PASO 11: PREPARAR RESULTADOS =============
        print("📦 Paso 11: Preparando resultados finales...")
        
        # Calcular importancia de features usando coeficientes
        feature_importance = np.abs(final_model.coef_)
        
        # Resultados del modelo
        model_metrics = {
            'model': final_model,
            'scaler': scaler,  # Importante guardar el scaler
            'rmse': rmse,
            'r2': r2,
            'mae': mae,
            'cv_rmse': cv_rmse,
            'cv_std': cv_std,
            'predictions': y_pred,
            'feature_importances': feature_importance,
            'coefficients': final_model.coef_,
            'intercept': final_model.intercept_,
            'best_params': best_params,
            'optuna_study': study
        }
        
        # Resultado completo para compatibilidad
        final_results = {
            'model_results': {'Ridge_Optuna': model_metrics},
            'best_model_name': 'Ridge_Optuna',
            'best_model': final_model,
            'scaler': scaler,  # Importante para predictions futuras
            'feature_names': feature_names,
            'job_categories': all_job_cats,
            'seniority_categories': all_seniority_cats,
            'stats_dict': stats_dict,
            'grouping_info': grouping_info,
            'X_test': X_test_scaled,  # Datos escalados
            'y_test': y_test,
            'X_train': X_train_scaled,  # Datos escalados
            'y_train': y_train,
            'optimization_study': study
        }
        
        # ============= MOSTRAR RESULTADOS =============
        print(f"\n🎉 RESULTADOS FINALES:")
        print(f"   RMSE: ${rmse:,.2f}")
        print(f"   R²: {r2:.3f}")
        print(f"   MAE: ${mae:,.2f}")
        print(f"   CV RMSE: ${cv_rmse:,.2f} (±{cv_std:,.2f})")
        print(f"   Features totales: {len(feature_names)}")
        print(f"   Alpha óptimo: {best_params['alpha']:.6f}")
        print(f"   Mejora vs RMSE de validación: {((study.best_value - rmse) / study.best_value * 100):+.2f}%")
        print(f"   Coeficientes no-cero: {np.sum(np.abs(final_model.coef_) > 1e-10)}")
        
        return final_results
        
    except Exception as e:
        print(f"❌ Error entrenando modelo final: {str(e)}")
        import traceback
        print(traceback.format_exc())
        return None

In [7]:
def train_model_ridge_optuna_cv(data, test_size=0.2, random_state=42, n_trials=100, timeout=300):
    """
    Entrena Ridge Regression con y sin Optuna, usando validación cruzada interna para Optuna.
    Compara ambos en el mismo test set y devuelve resultados completos.
    """

    print(f"\n🚀 Entrenando Ridge Regression (default vs Optuna)")
    print(f"   Trials: {n_trials}, Timeout: {timeout}s")

    # Importaciones
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    from sklearn.linear_model import Ridge
    from sklearn.preprocessing import StandardScaler
    import optuna
    import numpy as np
    import warnings

    warnings.filterwarnings('ignore')
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    # ===================== Preprocesamiento =====================
    print("🔄 Preprocesando datos...")

    data_with_groups, grouping_info = f_lgbm.create_and_save_grouping_info(data)
    all_job_cats, all_seniority_cats = f_lgbm.get_all_categories(data_with_groups)

    X_data = data_with_groups.drop('Salary', axis=1)
    y = data_with_groups['Salary']

    X_train_base, X_test_base, y_train, y_test = train_test_split(
        X_data, y, test_size=test_size, random_state=random_state
    )

    X_train, feature_names, stats_dict = f_lgbm.create_features_with_stats(
        X_train_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=None,
        is_training=True
    )

    X_test, _ = f_lgbm.create_features_with_stats(
        X_test_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=stats_dict,
        is_training=False
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # ===================== Ridge default =====================
    print("\n✅ Entrenando Ridge (default)...")
    ridge_default = Ridge()
    ridge_default.fit(X_train_scaled, y_train)

    y_pred_default = ridge_default.predict(X_test_scaled)
    rmse_default = np.sqrt(mean_squared_error(y_test, y_pred_default))
    r2_default = r2_score(y_test, y_pred_default)
    mae_default = mean_absolute_error(y_test, y_pred_default)

    # ===================== Optuna objective con CV =====================
    print("\n🎯 Optimizando hiperparámetros con Optuna (CV interno)...")

    def objective(trial):
        params = {
            'alpha': trial.suggest_float('alpha', 0.001, 1000.0, log=True),
            'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
            'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']),
            'max_iter': trial.suggest_int('max_iter', 1000, 10000),
            'tol': trial.suggest_float('tol', 1e-6, 1e-2, log=True),
            'random_state': random_state
        }

        model = Ridge(**params)
        scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        rmse = np.sqrt(-scores.mean())
        return rmse

    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=random_state))
    study.optimize(objective, n_trials=n_trials, timeout=timeout, show_progress_bar=True)

    best_params = study.best_params.copy()
    best_params['random_state'] = random_state

    print("   📋 Mejores hiperparámetros encontrados:")
    for param, value in best_params.items():
        if param != 'random_state':
            print(f"      {param}: {value}")

    # ===================== Ridge con best_params =====================
    print("\n✅ Entrenando Ridge (Optuna)...")
    ridge_optuna = Ridge(**best_params)
    ridge_optuna.fit(X_train_scaled, y_train)

    y_pred_optuna = ridge_optuna.predict(X_test_scaled)
    rmse_optuna = np.sqrt(mean_squared_error(y_test, y_pred_optuna))
    r2_optuna = r2_score(y_test, y_pred_optuna)
    mae_optuna = mean_absolute_error(y_test, y_pred_optuna)

    # ===================== Comparación final =====================
    print("\n🏁 Comparación final en Test set:")
    print(f"   ➡ Ridge Default:  RMSE = ${rmse_default:,.2f} | R² = {r2_default:.3f} | MAE = ${mae_default:,.2f}")
    print(f"   ➡ Ridge Optuna:   RMSE = ${rmse_optuna:,.2f} | R² = {r2_optuna:.3f} | MAE = ${mae_optuna:,.2f}")

    if rmse_optuna < rmse_default:
        print("✅ 🏆 Optuna mejora el modelo final.")
        best_model = ridge_optuna
        best_model_name = "Ridge_Optuna"
    else:
        print("⚠️ 🔎 El modelo default es mejor. Se recomienda mantenerlo.")
        best_model = ridge_default
        best_model_name = "Ridge_Default"

    # ===================== Resultados finales =====================
    final_results = {
        'ridge_default': {
            'model': ridge_default,
            'rmse': rmse_default,
            'r2': r2_default,
            'mae': mae_default,
            'predictions': y_pred_default,
        },
        'ridge_optuna': {
            'model': ridge_optuna,
            'rmse': rmse_optuna,
            'r2': r2_optuna,
            'mae': mae_optuna,
            'predictions': y_pred_optuna,
            'best_params': best_params,
            'optuna_study': study,
        },
        'best_model_name': best_model_name,
        'best_model': best_model,
        'scaler': scaler,
        'feature_names': feature_names,
        'job_categories': all_job_cats,
        'seniority_categories': all_seniority_cats,
        'stats_dict': stats_dict,
        'grouping_info': grouping_info,
        'X_test': X_test_scaled,
        'y_test': y_test,
        'X_train': X_train_scaled,
        'y_train': y_train,
    }

    return final_results


In [2]:
def train_model_random_forest_optuna_cv(data, test_size=0.2, random_state=42, n_trials=100, timeout=300):
    """
    Entrena Random Forest con y sin Optuna, usando validación cruzada interna para Optuna.
    Compara ambos en el mismo test set y devuelve resultados completos.
    """

    print(f"\n🚀 Entrenando Random Forest (default vs Optuna)")
    print(f"   Trials: {n_trials}, Timeout: {timeout}s")

    # Importaciones
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    from sklearn.ensemble import RandomForestRegressor
    import optuna
    import numpy as np
    import warnings

    warnings.filterwarnings('ignore')
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    # ===================== Preprocesamiento =====================
    print("🔄 Preprocesando datos...")

    data_with_groups, grouping_info = f_lgbm.create_and_save_grouping_info(data)
    all_job_cats, all_seniority_cats = f_lgbm.get_all_categories(data_with_groups)

    X_data = data_with_groups.drop('Salary', axis=1)
    y = data_with_groups['Salary']

    X_train_base, X_test_base, y_train, y_test = train_test_split(
        X_data, y, test_size=test_size, random_state=random_state
    )

    X_train, feature_names, stats_dict = f_lgbm.create_features_with_stats(
        X_train_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=None,
        is_training=True
    )

    X_test, _ = f_lgbm.create_features_with_stats(
        X_test_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=stats_dict,
        is_training=False
    )

    # Random Forest no necesita escalado, pero lo mantenemos para consistencia
    print(f"✅ Features preparadas: {X_train.shape}")

    # ===================== Random Forest default =====================
    print("\n✅ Entrenando Random Forest (default)...")
    rf_default = RandomForestRegressor(random_state=random_state)
    rf_default.fit(X_train, y_train)

    y_pred_default = rf_default.predict(X_test)
    rmse_default = np.sqrt(mean_squared_error(y_test, y_pred_default))
    r2_default = r2_score(y_test, y_pred_default)
    mae_default = mean_absolute_error(y_test, y_pred_default)

    # ===================== Optuna objective con CV =====================
    print("\n🎯 Optimizando hiperparámetros con Optuna (CV interno)...")

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 30),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
            'max_samples': trial.suggest_float('max_samples', 0.5, 1.0) if trial.suggest_categorical('bootstrap_check', [True, False]) else None,
            'random_state': random_state,
            'n_jobs': -1
        }
        
        # Limpiar parámetros None o incompatibles
        if not params['bootstrap']:
            params.pop('max_samples', None)
        if params['max_features'] == 'auto':
            params['max_features'] = 'sqrt'  # 'auto' está deprecated
        
        model = RandomForestRegressor(**params)
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        rmse = np.sqrt(-scores.mean())
        return rmse

    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=random_state))
    study.optimize(objective, n_trials=n_trials, timeout=timeout, show_progress_bar=True)

    best_params = study.best_params.copy()
    best_params['random_state'] = random_state
    best_params['n_jobs'] = -1
    
    # Limpiar parámetros de Optuna que no son del modelo
    best_params.pop('bootstrap_check', None)
    
    # Limpiar parámetros incompatibles
    if not best_params.get('bootstrap', True):
        best_params.pop('max_samples', None)
    if best_params.get('max_features') == 'auto':
        best_params['max_features'] = 'sqrt'

    print("   📋 Mejores hiperparámetros encontrados:")
    for param, value in best_params.items():
        if param not in ['random_state', 'n_jobs']:
            print(f"      {param}: {value}")

    # ===================== Random Forest con best_params =====================
    print("\n✅ Entrenando Random Forest (Optuna)...")
    rf_optuna = RandomForestRegressor(**best_params)
    rf_optuna.fit(X_train, y_train)

    y_pred_optuna = rf_optuna.predict(X_test)
    rmse_optuna = np.sqrt(mean_squared_error(y_test, y_pred_optuna))
    r2_optuna = r2_score(y_test, y_pred_optuna)
    mae_optuna = mean_absolute_error(y_test, y_pred_optuna)

    # ===================== Comparación final =====================
    print("\n🏁 Comparación final en Test set:")
    print(f"   ➡ Random Forest Default:  RMSE = ${rmse_default:,.2f} | R² = {r2_default:.3f} | MAE = ${mae_default:,.2f}")
    print(f"   ➡ Random Forest Optuna:   RMSE = ${rmse_optuna:,.2f} | R² = {r2_optuna:.3f} | MAE = ${mae_optuna:,.2f}")

    if rmse_optuna < rmse_default:
        print("✅ 🏆 Optuna mejora el modelo final.")
        best_model = rf_optuna
        best_model_name = "RandomForest_Optuna"
    else:
        print("⚠️ 🔎 El modelo default es mejor. Se recomienda mantenerlo.")
        best_model = rf_default
        best_model_name = "RandomForest_Default"

    # ===================== Feature Importance =====================
    print("\n🔍 Top 10 Features más importantes:")
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    
    for idx, row in feature_importance.iterrows():
        print(f"   {row['feature']}: {row['importance']:.4f}")

    # ===================== Resultados finales =====================
    final_results = {
        'rf_default': {
            'model': rf_default,
            'rmse': rmse_default,
            'r2': r2_default,
            'mae': mae_default,
            'predictions': y_pred_default,
        },
        'rf_optuna': {
            'model': rf_optuna,
            'rmse': rmse_optuna,
            'r2': r2_optuna,
            'mae': mae_optuna,
            'predictions': y_pred_optuna,
            'best_params': best_params,
            'optuna_study': study,
        },
        'best_model_name': best_model_name,
        'best_model': best_model,
        'scaler': None,  # Random Forest no necesita scaler
        'feature_names': feature_names,
        'feature_importance': feature_importance,
        'job_categories': all_job_cats,
        'seniority_categories': all_seniority_cats,
        'stats_dict': stats_dict,
        'grouping_info': grouping_info,
        'X_test': X_test,
        'y_test': y_test,
        'X_train': X_train,
        'y_train': y_train,
    }

    return final_results

# Función adicional para guardar el modelo Random Forest
def save_random_forest_model(results, filename="../../../modelos/random_forest_optuna.pkl"):
    """
    Guarda el mejor modelo Random Forest con formato compatible
    """
    print("💾 Guardando modelo Random Forest...")
    
    best_model = results['best_model']
    best_name = results['best_model_name']
    
    # Calcular métricas del mejor modelo
    if 'optuna' in best_name.lower():
        metrics = {
            'rmse': results['rf_optuna']['rmse'],
            'r2': results['rf_optuna']['r2'],
            'mae': results['rf_optuna']['mae']
        }
    else:
        metrics = {
            'rmse': results['rf_default']['rmse'],
            'r2': results['rf_default']['r2'],
            'mae': results['rf_default']['mae']
        }
    
    # Crear paquete compatible
    model_package = {
        'model': best_model,
        'model_name': best_name,
        'feature_names': results['feature_names'],
        'job_categories': results['job_categories'],
        'seniority_categories': results['seniority_categories'],
        'stats_dict': results['stats_dict'],
        'grouping_info': results['grouping_info'],
        'total_features': len(results['feature_names']),
        'training_data_shape': results['X_train'].shape,
        'has_statistical_features': True,
        'is_ensemble': False,
        'feature_importance': results['feature_importance'],
        'metrics': metrics
    }
    
    try:
        import joblib
        joblib.dump(model_package, filename)
        print(f"✅ Modelo guardado en: {filename}")
        print(f"   🎯 Modelo: {best_name}")
        print(f"   📊 RMSE: ${metrics['rmse']:,.2f}")
        print(f"   📈 R²: {metrics['r2']:.3f}")
        print(f"   📉 MAE: ${metrics['mae']:,.2f}")
        return True
    except Exception as e:
        print(f"❌ Error guardando modelo: {e}")
        return False

# Función de ejemplo de uso
def example_usage():
    """
    Ejemplo de cómo usar la función
    """
    import pandas as pd
    
    # Cargar datos
    data = pd.read_csv('../../../dataC/imputado.csv').dropna()
    
    # Entrenar modelo
    results = train_model_random_forest_optuna_cv(
        data=data,
        test_size=0.2,
        random_state=42,
        n_trials=50,  # Reducir para pruebas rápidas
        timeout=300
    )
    
    # Guardar modelo
    save_random_forest_model(results)
    
    return results

In [6]:
def pipeline_fe():
    print("🚀 INICIANDO PIPELINE")

    # 1 . Cargar datos
    data = pd.read_csv('../../../dataC/imputado.csv')
    data["Description"] = data["Description"].fillna("")
    
    data = data.dropna()
    # Se mejora levemente los errores realizando esta imputación y dejando los otros nulos de los otros features.
    
    # Entrenar modelo
    model_results =train_model_random_forest_optuna_cv(data)

    # Analizar optimización
    #best_params, best_value = f_lgbm.analyze_optuna_optimization(model_results['optimization_study'])

    # Visualizar proceso
    #plot_optuna_optimization(model_results['optimization_study'])

    # 5. Analizar importancia
    #feature_importance = fun.analyze_feature_importance(X,feature_names,model)

    # 6. Analizar predicciones
    #predictions_analysis = f.analyze_predictions(model_results)

    # 7. Comparar modelos
    #fun.create_comparison_chart(model_results)

    print("\n🎉 ANÁLISIS COMPLETADO!")
    print("="*50)

    # Normalizar nombre
    best_name = model_results['best_model_name']
    """
    # Asegurar consistencia
    if best_name.lower() == 'ridge_optuna':
        best_name_key = 'ridge_optuna'
    elif best_name.lower() == 'ridge_default':
        best_name_key = 'ridge_default'
    else:
        raise ValueError(f"Nombre de modelo inesperado: {best_name}")

    """
    # Obtener el dict
    #best_result = model_results[best_name_key]

    #print(f"\n🏆 RESUMEN FINAL:")
    #print(f"   Mejor modelo: {best_name}")
    #print(f"   RMSE: ${best_result['rmse']:,.2f}")
    #print(f"   R²: {best_result['r2']:.3f}")
    #print(f"   MAE: ${best_result['mae']:,.2f}")
        
    

     
    return model_results,data
    # ,model,X,feature_importance
    
results,data=pipeline_fe()   

🚀 INICIANDO PIPELINE

🚀 Entrenando Random Forest (default vs Optuna)
   Trials: 100, Timeout: 300s
🔄 Preprocesando datos...
📊 Creando grupos y guardando información de rangos...
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 67 características en total
   - Variables numéricas básicas: 3
   - Variables de educación: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
📊 Creando features estadísticos para producción (TRAIN)...
   🔄 Calculando estadísticas en TRAIN (solo variables de producción)...
   ✅ Estadísticas calculadas para 7 grupos
   ✅ Creadas 32 features estadísticos para producción
✅ Features totales para producción: 99
   - Originales: 67
   - Estadísticos: 32
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 67 características en total
   - Va

Best trial: 75. Best value: 11928.3: 100%|██████████| 100/100 [00:43<00:00,  2.29it/s, 43.59/300 seconds]


   📋 Mejores hiperparámetros encontrados:
      n_estimators: 436
      max_depth: 30
      min_samples_split: 3
      min_samples_leaf: 1
      max_features: log2
      bootstrap: False

✅ Entrenando Random Forest (Optuna)...

🏁 Comparación final en Test set:
   ➡ Random Forest Default:  RMSE = $17,587.88 | R² = 0.877 | MAE = $9,698.65
   ➡ Random Forest Optuna:   RMSE = $15,361.44 | R² = 0.906 | MAE = $8,881.49
✅ 🏆 Optuna mejora el modelo final.

🔍 Top 10 Features más importantes:
   age_vs_gender_mean: 0.0553
   age_exp_interaction: 0.0473
   age_rank_in_gender: 0.0437
   age: 0.0431
   start_year: 0.0429
   exp_percentile_global: 0.0407
   age_zscore_global: 0.0395
   experience_age_ratio: 0.0394
   age_edu: 0.0354
   age_zscore_vs_gender: 0.0353

🎉 ANÁLISIS COMPLETADO!


In [10]:

def predict_random_forest_results(new_data, rf_results):
    """
    Función especial para predecir con los resultados de Random Forest
    Compatible con la estructura devuelta por train_model_random_forest_optuna_cv()
    """
    print("🌲 Predicción con Random Forest (función especial)...")
    
    # Extraer el mejor modelo y componentes necesarios
    best_model = rf_results.get('best_model')
    best_model_name = rf_results.get('best_model_name', 'Unknown')
    feature_names = rf_results.get('feature_names', [])
    job_categories = rf_results.get('job_categories', [])
    seniority_categories = rf_results.get('seniority_categories', [])
    stats_dict = rf_results.get('stats_dict', {})
    grouping_info = rf_results.get('grouping_info', {})
    scaler = rf_results.get('scaler')  # None para Random Forest
    
    print(f"   🤖 Modelo: {best_model_name}")
    print(f"   🔢 Features disponibles: {len(feature_names)}")
    
    # Verificar que tenemos todo lo necesario
    if best_model is None:
        print("❌ Error: No hay modelo en rf_results")
        return None
    
    if not hasattr(best_model, 'predict'):
        print("❌ Error: El modelo no tiene método predict")
        return None
    
    # Crear grupos si no existen
    input_data_copy = new_data.copy()
    if 'Exp_group' not in input_data_copy.columns or 'Age_group' not in input_data_copy.columns:
        for idx, row in input_data_copy.iterrows():
            # Usar la función calculate_groups de tu librería
            exp_group, age_group = pred.calculate_groups(
                age=row['Age'], 
                years_of_experience=row['Years_of_Experience'], 
                grouping_info=grouping_info
            )
            input_data_copy.at[idx, 'Exp_group'] = exp_group
            input_data_copy.at[idx, 'Age_group'] = age_group
    
    # Crear features usando tu función de la librería
    try:
        X_features, created_feature_names = f_lgbm.create_features_with_stats(
            input_data_copy,
            all_job_categories=job_categories,
            all_seniority_levels=seniority_categories,
            stats_dict=stats_dict,
            is_training=False
        )
        
        print(f"   ✅ Features creadas: {X_features.shape}")
        
    except Exception as e:
        print(f"❌ Error creando features: {e}")
        return None
    
    # Verificar dimensiones
    if len(created_feature_names) != len(feature_names):
        print(f"   ⚠️ Ajustando dimensiones: {len(created_feature_names)} → {len(feature_names)}")
        
        # Alinear con features del modelo entrenado
        X_aligned = pd.DataFrame(0, index=X_features.index, columns=feature_names)
        
        # Llenar con los valores disponibles
        for col in X_features.columns:
            if col in X_aligned.columns:
                X_aligned[col] = X_features[col]
        
        X_features = X_aligned
        print(f"   ✅ Dimensiones alineadas: {X_features.shape}")
    
    # Hacer predicción
    try:
        # Random Forest no necesita escalado
        prediction = best_model.predict(X_features)[0]
        
        print(f"   💰 Predicción Random Forest: ${prediction:,.2f}")
        print(f"   ✅ Predicción exitosa")
        
        # Mostrar métricas del modelo si están disponibles
        if 'optuna' in best_model_name.lower():
            metrics = rf_results.get('rf_optuna', {})
        else:
            metrics = rf_results.get('rf_default', {})
        
        if metrics:
            print(f"   📊 Métricas del modelo:")
            print(f"      RMSE: ${metrics.get('rmse', 0):,.2f}")
            print(f"      R²: {metrics.get('r2', 0):.3f}")
            print(f"      MAE: ${metrics.get('mae', 0):,.2f}")
        
        return prediction
        
    except Exception as e:
        print(f"❌ Error en predicción: {e}")
        return None

def test_random_forest_prediction(rf_results):
    """
    Test específico para Random Forest
    """
    print("🧪 Testing predicción Random Forest...")
    
    # Crear registro de prueba
    test_record = pd.DataFrame({
        'Age': [60],
        'Gender': ['Male'],
        'Education_Level': ["PhD"],
        'Job_Title': ['CEO'],
        'Years_of_Experience': [24],
        'Description': ['I work with machine learning models and data analysis']
    })
    
    print(f"📋 Registro de prueba:")
    print(f"   👤 Edad: {test_record['Age'][0]} años")
    print(f"   👨 Género: {test_record['Gender'][0]}")
    print(f"   🎓 Educación: {test_record['Education_Level'][0]}")
    print(f"   💼 Puesto: {test_record['Job_Title'][0]}")
    print(f"   📈 Experiencia: {test_record['Years_of_Experience'][0]} años")
    
    try:
        prediction = predict_random_forest_results(test_record, rf_results)
        
        if prediction is not None:
            print(f"✅ Test exitoso: Predicción = ${prediction:,.2f}")
            return True
        else:
            print("❌ Test falló: La predicción devolvió None")
            return False
            
    except Exception as e:
        print(f"❌ Test falló: {e}")
        return False

In [12]:
# Después de entrenar tu modelo Random Forest
rf_results = results
# Para hacer predicciones individuales
test_record = pd.DataFrame({
    'Age': [60],
    'Gender': ['Male'],
    'Education_Level': ["PhD"],
    'Job_Title': ['CEO'],
    'Years_of_Experience': [24],
    'Description': ['I work with machine learning models']
})

prediction = predict_random_forest_results(test_record, rf_results)

# Para hacer tests
test_random_forest_prediction(rf_results)

🌲 Predicción con Random Forest (función especial)...
   🤖 Modelo: RandomForest_Optuna
   🔢 Features disponibles: 99
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 67 características en total
   - Variables numéricas básicas: 3
   - Variables de educación: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
📊 Creando features estadísticos para producción (PREDICT)...
   📥 Usando estadísticas pre-calculadas de TRAIN...
   ✅ Creadas 26 features estadísticos para producción
✅ Features totales para producción: 93
   - Originales: 67
   - Estadísticos: 26
   ✅ Features creadas: (1, 93)
   ⚠️ Ajustando dimensiones: 93 → 99
   ✅ Dimensiones alineadas: (1, 99)
   💰 Predicción Random Forest: $189,403.67
   ✅ Predicción exitosa
   📊 Métricas del modelo:
      RMSE: $15,361.44
      R²: 0.906
      MAE: $8,881.49
🧪 Testing predicción Random Forest

True

In [23]:
def create_multiple_random_forest_ensemble(data, n_forests=10, test_size=0.2, random_state=42):
    """
    Crea un ensemble de múltiples Random Forest con diferentes configuraciones
    """
    print(f"🌲 Creando ENSEMBLE de {n_forests} Random Forest...")
    
    # Importaciones necesarias
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    import numpy as np
    
    # Preparar datos igual que antes
    data_with_groups, grouping_info = f_lgbm.create_and_save_grouping_info(data)
    all_job_cats, all_seniority_cats = f_lgbm.get_all_categories(data_with_groups)
    
    X_data = data_with_groups.drop('Salary', axis=1)
    y = data_with_groups['Salary']
    
    X_train_base, X_test_base, y_train, y_test = train_test_split(
        X_data, y, test_size=test_size, random_state=random_state
    )
    
    X_train, feature_names, stats_dict = f_lgbm.create_features_with_stats(
        X_train_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=None,
        is_training=True
    )
    
    X_test, _ = f_lgbm.create_features_with_stats(
        X_test_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=stats_dict,
        is_training=False
    )
    
    print(f"✅ Datos preparados: Train {X_train.shape}, Test {X_test.shape}")
    
    # ============================================================================
    # CONFIGURACIONES DIFERENTES PARA CADA RANDOM FOREST
    # ============================================================================
    
    rf_configs = [
        {
            'name': 'RF_Deep',
            'params': {
                'n_estimators': 200,
                'max_depth': 25,
                'min_samples_split': 2,
                'min_samples_leaf': 1,
                'max_features': 'sqrt',
                'bootstrap': True,
                'random_state': random_state + 1
            }
        },
        {
            'name': 'RF_Wide',
            'params': {
                'n_estimators': 150,
                'max_depth': 15,
                'min_samples_split': 5,
                'min_samples_leaf': 2,
                'max_features': 'log2',
                'bootstrap': True,
                'random_state': random_state + 2
            }
        },
        {
            'name': 'RF_Conservative',
            'params': {
                'n_estimators': 100,
                'max_depth': 10,
                'min_samples_split': 10,
                'min_samples_leaf': 4,
                'max_features': 0.8,
                'bootstrap': True,
                'random_state': random_state + 3
            }
        },
        {
            'name': 'RF_Aggressive',
            'params': {
                'n_estimators': 300,
                'max_depth': 30,
                'min_samples_split': 2,
                'min_samples_leaf': 1,
                'max_features': None,
                'bootstrap': True,
                'random_state': random_state + 4
            }
        },
        {
            'name': 'RF_Balanced',
            'params': {
                'n_estimators': 150,
                'max_depth': 20,
                'min_samples_split': 4,
                'min_samples_leaf': 2,
                'max_features': 'sqrt',
                'bootstrap': True,
                'random_state': random_state + 5
            }
        }
    ]
    
    # Usar solo el número de bosques solicitado
    rf_configs = rf_configs[:n_forests]
    
    # ============================================================================
    # ENTRENAR MÚLTIPLES RANDOM FOREST
    # ============================================================================
    
    rf_models = {}
    all_predictions = []
    
    for i, config in enumerate(rf_configs):
        name = config['name']
        params = config['params']
        
        print(f"🚀 Entrenando {name} ({i+1}/{n_forests})...")
        print(f"   📊 Parámetros: n_estimators={params['n_estimators']}, max_depth={params['max_depth']}")
        
        try:
            # Crear y entrenar Random Forest
            rf_model = RandomForestRegressor(**params, n_jobs=-1)
            rf_model.fit(X_train, y_train)
            
            # Hacer predicciones
            y_pred = rf_model.predict(X_test)
            
            # Calcular métricas
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            
            rf_models[name] = {
                'model': rf_model,
                'rmse': rmse,
                'r2': r2,
                'mae': mae,
                'predictions': y_pred,
                'config': config
            }
            
            all_predictions.append(y_pred)
            
            print(f"   ✅ RMSE: ${rmse:,.2f}, R²: {r2:.3f}, MAE: ${mae:,.2f}")
            
        except Exception as e:
            print(f"   ❌ Error entrenando {name}: {e}")
    
    # ============================================================================
    # CREAR ENSEMBLE DE RANDOM FORESTS
    # ============================================================================
    
    print(f"\n🎯 Creando ensemble de {len(rf_models)} Random Forest...")
    
    if len(all_predictions) == 0:
        print("❌ No se pudo entrenar ningún Random Forest")
        return None
    
    # Método 1: Promedio simple
    ensemble_simple = np.mean(all_predictions, axis=0)
    simple_rmse = np.sqrt(mean_squared_error(y_test, ensemble_simple))
    simple_r2 = r2_score(y_test, ensemble_simple)
    simple_mae = mean_absolute_error(y_test, ensemble_simple)
    
    # Método 2: Promedio ponderado por R²
    valid_models = {name: res for name, res in rf_models.items() if res['r2'] > 0}
    r2_scores = [res['r2'] for res in valid_models.values()]
    weights = np.array(r2_scores)
    weights = weights / weights.sum()
    
    valid_predictions = [res['predictions'] for res in valid_models.values()]
    ensemble_weighted = np.average(valid_predictions, axis=0, weights=weights)
    weighted_rmse = np.sqrt(mean_squared_error(y_test, ensemble_weighted))
    weighted_r2 = r2_score(y_test, ensemble_weighted)
    weighted_mae = mean_absolute_error(y_test, ensemble_weighted)
    
    # Método 3: Mediana (más robusto a outliers)
    ensemble_median = np.median(all_predictions, axis=0)
    median_rmse = np.sqrt(mean_squared_error(y_test, ensemble_median))
    median_r2 = r2_score(y_test, ensemble_median)
    median_mae = mean_absolute_error(y_test, ensemble_median)
    
    # ============================================================================
    # COMPARAR MÉTODOS DE ENSEMBLE
    # ============================================================================
    
    print(f"\n📊 Comparación de métodos de ensemble:")
    print(f"   🔹 Promedio Simple:    RMSE=${simple_rmse:,.2f}, R²={simple_r2:.3f}, MAE=${simple_mae:,.2f}")
    print(f"   🔹 Promedio Ponderado: RMSE=${weighted_rmse:,.2f}, R²={weighted_r2:.3f}, MAE=${weighted_mae:,.2f}")
    print(f"   🔹 Mediana:           RMSE=${median_rmse:,.2f}, R²={median_r2:.3f}, MAE=${median_mae:,.2f}")
    
    # Determinar el mejor método
    ensemble_methods = {
        'simple': {'rmse': simple_rmse, 'r2': simple_r2, 'mae': simple_mae, 'predictions': ensemble_simple},
        'weighted': {'rmse': weighted_rmse, 'r2': weighted_r2, 'mae': weighted_mae, 'predictions': ensemble_weighted},
        'median': {'rmse': median_rmse, 'r2': median_r2, 'mae': median_mae, 'predictions': ensemble_median}
    }
    
    best_method = min(ensemble_methods.keys(), key=lambda x: ensemble_methods[x]['rmse'])
    best_metrics = ensemble_methods[best_method]
    
    print(f"\n🏆 Mejor método: {best_method.upper()}")
    print(f"   📊 RMSE: ${best_metrics['rmse']:,.2f}")
    print(f"   📈 R²: {best_metrics['r2']:.3f}")
    print(f"   📉 MAE: ${best_metrics['mae']:,.2f}")
    
    # ============================================================================
    # FEATURE IMPORTANCE COMBINADA
    # ============================================================================
    
    print(f"\n🔍 Calculando feature importance combinada...")
    
    # Combinar feature importance de todos los modelos
    all_importances = []
    for name, result in rf_models.items():
        importance = result['model'].feature_importances_
        all_importances.append(importance)
    
    # Promedio de importancias
    avg_importance = np.mean(all_importances, axis=0)
    
    # Crear DataFrame de feature importance
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': avg_importance
    }).sort_values('importance', ascending=False)
    
    print(f"🔍 Top 10 Features más importantes (promedio):")
    for i, row in feature_importance_df.head(10).iterrows():
        print(f"   {row['feature']}: {row['importance']:.4f}")
    
    # ============================================================================
    # RESULTADOS FINALES
    # ============================================================================
    
    ensemble_results = {
        'individual_models': rf_models,
        'ensemble_methods': ensemble_methods,
        'best_method': best_method,
        'best_predictions': best_metrics['predictions'],
        'best_metrics': best_metrics,
        'feature_importance': feature_importance_df,
        'weights': weights if best_method == 'weighted' else None,
        'feature_names': feature_names,
        'job_categories': all_job_cats,
        'seniority_categories': all_seniority_cats,
        'stats_dict': stats_dict,
        'grouping_info': grouping_info,
        'X_test': X_test,
        'y_test': y_test,
        'X_train': X_train,
        'y_train': y_train,
        'ensemble_type': 'multiple_random_forest'
    }
    
    print(f"\n✅ Ensemble de Random Forest creado exitosamente!")
    print(f"   🌲 Bosques entrenados: {len(rf_models)}")
    print(f"   🎯 Mejor método: {best_method}")
    print(f"   📊 Mejora promedio: Evalúa vs modelos individuales")
    
    return ensemble_results

def predict_multiple_rf_ensemble(new_data, ensemble_results):
    """
    Función de predicción para ensemble de múltiples Random Forest
    """
    print("🌲 Predicción con ENSEMBLE de múltiples Random Forest...")
    
    # Obtener componentes necesarios
    individual_models = ensemble_results['individual_models']
    best_method = ensemble_results['best_method']
    weights = ensemble_results.get('weights')
    expected_feature_names = ensemble_results['feature_names']
    
    print(f"   🔢 Features esperadas por los modelos: {len(expected_feature_names)}")
    
    # Crear grupos si no existen
    input_data_copy = new_data.copy()
    if 'Exp_group' not in input_data_copy.columns or 'Age_group' not in input_data_copy.columns:
        for idx, row in input_data_copy.iterrows():
            exp_group, age_group = pred.calculate_groups(
                age=row['Age'], 
                years_of_experience=row['Years_of_Experience'], 
                grouping_info=ensemble_results['grouping_info']
            )
            input_data_copy.at[idx, 'Exp_group'] = exp_group
            input_data_copy.at[idx, 'Age_group'] = age_group
    
    # Crear features usando los mismos parámetros del entrenamiento
    X_features, created_feature_names = f_lgbm.create_features_with_stats(
        input_data_copy,
        all_job_categories=ensemble_results['job_categories'],
        all_seniority_levels=ensemble_results['seniority_categories'],
        stats_dict=ensemble_results['stats_dict'],
        is_training=False
    )
    
    print(f"   🔢 Features creadas: {len(created_feature_names)}")
    print(f"   🔍 Primeras 5 features creadas: {created_feature_names[:5]}")
    print(f"   🔍 Primeras 5 features esperadas: {expected_feature_names[:5]}")
    
    # ALINEAR FEATURES con las que esperan los modelos
    if len(created_feature_names) != len(expected_feature_names) or created_feature_names != expected_feature_names:
        print(f"   ⚠️ Ajustando features: {len(created_feature_names)} → {len(expected_feature_names)}")
        
        # Crear DataFrame con todas las features esperadas, inicializadas en 0
        X_aligned = pd.DataFrame(0, index=X_features.index, columns=expected_feature_names)
        
        # Llenar con los valores disponibles
        for col in X_features.columns:
            if col in X_aligned.columns:
                X_aligned[col] = X_features[col]
            else:
                print(f"   ⚠️ Feature '{col}' no está en el modelo entrenado")
        
        # Verificar qué features faltan
        missing_features = set(expected_feature_names) - set(created_feature_names)
        if missing_features:
            print(f"   🔍 Features faltantes (se rellenan con 0): {len(missing_features)}")
            print(f"      Ejemplos: {list(missing_features)[:5]}")
        
        X_features = X_aligned
        print(f"   ✅ Features alineadas: {X_features.shape}")
    
    # Hacer predicciones con cada Random Forest
    predictions = []
    model_names = []
    
    for name, result in individual_models.items():
        try:
            model = result['model']
            
            # Verificar que las columnas coincidan exactamente
            if hasattr(model, 'feature_names_in_'):
                model_features = list(model.feature_names_in_)
                current_features = list(X_features.columns)
                
                if model_features != current_features:
                    print(f"   🔧 Reordenando features para {name}...")
                    # Reordenar las columnas para que coincidan exactamente
                    X_model = X_features.reindex(columns=model_features, fill_value=0)
                else:
                    X_model = X_features
            else:
                X_model = X_features
            
            model_prediction = model.predict(X_model)
            
            # Extraer el valor de predicción correctamente
            if len(model_prediction) == 1:
                pred_value = model_prediction[0]
            else:
                pred_value = model_prediction[0] if hasattr(model_prediction, '__len__') else model_prediction
            
            predictions.append(pred_value)
            model_names.append(name)
            print(f"   ✅ {name}: ${pred_value:,.2f}")
            
        except Exception as e:
            print(f"   ❌ Error con {name}: {e}")
            # Solo mostrar traceback si es necesario para debug
            continue
    
    if not predictions:
        print("❌ No se pudieron hacer predicciones")
        return None
    
    # Aplicar el mejor método de ensemble
    if best_method == 'simple':
        final_prediction = np.mean(predictions)
    elif best_method == 'weighted' and weights is not None and len(weights) == len(predictions):
        final_prediction = np.average(predictions, weights=weights[:len(predictions)])
    elif best_method == 'median':
        final_prediction = np.median(predictions)
    else:
        final_prediction = np.mean(predictions)  # Fallback
    
    print(f"\n   🎯 Método usado: {best_method}")
    print(f"   💰 Predicción final: ${final_prediction:,.2f}")
    print(f"   🤖 Bosques usados: {len(predictions)}")
    
    return final_prediction

def test_multiple_rf_ensemble(ensemble_results):
    """
    Test para ensemble de múltiples Random Forest
    """
    print("🧪 Testing ensemble de múltiples Random Forest...")
    
    # Crear registro de prueba
    test_record = pd.DataFrame({
        'Age': [60],
        'Gender': ['Male'],
        'Education_Level': ["PhD"],
        'Job_Title': ['CEO'],
        'Years_of_Experience': [24],
        'Description': ['I work with machine learning models and data analysis']
    })
    
    print(f"📋 Registro de prueba:")
    print(f"   👤 Edad: {test_record['Age'][0]} años")
    print(f"   👨 Género: {test_record['Gender'][0]}")
    print(f"   🎓 Educación: {test_record['Education_Level'][0]}")
    print(f"   💼 Puesto: {test_record['Job_Title'][0]}")
    print(f"   📈 Experiencia: {test_record['Years_of_Experience'][0]} años")
    
    try:
        prediction = predict_multiple_rf_ensemble(test_record, ensemble_results)
        
        if prediction is not None:
            print(f"✅ Test exitoso: Predicción = ${prediction:,.2f}")
            
            # Mostrar métricas del ensemble
            best_metrics = ensemble_results['best_metrics']
            print(f"   📊 Métricas del ensemble:")
            print(f"      RMSE: ${best_metrics['rmse']:,.2f}")
            print(f"      R²: {best_metrics['r2']:.3f}")
            print(f"      MAE: ${best_metrics['mae']:,.2f}")
            
            return True
        else:
            print("❌ Test falló: La predicción devolvió None")
            return False
            
    except Exception as e:
        print(f"❌ Test falló: {e}")
        return False

# Ejemplo de uso completo
def example_multiple_rf_usage():
    """
    Ejemplo completo de ensemble de múltiples Random Forest
    """
    print("🌲 Ejemplo: Ensemble de Múltiples Random Forest")
    
    # Cargar datos
    data = pd.read_csv('../../../dataC/imputado.csv').dropna()
    
    # Crear ensemble de Random Forest
    ensemble_results = create_multiple_random_forest_ensemble(
        data=data,
        n_forests=5,  # Número de bosques diferentes
        test_size=0.2,
        random_state=42
    )
    
    # Test de predicción
    test_multiple_rf_ensemble(ensemble_results)
    
    return ensemble_results

In [24]:
print("🌲 Ejemplo: Ensemble de Múltiples Random Forest")

# Cargar datos
data = pd.read_csv('../../../dataC/imputado.csv').dropna()

# Crear ensemble de Random Forest
ensemble_results = create_multiple_random_forest_ensemble(
    data=data,
    n_forests=5,  # Número de bosques diferentes
    test_size=0.2,
    random_state=42
)

# Test de predicción
#test_multiple_rf_ensemble(ensemble_results)

🌲 Ejemplo: Ensemble de Múltiples Random Forest
🌲 Creando ENSEMBLE de 5 Random Forest...
📊 Creando grupos y guardando información de rangos...
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 67 características en total
   - Variables numéricas básicas: 3
   - Variables de educación: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
📊 Creando features estadísticos para producción (TRAIN)...
   🔄 Calculando estadísticas en TRAIN (solo variables de producción)...
   ✅ Estadísticas calculadas para 7 grupos
   ✅ Creadas 32 features estadísticos para producción
✅ Features totales para producción: 99
   - Originales: 67
   - Estadísticos: 32
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 67 características en total
   - Variables numéricas básicas: 3
   - Va

In [25]:
test_multiple_rf_ensemble(ensemble_results)

🧪 Testing ensemble de múltiples Random Forest...
📋 Registro de prueba:
   👤 Edad: 60 años
   👨 Género: Male
   🎓 Educación: PhD
   💼 Puesto: CEO
   📈 Experiencia: 24 años
🌲 Predicción con ENSEMBLE de múltiples Random Forest...
   🔢 Features esperadas por los modelos: 99
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 67 características en total
   - Variables numéricas básicas: 3
   - Variables de educación: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
📊 Creando features estadísticos para producción (PREDICT)...
   📥 Usando estadísticas pre-calculadas de TRAIN...
   ✅ Creadas 26 features estadísticos para producción
✅ Features totales para producción: 93
   - Originales: 67
   - Estadísticos: 26
   🔢 Features creadas: 93
   🔍 Primeras 5 features creadas: ['age', 'years_experience', 'age_experience_ratio', 'experience_squared', 'a

True