In [1]:
# Importar módulos personalizados
import sys
import os

# Agregar la ruta de los módulos LGBM
sys.path.append('../lgbm')

import pred_lgbm as pred
import funciones_lgbm as f_lgbm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============================================================================
# ENSEMBLE MODEL PARA PREDICCIÓN DE SALARIOS 
# ============================================================================

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')


# Importar módulos personalizados 
try:
    import pred_lgbm as pred
    import funciones_lgbm as f_lgbm
    print("✅ Módulos personalizados cargados correctamente")
except ImportError as e:
    print(f"⚠️ No se pudieron cargar algunos módulos: {e}")

# ============================================================================
# FUNCIONES DE ENSEMBLE 
# ============================================================================

def ensemble_predict(X, individual_models, weights=None, scaler=None):
    """
    Función que hace predicciones de ensemble
    """
    predictions = []
    
    for name, model in individual_models.items():
        try:
            if name in ['Ridge', 'ElasticNet'] and scaler is not None:
                X_scaled = scaler.transform(X)
                pred = model.predict(X_scaled)
            else:
                pred = model.predict(X)
            
            predictions.append(pred)
            
        except Exception as e:
            print(f"⚠️ Error prediciendo con {name}: {e}")
            continue
    
    if not predictions:
        raise ValueError("No se pudieron hacer predicciones")
    
    # Combinar predicciones
    if weights is not None and len(weights) == len(predictions):
        result = np.average(predictions, axis=0, weights=weights)
    else:
        result = np.mean(predictions, axis=0)
    
    return result

def save_ensemble_compatible(ensemble_results, model_results, model_package, X_train, y_train, scaler):
    """
    Guarda el ensemble usando save_with_stats
    """
    print("💾 Guardando ensemble compatible...")
    
    # Identificar mejor ensemble
    best_ensemble_name = min(ensemble_results.keys(), key=lambda x: ensemble_results[x]['rmse'])
    best_ensemble = ensemble_results[best_ensemble_name]
    
    print(f"   🏆 Mejor ensemble: {best_ensemble['method']}")
    print(f"   📊 RMSE: ${best_ensemble['rmse']:,.2f}")
    print(f"   📈 R²: {best_ensemble['r2']:.3f}")
    
    
    ensemble_components = {
        'individual_models': {name: result['model'] for name, result in model_results.items()},
        'weights': best_ensemble.get('weights', None),
        'models_used': best_ensemble.get('models_used', list(model_results.keys())),
        'scaler': scaler,
        'ensemble_type': best_ensemble['method'],
        'ensemble_metrics': {
            'rmse': best_ensemble['rmse'],
            'r2': best_ensemble['r2'],
            'mae': best_ensemble['mae']
        }
    }
    
    # Usar el mismo formato que la función save_with_stats
    complete_package = {
        'model': ensemble_components,  #  los componentes 
        'model_name': f'Ensemble_{best_ensemble_name}_WithStats',
        'feature_names': model_package['feature_names'],
        'job_categories': model_package['job_categories'],
        'seniority_categories': model_package['seniority_categories'],
        'stats_dict': model_package['stats_dict'],
        'grouping_info': model_package['grouping_info'],
        'total_features': len(model_package['feature_names']),
        'training_data_shape': X_train.shape,
        'has_statistical_features': True,
        'is_ensemble': True,  # Flag para identificar que es ensemble
        'metrics': ensemble_components['ensemble_metrics']
    }
    
    # Guardar en archivo 
    ensemble_filename = "../../../modelos/ensemble.pkl"  
    
    try:
        joblib.dump(complete_package, ensemble_filename)
        print(f"   ✅ Ensemble guardado en: {ensemble_filename}")

        
        return complete_package
        
    except Exception as e:
        print(f"   ❌ Error guardando ensemble: {e}")
        return None


# ============================================================================
# FUNCIONES PRINCIPALES
# ============================================================================

def load_model_and_data():
    """Carga el modelo base y los datos"""
    print("🔄 Cargando modelo y datos...")
    
    try:
        model_package = joblib.load("../../../modelos/salary_with_stats.pkl")
        print("✅ Modelo cargado")
    except Exception as e:
        print(f"❌ Error cargando modelo: {e}")
        return None, None
    
    try:
        data = pd.read_csv('../../../dataC/imputado.csv')
        data = data.dropna()
        print(f"✅ Datos cargados: {data.shape}")
        return model_package, data
    except Exception as e:
        print(f"❌ Error cargando datos: {e}")
        return model_package, None

def prepare_data(data, model_package):
    """Prepara los datos para el ensemble"""
    print("🔧 Preparando datos...")
    
    data_copy = data.copy()
    
    # Crear grupos
    for idx, row in data_copy.iterrows():
        exp_group, age_group = pred.calculate_groups(
            age=row['Age'], 
            years_of_experience=row['Years_of_Experience'], 
            grouping_info=model_package['grouping_info']
        )
        data_copy.at[idx, 'Exp_group'] = exp_group
        data_copy.at[idx, 'Age_group'] = age_group
    
    # Separar features y target
    X_data = data_copy.drop('Salary', axis=1)
    y = data_copy['Salary']
    
    # Crear features
    X_features, _ = f_lgbm.create_features_with_stats(
        X_data,
        all_job_categories=model_package['job_categories'],
        all_seniority_levels=model_package['seniority_categories'],
        stats_dict=model_package['stats_dict'],
        is_training=False
    )
    
    print(f"✅ Features preparadas: {X_features.shape}")
    return X_features, y

def create_models(base_model):
    """Crea los modelos para el ensemble"""
    print(" Creando modelos...")
    
    models = {
        'LightGBM': base_model,
        'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'CatBoost': CatBoostRegressor(iterations=100, random_seed=42, verbose=False)
    }
    
    print(f"✅ {len(models)} modelos creados")
    return models

def train_models(models, X_train, y_train, X_test, y_test):
    """Entrena los modelos"""
    print("🚀 Entrenando modelos...")
    
    results = {}
    
    for name, model in models.items():
        print(f"Entrenando {name}...")
        
        try:
            if name != 'LightGBM':
                model.fit(X_train, y_train)
            
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            results[name] = {
                'model': model,
                'rmse': rmse,
                'r2': r2,
                'predictions': y_pred
            }
            
            print(f"   RMSE: ${rmse:,.0f} | R²: {r2:.3f}")
            
        except Exception as e:
            print(f"   ❌ Error: {e}")
    
    return results

def create_ensemble(results, y_test):
    
    print("🎯 Creando ensemble...")
    
    # Calcular pesos basados en R²
    valid_models = {name: res for name, res in results.items() if res['r2'] > 0}
    r2_scores = [res['r2'] for res in valid_models.values()]
    weights = np.array(r2_scores)
    weights = weights / weights.sum()
    
    # Crear diccionario del ensemble (NO clase)
    ensemble_dict = {
        'individual_models': {name: res['model'] for name, res in valid_models.items()},
        'weights': weights.tolist(),
        'model_names': list(valid_models.keys()),
        'ensemble_type': 'weighted'
    }
    
    print(f"✅ Ensemble creado con {len(valid_models)} modelos")
    return ensemble_dict

def save_compatible_model(ensemble, model_package, filename):
    
    print("💾 Guardando modelo compatible...")
    
    
    # Incluir todas las claves que espera pred.predict() (ver cambios en predict para la app)
    compatible_package = {
        'model': ensemble,  # ensemble como diccionario, no como clase
        'feature_names': model_package['feature_names'],
        'job_categories': model_package['job_categories'],
        'seniority_categories': model_package['seniority_categories'],
        'stats_dict': model_package['stats_dict'],
        'grouping_info': model_package['grouping_info'],
        'has_statistical_features': True,
        'is_ensemble': True,  # Flag para identificar ensemble
        
        
        'total_features': model_package.get('total_features', len(model_package['feature_names'])),
        'training_data_shape': model_package.get('training_data_shape', None),
        'model_name': f'Ensemble_Model',
        'metrics': model_package.get('metrics', {})
    }
    
    try:
        joblib.dump(compatible_package, filename)
        print(f"✅ Guardado en: {filename}")
        return True
    except Exception as e:
        print(f"❌ Error guardando: {e}")
        return False

def main():
    """Función principal"""
    print("🚀 INICIANDO ENSEMBLE")
    print("="*40)
    
    # 1. Cargar datos
    model_package, data = load_model_and_data()
    if model_package is None or data is None:
        print("❌ No se pudieron cargar los datos")
        return
    
    # 2. Preparar datos
    X, y = prepare_data(data, model_package)
    
    # 3. Dividir datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Train: {X_train.shape}, Test: {X_test.shape}")
    
    # 4. Crear modelos
    models = create_models(model_package['model'])
    
    # 5. Entrenar modelos
    results = train_models(models, X_train, y_train, X_test, y_test)
    
    # 6. Crear ensemble
    ensemble = create_ensemble(results, y_test)
    
    # 7. Guardar modelo 
    saved = save_compatible_model(ensemble, model_package, "../../../modelos/ensemble_voting.pkl")  # ← NUEVO ARCHIVO
    
    if saved:
        print("\n COMPLETADO EXITOSAMENTE!")
    else:
        print("\n❌ ERROR AL GUARDAR")

# ============================================================================
# EJECUTAR
# ============================================================================

if __name__ == "__main__":
    main()

✅ Módulos personalizados cargados correctamente
🚀 INICIANDO ENSEMBLE
🔄 Cargando modelo y datos...
✅ Modelo cargado
✅ Datos cargados: (366, 9)
🔧 Preparando datos...
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 67 características en total
   - Variables numéricas básicas: 3
   - Variables de educación: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
📊 Creando features estadísticos para producción (PREDICT)...
   📥 Usando estadísticas pre-calculadas de TRAIN...
   ✅ Creadas 32 features estadísticos para producción
✅ Features totales para producción: 99
   - Originales: 67
   - Estadísticos: 32
✅ Features preparadas: (366, 99)
Train: (292, 99), Test: (74, 99)
 Creando modelos...
✅ 4 modelos creados
🚀 Entrenando modelos...
Entrenando LightGBM...
   ❌ Error: The number of features in data (99) is not the same as it was in training data