In [1]:
# Importar módulos personalizados
import sys
import os

# Agregar la ruta de los módulos LGBM
sys.path.append('../lgbm')

import pred_lgbm as pred
import funciones_lgbm as f_lgbm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ============================================================================
# ENSEMBLE MODEL PARA PREDICCIÓN DE SALARIOS 
# ============================================================================

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict  # ← AGREGADO cross_val_predict
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, ElasticNet, LinearRegression  # ← AGREGADO LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

# Importar módulos personalizados
try:
    import pred_lgbm as pred
    import funciones_lgbm as f_lgbm
    print("✅ Módulos personalizados cargados correctamente")
except ImportError as e:
    print(f"⚠️ No se pudieron cargar algunos módulos: {e}")

# ============================================================================
# STACKING 
# ============================================================================

def create_stacking_ensemble(results, X_train, y_train, X_test, y_test, scaler):
    """
    Crea un ensemble Stacking 
    """
    print(" Creando ensemble STACKING ...")
    
    # Obtener modelos válidos
    valid_models = {name: res for name, res in results.items() if res['r2'] > 0}
    print(f" Modelos para stacking: {list(valid_models.keys())}")
    
    # NIVEL 1: Generar predicciones 
    print("   📊 Generando predicciones ...")
    meta_features_train = []
    meta_features_test = []
    
    for name, result in valid_models.items():
        model = result['model']
        
        # Usar datos apropiados según el modelo
        if name in ['Ridge', 'ElasticNet']:
            X_tr_scaled = scaler.transform(X_train)
            X_te_scaled = scaler.transform(X_test)
            X_tr, X_te = X_tr_scaled, X_te_scaled
        else:
            X_tr, X_te = X_train, X_test
        
        
        #ver función from sklearn.ensemble import StackingRegressor diferencias
        
        # Predicciones  para train 
        if name != 'LightGBM_Base':  # Modelos entrenados
            oof_predictions = cross_val_predict(model, X_tr, y_train, cv=5, method='predict')
        else:  # Modelo pre-entrenado
            oof_predictions = cross_val_predict(model, X_tr, y_train, cv=5, method='predict')
        
        # Predicciones para test
        test_predictions = model.predict(X_te)
        
        meta_features_train.append(oof_predictions)
        meta_features_test.append(test_predictions)
        
        print(f"      ✅ {name}: OOF shape {oof_predictions.shape}, Test shape {test_predictions.shape}")
    
    # Combinar features del meta-modelo
    meta_X_train = np.column_stack(meta_features_train)
    meta_X_test = np.column_stack(meta_features_test)
    
    print(f"   📊 Meta-features shape - Train: {meta_X_train.shape}, Test: {meta_X_test.shape}")
    
    # NIVEL 2: Entrenar meta-modelo
    print("  Entrenando meta-modelo...")
    
    # Probar varios meta-modelos
    meta_models = {
        'Ridge': Ridge(alpha=1.0),
        'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5),
        'LinearRegression': LinearRegression()
    }
    
    best_meta_model = None
    best_meta_score = float('inf')
    best_meta_name = None
    
    for meta_name, meta_model in meta_models.items():
        try:
            # Entrenar meta-modelo
            meta_model.fit(meta_X_train, y_train)
            
            # Predecir en test
            meta_predictions = meta_model.predict(meta_X_test)
            
            # Evaluar
            meta_rmse = np.sqrt(mean_squared_error(y_test, meta_predictions))
            meta_r2 = r2_score(y_test, meta_predictions)
            
            print(f"      {meta_name}: RMSE=${meta_rmse:,.2f}, R²={meta_r2:.3f}")
            
            if meta_rmse < best_meta_score:
                best_meta_score = meta_rmse
                best_meta_model = meta_model
                best_meta_name = meta_name
                best_predictions = meta_predictions
                
        except Exception as e:
            print(f"      ❌ Error con {meta_name}: {e}")
    
    if best_meta_model is None:
        print("   ❌ No se pudo entrenar ningún meta-modelo")
        return None
    
    print(f"   🏆 Mejor meta-modelo: {best_meta_name} (RMSE: ${best_meta_score:,.2f})")
    
    # Crear diccionario del stacking ensemble
    stacking_dict = {
        'individual_models': {name: res['model'] for name, res in valid_models.items()},
        'meta_model': best_meta_model,
        'meta_model_name': best_meta_name,
        'model_names': list(valid_models.keys()),
        'ensemble_type': 'stacking',
        'scaler': scaler,
        'meta_features_columns': len(valid_models),  # Para verificación
        'level1_models': list(valid_models.keys()),
        'predictions': best_predictions,
        'rmse': best_meta_score,
        'r2': r2_score(y_test, best_predictions),
        'mae': mean_absolute_error(y_test, best_predictions)
    }
    
    print(f"   ✅ Stacking ensemble creado exitosamente")
    print(f"      🎯 RMSE: ${stacking_dict['rmse']:,.2f}")
    print(f"      📈 R²: {stacking_dict['r2']:.3f}")
    
    return stacking_dict

def save_stacking_model(stacking_ensemble, model_package, filename="../../../modelos/ensemble_stacking.pkl"):
    """
    Guarda el ensemble stacking con formato compatible
    """
    print("💾 Guardando ensemble STACKING...")
    
    if stacking_ensemble is None:
        print("❌ No hay ensemble stacking para guardar")
        return False
    
    # Crear paquete compatible
    compatible_package = {
        'model': stacking_ensemble,  # Diccionario con todo el stacking
        'feature_names': model_package['feature_names'],
        'job_categories': model_package['job_categories'],
        'seniority_categories': model_package['seniority_categories'],
        'stats_dict': model_package['stats_dict'],
        'grouping_info': model_package['grouping_info'],
        'has_statistical_features': True,
        'is_ensemble': True,
        'ensemble_type': 'stacking',  # Flag específico para stacking
        
        # Claves adicionales necesarias
        'total_features': model_package.get('total_features', len(model_package['feature_names'])),
        'training_data_shape': model_package.get('training_data_shape', None),
        'model_name': 'Stacking_Ensemble',
        'metrics': {
            'rmse': stacking_ensemble['rmse'],
            'r2': stacking_ensemble['r2'],
            'mae': stacking_ensemble['mae']
        }
    }
    
    try:
        joblib.dump(compatible_package, filename)
        print(f"✅ Stacking ensemble guardado en: {filename}")
        print(f"   🎯 Meta-modelo: {stacking_ensemble['meta_model_name']}")
        print(f"   🤖 Modelos nivel 1: {len(stacking_ensemble['level1_models'])}")
        return True
    except Exception as e:
        print(f"❌ Error guardando stacking: {e}")
        return False

def predict_stacking_ensemble(input_data, ensemble_package):
    """
    Función de predicción específica para stacking ensemble
    """
    try:
        print("🎯 Predicción con STACKING ensemble...")
        
        # Obtener componentes del stacking
        stacking_components = ensemble_package['model']
        individual_models = stacking_components['individual_models']
        meta_model = stacking_components['meta_model']
        scaler = stacking_components.get('scaler')
        
        # Crear grupos si no existen
        input_data_copy = input_data.copy()
        if 'Exp_group' not in input_data_copy.columns or 'Age_group' not in input_data_copy.columns:
            for idx, row in input_data_copy.iterrows():
                exp_group, age_group = calculate_groups(
                    age=row['Age'], 
                    years_of_experience=row['Years_of_Experience'], 
                    grouping_info=ensemble_package['grouping_info']
                )
                input_data_copy.at[idx, 'Exp_group'] = exp_group
                input_data_copy.at[idx, 'Age_group'] = age_group
        
        # Crear features
        X_features, _ = create_features_with_stats_pred(
            input_data_copy,
            all_job_categories=ensemble_package['job_categories'],
            all_seniority_levels=ensemble_package['seniority_categories'],
            stats_dict=ensemble_package['stats_dict']
        )
        
        # NIVEL 1: Predicciones de modelos base
        level1_predictions = []
        
        for name, model in individual_models.items():
            try:
                if name in ['Ridge', 'ElasticNet'] and scaler is not None:
                    X_scaled = scaler.transform(X_features)
                    pred = model.predict(X_scaled)[0]
                else:
                    pred = model.predict(X_features)[0]
                
                level1_predictions.append(pred)
                
            except Exception as e:
                print(f"⚠️ Error con modelo {name}: {e}")
                continue
        
        if not level1_predictions:
            print("❌ No se pudieron generar predicciones nivel 1")
            return None
        
        # NIVEL 2: Meta-modelo
        meta_features = np.array(level1_predictions).reshape(1, -1)
        final_prediction = meta_model.predict(meta_features)[0]
        
        print(f"   🤖 Predicciones nivel 1: {level1_predictions}")
        print(f"   🧠 Meta-predicción: ${final_prediction:,.2f}")
        
        return final_prediction
        
    except Exception as e:
        print(f"❌ Error en predicción stacking: {e}")
        return None

def ensemble_predict(X, individual_models, weights=None, scaler=None):
    """
    Función que hace predicciones de ensemble sin usar clases
    """
    predictions = []
    
    for name, model in individual_models.items():
        try:
            if name in ['Ridge', 'ElasticNet'] and scaler is not None:
                X_scaled = scaler.transform(X)
                pred = model.predict(X_scaled)
            else:
                pred = model.predict(X)
            
            predictions.append(pred)
            
        except Exception as e:
            print(f"⚠️ Error prediciendo con {name}: {e}")
            continue
    
    if not predictions:
        raise ValueError("No se pudieron hacer predicciones")
    
    # Combinar predicciones
    if weights is not None and len(weights) == len(predictions):
        result = np.average(predictions, axis=0, weights=weights)
    else:
        result = np.mean(predictions, axis=0)
    
    return result

def save_ensemble_compatible(ensemble_results, model_results, model_package, X_train, y_train, scaler):
    """
    Guarda el ensemble SIN CLASES usando tu función save_with_stats
    """
    print("💾 Guardando ensemble compatible...")
    
    # Identificar mejor ensemble
    best_ensemble_name = min(ensemble_results.keys(), key=lambda x: ensemble_results[x]['rmse'])
    best_ensemble = ensemble_results[best_ensemble_name]
    
    print(f"   🏆 Mejor ensemble: {best_ensemble['method']}")
    print(f"   📊 RMSE: ${best_ensemble['rmse']:,.2f}")
    print(f"   📈 R²: {best_ensemble['r2']:.3f}")
    
    # En lugar de una clase, guardar los componentes del ensemble
    ensemble_components = {
        'individual_models': {name: result['model'] for name, result in model_results.items()},
        'weights': best_ensemble.get('weights', None),
        'models_used': best_ensemble.get('models_used', list(model_results.keys())),
        'scaler': scaler,
        'ensemble_type': best_ensemble['method'],
        'ensemble_metrics': {
            'rmse': best_ensemble['rmse'],
            'r2': best_ensemble['r2'],
            'mae': best_ensemble['mae']
        }
    }
    
    # Usar el MISMO formato que tu función save_with_stats
    complete_package = {
        'model': ensemble_components,  # ← Los componentes en lugar de la clase
        'model_name': f'Ensemble_{best_ensemble_name}_WithStats',
        'feature_names': model_package['feature_names'],
        'job_categories': model_package['job_categories'],
        'seniority_categories': model_package['seniority_categories'],
        'stats_dict': model_package['stats_dict'],
        'grouping_info': model_package['grouping_info'],
        'total_features': len(model_package['feature_names']),
        'training_data_shape': X_train.shape,
        'has_statistical_features': True,
        'is_ensemble': True,  # Flag para identificar que es ensemble
        'metrics': ensemble_components['ensemble_metrics']
    }
    
    # Guardar en archivo SEPARADO (no sobreescribir original)
    ensemble_filename = "../../../modelos/ensemble.pkl"  # ← NUEVO ARCHIVO
    
    try:
        joblib.dump(complete_package, ensemble_filename)
        print(f"   ✅ Ensemble guardado en: {ensemble_filename}")
        print(f"   📱 Archivo original NO modificado")
        print(f"   🔄 FORMATO: Compatible con tu función save_with_stats")
        
        return complete_package
        
    except Exception as e:
        print(f"   ❌ Error guardando ensemble: {e}")
        return None



# ============================================================================
# FUNCIONES PRINCIPALES
# ============================================================================

def load_model_and_data():
    """Carga el modelo base y los datos"""
    print("🔄 Cargando modelo y datos...")
    
    try:
        model_package = joblib.load("../../../modelos/salary_with_stats.pkl")
        print("✅ Modelo cargado")
    except Exception as e:
        print(f"❌ Error cargando modelo: {e}")
        return None, None
    
    try:
        data = pd.read_csv('../../../dataC/imputado.csv')
        data = data.dropna()
        print(f"✅ Datos cargados: {data.shape}")
        return model_package, data
    except Exception as e:
        print(f"❌ Error cargando datos: {e}")
        return model_package, None

def prepare_data(data, model_package):
    """Prepara los datos para el ensemble"""
    print("🔧 Preparando datos...")
    
    data_copy = data.copy()
    
    # Crear grupos
    for idx, row in data_copy.iterrows():
        exp_group, age_group = pred.calculate_groups(
            age=row['Age'], 
            years_of_experience=row['Years_of_Experience'], 
            grouping_info=model_package['grouping_info']
        )
        data_copy.at[idx, 'Exp_group'] = exp_group
        data_copy.at[idx, 'Age_group'] = age_group
    
    # Separar features y target
    X_data = data_copy.drop('Salary', axis=1)
    y = data_copy['Salary']
    
    # Crear features
    X_features, _ = f_lgbm.create_features_with_stats(
        X_data,
        all_job_categories=model_package['job_categories'],
        all_seniority_levels=model_package['seniority_categories'],
        stats_dict=model_package['stats_dict'],
        is_training=False
    )
    
    print(f"✅ Features preparadas: {X_features.shape}")
    return X_features, y

def create_models(base_model):
    """Crea los modelos para el ensemble"""
    print(" Creando modelos...")
    
    models = {
        'LightGBM': base_model,
        'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'CatBoost': CatBoostRegressor(iterations=100, random_seed=42, verbose=False)
    }
    
    print(f"✅ {len(models)} modelos creados")
    return models

def train_models(models, X_train, y_train, X_test, y_test):
    """Entrena los modelos"""
    print("🚀 Entrenando modelos...")
    
    results = {}
    
    for name, model in models.items():
        print(f"Entrenando {name}...")
        
        try:
            if name != 'LightGBM':
                model.fit(X_train, y_train)
            
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            results[name] = {
                'model': model,
                'rmse': rmse,
                'r2': r2,
                'predictions': y_pred
            }
            
            print(f"   RMSE: ${rmse:,.0f} | R²: {r2:.3f}")
            
        except Exception as e:
            print(f"   ❌ Error: {e}")
    
    return results

def create_ensemble(results, y_test):
    """Crea el ensemble SIN CLASES"""
    print(" Creando ensemble...")
    
    # Calcular pesos basados en R²
    valid_models = {name: res for name, res in results.items() if res['r2'] > 0}
    r2_scores = [res['r2'] for res in valid_models.values()]
    weights = np.array(r2_scores)
    weights = weights / weights.sum()
    
    # Crear diccionario del ensemble (NO clase)
    ensemble_dict = {
        'individual_models': {name: res['model'] for name, res in valid_models.items()},
        'weights': weights.tolist(),
        'model_names': list(valid_models.keys()),
        'ensemble_type': 'weighted'
    }
    
    print(f"✅ Ensemble creado con {len(valid_models)} modelos")
    return ensemble_dict

def save_compatible_model(ensemble, model_package, filename):
    """Guarda el ensemble con formato compatible SIN CLASES"""
    print("💾 Guardando modelo compatible...")
    
    # NO usar clases, solo diccionarios y funciones
    # Incluir TODAS las claves que espera pred.predict()
    compatible_package = {
        'model': ensemble,  # ensemble como diccionario, no como clase
        'feature_names': model_package['feature_names'],
        'job_categories': model_package['job_categories'],
        'seniority_categories': model_package['seniority_categories'],
        'stats_dict': model_package['stats_dict'],
        'grouping_info': model_package['grouping_info'],
        'has_statistical_features': True,
        'is_ensemble': True,  # Flag para identificar ensemble
        
        # CLAVES ADICIONALES que espera pred.predict()
        'total_features': model_package.get('total_features', len(model_package['feature_names'])),
        'training_data_shape': model_package.get('training_data_shape', None),
        'model_name': f'Ensemble_Model',
        'metrics': model_package.get('metrics', {})
    }
    
    try:
        joblib.dump(compatible_package, filename)
        print(f"✅ Guardado en: {filename}")
        return True
    except Exception as e:
        print(f"❌ Error guardando: {e}")
        return False

def main():
    """Función principal que crea AMBOS ensembles: Voting y Stacking"""
    print("🚀 INICIANDO PIPELINE DE ENSEMBLE DUAL")
    print("="*50)
    
    # 1. Cargar datos
    model_package, data = load_model_and_data()
    if model_package is None or data is None:
        print("❌ No se pudieron cargar los datos")
        return
    
    # 2. Preparar datos
    X, y = prepare_data(data, model_package)
    
    # 3. Dividir datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Train: {X_train.shape}, Test: {X_test.shape}")
    
    # 4. Escalar features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 5. Crear modelos
    models = create_models(model_package['model'])
    
    # 6. Entrenar modelos
    results = train_models(models, X_train, y_train, X_test, y_test)
    
    print("\n" + "="*60)
    print(" CREANDO ENSEMBLE VOTING")
    print("="*60)
    
    # 7. Crear ensemble Voting
    voting_ensemble = create_ensemble(results, y_test)
    
    # 8. Guardar ensemble Voting
    voting_saved = save_compatible_model(voting_ensemble, model_package, "../../../modelos/ensemble_voting2.pkl")
    
    print("\n" + "="*60)
    print(" CREANDO ENSEMBLE STACKING")
    print("="*60)
    
    # 9. Crear ensemble Stacking
    stacking_ensemble = create_stacking_ensemble(results, X_train, y_train, X_test, y_test, scaler)
    
    # 10. Guardar ensemble Stacking
    stacking_saved = save_stacking_model(stacking_ensemble, model_package)
    
    print("\n" + "="*60)
    print("📊 COMPARACIÓN DE ENSEMBLES")
    print("="*60)
    
    # 11. Comparar resultados
    if voting_ensemble and stacking_ensemble:
        print(" VOTING ENSEMBLE:")
        print(f"   Método: Promedio ponderado por R²")
        print(f"   Modelos: {len(voting_ensemble['individual_models'])}")
        # Calcular métricas para voting
        voting_predictions = []
        voting_weights = voting_ensemble.get('weights', [])
        for i, (name, model) in enumerate(voting_ensemble['individual_models'].items()):
            if name in ['Ridge', 'ElasticNet']:
                pred = model.predict(X_test_scaled)
            else:
                pred = model.predict(X_test)
            voting_predictions.append(pred)
        
        if voting_weights and len(voting_weights) == len(voting_predictions):
            voting_final = np.average(voting_predictions, axis=0, weights=voting_weights)
        else:
            voting_final = np.mean(voting_predictions, axis=0)
        
        voting_rmse = np.sqrt(mean_squared_error(y_test, voting_final))
        voting_r2 = r2_score(y_test, voting_final)
        
        print(f"   RMSE: ${voting_rmse:,.2f}")
        print(f"   R²: {voting_r2:.3f}")
        
        print("\n STACKING ENSEMBLE:")
        print(f"   Método: Meta-modelo ({stacking_ensemble['meta_model_name']})")
        print(f"   Modelos nivel 1: {len(stacking_ensemble['level1_models'])}")
        print(f"   RMSE: ${stacking_ensemble['rmse']:,.2f}")
        print(f"   R²: {stacking_ensemble['r2']:.3f}")
        
        # Determinar ganador
        if stacking_ensemble['rmse'] < voting_rmse:
            print(f"\n🏆 GANADOR: STACKING (mejora de ${voting_rmse - stacking_ensemble['rmse']:,.2f} en RMSE)")
        else:
            print(f"\n🏆 GANADOR: VOTING (mejora de ${stacking_ensemble['rmse'] - voting_rmse:,.2f} en RMSE)")
    
    print("\n PIPELINE DUAL COMPLETADO!")
    print("✅ Dos ensembles creados:")
    print("   • ensemble_voting2.pkl (Promedio ponderado)")
    print("   • ensemble_stacking.pkl (Meta-modelo)")
    

if __name__ == "__main__":
    main()

✅ Módulos personalizados cargados correctamente
🚀 INICIANDO PIPELINE DE ENSEMBLE DUAL
🔄 Cargando modelo y datos...
✅ Modelo cargado
✅ Datos cargados: (366, 8)
🔧 Preparando datos...
🔧 Creando características completas para producción (originales + estadísticos)...
🔧 Creando todas las características ...
✅ Creadas 61 características en total
   - Variables numéricas básicas: 3
   - Variables de educación: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
📊 Creando features estadísticos para producción (PREDICT)...
   📥 Usando estadísticas pre-calculadas de TRAIN...
   ✅ Creadas 32 features estadísticos para producción
✅ Features totales para producción: 93
   - Originales: 61
   - Estadísticos: 32
✅ Features preparadas: (366, 93)
Train: (292, 93), Test: (74, 93)
 Creando modelos...
✅ 4 modelos creados
🚀 Entrenando modelos...
Entrenando LightGBM...
   RMSE: $15,755 | R²: 0.906
Entrenando XGBoost...
   RMSE: $18,281 | R²:

In [None]:
#pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
