Librerias

In [1]:
import pred_lgbm as pred
import funciones_lgbm as f_lgbm
import pandas as pd
import numpy as np
import matplotlib as plt

  from .autonotebook import tqdm as notebook_tqdm


Entrenamos con una optimizaci√≥n de hiperpar√°metros utilizando optuna

In [2]:
def train_model_lgbm_optuna(data, test_size=0.2, random_state=42, n_trials=100, timeout=300):
    """
    Args:
        data: DataFrame completo con todas las columnas (incluyendo Salary)
        test_size: Proporci√≥n del conjunto de prueba
        random_state: Semilla aleatoria
        n_trials: N√∫mero m√°ximo de pruebas de Optuna
        timeout: Tiempo l√≠mite en segundos para la optimizaci√≥n
    """
    print(f"\nüöÄ Entrenando modelo LightGBM con Optuna + Features Estad√≠sticos")
    print(f"   Trials: {n_trials}, Timeout: {timeout}s")
    
    # Importaciones necesarias
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    import lightgbm as lgb
    import optuna
    import numpy as np
    import warnings
    
    # Silenciar warnings
    warnings.filterwarnings('ignore')
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    
    # ============= PASO 1: PREPARAR GROUPING INFO =============
    print("üìä Paso 1: Preparando informaci√≥n de agrupaci√≥n...")
    data_with_groups, grouping_info = f_lgbm.create_and_save_grouping_info(data)
    all_job_cats, all_seniority_cats = f_lgbm.get_all_categories(data_with_groups)
    
    print(f"   ‚úÖ Grupos creados: Age_group, Exp_group")
    print(f"   ‚úÖ Job categories: {len(all_job_cats)}")
    print(f"   ‚úÖ Seniority levels: {len(all_seniority_cats)}")
    
    # ============= PASO 2: SEPARAR TARGET Y FEATURES =============
    
    print("üîÑ Paso 2: Separando target y features...")
    
    X_data = data_with_groups.drop('Salary', axis=1)  # Variables disponibles en producci√≥n
    y = data_with_groups['Salary']  # Target
    
    print(f"   üìä Datos originales: {X_data.shape}")
    print(f"   üéØ Target: {len(y)} registros")
    
    # ============= PASO 3: SPLIT PRINCIPAL TRAIN/TEST =============
    print("‚úÇÔ∏è  Paso 3: Split principal train/test...")
    X_train_base, X_test_base, y_train, y_test = train_test_split(
        X_data, y, test_size=test_size, random_state=random_state
    )
    
    print(f"   üìà Train: {X_train_base.shape[0]} registros")
    print(f"   üìâ Test:  {X_test_base.shape[0]} registros")
    
    # ============= PASO 4: CREAR FEATURES CON ESTAD√çSTICAS =============
    print("üîß Paso 4: Creando features con estad√≠sticas...")
    
    # Crear features en TRAIN (calcula estad√≠sticas)
    X_train, feature_names, stats_dict = f_lgbm.create_features_with_stats(
        X_train_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=None,
        is_training=True
    )
    
    # Aplicar features a TEST (usa estad√≠sticas de train)
    X_test, _ = f_lgbm.create_features_with_stats(
        X_test_base,
        all_job_categories=all_job_cats,
        all_seniority_levels=all_seniority_cats,
        stats_dict=stats_dict,
        is_training=False
    )
    
    print(f"   ‚úÖ Features totales: {X_train.shape[1]}")
    print(f"   ‚úÖ Train: {X_train.shape}")
    print(f"   ‚úÖ Test:  {X_test.shape}")
    
    # ============= PASO 5: SPLIT PARA VALIDACI√ìN DE OPTUNA =============
    print("üîÑ Paso 5: Split para validaci√≥n de Optuna...")
    X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
        X_train, y_train, test_size=0.2, random_state=random_state
    )
    
    print(f"   üéØ Train opt: {X_train_opt.shape}")
    print(f"   üîç Validation: {X_val_opt.shape}")
    
    # ============= PASO 6: f_lgbmCI√ìN OBJETIVO PARA OPTUNA =============
    def objective(trial):
        """f_lgbmci√≥n objetivo para Optuna"""
        
        # Hiperpar√°metros a optimizar
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'verbosity': -1,
            'random_state': random_state,
            'n_jobs': -1,
            
            # Par√°metros a optimizar
            'num_leaves': trial.suggest_int('num_leaves', 10, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'n_estimators': trial.suggest_int('n_estimators', 100, 2000)
        }
        
        # Crear y entrenar modelo
        model = lgb.LGBMRegressor(**params)
        
        try:
            # Entrenar con early stopping
            model.fit(
                X_train_opt, y_train_opt,
                eval_set=[(X_val_opt, y_val_opt)],
                callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
            )
            
            # Predecir en conjunto de validaci√≥n
            y_pred = model.predict(X_val_opt)
            rmse = np.sqrt(mean_squared_error(y_val_opt, y_pred))
            
            return rmse
            
        except Exception as e:
            # Si hay error, devolver un valor alto
            return float('inf')
    
    # ============= PASO 7: OPTIMIZACI√ìN CON OPTUNA =============
    print("üéØ Paso 7: Optimizando hiperpar√°metros con Optuna...")
    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=random_state))
    
    # Optimizar
    study.optimize(objective, n_trials=n_trials, timeout=timeout, show_progress_bar=True)
    
    print(f"   ‚úÖ Optimizaci√≥n completada: {len(study.trials)} trials realizados")
    print(f"   üèÜ Mejor RMSE de validaci√≥n: ${study.best_value:,.2f}")
    
    # ============= PASO 8: MODELO FINAL =============
    print("üèÜ Paso 8: Entrenando modelo final...")
    
    # Obtener mejores par√°metros
    best_params = study.best_params.copy()
    best_params.update({
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'random_state': random_state,
        'n_jobs': -1
    })
    
    print("   üìã Mejores hiperpar√°metros encontrados:")
    for param, value in best_params.items():
        if param not in ['objective', 'metric', 'boosting_type', 'verbosity', 'random_state', 'n_jobs']:
            print(f"      {param}: {value}")
    
    # Entrenar modelo final con mejores par√°metros
    final_model = lgb.LGBMRegressor(**best_params)
    
    try:
        # Entrenar en todo el conjunto de entrenamiento
        final_model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
        )
        
        # ============= PASO 9: EVALUACI√ìN FINAL =============
        print("üìä Paso 9: Evaluaci√≥n final...")
        
        # Predicciones finales
        y_pred = final_model.predict(X_test)
        
        # M√©tricas en conjunto de prueba
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Cross-validation con el modelo optimizado
        print("   üîÑ Realizando validaci√≥n cruzada final...")
        cv_model = lgb.LGBMRegressor(**best_params)
        cv_scores = cross_val_score(
            cv_model, X_train, y_train, cv=5, 
            scoring='neg_mean_squared_error', n_jobs=-1
        )
        cv_rmse = np.sqrt(-cv_scores.mean())
        cv_std = np.sqrt(cv_scores.std())
        
        # ============= PASO 10: PREPARAR RESULTADOS =============
        print("üì¶ Paso 10: Preparando resultados finales...")
        
        # Resultados del modelo
        model_metrics = {
            'model': final_model,
            'rmse': rmse,
            'r2': r2,
            'mae': mae,
            'cv_rmse': cv_rmse,
            'cv_std': cv_std,
            'predictions': y_pred,
            'feature_importances': final_model.feature_importances_,
            'n_estimators_used': final_model.best_iteration_ if hasattr(final_model, 'best_iteration_') else final_model.n_estimators,
            'best_params': best_params,
            'optuna_study': study
        }
        
        # Resultado completo para compatibilidad
        final_results = {
            'model_results': {'LightGBM_Optuna': model_metrics},
            'best_model_name': 'LightGBM_Optuna',
            'best_model': final_model,
            'feature_names': feature_names,
            'job_categories': all_job_cats,
            'seniority_categories': all_seniority_cats,
            'stats_dict': stats_dict,
            'grouping_info': grouping_info,
            'X_test': X_test,
            'y_test': y_test,
            'X_train': X_train,
            'y_train': y_train,
            'optimization_study': study
        }
        
        # ============= MOSTRAR RESULTADOS =============
        print(f"\nüéâ RESULTADOS FINALES:")
        print(f"   RMSE: ${rmse:,.2f}")
        print(f"   R¬≤: {r2:.3f}")
        print(f"   MAE: ${mae:,.2f}")
        print(f"   CV RMSE: ${cv_rmse:,.2f} (¬±{cv_std:,.2f})")
        print(f"   Features totales: {len(feature_names)}")
        print(f"   Estimadores utilizados: {model_metrics['n_estimators_used']}")
        print(f"   Mejora vs RMSE de validaci√≥n: {((study.best_value - rmse) / study.best_value * 100):+.2f}%")
        
        return final_results
        
    except Exception as e:
        print(f"‚ùå Error entrenando modelo final: {str(e)}")
        import traceback
        print(traceback.format_exc())
        return None

PIPELINE - LGBM

In [3]:
def pipeline_fe():
    print("üöÄ INICIANDO PIPELINE")

    # 1 . Cargar datos
    data = pd.read_csv('../../../dataC/imputado.csv')
    data["Description"] = data["Description"].fillna("")
    
    data = data.dropna()
    # Se mejora levemente los errores realizando esta imputaci√≥n y dejando los otros nulos de los otros features.
    
    # Entrenar modelo
    model_results =train_model_lgbm_optuna(data)

    # Analizar optimizaci√≥n
    best_params, best_value = f_lgbm.analyze_optuna_optimization(model_results['optimization_study'])

    # Visualizar proceso
    #plot_optuna_optimization(model_results['optimization_study'])

    # 5. Analizar importancia
    #feature_importance = fun.analyze_feature_importance(X,feature_names,model)

    # 6. Analizar predicciones
    #predictions_analysis = f.analyze_predictions(model_results)

    # 7. Comparar modelos
    #fun.create_comparison_chart(model_results)

    print("\nüéâ AN√ÅLISIS COMPLETADO!")
    print("="*50)

    # Resumen final
    best_name = model_results['best_model_name']
    best_result = model_results['model_results'][best_name]

    print(f"\nüèÜ RESUMEN FINAL:")
    print(f"   Mejor modelo: {best_name}")
    print(f"   RMSE: ${best_result['rmse']:,.2f}")
    print(f"   R¬≤: {best_result['r2']:.3f}")
    print(f"   CV RMSE: ${best_result['cv_rmse']:,.2f}")
    #print(f"   Caracter√≠sticas utilizadas: {len(feature_names)}")
    
    

     
    return model_results,data
    # ,model,X,feature_importance
    
results,data=pipeline_fe()   

üöÄ INICIANDO PIPELINE

üöÄ Entrenando modelo LightGBM con Optuna + Features Estad√≠sticos
   Trials: 100, Timeout: 300s
üìä Paso 1: Preparando informaci√≥n de agrupaci√≥n...
üìä Creando grupos y guardando informaci√≥n de rangos...
   ‚úÖ Grupos creados: Age_group, Exp_group
   ‚úÖ Job categories: 12
   ‚úÖ Seniority levels: 5
üîÑ Paso 2: Separando target y features...
   üìä Datos originales: (369, 9)
   üéØ Target: 369 registros
‚úÇÔ∏è  Paso 3: Split principal train/test...
   üìà Train: 295 registros
   üìâ Test:  74 registros
üîß Paso 4: Creando features con estad√≠sticas...
üîß Creando caracter√≠sticas completas para producci√≥n (originales + estad√≠sticos)...
üîß Creando todas las caracter√≠sticas mejoradas...
‚úÖ Creadas 61 caracter√≠sticas en total
   - Variables num√©ricas b√°sicas: 3
   - Variables de educaci√≥n: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
üìä Creando features estad√≠stico

Best trial: 88. Best value: 13311.5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:10<00:00,  9.35it/s, 10.69/300 seconds]


   ‚úÖ Optimizaci√≥n completada: 100 trials realizados
   üèÜ Mejor RMSE de validaci√≥n: $13,311.46
üèÜ Paso 8: Entrenando modelo final...
   üìã Mejores hiperpar√°metros encontrados:
      num_leaves: 21
      learning_rate: 0.1103186092201265
      feature_fraction: 0.906524091469967
      bagging_fraction: 0.6815973162203444
      bagging_freq: 1
      min_child_samples: 23
      min_child_weight: 0.7585553500711019
      reg_alpha: 9.248611683528368
      reg_lambda: 1.5875177712326718
      max_depth: 13
      n_estimators: 803
üìä Paso 9: Evaluaci√≥n final...
   üîÑ Realizando validaci√≥n cruzada final...
üì¶ Paso 10: Preparando resultados finales...

üéâ RESULTADOS FINALES:
   RMSE: $16,963.52
   R¬≤: 0.886
   MAE: $9,915.00
   CV RMSE: $11,777.75 (¬±6,188.65)
   Features totales: 93
   Estimadores utilizados: 134
   Mejora vs RMSE de validaci√≥n: -27.44%
üî¨ An√°lisis de optimizaci√≥n Optuna:
   N√∫mero total de trials: 100
   Mejor valor: $13,311.46
   Trials completad

In [4]:
complete_package = f_lgbm.save_with_stats(
        results,
        filename="../../../modelos/salary_with_stats.pkl"
    )

üíæ Guardando modelo completo con features estad√≠sticos...
‚úÖ Modelo completo guardado en ../../../modelos/salary_with_stats.pkl
üì¶ Incluye:
   ü§ñ Modelo LightGBM optimizado
   üî¢ 93 caracter√≠sticas
   üìä Features estad√≠sticos (stats_dict)
   üè∑Ô∏è  12 categor√≠as de trabajo
   üëî 5 niveles de seniority
   üìà Informaci√≥n de agrupaci√≥n
   üìâ M√©tricas del modelo


Probamos con  una nueva predicci√≥n

In [6]:
import joblib

In [7]:
model_package = joblib.load("../../../modelos/salary_with_stats.pkl")
def test_prediction(model_package):
    """
    Test para verificar que la predicci√≥n funciona con un solo registro
    """
    print("üß™ Testing predicci√≥n con un solo registro...")
    
    # Crear registro de prueba
    exp_group, age_group = pred.calculate_groups(
    age=60, 
    years_of_experience=24, 
    grouping_info=model_package.get('grouping_info')
    )

    test_record = pd.DataFrame({
    'Age': [60],
    'Gender': ['Male'],
    'Education_Level': ["PhD"],
    'Job_Title': ['CEO'],
    'Years_of_Experience': [24],
    'Description': ['I work with machine learning models and data analysis'],
    'Exp_group': [exp_group],      # ‚Üê Calculado autom√°ticamente
    'Age_group': [age_group]       # ‚Üê Calculado autom√°ticamente
    })
    
    try:
        prediction = pred.predict(test_record, model_package)
        print(f"‚úÖ Test exitoso: Predicci√≥n = ${prediction:,.2f}")
        return True
    except Exception as e:
        print(f"‚ùå Test fall√≥: {e}")
        return False

In [None]:
import joblib
model_package = joblib.load("../../../modelos/salary_with_stats.pkl")

In [None]:
test_prediction(model_package)

üß™ Testing predicci√≥n con un solo registro...
üéØ Predicci√≥n con modelo completo (un solo registro)...
   üî¢ Features esperadas: 93
üîß Creando caracter√≠sticas completas para un solo registro...
üîß Creando todas las caracter√≠sticas mejoradas...
‚úÖ Creadas 61 caracter√≠sticas en total
   - Variables num√©ricas b√°sicas: 3
   - Variables de educaci√≥n: 3
   - Variables de job category: 12
   - Variables de seniority: 6
   - Variables de texto: 4
   - Ratios y scores: 5
üìä Creando features estad√≠sticos para un solo registro...
   ‚úÖ Creadas 32 features estad√≠sticos para un solo registro
‚úÖ Features totales para un solo registro: 93
   - Originales: 61
   - Estad√≠sticos: 32
   üî¢ Features generadas: 93
   üí∞ Predicci√≥n: $172,229.38
   ‚úÖ Predicci√≥n exitosa con 93 features
‚úÖ Test exitoso: Predicci√≥n = $172,229.38


True

An√°lisis de la optimizaci√≥n

In [None]:


# 1. AN√ÅLISIS DE IMPORTANCIA DE HIPERPAR√ÅMETROS
print("üîç AN√ÅLISIS DE IMPORTANCIA DE HIPERPAR√ÅMETROS")
print("="*50)

# Suponiendo que tienes tu study guardado en 'results' o 'study'
best_params, best_value = f_lgbm.analyze_optuna_optimization(results['optimization_study'])

# 2. AN√ÅLISIS DETALLADO DE M√âTRICAS FINALES
print("\nüìä M√âTRICAS FINALES DEL MODELO OPTIMIZADO")
print("="*50)

final_model = results['best_model']
model_results = results['model_results']['LightGBM_Optuna']

print(f"üéØ RMSE en Test: ${model_results['rmse']:,.2f}")
print(f"üìà R¬≤ Score: {model_results['r2']:.4f} ({model_results['r2']*100:.2f}%)")
print(f"üìâ MAE: ${model_results['mae']:,.2f}")
print(f"üîÑ CV RMSE: ${model_results['cv_rmse']:,.2f} (¬±{model_results['cv_std']:,.2f})")
print(f"üå≥ Estimadores usados: {model_results['n_estimators_used']}")

# Calcular mejora respecto a baseline
rmse_improvement = ((12600 - model_results['rmse']) / 12600) * 100
print(f"‚ö° Mejora total: {rmse_improvement:.2f}% respecto al inicio")

# 3. AN√ÅLISIS DE IMPORTANCIA DE CARACTER√çSTICAS
print("\nüîç IMPORTANCIA DE CARACTER√çSTICAS")
print("="*50)

feature_importance_df = f_lgbm.get_feature_importance(
    final_model, 
    feature_names=results.get('feature_names'), 
    top_n=20
)

# 4. AN√ÅLISIS DE CALIDAD DE PREDICCIONES
print("\nüéØ AN√ÅLISIS DE CALIDAD DE PREDICCIONES")
print("="*50)

y_test = results['y_test']
y_pred = model_results['predictions']
residuos = y_test - y_pred

# Estad√≠sticas de residuos
print(f"Media de residuos: ${np.mean(residuos):,.2f}")
print(f"Std de residuos: ${np.std(residuos):,.2f}")
print(f"Mediana abs residuos: ${np.median(np.abs(residuos)):,.2f}")

# Percentiles de error absoluto
abs_errors = np.abs(residuos)
print(f"\nPercentiles de error absoluto:")
print(f"  25%: ${np.percentile(abs_errors, 25):,.2f}")
print(f"  50%: ${np.percentile(abs_errors, 50):,.2f}")
print(f"  75%: ${np.percentile(abs_errors, 75):,.2f}")
print(f"  95%: ${np.percentile(abs_errors, 95):,.2f}")

# 5. AN√ÅLISIS DE OUTLIERS EN PREDICCIONES
print(f"\nüîç AN√ÅLISIS DE OUTLIERS")
print("="*50)

# Definir outliers como errores > 2 std
threshold = 2 * np.std(abs_errors)
outliers = abs_errors > threshold
n_outliers = np.sum(outliers)
outlier_pct = (n_outliers / len(abs_errors)) * 100

print(f"Outliers detectados: {n_outliers} ({outlier_pct:.2f}%)")
print(f"Threshold usado: ${threshold:,.2f}")

if n_outliers > 0:
    print(f"Error promedio en outliers: ${np.mean(abs_errors[outliers]):,.2f}")
    print(f"Valor real promedio outliers: ${np.mean(y_test[outliers]):,.2f}")

# 6. AN√ÅLISIS DE HIPERPAR√ÅMETROS √ìPTIMOS
print(f"\n‚öôÔ∏è HIPERPAR√ÅMETROS √ìPTIMOS ENCONTRADOS")
print("="*50)

important_params = [
    'num_leaves', 'learning_rate', 'max_depth', 'n_estimators',
    'feature_fraction', 'bagging_fraction', 'reg_alpha', 'reg_lambda'
]

for param in important_params:
    if param in best_params:
        print(f"{param:20s}: {best_params[param]}")

# 7. COMPARACI√ìN CON CONFIGURACI√ìN BASE
print(f"\nüìà COMPARACI√ìN CON CONFIGURACI√ìN BASE")
print("="*50)

# Configuraci√≥n base t√≠pica
base_config = {
    'num_leaves': 31,  # Default
    'learning_rate': 0.1,  # Default
    'max_depth': -1,  # Default (sin l√≠mite)
    'n_estimators': 100,  # Default
    'feature_fraction': 1.0,  # Default
    'bagging_fraction': 1.0,  # Default
}

print("Par√°metro             Base      Optimizado    Cambio")
print("-" * 55)
for param in important_params[:6]:  # Top 6 m√°s importantes
    if param in best_params and param in base_config:
        base_val = base_config[param]
        opt_val = best_params[param]
        if isinstance(opt_val, float):
            change = f"{((opt_val - base_val) / base_val * 100):+.1f}%"
            print(f"{param:20s} {base_val:8.3f} {opt_val:11.3f} {change:>10s}")
        else:
            change = f"{opt_val - base_val:+d}"
            print(f"{param:20s} {base_val:8d} {opt_val:11d} {change:>10s}")

# 8. RECOMENDACIONES FINALES
print(f"\nüí° RECOMENDACIONES Y CONCLUSIONES")
print("="*50)

print("‚úÖ Aspectos positivos:")
print(f"  ‚Ä¢ Convergencia estable alcanzada en ~45 trials")
print(f"  ‚Ä¢ Mejora significativa: {rmse_improvement:.1f}% en RMSE")
print(f"  ‚Ä¢ R¬≤ de {model_results['r2']:.3f} indica buen ajuste")
print(f"  ‚Ä¢ CV estable con baja varianza")

if model_results['r2'] > 0.8:
    print(f"  ‚Ä¢ Excelente capacidad predictiva (R¬≤ > 0.8)")
elif model_results['r2'] > 0.6:
    print(f"  ‚Ä¢ Buena capacidad predictiva (R¬≤ > 0.6)")

print(f"\n‚ö†Ô∏è  √Åreas de atenci√≥n:")
if outlier_pct > 5:
    print(f"  ‚Ä¢ {outlier_pct:.1f}% de outliers - considerar an√°lisis adicional")
if model_results['cv_std'] > model_results['cv_rmse'] * 0.1:
    print(f"  ‚Ä¢ Varianza en CV relativamente alta - validar estabilidad")



üîç AN√ÅLISIS DE IMPORTANCIA DE HIPERPAR√ÅMETROS
üî¨ An√°lisis de optimizaci√≥n Optuna:
   N√∫mero total de trials: 100
   Mejor valor: $13,311.46
   Trials completados: 100
   Trials fallidos: 0
   üîç Importancia de hiperpar√°metros (Top 10):
       1. min_child_samples    - 0.8167
       2. learning_rate        - 0.1355
       3. num_leaves           - 0.0151
       4. feature_fraction     - 0.0094
       5. reg_lambda           - 0.0074
       6. bagging_fraction     - 0.0059
       7. reg_alpha            - 0.0034
       8. max_depth            - 0.0031
       9. n_estimators         - 0.0027
      10. min_child_weight     - 0.0004

üìä M√âTRICAS FINALES DEL MODELO OPTIMIZADO
üéØ RMSE en Test: $16,963.52
üìà R¬≤ Score: 0.8859 (88.59%)
üìâ MAE: $9,915.00
üîÑ CV RMSE: $11,777.75 (¬±6,188.65)
üå≥ Estimadores usados: 134
‚ö° Mejora total: -34.63% respecto al inicio

üîç IMPORTANCIA DE CARACTER√çSTICAS

üîç Top 20 caracter√≠sticas m√°s importantes:
    1. avg_word_length    

El 25% de los casos se predicen con un error menor a $1250 ok.

El 50% (mediana) est√°n dentro de $6000 ok.

Pero un 25% tiene errores mayores a $12.800, y un 5% supera los $24.500

Salarios de aproximadamente $200 000 un error 12%

Construir nuevas variables que puedan captar este comportamiento - work

El modelo muestra un excelente rendimiento general, con m√©tricas s√≥lidas y comportamiento estable. Revisar casos extremos (outliers) y si hay segmentos que afectan la varianza de la validaci√≥n cruzada.

Posible forma en la que se generan los grupos


Revisemos los outliers de predicci√≥n

In [None]:
# AN√ÅLISIS DE OUTLIERS USANDO SOLO MODEL_RESULTS
print("üîç AN√ÅLISIS DE OUTLIERS CON MODEL_RESULTS")
print("="*60)

# 1. EXTRAER DATOS DESDE MODEL_RESULTS
model_results = results['model_results']['LightGBM_Optuna']
y_test = results['y_test']
y_pred = model_results['predictions']

print(f"üìä Datos disponibles:")
print(f"   ‚Ä¢ Predicciones: {len(y_pred)} casos")
print(f"   ‚Ä¢ Valores reales: {len(y_test)} casos")

# 2. CALCULAR ERRORES Y IDENTIFICAR OUTLIERS
residuos = y_test - y_pred
abs_errors = np.abs(residuos)
rel_errors = (abs_errors / y_test) * 100

# Convertir a arrays numpy para evitar problemas de indexing
y_test_array = y_test.values if hasattr(y_test, 'values') else np.array(y_test)
y_pred_array = np.array(y_pred)
abs_errors_array = np.abs(y_test_array - y_pred_array)
rel_errors_array = (abs_errors_array / y_test_array) * 100

# Threshold para outliers (2 std)
threshold = 2 * np.std(abs_errors_array)
outliers_mask = abs_errors_array > threshold
outliers_indices = np.where(outliers_mask)[0]

print(f"\nüéØ DETECCI√ìN DE OUTLIERS:")
print(f"   ‚Ä¢ Threshold: ${threshold:,.2f}")
print(f"   ‚Ä¢ Outliers encontrados: {len(outliers_indices)} casos")
print(f"   ‚Ä¢ Porcentaje: {(len(outliers_indices)/len(y_test))*100:.2f}%")

# 3. AN√ÅLISIS DETALLADO DE OUTLIERS
print(f"\nüî¥ TOP 15 OUTLIERS M√ÅS PROBLEM√ÅTICOS")
print("="*80)

# Crear DataFrame con la informaci√≥n de outliers
outliers_df = pd.DataFrame({
    'Index': outliers_indices,
    'Salary_Real': y_test_array[outliers_indices],
    'Salary_Pred': y_pred_array[outliers_indices],
    'Error_Abs': abs_errors_array[outliers_indices],
    'Error_Rel': rel_errors_array[outliers_indices]
})

# Ordenar por error absoluto (descendente)
outliers_df = outliers_df.sort_values('Error_Abs', ascending=False)

print("Rank  Index   Salario Real   Predicci√≥n    Error Abs    Error %")
print("-" * 75)

for i, (_, row) in enumerate(outliers_df.head(15).iterrows(), 1):
    print(f"{i:2d}    {row['Index']:4.0f}   ${row['Salary_Real']:9.0f}   ${row['Salary_Pred']:9.0f}   ${row['Error_Abs']:8.0f}   {row['Error_Rel']:6.1f}%")

# 4. ESTAD√çSTICAS DE OUTLIERS vs NORMALES
print(f"\nüìà COMPARACI√ìN: OUTLIERS vs CASOS NORMALES")
print("="*60)

normal_mask = ~outliers_mask

# Salarios reales
outlier_salaries = y_test_array[outliers_mask]
normal_salaries = y_test_array[normal_mask]

print("M√©trica                 Outliers      Normales    Diferencia")
print("-" * 65)
print(f"{'Media Salario':20s}   ${outlier_salaries.mean():9.0f} ${normal_salaries.mean():9.0f}   ${outlier_salaries.mean() - normal_salaries.mean():+9.0f}")
#print(f"{'Mediana Salario':20s}   ${outlier_salaries.median():9.0f} ${normal_salaries.median():9.0f}   ${outlier_salaries.median() - normal_salaries.median():+9.0f}")
print(f"{'Std Salario':20s}   ${outlier_salaries.std():9.0f} ${normal_salaries.std():9.0f}   ${outlier_salaries.std() - normal_salaries.std():+9.0f}")

# Errores
outlier_errors = abs_errors_array[outliers_mask]
normal_errors = abs_errors_array[normal_mask]

print(f"{'Error Promedio':20s}   ${outlier_errors.mean():9.0f} ${normal_errors.mean():9.0f}   ${outlier_errors.mean() - normal_errors.mean():+9.0f}")
print(f"{'Error Mediano':20s}   ${np.median(outlier_errors):9.0f} ${np.median(normal_errors):9.0f}   ${np.median(outlier_errors) - np.median(normal_errors):+9.0f}")

# 5. AN√ÅLISIS DE DISTRIBUCI√ìN DE OUTLIERS
print(f"\nüìä DISTRIBUCI√ìN DE OUTLIERS POR RANGOS SALARIALES")
print("="*60)

# Definir rangos salariales
salary_ranges = [
    (0, 50000, "Bajo (<$50K)"),
    (50000, 100000, "Medio ($50K-$100K)"),
    (100000, 150000, "Alto ($100K-$150K)"),
    (150000, 200000, "Muy Alto ($150K-$200K)"),
    (200000, float('inf'), "Premium (>$200K)")
]

print("Rango Salarial        Total    Outliers   Tasa Outlier")
print("-" * 60)

for min_sal, max_sal, label in salary_ranges:
    # Casos en este rango
    in_range = (y_test_array >= min_sal) & (y_test_array < max_sal)
    total_in_range = in_range.sum()
    
    if total_in_range > 0:
        # Outliers en este rango
        outliers_in_range = (in_range & outliers_mask).sum()
        outlier_rate = (outliers_in_range / total_in_range) * 100
        
        print(f"{label:20s}   {total_in_range:5d}    {outliers_in_range:8d}   {outlier_rate:8.2f}%")

# 6. AN√ÅLISIS DE PATRONES EN PREDICCIONES
print(f"\nüéØ PATRONES EN LAS PREDICCIONES ERR√ìNEAS")
print("="*60)

# Casos donde el modelo subestima mucho
underestimated = outliers_df[outliers_df['Salary_Real'] > outliers_df['Salary_Pred']]
underestimated_severe = underestimated[underestimated['Error_Rel'] > 30]

# Casos donde el modelo sobreestima mucho
overestimated = outliers_df[outliers_df['Salary_Real'] < outliers_df['Salary_Pred']]
overestimated_severe = overestimated[overestimated['Error_Rel'] > 30]

print(f"üìâ Subestimaciones severas (>30%): {len(underestimated_severe)} casos")
if len(underestimated_severe) > 0:
    print(f"   ‚Ä¢ Error promedio: ${underestimated_severe['Error_Abs'].mean():,.0f}")
    print(f"   ‚Ä¢ Salario real promedio: ${underestimated_severe['Salary_Real'].mean():,.0f}")
    print(f"   ‚Ä¢ Predicci√≥n promedio: ${underestimated_severe['Salary_Pred'].mean():,.0f}")

print(f"\nüìà Sobreestimaciones severas (>30%): {len(overestimated_severe)} casos")
if len(overestimated_severe) > 0:
    print(f"   ‚Ä¢ Error promedio: ${overestimated_severe['Error_Abs'].mean():,.0f}")
    print(f"   ‚Ä¢ Salario real promedio: ${overestimated_severe['Salary_Real'].mean():,.0f}")
    print(f"   ‚Ä¢ Predicci√≥n promedio: ${overestimated_severe['Salary_Pred'].mean():,.0f}")

# 7. CASOS EXTREMOS PARA INVESTIGACI√ìN MANUAL
print(f"\nüîç CASOS EXTREMOS PARA INVESTIGACI√ìN")
print("="*60)

# Casos con errores relativos extremos
extreme_cases = outliers_df[outliers_df['Error_Rel'] > 50]
print(f"‚ö†Ô∏è  Casos con error relativo >50%: {len(extreme_cases)}")

if len(extreme_cases) > 0:
    print("\nCasos que requieren investigaci√≥n manual:")
    for _, row in extreme_cases.head(5).iterrows():
        error_type = "Subestim√≥" if row['Salary_Real'] > row['Salary_Pred'] else "Sobreestim√≥"
        print(f"  ‚Ä¢ √çndice {row['Index']:.0f}: {error_type}")
        print(f"    Real: ${row['Salary_Real']:,.0f} | Pred: ${row['Salary_Pred']:,.0f} | Error: {row['Error_Rel']:.1f}%")

# 8. M√âTRICAS DE OUTLIERS
print(f"\nüìä M√âTRICAS ESPEC√çFICAS DE OUTLIERS")
print("="*60)

print(f"üéØ Estad√≠sticas de errores en outliers:")
print(f"   ‚Ä¢ Error absoluto promedio: ${outliers_df['Error_Abs'].mean():,.2f}")
print(f"   ‚Ä¢ Error absoluto mediano: ${outliers_df['Error_Abs'].median():,.2f}")
print(f"   ‚Ä¢ Error relativo promedio: {outliers_df['Error_Rel'].mean():.2f}%")
print(f"   ‚Ä¢ Error relativo mediano: {outliers_df['Error_Rel'].median():.2f}%")

# Percentiles de errores en outliers
print(f"\nüìà Percentiles de error absoluto en outliers:")
print(f"   ‚Ä¢ P25: ${np.percentile(outliers_df['Error_Abs'], 25):,.0f}")
print(f"   ‚Ä¢ P50: ${np.percentile(outliers_df['Error_Abs'], 50):,.0f}")
print(f"   ‚Ä¢ P75: ${np.percentile(outliers_df['Error_Abs'], 75):,.0f}")
print(f"   ‚Ä¢ P90: ${np.percentile(outliers_df['Error_Abs'], 90):,.0f}")

# 9. RECOMENDACIONES BASADAS EN EL AN√ÅLISIS
print(f"\nüí° RECOMENDACIONES PARA MEJORAR EL MODELO")
print("="*60)

if len(extreme_cases) > 0:
    print("üî¥ PRIORIDAD ALTA:")
    print(f"   ‚Ä¢ Investigar {len(extreme_cases)} casos con error >50%")
    print("   ‚Ä¢ Verificar posibles errores en los datos")
    print("   ‚Ä¢ Considerar exclusi√≥n temporal para validar")

if len(underestimated_severe) > len(overestimated_severe):
    print(f"\nüìâ El modelo tiende a SUBESTIMAR salarios altos:")
    print("   ‚Ä¢ Revisar feature engineering para capturar mejor salarios premium")
    print("   ‚Ä¢ Considerar transformaci√≥n logar√≠tmica")
    print("   ‚Ä¢ Ajustar regularizaci√≥n para permitir predicciones m√°s altas")
elif len(overestimated_severe) > len(underestimated_severe):
    print(f"\nüìà El modelo tiende a SOBREESTIMAR:")
    print("   ‚Ä¢ Aumentar regularizaci√≥n")
    print("   ‚Ä¢ Revisar outliers en datos de entrenamiento")

outlier_rate = (len(outliers_indices)/len(y_test))*100
if outlier_rate > 10:
    print(f"\n‚ö†Ô∏è  Tasa de outliers alta ({outlier_rate:.2f}%):")
    print("   ‚Ä¢ Revisar calidad de los datos")
    print("   ‚Ä¢ Considerar ensemble de modelos")
    print("   ‚Ä¢ Implementar detecci√≥n de anomal√≠as en preprocessing")

print(f"\n‚úÖ PR√ìXIMOS PASOS:")
print("1. Exportar √≠ndices de outliers para an√°lisis manual")
print("2. Verificar datos originales de casos extremos")
print("3. Implementar mejoras espec√≠ficas identificadas")
print("4. Re-evaluar modelo sin outliers extremos")

# 10. EXPORTAR OUTLIERS PARA AN√ÅLISIS
print(f"\nüíæ DATOS DE OUTLIERS PARA EXPORTAR:")
print("="*60)
print("# C√≥digo para exportar outliers:")
print("outliers_to_export = outliers_df.copy()")
print("# outliers_to_export.to_csv('outliers_analysis.csv', index=False)")
print(f"# Total registros a exportar: {len(outliers_df)}")

üîç AN√ÅLISIS DE OUTLIERS CON MODEL_RESULTS
üìä Datos disponibles:
   ‚Ä¢ Predicciones: 74 casos
   ‚Ä¢ Valores reales: 74 casos

üéØ DETECCI√ìN DE OUTLIERS:
   ‚Ä¢ Threshold: $27,528.46
   ‚Ä¢ Outliers encontrados: 3 casos
   ‚Ä¢ Porcentaje: 4.05%

üî¥ TOP 15 OUTLIERS M√ÅS PROBLEM√ÅTICOS
Rank  Index   Salario Real   Predicci√≥n    Error Abs    Error %
---------------------------------------------------------------------------
 1      42   $   250000   $   141914   $  108086     43.2%
 2      16   $    90000   $    54140   $   35860     39.8%
 3       6   $   160000   $   130736   $   29264     18.3%

üìà COMPARACI√ìN: OUTLIERS vs CASOS NORMALES
M√©trica                 Outliers      Normales    Diferencia
-----------------------------------------------------------------
Media Salario          $   166667 $    95282   $   +71385
Std Salario            $    65490 $    47329   $   +18160
Error Promedio         $    57737 $     7894   $   +49842
Error Mediano          $    35860 $    

****************************************************************************

In [None]:
with open("../../../modelos/salary_with_stats.pkl", "rb") as f:
    print(f.read(50))

b'\x80\x04\x95\xf3\x01\x00\x00\x00\x00\x00\x00}\x94(\x8c\x05model\x94\x8c\x10lightgbm.sklearn\x94\x8c\rLGBMReg'


Pr√≥ximos pasos:
    Probar un ensamble de modelos
    Implementar monitoreo de drift
    

In [None]:
! pip freeze > requirements.txt

In [None]:
dataf=data[(data['Salary'] < 90000) & (data['Salary'] > 35000)]

In [None]:
dataf

Unnamed: 0,id,Age,Gender,Education_Level,Job_Title,Years_of_Experience,Salary,Description,Exp_group,Age_group
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,I am a 28-year-old data analyst with a Master'...,Junior,Joven
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,I am a 36-year-old female Sales Associate with...,Medio,Medio
5,5,29.0,Male,Bachelor's,Marketing Analyst,2.0,55000.0,I am a 29-year-old Marketing Analyst with a Ba...,Junior,Joven
7,7,31.0,Male,Bachelor's,Sales Manager,4.0,80000.0,I am a 31-year-old Sales Manager with a Bachel...,Junior,Medio
8,8,26.0,Female,Bachelor's,Marketing Coordinator,1.0,45000.0,I am a 26-year-old female Marketing Coordinato...,Junior,Joven
...,...,...,...,...,...,...,...,...,...,...
361,363,33.0,Male,Bachelor's,Junior Marketing Specialist,5.0,70000.0,I am a 33-year-old male with a Bachelor's degr...,Junior,Medio
364,366,31.0,Female,Bachelor's,Junior Financial Analyst,3.0,50000.0,I am a 31-year-old female working as a Junior ...,Junior,Medio
367,369,33.0,Male,Bachelor's,Junior Business Analyst,4.0,60000.0,I am a 33-year-old male working as a Junior Bu...,Junior,Medio
368,370,35.0,Female,Bachelor's,Senior Marketing Analyst,8.0,85000.0,As a 35-year-old Senior Marketing Analyst with...,Medio,Medio
