In [4]:
# ==========================================
# AN√ÅLISIS DE CONTRIBUCI√ìN AL AUC POR FEATURE
# ==========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("üìä AN√ÅLISIS DE IMPORTANCIA DE FEATURES PARA AUC")
print("="*60)

# ==========================================
# 1. CARGAR DATOS
# ==========================================
print("\nüìÇ Cargando datos...")

df = pd.read_parquet("../data/interim/train_final_advanced_features.parquet")

# Limpiar
df = df.loc[:, ~df.columns.duplicated()]
cols_to_drop = [c for c in df.columns if c.endswith('_x') or c.endswith('_y')]
df = df.drop(columns=cols_to_drop, errors='ignore')

X = df.drop(['TARGET', 'SK_ID_CURR'], axis=1, errors='ignore')
y = df['TARGET']

# Encoding
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

X = X.replace([np.inf, -np.inf], np.nan).fillna(0)

print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==========================================
# 2. ENTRENAR MODELO BASE
# ==========================================
print("\nüîß Entrenando modelo base...")

model = XGBClassifier(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.02,
    min_child_weight=50,
    subsample=0.8,
    colsample_bytree=0.5,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    device='cuda',
    tree_method='hist',
    random_state=42,
    eval_metric='auc',
    early_stopping_rounds=100,
    verbosity=0
)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

baseline_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"AUC Baseline: {baseline_auc:.4f}")

# ==========================================
# 3. FEATURE IMPORTANCE DEL MODELO
# ==========================================
print("\nüìä Calculando Feature Importance del modelo...")

fi_gain = pd.DataFrame({
    'feature': X_train.columns,
    'importance_gain': model.feature_importances_
}).sort_values('importance_gain', ascending=False)

# Normalizar a porcentaje
fi_gain['pct_importance'] = fi_gain['importance_gain'] / fi_gain['importance_gain'].sum() * 100

print("\nüèÜ TOP 30 Features por Importancia (Gain):")
print(fi_gain.head(30).to_string(index=False))

# ==========================================
# 4. CORRELACI√ìN CON TARGET
# ==========================================
print("\nüìä Calculando correlaci√≥n con TARGET...")

correlations = []
for col in X.columns:
    corr = X[col].corr(y)
    correlations.append({
        'feature': col,
        'correlation': corr,
        'abs_correlation': abs(corr)
    })

corr_df = pd.DataFrame(correlations).sort_values('abs_correlation', ascending=False)

print("\nüèÜ TOP 30 Features por Correlaci√≥n con TARGET:")
print(corr_df.head(30).to_string(index=False))

# ==========================================
# 5. PERMUTATION IMPORTANCE (Impacto real en AUC)
# ==========================================
print("\nüìä Calculando Permutation Importance (esto tarda unos minutos)...")
print("   Esto mide cu√°nto BAJA el AUC cuando se permuta cada feature...")

# Usar subset para velocidad
sample_idx = np.random.choice(len(X_test), min(10000, len(X_test)), replace=False)
X_test_sample = X_test.iloc[sample_idx]
y_test_sample = y_test.iloc[sample_idx]

perm_importance = permutation_importance(
    model, X_test_sample, y_test_sample,
    n_repeats=5,
    random_state=42,
    scoring='roc_auc',
    n_jobs=-1
)

perm_df = pd.DataFrame({
    'feature': X_train.columns,
    'perm_importance_mean': perm_importance.importances_mean,
    'perm_importance_std': perm_importance.importances_std
}).sort_values('perm_importance_mean', ascending=False)

# Convertir a "p√©rdida de AUC"
perm_df['auc_drop'] = perm_df['perm_importance_mean']
perm_df['auc_drop_pct'] = perm_df['auc_drop'] / baseline_auc * 100

print("\nüèÜ TOP 30 Features por Permutation Importance:")
print("   (Cu√°nto BAJA el AUC si se elimina la informaci√≥n de esa feature)")
print(perm_df[['feature', 'auc_drop', 'auc_drop_pct']].head(30).to_string(index=False))

# ==========================================
# 6. DROP COLUMN IMPORTANCE (Top 50 features)
# ==========================================
print("\nüìä Calculando Drop Column Importance (Top 50 features)...")
print("   Esto entrena el modelo SIN cada feature y mide la ca√≠da en AUC...")

top_features = fi_gain.head(50)['feature'].tolist()
drop_importance = []

for i, feat in enumerate(top_features):
    print(f"   Procesando {i+1}/50: {feat}...", end='\r')

    # Entrenar sin esta feature
    X_train_drop = X_train.drop(columns=[feat])
    X_test_drop = X_test.drop(columns=[feat])

    model_drop = XGBClassifier(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.03,
        min_child_weight=50,
        subsample=0.8,
        colsample_bytree=0.5,
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
        device='cuda',
        tree_method='hist',
        random_state=42,
        verbosity=0
    )
    model_drop.fit(X_train_drop, y_train)

    auc_without = roc_auc_score(y_test, model_drop.predict_proba(X_test_drop)[:, 1])
    auc_drop = baseline_auc - auc_without

    drop_importance.append({
        'feature': feat,
        'auc_with': baseline_auc,
        'auc_without': auc_without,
        'auc_drop': auc_drop,
        'auc_drop_pct': auc_drop / baseline_auc * 100
    })

print("\n")
drop_df = pd.DataFrame(drop_importance).sort_values('auc_drop', ascending=False)

print("\nüèÜ TOP 30 Features por Drop Column Importance:")
print("   (Cu√°nto BAJA el AUC si se ELIMINA completamente esa feature)")
print(drop_df.head(30).to_string(index=False))

# ==========================================
# 7. AN√ÅLISIS COMBINADO
# ==========================================
print("\nüìä Creando an√°lisis combinado...")

# Merge all importance metrics
combined = fi_gain[['feature', 'importance_gain', 'pct_importance']].copy()
combined = combined.merge(corr_df[['feature', 'correlation', 'abs_correlation']], on='feature', how='left')
combined = combined.merge(perm_df[['feature', 'auc_drop', 'auc_drop_pct']], on='feature', how='left')
combined = combined.merge(drop_df[['feature', 'auc_drop']].rename(columns={'auc_drop': 'drop_auc_impact'}),
                          on='feature', how='left')

# Calcular score combinado
combined['combined_score'] = (
    combined['pct_importance'].fillna(0) * 0.3 +
    combined['abs_correlation'].fillna(0) * 100 * 0.2 +
    combined['auc_drop_pct'].fillna(0) * 0.3 +
    combined['drop_auc_impact'].fillna(0) * 1000 * 0.2
)

combined = combined.sort_values('combined_score', ascending=False)

print("\n" + "="*80)
print("üèÜ RANKING FINAL DE FEATURES (Score Combinado)")
print("="*80)
print(combined.head(40).to_string(index=False))

# ==========================================
# 8. AN√ÅLISIS POR CATEGOR√çA
# ==========================================
print("\nüìä An√°lisis por categor√≠a de features...")

def get_category(feature_name):
    if 'EXT' in feature_name.upper():
        return 'EXT_SOURCE'
    elif 'BUREAU' in feature_name.upper():
        return 'BUREAU'
    elif 'PREV' in feature_name.upper():
        return 'PREVIOUS'
    elif 'INST' in feature_name.upper():
        return 'INSTALLMENTS'
    elif 'POS' in feature_name.upper():
        return 'POS_CASH'
    elif 'CC_' in feature_name.upper():
        return 'CREDIT_CARD'
    elif 'AMT_' in feature_name.upper():
        return 'FINANCIAL'
    elif 'DAYS_' in feature_name.upper():
        return 'TEMPORAL'
    elif 'RISK' in feature_name.upper():
        return 'RISK_SCORE'
    elif any(x in feature_name.upper() for x in ['AGE', 'EMPLOY', 'WORK']):
        return 'DEMOGRAPHIC'
    else:
        return 'OTHER'

combined['category'] = combined['feature'].apply(get_category)

category_summary = combined.groupby('category').agg({
    'feature': 'count',
    'pct_importance': 'sum',
    'auc_drop_pct': 'sum',
    'combined_score': 'sum'
}).rename(columns={'feature': 'num_features'})

category_summary = category_summary.sort_values('pct_importance', ascending=False)

print("\n" + "="*60)
print("üìä IMPORTANCIA POR CATEGOR√çA")
print("="*60)
print(category_summary.to_string())

# ==========================================
# 9. VISUALIZACIONES
# ==========================================
print("\nüìä Generando visualizaciones...")

fig, axes = plt.subplots(2, 3, figsize=(20, 14))

# 9.1 Top 20 por Gain Importance
top20_gain = fi_gain.head(20)
axes[0, 0].barh(range(len(top20_gain)), top20_gain['pct_importance'], color='steelblue')
axes[0, 0].set_yticks(range(len(top20_gain)))
axes[0, 0].set_yticklabels(top20_gain['feature'], fontsize=8)
axes[0, 0].invert_yaxis()
axes[0, 0].set_xlabel('% Importancia')
axes[0, 0].set_title('Top 20 Features - Gain Importance', fontweight='bold')

# 9.2 Top 20 por Correlaci√≥n
top20_corr = corr_df.head(20)
colors = ['green' if c < 0 else 'red' for c in top20_corr['correlation']]
axes[0, 1].barh(range(len(top20_corr)), top20_corr['correlation'], color=colors)
axes[0, 1].set_yticks(range(len(top20_corr)))
axes[0, 1].set_yticklabels(top20_corr['feature'], fontsize=8)
axes[0, 1].invert_yaxis()
axes[0, 1].set_xlabel('Correlaci√≥n con TARGET')
axes[0, 1].set_title('Top 20 Features - Correlaci√≥n', fontweight='bold')
axes[0, 1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)

# 9.3 Top 20 por Permutation Importance
top20_perm = perm_df.head(20)
axes[0, 2].barh(range(len(top20_perm)), top20_perm['auc_drop_pct'], color='coral')
axes[0, 2].set_yticks(range(len(top20_perm)))
axes[0, 2].set_yticklabels(top20_perm['feature'], fontsize=8)
axes[0, 2].invert_yaxis()
axes[0, 2].set_xlabel('% Ca√≠da de AUC al permutar')
axes[0, 2].set_title('Top 20 Features - Permutation Importance', fontweight='bold')

# 9.4 Top 20 por Drop Column
top20_drop = drop_df.head(20)
axes[1, 0].barh(range(len(top20_drop)), top20_drop['auc_drop'] * 1000, color='purple')
axes[1, 0].set_yticks(range(len(top20_drop)))
axes[1, 0].set_yticklabels(top20_drop['feature'], fontsize=8)
axes[1, 0].invert_yaxis()
axes[1, 0].set_xlabel('Ca√≠da de AUC x1000')
axes[1, 0].set_title('Top 20 Features - Drop Column Impact', fontweight='bold')

# 9.5 Importancia por Categor√≠a
cat_colors = plt.cm.Set3(range(len(category_summary)))
axes[1, 1].pie(category_summary['pct_importance'], labels=category_summary.index,
               autopct='%1.1f%%', colors=cat_colors)
axes[1, 1].set_title('Distribuci√≥n de Importancia por Categor√≠a', fontweight='bold')

# 9.6 Top 20 Combined Score
top20_combined = combined.head(20)
axes[1, 2].barh(range(len(top20_combined)), top20_combined['combined_score'], color='teal')
axes[1, 2].set_yticks(range(len(top20_combined)))
axes[1, 2].set_yticklabels(top20_combined['feature'], fontsize=8)
axes[1, 2].invert_yaxis()
axes[1, 2].set_xlabel('Score Combinado')
axes[1, 2].set_title('Top 20 Features - Score Combinado', fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/figures/feature_importance_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úÖ Visualizaciones guardadas en '../reports/figures/feature_importance_analysis.png'")

# ==========================================
# 10. RECOMENDACIONES
# ==========================================
print("\n" + "="*80)
print("üí° RECOMENDACIONES PARA MEJORAR EL AUC")
print("="*80)

# Top features por cada categor√≠a
print("\nüéØ TOP 5 FEATURES M√ÅS IMPORTANTES:")
for i, row in combined.head(5).iterrows():
    print(f"   {row['feature']}: {row['pct_importance']:.2f}% importancia, "
          f"corr={row['correlation']:.4f}, drop_impact={row.get('drop_auc_impact', 0):.4f}")

# Features con alta correlaci√≥n pero baja importancia (oportunidad)
high_corr_low_imp = combined[
    (combined['abs_correlation'] > 0.05) &
    (combined['pct_importance'] < 0.5)
].head(10)

print("\nüîç OPORTUNIDADES (alta correlaci√≥n, baja importancia actual):")
print("   Estas features tienen informaci√≥n √∫til pero el modelo no las usa bien:")
for _, row in high_corr_low_imp.iterrows():
    print(f"   - {row['feature']}: corr={row['correlation']:.4f}, imp={row['pct_importance']:.2f}%")

# Categor√≠as subexplotadas
print("\nüìä CATEGOR√çAS CON POTENCIAL DE MEJORA:")
for cat, data in category_summary.iterrows():
    if data['pct_importance'] < 10 and data['num_features'] > 5:
        print(f"   - {cat}: {data['num_features']} features pero solo {data['pct_importance']:.1f}% de importancia")

# Guardar an√°lisis
combined.to_csv('../reports/feature_importance_analysis.csv', index=False)
category_summary.to_csv('../reports/category_importance.csv')

print("\nüíæ An√°lisis guardado en '../reports/'")

# ==========================================
# 11. RESUMEN EJECUTIVO
# ==========================================
print("\n" + "="*80)
print("üìã RESUMEN EJECUTIVO")
print("="*80)

print(f"""
üìä DISTRIBUCI√ìN DE IMPORTANCIA:

   EXT_SOURCE features: {category_summary.loc['EXT_SOURCE', 'pct_importance']:.1f}% del total
   ‚îú‚îÄ‚îÄ Estas son las M√ÅS IMPORTANTES
   ‚îú‚îÄ‚îÄ EXT_SOURCE_2 tiene la mayor correlaci√≥n negativa con default
   ‚îî‚îÄ‚îÄ Mejorar las combinaciones de EXT_SOURCE podr√≠a dar +0.01 AUC

   BUREAU features: {category_summary.loc['BUREAU', 'pct_importance'] if 'BUREAU' in category_summary.index else 0:.1f}%
   ‚îú‚îÄ‚îÄ Historial crediticio en otros bancos
   ‚îî‚îÄ‚îÄ bureau_dpd_count, bureau_credit_active son clave

   INSTALLMENTS features: {category_summary.loc['INSTALLMENTS', 'pct_importance'] if 'INSTALLMENTS' in category_summary.index else 0:.1f}%
   ‚îú‚îÄ‚îÄ Comportamiento de pago de cuotas
   ‚îî‚îÄ‚îÄ inst_late_ratio es muy predictiva

üéØ PARA SUBIR EL AUC, ENF√ìCATE EN:

   1. EXT_SOURCE: Ya las estamos usando bien, pero crear m√°s interacciones
   2. BUREAU: A√±adir m√°s features de bureau_balance a nivel mensual
   3. INSTALLMENTS: Crear features de tendencia m√°s sofisticadas
   4. RISK_SCORE: El score compuesto de riesgo es muy √∫til

üìà ESTIMACI√ìN DE POTENCIAL:
   - Features actuales bien optimizadas: ~0.79 AUC (donde estamos)
   - Con features adicionales de bureau_balance: +0.005-0.01
   - Con mejor feature engineering de EXT_SOURCE: +0.002-0.005
   - Techo estimado con estos datos: ~0.80-0.81 AUC
""")

üìä AN√ÅLISIS DE IMPORTANCIA DE FEATURES PARA AUC

üìÇ Cargando datos...
Features: 215
Samples: 307511

üîß Entrenando modelo base...
AUC Baseline: 0.7850

üìä Calculando Feature Importance del modelo...

üèÜ TOP 30 Features por Importancia (Gain):
                      feature  importance_gain  pct_importance
                 EXT_SOURCE_2         0.039387        3.938742
                 EXT_SOURCE_3         0.031698        3.169776
               FLAG_EMP_PHONE         0.019440        1.944029
       inst_last_3_late_ratio         0.018097        1.809686
deterioration_composite_score         0.016123        1.612298
          NAME_EDUCATION_TYPE         0.015464        1.546442
                  CODE_GENDER         0.014937        1.493745
  prev_weighted_refused_ratio         0.013994        1.399392
               cc_util_recent         0.013962        1.396237
                 EXT_SOURCE_1         0.013959        1.395889
         pos_recent_count_dpd         0.013674       

Exception in thread ExecutorManagerThread:
Traceback (most recent call last):
  File "C:\Users\usuario\Desktop\4geeks\Final_Proyect_Credit_Default_Risk-main\.venv\Lib\site-packages\psutil\_pswindows.py", line 692, in wrapper
    return fun(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\usuario\Desktop\4geeks\Final_Proyect_Credit_Default_Risk-main\.venv\Lib\site-packages\psutil\_pswindows.py", line 870, in kill
    return cext.proc_kill(self.pid)
           ^^^^^^^^^^^^^^^^^^^^^^^^
PermissionError: [WinError 5] Acceso denegado: '(originated from OpenProcess)'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\usuario\anaconda3\Lib\threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "C:\Users\usuario\Desktop\4geeks\Final_Proyect_Credit_Default_Risk-main\.venv\Lib\site-packages\joblib\externals\loky\process_executor.py", line 635, in run
    self.flag_executor_shutting_do

XGBoostError: parallel_for: failed to synchronize: cudaErrorLaunchFailure: unspecified launch failure