In [None]:
# ==========================================
# XGBOOST + OPTUNA + FEATURE SELECTION
# ==========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.metrics import (roc_auc_score, classification_report, confusion_matrix,
                             precision_recall_curve, f1_score, average_precision_score)
from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler
import warnings

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("=" * 60)
print("üöÄ XGBOOST + OPTUNA + FEATURE SELECTION")
print("=" * 60)

# ==========================================
# 1. CARGA DE DATOS
# ==========================================
print("\nüìÇ Cargando datos...")
df = pd.read_parquet("../data/interim/train_final_advanced_features.parquet")

df = df.loc[:, ~df.columns.duplicated()]
cols_to_drop = [c for c in df.columns if c.endswith('_x') or c.endswith('_y')]
df = df.drop(columns=cols_to_drop, errors='ignore')

print(f"Dataset: {df.shape}")

# ==========================================
# 2. PREPARAR FEATURES
# ==========================================
X = df.drop(['TARGET', 'SK_ID_CURR'], axis=1, errors='ignore')
y = df['TARGET']

# Encoding
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# ==========================================
# 3. FEATURE ENGINEERING
# ==========================================
print("\nüîß Feature Engineering...")


def complete_features(data):
    df = data.copy()

    ext1 = df.get('EXT_SOURCE_1', pd.Series([0.5] * len(df))).fillna(0.5).replace(0, 0.5)
    ext2 = df.get('EXT_SOURCE_2', pd.Series([0.5] * len(df))).fillna(0.5).replace(0, 0.5)
    ext3 = df.get('EXT_SOURCE_3', pd.Series([0.5] * len(df))).fillna(0.5).replace(0, 0.5)

    ext_df = pd.concat([ext1, ext2, ext3], axis=1)
    df['EXT_mean'] = ext_df.mean(axis=1)
    df['EXT_std'] = ext_df.std(axis=1)
    df['EXT_min'] = ext_df.min(axis=1)
    df['EXT_max'] = ext_df.max(axis=1)
    df['EXT_sum'] = ext1 + ext2 + ext3
    df['EXT_prod'] = ext1 * ext2 * ext3

    df['EXT_w_01_06_03'] = ext1 * 0.1 + ext2 * 0.6 + ext3 * 0.3
    df['EXT_1x2'] = ext1 * ext2
    df['EXT_2x3'] = ext2 * ext3
    df['EXT_1x3'] = ext1 * ext3

    df['EXT_2_pow2'] = ext2 ** 2
    df['EXT_2_log'] = np.log1p(ext2)
    df['EXT_harmonic'] = 3 / (1 / (ext1 + 0.01) + 1 / (ext2 + 0.01) + 1 / (ext3 + 0.01))
    df['EXT_geometric'] = (ext1 * ext2 * ext3) ** (1 / 3)

    if 'DAYS_BIRTH' in df.columns:
        df['age'] = -df['DAYS_BIRTH'] / 365.25
        df['age_sq'] = df['age'] ** 2

    if 'DAYS_EMPLOYED' in df.columns:
        days_emp = df['DAYS_EMPLOYED'].replace(365243, np.nan)
        df['emp_years'] = (-days_emp / 365.25).clip(lower=0)
        df['is_unemployed'] = (df['DAYS_EMPLOYED'] == 365243).astype(int)

    if 'age' in df.columns and 'emp_years' in df.columns:
        df['emp_ratio'] = df['emp_years'].fillna(0) / (df['age'] + 0.01)

    income = df.get('AMT_INCOME_TOTAL', pd.Series([1] * len(df))).replace(0, np.nan).fillna(1)
    credit = df.get('AMT_CREDIT', pd.Series([1] * len(df))).replace(0, np.nan).fillna(1)
    annuity = df.get('AMT_ANNUITY', pd.Series([1] * len(df))).replace(0, np.nan).fillna(1)

    df['cr_inc'] = (credit / income).clip(upper=50)
    df['an_inc'] = (annuity / income).clip(upper=5)
    df['cr_an'] = (credit / annuity).clip(upper=100)

    if 'age' in df.columns:
        df['EXT2_age'] = ext2 * df['age']

    df['EXT2_d_crInc'] = ext2 / (df['cr_inc'] + 0.01)

    return df


X = complete_features(X)
X = X.loc[:, ~X.columns.duplicated()]
X = X.replace([np.inf, -np.inf], np.nan)

for col in X.columns:
    if X[col].isna().any():
        median_val = X[col].median()
        X[col] = X[col].fillna(median_val if pd.notna(median_val) else 0)

print(f"Features despu√©s de engineering: {X.shape[1]}")

# ==========================================
# 3.5 FEATURE SELECTION ‚¨ÖÔ∏è AQU√ç VA
# ==========================================
print("\nüéØ Feature Selection...")

selector_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    device='cuda',
    tree_method='hist',
    random_state=42,
    verbosity=0
)

# Muestra balanceada para selecci√≥n r√°pida
sample_idx = y.sample(frac=0.3, random_state=42).index
X_sample = X.loc[sample_idx]
y_sample = y.loc[sample_idx]

sample_df = pd.concat([X_sample, y_sample], axis=1)
c0 = sample_df[sample_df['TARGET'] == 0]
c1 = sample_df[sample_df['TARGET'] == 1]
c0_sub = resample(c0, replace=False, n_samples=len(c1), random_state=42)
sample_balanced = pd.concat([c0_sub, c1])

X_sel = sample_balanced.drop('TARGET', axis=1)
y_sel = sample_balanced['TARGET']

selector_model.fit(X_sel, y_sel)

fi_selector = pd.DataFrame({
    'feature': X.columns,
    'importance': selector_model.feature_importances_
}).sort_values('importance', ascending=False)

# ‚ö° AJUSTAR ESTE N√öMERO: 50, 100, 150
N_FEATURES = 100

top_features = fi_selector.head(N_FEATURES)['feature'].tolist()

print(f"Features originales: {X.shape[1]}")
print(f"Features seleccionadas: {N_FEATURES}")
print(f"\nTop 10 features:")
for _, row in fi_selector.head(10).iterrows():
    print(f"  {row['feature']:<40} {row['importance']:.6f}")

# Aplicar selecci√≥n
X = X[top_features]

# ==========================================
# 4. SPLIT (ANTES DE BALANCEAR)
# ==========================================
print("\nüìä Split estratificado...")

X_train_raw, X_test, y_train_raw, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train_raw.shape}, Test: {X_test.shape}")

# ==========================================
# 5. BALANCEO UNDERSAMPLING 1:1
# ==========================================
print("\n‚öñÔ∏è Balanceando...")

train_df = pd.concat([X_train_raw, y_train_raw], axis=1)
class_0 = train_df[train_df['TARGET'] == 0]
class_1 = train_df[train_df['TARGET'] == 1]

class_0_sub = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)
train_balanced = pd.concat([class_0_sub, class_1]).sample(frac=1, random_state=42)

X_train_full = train_balanced.drop('TARGET', axis=1)
y_train_full = train_balanced['TARGET']

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")

# ==========================================
# 6. OPTUNA
# ==========================================
print("\nüîç Optimizando con Optuna...")


def objective(trial):
    params = {
        'n_estimators': 3000,
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 20, 150),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.6),
        'gamma': trial.suggest_float('gamma', 0.5, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 5),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 5),
        'device': 'cuda',
        'tree_method': 'hist',
        'random_state': 42,
        'eval_metric': 'auc',
        'early_stopping_rounds': 150,
        'verbosity': 0
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

    pred_val = model.predict_proba(X_val)[:, 1]
    pred_test = model.predict_proba(X_test)[:, 1]

    trial.set_user_attr('auc_test', roc_auc_score(y_test, pred_test))

    return roc_auc_score(y_val, pred_val)


study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=200, show_progress_bar=True)

print(f"\n‚úÖ Mejor AUC Val: {study.best_value:.4f}")
print(f"‚úÖ Mejor AUC Test: {study.best_trial.user_attrs['auc_test']:.4f}")

# ==========================================
# 7. MODELO FINAL
# ==========================================
print("\nüéØ Entrenando modelo final...")

best_params = study.best_params
best_params.update({
    'n_estimators': 5000,
    'device': 'cuda',
    'tree_method': 'hist',
    'random_state': 42,
    'eval_metric': 'auc',
    'early_stopping_rounds': 300,
    'verbosity': 0
})

X_tr, X_vl, y_tr, y_vl = train_test_split(
    X_train_full, y_train_full, test_size=0.15, random_state=42, stratify=y_train_full
)

final_model = XGBClassifier(**best_params)
final_model.fit(X_tr, y_tr, eval_set=[(X_vl, y_vl)], verbose=True)

pred_test = final_model.predict_proba(X_test)[:, 1]

# ==========================================
# 8. M√âTRICAS FINALES
# ==========================================
print("\n" + "=" * 60)
print("üìä RESULTADOS FINALES")
print("=" * 60)

auc = roc_auc_score(y_test, pred_test)
ap = average_precision_score(y_test, pred_test)

print(f"\nüìà ROC-AUC: {auc:.4f}")
print(f"üìà Average Precision: {ap:.4f}")

# Threshold √≥ptimo
prec, rec, thresh = precision_recall_curve(y_test, pred_test)
f1_scores = 2 * prec * rec / (prec + rec + 1e-10)
opt_thresh = thresh[np.argmax(f1_scores)]

y_pred = (pred_test >= opt_thresh).astype(int)
cm = confusion_matrix(y_test, y_pred)

print(f"\nüìä Con threshold {opt_thresh:.3f}:")
print(f"  Recall: {cm[1, 1] / (cm[1, 0] + cm[1, 1]):.2%}")
print(f"  Precision: {cm[1, 1] / (cm[0, 1] + cm[1, 1]):.2%}")
print(f"  Defaults detectados: {cm[1, 1]:,} / {cm[1, 0] + cm[1, 1]:,}")

print("\n" + "=" * 60)

üöÄ XGBOOST + OPTUNA + FEATURE SELECTION

üìÇ Cargando datos...
Dataset: (307511, 218)

üîß Feature Engineering...
