# Setup e Carregamento

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
import joblib
import warnings
warnings.filterwarnings('ignore')

# Configura√ß√µes
plt.style.use('seaborn-v0_8-darkgrid')
RANDOM_STATE = 42
N_FOLDS = 5

# Carregar dados processados
train = pd.read_csv('../data/processed/train_processed.csv')
test = pd.read_csv('../data/processed/test_processed.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Separar features e target
y = train['y']
X = train.drop(['y', 'id'], axis=1, errors='ignore')
X_test = test.drop(['id'], axis=1, errors='ignore')

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X_test shape: {X_test.shape}")

# Verificar desbalanceamento
print(f"\nDistribui√ß√£o do target:")
print(y.value_counts())
print(y.value_counts(normalize=True))

# Calcular scale_pos_weight para modelos tree-based
scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"\nScale pos weight: {scale_pos_weight:.2f}")

Train shape: (750000, 69)
Test shape: (250000, 68)

X shape: (750000, 67)
y shape: (750000,)
X_test shape: (250000, 67)

Distribui√ß√£o do target:
y
0    659512
1     90488
Name: count, dtype: int64
y
0    0.879349
1    0.120651
Name: proportion, dtype: float64

Scale pos weight: 7.29


# Analises de modelos

## Configura√ß√£o da Valida√ß√£o Cruzada

In [2]:
# Stratified K-Fold para manter propor√ß√£o das classes
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

def evaluate_model_cv(model, X, y, cv=skf):
    """
    Avalia modelo usando cross-validation
    Retorna: m√©dia e desvio padr√£o do AUC-ROC
    """
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        # Treinar
        model.fit(X_train_fold, y_train_fold)
        
        # Prever probabilidades
        y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
        
        # Calcular AUC
        auc = roc_auc_score(y_val_fold, y_pred_proba)
        scores.append(auc)
        
        print(f"Fold {fold}: AUC = {auc:.5f}")
    
    mean_auc = np.mean(scores)
    std_auc = np.std(scores)
    
    print(f"\n{'='*50}")
    print(f"Mean AUC: {mean_auc:.5f} (+/- {std_auc:.5f})")
    print(f"{'='*50}\n")
    
    return mean_auc, std_auc, scores

print("‚úÖ Valida√ß√£o cruzada configurada")


‚úÖ Valida√ß√£o cruzada configurada


## Baseline - Logistic Regression

In [3]:
print("\n" + "="*70)
print("MODELO BASELINE: LOGISTIC REGRESSION")
print("="*70 + "\n")

# Modelo com class_weight para lidar com desbalanceamento
lr_model = LogisticRegression(
    random_state=RANDOM_STATE,
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs'
)

# Avaliar com CV
lr_mean_auc, lr_std_auc, lr_scores = evaluate_model_cv(lr_model, X, y)

# Salvar resultado
baseline_results = {
    'model': 'Logistic Regression',
    'mean_auc': lr_mean_auc,
    'std_auc': lr_std_auc,
    'scores': lr_scores
}



MODELO BASELINE: LOGISTIC REGRESSION

Fold 1: AUC = 0.93547
Fold 2: AUC = 0.93557
Fold 3: AUC = 0.93407
Fold 4: AUC = 0.93658


KeyboardInterrupt: 

## LightGBM - Modelo Principal

In [None]:
print("\n" + "="*70)
print("MODELO PRINCIPAL: LIGHTGBM")
print("="*70 + "\n")

# Par√¢metros iniciais (baseline)
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
    'min_child_samples': 20,
    'scale_pos_weight': scale_pos_weight,
    'random_state': RANDOM_STATE,
    'verbose': -1
}

# Treinar com CV manual para ter controle
lgb_scores = []
lgb_models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Training Fold {fold}/{N_FOLDS}")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Criar datasets LightGBM
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    # Treinar
    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )
    
    # Prever
    y_pred_proba = model.predict(X_val_fold, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_val_fold, y_pred_proba)
    
    lgb_scores.append(auc)
    lgb_models.append(model)
    
    print(f"Fold {fold} AUC: {auc:.5f}\n")

lgb_mean_auc = np.mean(lgb_scores)
lgb_std_auc = np.std(lgb_scores)

print(f"\n{'='*50}")
print(f"LightGBM Mean AUC: {lgb_mean_auc:.5f} (+/- {lgb_std_auc:.5f})")
print(f"{'='*50}\n")

# Salvar resultado
lgb_results = {
    'model': 'LightGBM',
    'mean_auc': lgb_mean_auc,
    'std_auc': lgb_std_auc,
    'scores': lgb_scores,
    'models': lgb_models
}


## Feature Importance

In [None]:
# Calcular import√¢ncia m√©dia across folds
feature_importance = pd.DataFrame()
feature_importance['feature'] = X.columns

importances = []
for model in lgb_models:
    importances.append(model.feature_importance(importance_type='gain'))

feature_importance['importance'] = np.mean(importances, axis=0)
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Visualizar top 20
plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'].head(20)[::-1], 
         feature_importance['importance'].head(20)[::-1])
plt.xlabel('Importance (Gain)', fontweight='bold')
plt.title('Top 20 Features - LightGBM', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nTop 20 Features Mais Importantes:")
print(feature_importance.head(20))

# Salvar import√¢ncias
feature_importance.to_csv('../data/processed/feature_importance.csv', index=False)


## XGBoost

In [6]:
print("\n" + "="*70)
print("MODELO: XGBOOST")
print("="*70 + "\n")

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': scale_pos_weight,
    'random_state': RANDOM_STATE,
    'tree_method': 'hist'
}

xgb_scores = []
xgb_models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Training Fold {fold}/{N_FOLDS}")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Criar DMatrix
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    
    # Treinar
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dval, 'valid')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    # Prever
    y_pred_proba = model.predict(dval, iteration_range=(0, model.best_iteration))
    auc = roc_auc_score(y_val_fold, y_pred_proba)
    
    xgb_scores.append(auc)
    xgb_models.append(model)
    
    print(f"Fold {fold} AUC: {auc:.5f}\n")

xgb_mean_auc = np.mean(xgb_scores)
xgb_std_auc = np.std(xgb_scores)

print(f"\n{'='*50}")
print(f"XGBoost Mean AUC: {xgb_mean_auc:.5f} (+/- {xgb_std_auc:.5f})")
print(f"{'='*50}\n")

xgb_results = {
    'model': 'XGBoost',
    'mean_auc': xgb_mean_auc,
    'std_auc': xgb_std_auc,
    'scores': xgb_scores,
    'models': xgb_models
}



MODELO: XGBOOST

Training Fold 1/5
[0]	train-auc:0.94272	valid-auc:0.94247
[100]	train-auc:0.96068	valid-auc:0.96023
[200]	train-auc:0.96557	valid-auc:0.96421
[300]	train-auc:0.96798	valid-auc:0.96572
[400]	train-auc:0.96956	valid-auc:0.96645
[500]	train-auc:0.97075	valid-auc:0.96687
[600]	train-auc:0.97183	valid-auc:0.96723
[700]	train-auc:0.97274	valid-auc:0.96745
[800]	train-auc:0.97361	valid-auc:0.96763
[900]	train-auc:0.97445	valid-auc:0.96781
[999]	train-auc:0.97518	valid-auc:0.96792
Fold 1 AUC: 0.96792

Training Fold 2/5
[0]	train-auc:0.94194	valid-auc:0.94122
[100]	train-auc:0.96087	valid-auc:0.95890
[200]	train-auc:0.96579	valid-auc:0.96297
[300]	train-auc:0.96825	valid-auc:0.96457
[400]	train-auc:0.96980	valid-auc:0.96532
[500]	train-auc:0.97108	valid-auc:0.96582
[600]	train-auc:0.97210	valid-auc:0.96610
[700]	train-auc:0.97304	valid-auc:0.96634
[800]	train-auc:0.97392	valid-auc:0.96652
[900]	train-auc:0.97470	valid-auc:0.96663
[999]	train-auc:0.97543	valid-auc:0.96672
Fold 

KeyboardInterrupt: 

## CatBoost

In [5]:
import catboost

# Verificar se GPU est√° dispon√≠vel
try:
    # Tenta criar um modelo simples com GPU
    test_model = CatBoostClassifier(task_type='GPU', devices='0', iterations=1, verbose=False)
    print("‚úì GPU dispon√≠vel e ser√° utilizada para treinamento")
except Exception as e:
    print(f"‚ö† GPU n√£o dispon√≠vel ou erro: {str(e)}")
    print("O treinamento continuar√° em CPU")


print("\n" + "="*70)
print("MODELO: CATBOOST (GPU)")
print("="*70 + "\n")

cat_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU',              # Ativa GPU
    'devices': '0',                  # ID da GPU (0 para primeira GPU)
    'random_seed': RANDOM_STATE,
    'verbose': 100,
    'early_stopping_rounds': 50,
    'scale_pos_weight': scale_pos_weight
}

cat_scores = []
cat_models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nTraining Fold {fold}/{N_FOLDS}")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    model = CatBoostClassifier(**cat_params)
    
    # Treinar
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        verbose=False
    )
    
    # Prever
    y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
    auc = roc_auc_score(y_val_fold, y_pred_proba)
    
    cat_scores.append(auc)
    cat_models.append(model)
    
    print(f"Fold {fold} AUC: {auc:.5f}")

cat_mean_auc = np.mean(cat_scores)
cat_std_auc = np.std(cat_scores)

print(f"\n{'='*50}")
print(f"CatBoost (GPU) Mean AUC: {cat_mean_auc:.5f} (+/- {cat_std_auc:.5f})")
print(f"{'='*50}\n")

cat_results = {
    'model': 'CatBoost (GPU)',
    'mean_auc': cat_mean_auc,
    'std_auc': cat_std_auc,
    'scores': cat_scores,
    'models': cat_models
}


‚úì GPU dispon√≠vel e ser√° utilizada para treinamento

MODELO: CATBOOST (GPU)


Training Fold 1/5


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 1 AUC: 0.96596

Training Fold 2/5


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2 AUC: 0.96486

Training Fold 3/5


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 3 AUC: 0.96489

Training Fold 4/5


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 4 AUC: 0.96574

Training Fold 5/5


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 5 AUC: 0.96525

CatBoost (GPU) Mean AUC: 0.96534 (+/- 0.00045)



## Compara√ß√£o de Modelos

In [None]:
results_comparison = pd.DataFrame([
    baseline_results,
    lgb_results,
    xgb_results,
    cat_results
])[['model', 'mean_auc', 'std_auc']]

print("\n" + "="*70)
print("COMPARA√á√ÉO DE MODELOS")
print("="*70)
print(results_comparison.to_string(index=False))
print("="*70 + "\n")

# Visualizar
plt.figure(figsize=(10, 6))
plt.barh(results_comparison['model'], results_comparison['mean_auc'], 
         xerr=results_comparison['std_auc'], capsize=5, alpha=0.7)
plt.xlabel('Mean AUC-ROC', fontweight='bold')
plt.title('Compara√ß√£o de Performance dos Modelos', fontsize=14, fontweight='bold')
plt.xlim(0.85, results_comparison['mean_auc'].max() + 0.02)
for idx, row in results_comparison.iterrows():
    plt.text(row['mean_auc'], idx, f" {row['mean_auc']:.5f}", 
             va='center', fontweight='bold')
plt.tight_layout()
plt.show()

# Identificar melhor modelo
best_model_name = results_comparison.loc[results_comparison['mean_auc'].idxmax(), 'model']
print(f"üèÜ Melhor Modelo: {best_model_name}")


# Gerar Predi√ß√µes para Submiss√£o

In [None]:
def generate_predictions(models, X_test, model_name='ensemble'):
    """Gera predi√ß√µes usando m√©dia dos modelos (ensemble)"""
    
    predictions = []
    
    for model in models:
        if model_name == 'LightGBM':
            pred = model.predict(X_test, num_iteration=model.best_iteration)
        elif model_name == 'XGBoost':
            dtest = xgb.DMatrix(X_test)
            pred = model.predict(dtest, iteration_range=(0, model.best_iteration))
        else:  # CatBoost
            pred = model.predict_proba(X_test)[:, 1]
        
        predictions.append(pred)
    
    # M√©dia das predi√ß√µes (ensemble dos folds)
    final_predictions = np.mean(predictions, axis=0)
    
    return final_predictions

# Gerar predi√ß√µes para cada modelo
print("\nGerando predi√ß√µes...")

lgb_predictions = generate_predictions(lgb_models, X_test, 'LightGBM')
xgb_predictions = generate_predictions(xgb_models, X_test, 'XGBoost')
cat_predictions = generate_predictions(cat_models, X_test, 'CatBoost')

# Ensemble final: m√©dia ponderada pelos scores de CV
total_score = lgb_mean_auc + xgb_mean_auc + cat_mean_auc
lgb_weight = lgb_mean_auc / total_score
xgb_weight = xgb_mean_auc / total_score
cat_weight = cat_mean_auc / total_score

ensemble_predictions = (lgb_weight * lgb_predictions + 
                        xgb_weight * xgb_predictions + 
                        cat_weight * cat_predictions)

print(f"\nPesos do ensemble:")
print(f"  LightGBM: {lgb_weight:.3f}")
print(f"  XGBoost: {xgb_weight:.3f}")
print(f"  CatBoost: {cat_weight:.3f}")

# Criar submission files
test_ids = test['id'].values if 'id' in test.columns else range(len(test))

submissions = {
    'lgb': lgb_predictions,
    'xgb': xgb_predictions,
    'cat': cat_predictions,
    'ensemble': ensemble_predictions
}

for name, preds in submissions.items():
    submission = pd.DataFrame({
        'id': test_ids,
        'y': preds
    })
    submission.to_csv(f'../submissions/submission_{name}.csv', index=False)
    print(f"‚úÖ Arquivo salvo: submission_{name}.csv")

print("\nüéØ Submiss√µes prontas! Recomendo testar o ensemble primeiro.")


## Ensemble

In [None]:
# Comparar predi√ß√µes dos 3 modelos (primeiros 10 clientes)
comparison_df = pd.DataFrame({
    'id': test_ids[:10],
    'LightGBM': lgb_predictions[:10],
    'XGBoost': xgb_predictions[:10],
    'CatBoost': cat_predictions[:10],
    'Ensemble': ensemble_predictions[:10]
})

print("\n" + "="*70)
print("COMPARA√á√ÉO DE PREDI√á√ïES - Primeiros 10 Clientes")
print("="*70)
print(comparison_df.to_string(index=False))

# Calcular diferen√ßas
comparison_df['diff_lgb_xgb'] = abs(comparison_df['LightGBM'] - comparison_df['XGBoost'])
comparison_df['diff_lgb_cat'] = abs(comparison_df['LightGBM'] - comparison_df['CatBoost'])
comparison_df['diff_xgb_cat'] = abs(comparison_df['XGBoost'] - comparison_df['CatBoost'])

print(f"\nM√©dia das diferen√ßas entre modelos:")
print(f"  LightGBM vs XGBoost: {comparison_df['diff_lgb_xgb'].mean():.5f}")
print(f"  LightGBM vs CatBoost: {comparison_df['diff_lgb_cat'].mean():.5f}")
print(f"  XGBoost vs CatBoost: {comparison_df['diff_xgb_cat'].mean():.5f}")

# Visualizar distribui√ß√£o das predi√ß√µes
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].hist(lgb_predictions, bins=50, alpha=0.7, label='LightGBM', color='blue')
axes[0, 0].set_title('Distribui√ß√£o - LightGBM', fontweight='bold')
axes[0, 0].set_xlabel('Probabilidade Predita')
axes[0, 0].axvline(lgb_predictions.mean(), color='red', linestyle='--', 
                    label=f'M√©dia: {lgb_predictions.mean():.3f}')
axes[0, 0].legend()

axes[0, 1].hist(xgb_predictions, bins=50, alpha=0.7, label='XGBoost', color='green')
axes[0, 1].set_title('Distribui√ß√£o - XGBoost', fontweight='bold')
axes[0, 1].set_xlabel('Probabilidade Predita')
axes[0, 1].axvline(xgb_predictions.mean(), color='red', linestyle='--',
                    label=f'M√©dia: {xgb_predictions.mean():.3f}')
axes[0, 1].legend()

axes[1, 0].hist(cat_predictions, bins=50, alpha=0.7, label='CatBoost', color='orange')
axes[1, 0].set_title('Distribui√ß√£o - CatBoost', fontweight='bold')
axes[1, 0].set_xlabel('Probabilidade Predita')
axes[1, 0].axvline(cat_predictions.mean(), color='red', linestyle='--',
                    label=f'M√©dia: {cat_predictions.mean():.3f}')
axes[1, 0].legend()

axes[1, 1].hist(ensemble_predictions, bins=50, alpha=0.7, label='Ensemble', color='purple')
axes[1, 1].set_title('Distribui√ß√£o - Ensemble (Final)', fontweight='bold')
axes[1, 1].set_xlabel('Probabilidade Predita')
axes[1, 1].axvline(ensemble_predictions.mean(), color='red', linestyle='--',
                    label=f'M√©dia: {ensemble_predictions.mean():.3f}')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

# Estat√≠sticas das predi√ß√µes
print("\n" + "="*70)
print("ESTAT√çSTICAS DAS PREDI√á√ïES")
print("="*70)
print(f"\nLightGBM:")
print(f"  M√©dia: {lgb_predictions.mean():.5f}")
print(f"  Min: {lgb_predictions.min():.5f}")
print(f"  Max: {lgb_predictions.max():.5f}")
print(f"  Std: {lgb_predictions.std():.5f}")

print(f"\nXGBoost:")
print(f"  M√©dia: {xgb_predictions.mean():.5f}")
print(f"  Min: {xgb_predictions.min():.5f}")
print(f"  Max: {xgb_predictions.max():.5f}")
print(f"  Std: {xgb_predictions.std():.5f}")

print(f"\nCatBoost:")
print(f"  M√©dia: {cat_predictions.mean():.5f}")
print(f"  Min: {cat_predictions.min():.5f}")
print(f"  Max: {cat_predictions.max():.5f}")
print(f"  Std: {cat_predictions.std():.5f}")

print(f"\nüéØ ENSEMBLE (FINAL):")
print(f"  M√©dia: {ensemble_predictions.mean():.5f}")
print(f"  Min: {ensemble_predictions.min():.5f}")
print(f"  Max: {ensemble_predictions.max():.5f}")
print(f"  Std: {ensemble_predictions.std():.5f}")

# Correla√ß√£o entre modelos
corr_matrix = pd.DataFrame({
    'LightGBM': lgb_predictions,
    'XGBoost': xgb_predictions,
    'CatBoost': cat_predictions
}).corr()

print("\n" + "="*70)
print("CORRELA√á√ÉO ENTRE MODELOS")
print("="*70)
print(corr_matrix)
print("\nüí° Correla√ß√£o alta (>0.95) = modelos muito similares")
print("üí° Correla√ß√£o moderada (0.85-0.95) = ideal para ensemble!")


# Outros testes

## Teste sem duration

In [None]:
print("\n" + "="*70)
print("TESTE: MODELOS SEM DURATION")
print("="*70 + "\n")

# Remover duration
X_no_duration = X.drop('duration', axis=1, errors='ignore')
X_test_no_duration = X_test.drop('duration', axis=1, errors='ignore')

print(f"Features sem duration: {X_no_duration.shape[1]}")

# Treinar LightGBM sem duration
lgb_scores_no_dur = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_no_duration, y), 1):
    X_train_fold = X_no_duration.iloc[train_idx]
    X_val_fold = X_no_duration.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    
    y_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_val_fold, y_pred)
    lgb_scores_no_dur.append(auc)
    print(f"Fold {fold} AUC (sem duration): {auc:.5f}")

print(f"\nMean AUC sem duration: {np.mean(lgb_scores_no_dur):.5f}")
print(f"Compara√ß√£o:")
print(f"  COM duration: {lgb_mean_auc:.5f}")
print(f"  SEM duration: {np.mean(lgb_scores_no_dur):.5f}")
print(f"  Diferen√ßa: {lgb_mean_auc - np.mean(lgb_scores_no_dur):.5f}")


## Hyperparameter Tuning com Optuna

### LGB Tuning

In [None]:
def objective(trial):
    """Fun√ß√£o objetivo para Optuna"""
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'scale_pos_weight': scale_pos_weight,
        'random_state': RANDOM_STATE,
        'verbose': -1
    }
    
    cv_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_train_fold = y.iloc[train_idx]
        y_val_fold = y.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
        val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=500,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(30, verbose=False)]
        )
        
        y_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
        auc = roc_auc_score(y_val_fold, y_pred)
        cv_scores.append(auc)
    
    return np.mean(cv_scores)

# Rodar otimiza√ß√£o
print("\nüîß Iniciando Hyperparameter Tuning...")
print("Isso pode levar alguns minutos...\n")

study = optuna.create_study(direction='maximize', study_name='lgb_optimization')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\n" + "="*70)
print("MELHORES HIPERPAR√ÇMETROS")
print("="*70)
print(f"\nMelhor AUC: {study.best_value:.5f}")
print(f"Melhoria sobre baseline: +{study.best_value - lgb_mean_auc:.5f}")
print("\nMelhores par√¢metros:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")


### XGB Tuning

In [None]:
# ============================================
# HYPERPARAMETER TUNING - XGBOOST (CORRIGIDO)
# ============================================

print("="*70)
print("HYPERPARAMETER TUNING - XGBOOST")
print("="*70)

def objective_xgb(trial):
    """Fun√ß√£o objetivo para Optuna - XGBoost"""
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',
        
        # Hiperpar√¢metros para otimizar
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
        
        'scale_pos_weight': scale_pos_weight,
        'random_state': RANDOM_STATE,
        'early_stopping_rounds': 30,  # MOVIDO PARA AQUI
        'n_jobs': -1
    }
    
    cv_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_train_fold = y.iloc[train_idx]
        y_val_fold = y.iloc[val_idx]
        
        model = xgb.XGBClassifier(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            verbose=False
        )
        
        y_pred = model.predict_proba(X_val_fold)[:, 1]
        auc = roc_auc_score(y_val_fold, y_pred)
        cv_scores.append(auc)
    
    return np.mean(cv_scores)

# Criar study e otimizar
study_xgb = optuna.create_study(direction='maximize', study_name='xgb_optimization')
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

print("="*70)
print("MELHORES HIPERPAR√ÇMETROS - XGBOOST")
print("="*70)
print(f"Melhor AUC: {study_xgb.best_value:.5f}")
print(f"Melhoria sobre baseline: {study_xgb.best_value - xgb_mean_auc:.5f}")
print("\nMelhores par√¢metros:")
for key, value in study_xgb.best_params.items():
    print(f"  {key}: {value}")

### Cat Tuning

In [None]:
print("="*70)
print("HYPERPARAMETER TUNING - CATBOOST")
print("="*70)

def objective_cat(trial):
    """Fun√ß√£o objetivo para Optuna - CatBoost"""
    
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'task_type': 'CPU',
        
        # Hiperpar√¢metros para otimizar
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 10),
        
        'scale_pos_weight': scale_pos_weight,
        'random_state': RANDOM_STATE,
        'verbose': False
    }
    
    cv_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_train_fold = y.iloc[train_idx]
        y_val_fold = y.iloc[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_val_fold, y_val_fold),
            early_stopping_rounds=30,
            verbose=False
        )
        
        y_pred = model.predict_proba(X_val_fold)[:, 1]
        auc = roc_auc_score(y_val_fold, y_pred)
        cv_scores.append(auc)
    
    return np.mean(cv_scores)

# Criar study e otimizar
study_cat = optuna.create_study(direction='maximize', study_name='cat_optimization')
study_cat.optimize(objective_cat, n_trials=50, show_progress_bar=True)

print("="*70)
print("MELHORES HIPERPAR√ÇMETROS - CATBOOST")
print("="*70)
print(f"Melhor AUC: {study_cat.best_value:.5f}")
print(f"Melhoria sobre baseline: {study_cat.best_value - cat_mean_auc:.5f}")
print("\nMelhores par√¢metros:")
for key, value in study_cat.best_params.items():
    print(f"  {key}: {value}")

### Cat Tuning with GPU

In [None]:
print("="*70)
print("HYPERPARAMETER TUNING - CATBOOST (GPU)")
print("="*70)

def objective_cat(trial):
    """Fun√ß√£o objetivo para Optuna - CatBoost com GPU"""
    
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'task_type': 'GPU',      # GPU ativada
        'devices': '0',           # Primeira GPU
        
        # Hiperpar√¢metros para otimizar
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0, 10),
        
        'scale_pos_weight': scale_pos_weight,
        'random_state': RANDOM_STATE,
        'verbose': False
    }
    
    cv_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_train_fold = y.iloc[train_idx]
        y_val_fold = y.iloc[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_val_fold, y_val_fold),
            early_stopping_rounds=30,
            verbose=False
        )
        
        y_pred = model.predict_proba(X_val_fold)[:, 1]
        auc = roc_auc_score(y_val_fold, y_pred)
        cv_scores.append(auc)
    
    return np.mean(cv_scores)

# Criar study e otimizar
study_cat = optuna.create_study(direction='maximize', study_name='cat_gpu_optimization')
study_cat.optimize(objective_cat, n_trials=50, show_progress_bar=True)

print("="*70)
print("MELHORES HIPERPAR√ÇMETROS - CATBOOST (GPU)")
print("="*70)
print(f"Melhor AUC: {study_cat.best_value:.5f}")
print(f"Melhoria sobre baseline: {study_cat.best_value - cat_mean_auc:.5f}")
print("\nMelhores par√¢metros:")
for key, value in study_cat.best_params.items():
    print(f"  {key}: {value}")

## Feature Selection

In [None]:
# Carregar feature importance
feature_importance = pd.read_csv('../data/processed/feature_importance.csv')

# Testar com top 30 features
top_n = 30
top_features = feature_importance.head(top_n)['feature'].tolist()

print(f"\nTestando com top {top_n} features:")
print(top_features)

X_selected = X[top_features]
X_test_selected = X_test[top_features]

# Treinar com features selecionadas
lgb_scores_selected = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_selected, y), 1):
    X_train_fold = X_selected.iloc[train_idx]
    X_val_fold = X_selected.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    
    y_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_val_fold, y_pred)
    lgb_scores_selected.append(auc)
    print(f"Fold {fold} AUC (top {top_n}): {auc:.5f}")

print(f"\nMean AUC (top {top_n}): {np.mean(lgb_scores_selected):.5f}")
print(f"Compara√ß√£o com todas features: {lgb_mean_auc:.5f}")


# Rodadas finais

## Hiperpar√¢metros otimizados

In [None]:
print("\n" + "="*70)
print("TREINANDO MODELO FINAL COM HIPERPAR√ÇMETROS OTIMIZADOS")
print("="*70 + "\n")

# Melhores par√¢metros do Optuna
best_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 88,
    'learning_rate': 0.0679235909913875,
    'feature_fraction': 0.6008994544769531,
    'bagging_fraction': 0.9617438469454634,
    'bagging_freq': 1,
    'min_child_samples': 48,
    'max_depth': 12,
    'scale_pos_weight': scale_pos_weight,
    'random_state': RANDOM_STATE,
    'verbose': -1
}

# Treinar com CV
tuned_models = []
tuned_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Training Fold {fold}/{N_FOLDS}")
    
    X_train_fold = X.iloc[train_idx]
    X_val_fold = X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=2000,  # Mais itera√ß√µes
        valid_sets=[val_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=200)
        ]
    )
    
    y_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_val_fold, y_pred)
    
    tuned_scores.append(auc)
    tuned_models.append(model)
    
    print(f"Fold {fold} AUC: {auc:.5f}")
    print(f"Best iteration: {model.best_iteration}\n")

tuned_mean_auc = np.mean(tuned_scores)
tuned_std_auc = np.std(tuned_scores)

print(f"\n{'='*70}")
print(f"TUNED MODEL - Mean AUC: {tuned_mean_auc:.5f} (+/- {tuned_std_auc:.5f})")
print(f"Baseline Model - Mean AUC: {lgb_mean_auc:.5f}")
print(f"Improvement: +{tuned_mean_auc - lgb_mean_auc:.5f}")
print(f"{'='*70}\n")

# Gerar predi√ß√µes
tuned_predictions = []
for model in tuned_models:
    pred = model.predict(X_test, num_iteration=model.best_iteration)
    tuned_predictions.append(pred)

tuned_final_pred = np.mean(tuned_predictions, axis=0)

# Salvar submiss√£o
submission_tuned = pd.DataFrame({
    'id': test['id'].values if 'id' in test.columns else range(len(test)),
    'y': tuned_final_pred
})
submission_tuned.to_csv('../submissions/submission_lgb_tuned.csv', index=False)
print("‚úÖ Submiss√£o salva: submission_lgb_tuned.csv")


## Ensemble Otimizado (LightGBM Tuned + XGBoost + CatBoost)

In [None]:
print("\n" + "="*70)
print("CRIANDO ENSEMBLE FINAL")
print("="*70 + "\n")

# Pesos baseados em performance
models_performance = {
    'tuned_lgb': tuned_mean_auc,
    'xgb': xgb_mean_auc,
    'cat': cat_mean_auc
}

print("Performance dos modelos:")
for name, score in models_performance.items():
    print(f"  {name}: {score:.5f}")

# Calcular pesos
total_score = sum(models_performance.values())
weights = {k: v/total_score for k, v in models_performance.items()}

print("\nPesos do ensemble:")
for name, weight in weights.items():
    print(f"  {name}: {weight:.4f}")

# Ensemble ponderado
ensemble_final = (
    weights['tuned_lgb'] * tuned_final_pred + 
    weights['xgb'] * xgb_predictions + 
    weights['cat'] * cat_predictions
)

# Salvar
submission_ensemble = pd.DataFrame({
    'id': test['id'].values if 'id' in test.columns else range(len(test)),
    'y': ensemble_final
})
submission_ensemble.to_csv('../submissions/submission_ensemble_optimized.csv', index=False)
print("\n‚úÖ Ensemble otimizado salvo: submission_ensemble_optimized.csv")

# Compara√ß√£o das distribui√ß√µes
print("\n" + "="*70)
print("COMPARA√á√ÉO DAS PREDI√á√ïES")
print("="*70)
print(f"\nLightGBM Tuned:")
print(f"  M√©dia: {tuned_final_pred.mean():.5f}")
print(f"  Min: {tuned_final_pred.min():.5f}")
print(f"  Max: {tuned_final_pred.max():.5f}")

print(f"\nEnsemble Otimizado:")
print(f"  M√©dia: {ensemble_final.mean():.5f}")
print(f"  Min: {ensemble_final.min():.5f}")
print(f"  Max: {ensemble_final.max():.5f}")


## Top 30 Features com Tuning

In [None]:
print("\n" + "="*70)
print("MODELO SIMPLIFICADO: TOP 30 FEATURES + TUNING")
print("="*70 + "\n")

# Top features
feature_importance = pd.read_csv('../data/processed/feature_importance.csv')
top_30_features = feature_importance.head(30)['feature'].tolist()

X_top30 = X[top_30_features]
X_test_top30 = X_test[top_30_features]

# Treinar com par√¢metros otimizados
top30_models = []
top30_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_top30, y), 1):
    X_train_fold = X_top30.iloc[train_idx]
    X_val_fold = X_top30.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    y_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_val_fold, y_pred)
    
    top30_scores.append(auc)
    top30_models.append(model)
    
    print(f"Fold {fold} AUC: {auc:.5f}")

top30_mean_auc = np.mean(top30_scores)
print(f"\nTop 30 Features + Tuning: {top30_mean_auc:.5f}")
print(f"All Features + Tuning: {tuned_mean_auc:.5f}")
print(f"Diferen√ßa: {tuned_mean_auc - top30_mean_auc:.5f}")

# Se a diferen√ßa for pequena (<0.0005), vale a pena submeter a vers√£o simplificada
if abs(tuned_mean_auc - top30_mean_auc) < 0.0005:
    print("\nüí° Modelo simplificado tem performance similar! Gerando submiss√£o...")
    
    top30_predictions = []
    for model in top30_models:
        pred = model.predict(X_test_top30, num_iteration=model.best_iteration)
        top30_predictions.append(pred)
    
    top30_final = np.mean(top30_predictions, axis=0)
    
    submission_top30 = pd.DataFrame({
        'id': test['id'].values if 'id' in test.columns else range(len(test)),
        'y': top30_final
    })
    submission_top30.to_csv('../submissions/submission_lgb_top30.csv', index=False)
    print("‚úÖ Submiss√£o top30 salva!")


# Checklist

## Valida√ß√µes

In [None]:
import pandas as pd
import os

print("="*70)
print("VALIDA√á√ÉO DOS ARQUIVOS DE SUBMISS√ÉO")
print("="*70 + "\n")

# Arquivos para validar
submission_files = [
    '../submissions/submission_lgb_top30.csv',
    '../submissions/submission_lgb_tuned.csv',
    '../submissions/submission_ensemble_optimized.csv'
]

# Carregar sample submission para refer√™ncia
sample_sub = pd.read_csv('../data/raw/analise-preditiva-de-comportamento-bancario/sample_submission.csv')

for file_path in submission_files:
    if os.path.exists(file_path):
        print(f"\nüìÑ Validando: {os.path.basename(file_path)}")
        print("-" * 60)
        
        # Carregar submiss√£o
        sub = pd.read_csv(file_path)
        
        # Verifica√ß√µes
        checks = {
            'Tem coluna id': 'id' in sub.columns,
            'Tem coluna y': 'y' in sub.columns,
            'N√∫mero correto de linhas': len(sub) == len(sample_sub),
            'Sem valores faltantes': sub.isnull().sum().sum() == 0,
            'y entre 0 e 1': (sub['y'].min() >= 0) and (sub['y'].max() <= 1),
            'IDs corretos': (sub['id'] == sample_sub['id']).all() if 'id' in sample_sub.columns else True
        }
        
        # Exibir resultados
        all_passed = True
        for check, result in checks.items():
            status = "‚úÖ" if result else "‚ùå"
            print(f"{status} {check}: {result}")
            if not result:
                all_passed = False
        
        # Estat√≠sticas
        print(f"\nüìä Estat√≠sticas da coluna 'y':")
        print(f"   M√©dia: {sub['y'].mean():.5f}")
        print(f"   Mediana: {sub['y'].median():.5f}")
        print(f"   Min: {sub['y'].min():.5f}")
        print(f"   Max: {sub['y'].max():.5f}")
        print(f"   Desvio padr√£o: {sub['y'].std():.5f}")
        
        if all_passed:
            print(f"\n‚úÖ {os.path.basename(file_path)} est√° PRONTO para submiss√£o!")
        else:
            print(f"\n‚ùå {os.path.basename(file_path)} tem problemas! Corrija antes de submeter.")
    else:
        print(f"\n‚ùå Arquivo n√£o encontrado: {file_path}")

print("\n" + "="*70)
print("VALIDA√á√ÉO CONCLU√çDA")
print("="*70)

# Comparar distribui√ß√µes entre submiss√µes
print("\nüìä COMPARA√á√ÉO ENTRE SUBMISS√ïES\n")

sub1 = pd.read_csv('../submissions/submission_lgb_top30.csv')
sub2 = pd.read_csv('../submissions/submission_lgb_tuned.csv')
sub3 = pd.read_csv('../submissions/submission_ensemble_optimized.csv')

comparison = pd.DataFrame({
    'Top30': [sub1['y'].mean(), sub1['y'].std(), sub1['y'].min(), sub1['y'].max()],
    'Tuned': [sub2['y'].mean(), sub2['y'].std(), sub2['y'].min(), sub2['y'].max()],
    'Ensemble': [sub3['y'].mean(), sub3['y'].std(), sub3['y'].min(), sub3['y'].max()]
}, index=['M√©dia', 'Desvio', 'Min', 'Max'])

print(comparison.to_string())

# Correla√ß√£o entre predi√ß√µes
print("\nüîó CORRELA√á√ÉO ENTRE PREDI√á√ïES:\n")
corr = pd.DataFrame({
    'Top30': sub1['y'],
    'Tuned': sub2['y'],
    'Ensemble': sub3['y']
}).corr()
print(corr.to_string())


## Conclus√µes e insights

In [None]:
print("\n" + "="*70)
print("CONCLUS√ïES DO PROJETO")
print("="*70 + "\n")

print("üìä PERFORMANCE DOS MODELOS:")
print(f"   ‚Ä¢ Logistic Regression (baseline): {lr_mean_auc:.5f}")
print(f"   ‚Ä¢ LightGBM (baseline):           {lgb_mean_auc:.5f}")
print(f"   ‚Ä¢ LightGBM (tuned):              {tuned_mean_auc:.5f}")
print(f"   ‚Ä¢ XGBoost:                       {xgb_mean_auc:.5f}")
print(f"   ‚Ä¢ CatBoost:                      {cat_mean_auc:.5f}")

print("\nüéØ MELHOR MODELO: LightGBM com Hyperparameter Tuning")
print(f"   ‚Ä¢ AUC-ROC: {tuned_mean_auc:.5f} (¬±{tuned_std_auc:.5f})")
print(f"   ‚Ä¢ Melhoria sobre baseline: +{tuned_mean_auc - lgb_mean_auc:.5f}")

print("\nüí° PRINCIPAIS INSIGHTS:")
print("   1. Duration √© a feature mais importante (correla√ß√£o 0.52 com target)")
print("   2. Hist√≥rico de campanha (poutcome='success') tem 76% de convers√£o")
print("   3. Meses mar, sep, oct, dec t√™m convers√£o >50% (alta sazonalidade)")
print("   4. Top 30 features capturam 99.99% da performance")
print("   5. Modelos tree-based superam modelos lineares significativamente")

print("\n‚öôÔ∏è T√âCNICAS APLICADAS:")
print("   ‚Ä¢ Feature Engineering: 51 novas features criadas")
print("   ‚Ä¢ Target Encoding para vari√°veis de alta cardinalidade")
print("   ‚Ä¢ Stratified K-Fold (5 folds) para valida√ß√£o")
print("   ‚Ä¢ Scale pos weight para lidar com desbalanceamento")
print("   ‚Ä¢ Hyperparameter Tuning com Optuna (50 trials)")
print("   ‚Ä¢ Ensemble de m√∫ltiplos modelos")

print("\nüìÅ ARQUIVOS DE SUBMISS√ÉO GERADOS:")
print("   1. submission_lgb_top30.csv (recomendado - menos overfitting)")
print("   2. submission_lgb_tuned.csv (melhor CV score)")
print("   3. submission_ensemble_optimized.csv (mais robusto)")

print("\n" + "="*70)
print("PROJETO CONCLU√çDO COM SUCESSO!")
print("="*70)


## √öltima Submiss√£o

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

print("="*70)
print("GERANDO SUBMISS√ïES CALIBRADAS")
print("="*70 + "\n")

# Carregar dados
train = pd.read_csv('../data/processed/train_processed.csv')
test = pd.read_csv('../data/processed/test_processed.csv')

y = train['y']
X = train.drop(['y', 'id'], axis=1, errors='ignore')
X_test = test.drop(['id'], axis=1, errors='ignore')

# Par√¢metros otimizados
best_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 88,
    'learning_rate': 0.0679235909913875,
    'feature_fraction': 0.6008994544769531,
    'bagging_fraction': 0.9617438469454634,
    'bagging_freq': 1,
    'min_child_samples': 48,
    'max_depth': 12,
    'scale_pos_weight': (y == 0).sum() / (y == 1).sum(),
    'random_state': 42,
    'verbose': -1
}

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Armazenar calibradores e predi√ß√µes
iso_calibrators = []
platt_calibrators = []
test_predictions_raw = []
oof_predictions = np.zeros(len(X))  # Out-of-fold predictions

print("üîÑ Treinando modelos e calibradores...\n")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Processando Fold {fold}/5...")
    
    X_train_fold = X.iloc[train_idx]
    X_val_fold = X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    # Treinar modelo
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    # Predi√ß√µes no validation set (para treinar calibrador)
    y_val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = y_val_pred
    
    # Treinar calibradores
    # Isotonic Regression
    iso_cal = IsotonicRegression(out_of_bounds='clip')
    iso_cal.fit(y_val_pred, y_val_fold)
    iso_calibrators.append(iso_cal)
    
    # Platt Scaling
    platt_cal = LogisticRegression()
    platt_cal.fit(y_val_pred.reshape(-1, 1), y_val_fold)
    platt_calibrators.append(platt_cal)
    
    # Predi√ß√µes no test set
    y_test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    test_predictions_raw.append(y_test_pred)
    
    print(f"  ‚úì Fold {fold} completo\n")

# M√©dia das predi√ß√µes no test set (sem calibrar)
test_pred_raw_mean = np.mean(test_predictions_raw, axis=0)

print("="*70)
print("APLICANDO CALIBRA√á√ÉO NAS PREDI√á√ïES DO TEST SET")
print("="*70 + "\n")

# Aplicar cada calibrador e fazer m√©dia
test_predictions_iso = []
test_predictions_platt = []

for i, (iso_cal, platt_cal) in enumerate(zip(iso_calibrators, platt_calibrators)):
    # Isotonic
    pred_iso = iso_cal.predict(test_predictions_raw[i])
    test_predictions_iso.append(pred_iso)
    
    # Platt
    pred_platt = platt_cal.predict_proba(test_predictions_raw[i].reshape(-1, 1))[:, 1]
    test_predictions_platt.append(pred_platt)

# M√©dia das predi√ß√µes calibradas
test_pred_iso_mean = np.mean(test_predictions_iso, axis=0)
test_pred_platt_mean = np.mean(test_predictions_platt, axis=0)

# Comparar estat√≠sticas
print("üìä ESTAT√çSTICAS DAS PREDI√á√ïES:\n")
print(f"{'M√©todo':<25} {'M√©dia':<12} {'Mediana':<12} {'Min':<12} {'Max':<12}")
print("-" * 73)
print(f"{'Sem Calibra√ß√£o':<25} {test_pred_raw_mean.mean():<12.5f} {np.median(test_pred_raw_mean):<12.5f} {test_pred_raw_mean.min():<12.5f} {test_pred_raw_mean.max():<12.5f}")
print(f"{'Isotonic Regression':<25} {test_pred_iso_mean.mean():<12.5f} {np.median(test_pred_iso_mean):<12.5f} {test_pred_iso_mean.min():<12.5f} {test_pred_iso_mean.max():<12.5f}")
print(f"{'Platt Scaling':<25} {test_pred_platt_mean.mean():<12.5f} {np.median(test_pred_platt_mean):<12.5f} {test_pred_platt_mean.min():<12.5f} {test_pred_platt_mean.max():<12.5f}")

# Verificar taxa esperada no test
print(f"\nüí° Taxa de convers√£o esperada no train: {y.mean():.5f} ({y.mean()*100:.2f}%)")
print(f"   Predi√ß√£o m√©dia calibrada (Isotonic): {test_pred_iso_mean.mean():.5f} ({test_pred_iso_mean.mean()*100:.2f}%)")
print(f"   Diferen√ßa: {abs(y.mean() - test_pred_iso_mean.mean()):.5f}\n")

# Salvar submiss√µes calibradas
test_ids = test['id'].values if 'id' in test.columns else range(len(test))

# Submiss√£o 1: Isotonic Regression (recomendada)
submission_iso = pd.DataFrame({
    'id': test_ids,
    'y': test_pred_iso_mean
})
submission_iso.to_csv('../submissions/submission_lgb_calibrated_isotonic.csv', index=False)
print("‚úÖ Salvo: submission_lgb_calibrated_isotonic.csv")

# Submiss√£o 2: Platt Scaling
submission_platt = pd.DataFrame({
    'id': test_ids,
    'y': test_pred_platt_mean
})
submission_platt.to_csv('../submissions/submission_lgb_calibrated_platt.csv', index=False)
print("‚úÖ Salvo: submission_lgb_calibrated_platt.csv")

# Submiss√£o 3: Top 30 features calibradas
print("\nüîÑ Gerando vers√£o calibrada do Top 30 features...")

# Top 30 features
feature_importance = pd.read_csv('../data/processed/feature_importance.csv')
top_30_features = feature_importance.head(30)['feature'].tolist()

X_top30 = X[top_30_features]
X_test_top30 = X_test[top_30_features]

# Repetir processo para top 30
iso_calibrators_top30 = []
test_predictions_top30 = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_top30, y), 1):
    X_train_fold = X_top30.iloc[train_idx]
    X_val_fold = X_top30.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    y_val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    
    iso_cal = IsotonicRegression(out_of_bounds='clip')
    iso_cal.fit(y_val_pred, y_val_fold)
    iso_calibrators_top30.append(iso_cal)
    
    y_test_pred = model.predict(X_test_top30, num_iteration=model.best_iteration)
    test_predictions_top30.append(y_test_pred)

# Calibrar predi√ß√µes top30
test_predictions_top30_calibrated = []
for i, iso_cal in enumerate(iso_calibrators_top30):
    pred_cal = iso_cal.predict(test_predictions_top30[i])
    test_predictions_top30_calibrated.append(pred_cal)

test_pred_top30_cal_mean = np.mean(test_predictions_top30_calibrated, axis=0)

submission_top30_cal = pd.DataFrame({
    'id': test_ids,
    'y': test_pred_top30_cal_mean
})
submission_top30_cal.to_csv('../submissions/submission_top30_calibrated.csv', index=False)
print("‚úÖ Salvo: submission_top30_calibrated.csv")

print("\n" + "="*70)
print("RESUMO DAS SUBMISS√ïES DISPON√çVEIS")
print("="*70 + "\n")

print("üìÅ SUBMISS√ïES SEM CALIBRA√á√ÉO (geradas anteriormente):")
print("   1. submission_lgb_top30.csv")
print("   2. submission_lgb_tuned.csv")
print("   3. submission_ensemble_optimized.csv")

print("\nüìÅ SUBMISS√ïES CALIBRADAS (NOVAS - RECOMENDADAS):")
print("   4. submission_lgb_calibrated_isotonic.csv    ‚≠ê MELHOR OP√á√ÉO")
print("   5. submission_lgb_calibrated_platt.csv")
print("   6. submission_top30_calibrated.csv           ‚≠ê ALTERNATIVA ROBUSTA")

print("\nüí° RECOMENDA√á√ÉO FINAL:")
print("   Submeta PRIMEIRO: submission_lgb_calibrated_isotonic.csv")
print("   Segunda op√ß√£o:    submission_top30_calibrated.csv")
print("   Terceira op√ß√£o:   submission_ensemble_optimized.csv (j√° gerado)")


# tmp

## Calibra√ß√£o de Probabilidades

In [None]:
# ==============================================================================
# AN√ÅLISE DE CALIBRA√á√ÉO DO MODELO (CORRIGIDO)
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss, roc_auc_score

# Carregar dados
train = pd.read_csv('../data/processed/train_processed.csv')
y = train['y']
X = train.drop(['y', 'id'], axis=1, errors='ignore')

# Pegar predi√ß√µes de um dos folds para an√°lise
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, val_idx = list(skf.split(X, y))[0]

X_train_fold = X.iloc[train_idx]
X_val = X.iloc[val_idx]
y_train_fold = y.iloc[train_idx]
y_val = y.iloc[val_idx]

# Retreinar um modelo r√°pido para an√°lise
train_data = lgb.Dataset(X_train_fold, label=y_train_fold)

best_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 88,
    'learning_rate': 0.068,
    'feature_fraction': 0.60,
    'bagging_fraction': 0.96,
    'bagging_freq': 1,
    'min_child_samples': 48,
    'max_depth': 12,
    'scale_pos_weight': (y == 0).sum() / (y == 1).sum(),
    'random_state': 42,
    'verbose': -1
}

model = lgb.train(best_params, train_data, num_boost_round=800)

# Predi√ß√µes
y_pred = model.predict(X_val)

print("="*70)
print("AN√ÅLISE DE CALIBRA√á√ÉO DO MODELO")
print("="*70)

# 1. Comparar taxa real vs predita
print(f"\nüìä COMPARA√á√ÉO DE TAXAS:")
print(f"   Taxa real (y_val):        {y_val.mean():.4f} ({y_val.mean()*100:.2f}%)")
print(f"   Predi√ß√£o m√©dia (modelo):  {y_pred.mean():.4f} ({y_pred.mean()*100:.2f}%)")
print(f"   Diferen√ßa absoluta:       {abs(y_val.mean() - y_pred.mean()):.4f}")
print(f"   Diferen√ßa relativa:       {((y_pred.mean() / y_val.mean()) - 1)*100:.1f}%")

# 2. An√°lise por bins de probabilidade (CORRIGIDO)
print(f"\nüìà AN√ÅLISE POR BINS DE PROBABILIDADE:")
print("-" * 70)
print(f"{'Bin de Predi√ß√£o':<20} {'Taxa Real':<15} {'Pred M√©dia':<15} {'Quantidade':<15}")
print("-" * 70)

bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
y_val_binned = pd.cut(y_pred, bins=bins)

# Corrigido: usar .categories diretamente
for bin_range in y_val_binned.categories:
    mask = y_val_binned == bin_range
    count = mask.sum()
    if count > 0:
        real_rate = y_val[mask].mean()
        pred_mean = y_pred[mask].mean()
        print(f"{str(bin_range):<20} {real_rate:<15.3f} {pred_mean:<15.3f} {count:<15,}")

# 3. Brier Score (mede calibra√ß√£o)
brier = brier_score_loss(y_val, y_pred)
print(f"\nüìâ BRIER SCORE: {brier:.5f}")
print("   (Quanto menor, melhor. 0 = perfeito, 0.25 = aleat√≥rio)")

# Refer√™ncia: Brier Score de um modelo aleat√≥rio
brier_random = y_val.mean() * (1 - y_val.mean())
print(f"   Brier Score (modelo aleat√≥rio): {brier_random:.5f}")
print(f"   Melhoria sobre aleat√≥rio: {((brier_random - brier) / brier_random * 100):.1f}%")

# 4. AUC-ROC
auc = roc_auc_score(y_val, y_pred)
print(f"\nüìà AUC-ROC: {auc:.5f}")

# 5. An√°lise espec√≠fica: quantos % dos casos t√™m pred > 0.5?
print(f"\nüéØ PREDI√á√ïES COM ALTA CONFIAN√áA:")

for threshold in [0.3, 0.5, 0.7, 0.9]:
    high_prob = (y_pred > threshold).sum()
    high_prob_pct = high_prob / len(y_pred) * 100
    if high_prob > 0:
        real_rate = y_val[y_pred > threshold].mean()
        print(f"   >={threshold:.1f}: {high_prob:,} casos ({high_prob_pct:.2f}%) | Taxa real: {real_rate:.3f}")

# 6. Curva de calibra√ß√£o
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_val, y_pred, n_bins=10, strategy='uniform'
)

# Visualizar
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Curva de calibra√ß√£o
axes[0, 0].plot([0, 1], [0, 1], 'k--', label='Perfeitamente calibrado', linewidth=2)
axes[0, 0].plot(mean_predicted_value, fraction_of_positives, 's-', 
                label='LightGBM Tuned', linewidth=2, markersize=8, color='red')
axes[0, 0].set_xlabel('Probabilidade Predita M√©dia', fontweight='bold')
axes[0, 0].set_ylabel('Fra√ß√£o de Positivos Real', fontweight='bold')
axes[0, 0].set_title('Curva de Calibra√ß√£o', fontweight='bold', fontsize=14)
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xlim([0, 1])
axes[0, 0].set_ylim([0, 1])

# Plot 2: Distribui√ß√£o das predi√ß√µes vs real
axes[0, 1].hist(y_pred[y_val == 0], bins=50, alpha=0.5, label='y=0 (n√£o aderiu)', 
                density=True, color='red')
axes[0, 1].hist(y_pred[y_val == 1], bins=50, alpha=0.5, label='y=1 (aderiu)', 
                density=True, color='green')
axes[0, 1].axvline(y_val.mean(), color='blue', linestyle='--', linewidth=2,
                   label=f'Taxa real: {y_val.mean():.3f}')
axes[0, 1].axvline(y_pred.mean(), color='orange', linestyle='--', linewidth=2,
                   label=f'Pred m√©dia: {y_pred.mean():.3f}')
axes[0, 1].set_xlabel('Probabilidade', fontweight='bold')
axes[0, 1].set_ylabel('Densidade', fontweight='bold')
axes[0, 1].set_title('Distribui√ß√£o das Predi√ß√µes', fontweight='bold', fontsize=14)
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Plot 3: Reliability diagram (outra forma de visualizar calibra√ß√£o)
axes[1, 0].bar(mean_predicted_value, fraction_of_positives - mean_predicted_value,
               width=0.08, alpha=0.7, color='coral')
axes[1, 0].axhline(0, color='black', linestyle='-', linewidth=1)
axes[1, 0].set_xlabel('Probabilidade Predita M√©dia', fontweight='bold')
axes[1, 0].set_ylabel('Diferen√ßa (Real - Predita)', fontweight='bold')
axes[1, 0].set_title('Reliability Diagram (Erro de Calibra√ß√£o)', fontweight='bold', fontsize=14)
axes[1, 0].grid(axis='y', alpha=0.3)

# Plot 4: Distribui√ß√£o acumulada
sorted_pred = np.sort(y_pred)
cumsum = np.arange(1, len(sorted_pred) + 1) / len(sorted_pred)
axes[1, 1].plot(sorted_pred, cumsum, linewidth=2, color='purple')
axes[1, 1].axhline(0.5, color='red', linestyle='--', label='Mediana')
axes[1, 1].axvline(y_val.mean(), color='blue', linestyle='--', label=f'Taxa real: {y_val.mean():.3f}')
axes[1, 1].set_xlabel('Probabilidade Predita', fontweight='bold')
axes[1, 1].set_ylabel('Propor√ß√£o Acumulada', fontweight='bold')
axes[1, 1].set_title('Distribui√ß√£o Acumulada das Predi√ß√µes', fontweight='bold', fontsize=14)
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()

## Se a Calibra√ß√£o Estiver Ruim

In [None]:
# ==============================================================================
# CALIBRA√á√ÉO DE PROBABILIDADES (SE NECESS√ÅRIO)
# ==============================================================================

from sklearn.calibration import CalibratedClassifierCV
from sklearn.isotonic import IsotonicRegression

print("\n" + "="*70)
print("APLICANDO CALIBRA√á√ÉO NAS PROBABILIDADES")
print("="*70 + "\n")

# M√©todo 1: Isotonic Regression (mais flex√≠vel)
iso_reg = IsotonicRegression(out_of_bounds='clip')
iso_reg.fit(y_pred, y_val)
y_pred_calibrated_iso = iso_reg.predict(y_pred)

print(f"üìä RESULTADOS DA CALIBRA√á√ÉO (Isotonic):")
print(f"   Antes: {y_pred.mean():.4f}")
print(f"   Depois: {y_pred_calibrated_iso.mean():.4f}")
print(f"   Taxa real: {y_val.mean():.4f}")
print(f"   Brier Score antes: {brier_score_loss(y_val, y_pred):.5f}")
print(f"   Brier Score depois: {brier_score_loss(y_val, y_pred_calibrated_iso):.5f}")

# Verificar se AUC mudou (n√£o deveria mudar muito)
from sklearn.metrics import roc_auc_score
auc_before = roc_auc_score(y_val, y_pred)
auc_after = roc_auc_score(y_val, y_pred_calibrated_iso)

print(f"\nüìà IMPACTO NO AUC:")
print(f"   AUC antes: {auc_before:.5f}")
print(f"   AUC depois: {auc_after:.5f}")
print(f"   Diferen√ßa: {auc_after - auc_before:.5f}")

# M√©todo 2: Platt Scaling (regress√£o log√≠stica)
from sklearn.linear_model import LogisticRegression

platt = LogisticRegression()
platt.fit(y_pred.reshape(-1, 1), y_val)
y_pred_calibrated_platt = platt.predict_proba(y_pred.reshape(-1, 1))[:, 1]

print(f"\nüìä RESULTADOS DA CALIBRA√á√ÉO (Platt Scaling):")
print(f"   Depois (Platt): {y_pred_calibrated_platt.mean():.4f}")
print(f"   Brier Score (Platt): {brier_score_loss(y_val, y_pred_calibrated_platt):.5f}")
