# Início NB

## 0. Configurações iniciais

In [None]:
# ==============================================================================
# MARATONA DE DATA SCIENCE MACKENZIE 2025
# Análise Preditiva de Comportamento Bancário
# ==============================================================================
# 
# Autor: Eduardo M Sanchez
# Data: Dezembro 2025
# Objetivo: Prever probabilidade de adesão a depósito bancário
# Métrica: ROC AUC
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.isotonic import IsotonicRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# Configurações
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

RANDOM_STATE = 42
N_FOLDS = 5

print("Bibliotecas importadas")
#print(f"   LightGBM: {lgb.__version__}")
#print(f"   XGBoost: {xgb.__version__}")

## 1. Carregando Dados

In [None]:
# arquivos
TRAIN_DIR = "../data/raw/analise-preditiva-de-comportamento-bancario/train.csv"
TEST_DIR = "../data/raw/analise-preditiva-de-comportamento-bancario/test.csv"
SAMPLE_DIR = "../data/raw/analise-preditiva-de-comportamento-bancario/sample_submission.csv"

# Carregar dados
train = pd.read_csv(TRAIN_DIR)
test = pd.read_csv(TEST_DIR)
sample_submission = pd.read_csv(SAMPLE_DIR)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("\nANÁLISE INICIAL DO DATASET\n")

# Distribuição do target
conversion_rate = train['y'].mean()
print(f"\nTaxa de conversão: {conversion_rate:.4f} ({conversion_rate*100:.2f}%)")
print(f"   Não aderiu (0): {(train['y']==0).sum():,} ({(train['y']==0).mean()*100:.1f}%)")
print(f"   Aderiu (1): {(train['y']==1).sum():,} ({(train['y']==1).mean()*100:.1f}%)")

# Scale pos weight
scale_pos_weight = (train['y'] == 0).sum() / (train['y'] == 1).sum()
print(f"\nScale pos weight: {scale_pos_weight:.2f}")

## 2. Modelando features

In [None]:
print("Iniciando Feature Engineering...\n")

def create_all_features(df, target=None, train_size=None):
    """
    Cria TODAS as features engineered para máxima performance
    
    Args:
        df: DataFrame com dados originais
        target: Series com target (apenas para train)
        train_size: Tamanho do conjunto de treino (para target encoding)
    
    Returns:
        DataFrame com todas features engineered
    """
    
    df = df.copy()
    
    print("Criando features binárias...")
    
    # ===== FEATURES BINÁRIAS =====
    
    # Histórico de campanha
    df['foi_contatado_antes'] = (df['pdays'] != -1).astype(int)
    df['historico_sucesso'] = (df['poutcome'] == 'success').astype(int)
    df['historico_failure'] = (df['poutcome'] == 'failure').astype(int)
    df['historico_unknown'] = (df['poutcome'] == 'unknown').astype(int)
    df['historico_other'] = (df['poutcome'] == 'other').astype(int)
    
    # Perfil profissional
    high_conversion_jobs = ['student', 'retired', 'unemployed', 'management']
    low_conversion_jobs = ['blue-collar', 'services', 'housemaid']
    df['job_high_conversion'] = df['job'].isin(high_conversion_jobs).astype(int)
    df['job_low_conversion'] = df['job'].isin(low_conversion_jobs).astype(int)
    df['job_student'] = (df['job'] == 'student').astype(int)
    df['job_retired'] = (df['job'] == 'retired').astype(int)
    df['job_management'] = (df['job'] == 'management').astype(int)
    
    # Contato
    df['has_effective_contact'] = (df['contact'] != 'unknown').astype(int)
    df['contact_cellular'] = (df['contact'] == 'cellular').astype(int)
    df['contact_telephone'] = (df['contact'] == 'telephone').astype(int)
    df['contact_unknown'] = (df['contact'] == 'unknown').astype(int)
    
    # Situação financeira
    df['has_default'] = (df['default'] == 'yes').astype(int)
    df['has_housing'] = (df['housing'] == 'yes').astype(int)
    df['has_loan'] = (df['loan'] == 'yes').astype(int)
    df['no_debt'] = ((df['default'] == 'no') & (df['loan'] == 'no')).astype(int)
    df['full_debt'] = ((df['has_housing'] == 1) & (df['has_loan'] == 1)).astype(int)
    
    # Status civil
    df['is_single'] = (df['marital'] == 'single').astype(int)
    df['is_married'] = (df['marital'] == 'married').astype(int)
    df['is_divorced'] = (df['marital'] == 'divorced').astype(int)
    
    # Educação
    df['high_education'] = df['education'].isin(['tertiary', 'unknown']).astype(int)
    df['education_tertiary'] = (df['education'] == 'tertiary').astype(int)
    df['education_secondary'] = (df['education'] == 'secondary').astype(int)
    
    print("Criando features temporais...")
    
    # ===== FEATURES TEMPORAIS =====
    
    # Sazonalidade
    high_season_months = ['mar', 'sep', 'oct', 'dec']
    low_season_months = ['may', 'jun', 'jul']
    df['is_high_season'] = df['month'].isin(high_season_months).astype(int)
    df['is_low_season'] = df['month'].isin(low_season_months).astype(int)
    df['is_may'] = (df['month'] == 'may').astype(int)
    df['is_mar'] = (df['month'] == 'mar').astype(int)
    df['is_dec'] = (df['month'] == 'dec').astype(int)
    
    # Trimestre
    month_to_quarter = {
        'jan': 1, 'feb': 1, 'mar': 1,
        'apr': 2, 'may': 2, 'jun': 2,
        'jul': 3, 'aug': 3, 'sep': 3,
        'oct': 4, 'nov': 4, 'dec': 4
    }
    df['quarter'] = df['month'].map(month_to_quarter)
    df['quarter_1'] = (df['quarter'] == 1).astype(int)
    df['quarter_4'] = (df['quarter'] == 4).astype(int)
    
    # Período do mês
    df['day_period'] = pd.cut(df['day'], bins=[0, 10, 20, 31], labels=[0, 1, 2]).astype(int)
    df['inicio_mes'] = (df['day'] <= 5).astype(int)
    df['meio_mes'] = ((df['day'] > 10) & (df['day'] <= 20)).astype(int)
    df['fim_mes'] = (df['day'] >= 25).astype(int)
    df['day_squared'] = df['day'] ** 2
    
    print("Criando features de campanha...")
    
    # ===== FEATURES DE CAMPANHA =====
    
    df['total_contacts'] = df['campaign'] + df['previous']
    df['contact_frequency'] = df['campaign'] / (df['campaign'].max() + 1)
    df['has_previous'] = (df['previous'] > 0).astype(int)
    df['multiple_previous'] = (df['previous'] > 1).astype(int)
    
    # Perfis de cliente
    df['cliente_frio'] = ((df['campaign'] > 3) & (df['poutcome'] != 'success')).astype(int)
    df['cliente_quente'] = ((df['campaign'] <= 2) & (df['poutcome'] == 'success')).astype(int)
    df['cliente_novo'] = ((df['previous'] == 0) & (df['poutcome'] == 'unknown')).astype(int)
    df['cliente_insistente'] = (df['campaign'] > 5).astype(int)
    
    # Categorias de pdays
    df['pdays_category'] = pd.cut(df['pdays'], 
                                    bins=[-2, -1, 30, 90, 180, 999],
                                    labels=[0, 1, 2, 3, 4]).astype(int)
    df['never_contacted'] = (df['pdays'] == -1).astype(int)
    df['contato_recente'] = ((df['pdays'] > 0) & (df['pdays'] <= 30)).astype(int)
    df['contato_medio'] = ((df['pdays'] > 30) & (df['pdays'] <= 180)).astype(int)
    df['contato_antigo'] = (df['pdays'] > 180).astype(int)
    
    # Intensidade
    df['high_campaign_intensity'] = (df['campaign'] > df['campaign'].median()).astype(int)
    df['low_campaign_intensity'] = (df['campaign'] == 1).astype(int)
    
    # Campaign buckets
    df['campaign_bucket'] = pd.cut(df['campaign'], 
                                     bins=[0, 1, 2, 3, 5, 100],
                                     labels=[0, 1, 2, 3, 4]).astype(int)
    
    print("Criando features financeiras...")
    
    # ===== FEATURES FINANCEIRAS =====
    
    # Categorização de balance
    df['balance_category'] = pd.cut(df['balance'], 
                                     bins=[-np.inf, 0, 500, 1000, 3000, 5000, np.inf],
                                     labels=[0, 1, 2, 3, 4, 5]).astype(int)
    
    df['balance_negative'] = (df['balance'] < 0).astype(int)
    df['balance_very_low'] = (df['balance'] < 500).astype(int)
    df['balance_low'] = ((df['balance'] >= 500) & (df['balance'] < 1000)).astype(int)
    df['balance_medium'] = ((df['balance'] >= 1000) & (df['balance'] < 5000)).astype(int)
    df['balance_high'] = (df['balance'] >= 5000).astype(int)
    df['balance_very_high'] = (df['balance'] > 10000).astype(int)
    df['balance_zero'] = (df['balance'] == 0).astype(int)
    
    # Transformações
    df['balance_per_age'] = df['balance'] / (df['age'] + 1)
    df['balance_log'] = np.log1p(df['balance'] + abs(df['balance'].min()) + 1)
    df['balance_sqrt'] = np.sqrt(df['balance'] + abs(df['balance'].min()))
    df['balance_abs'] = np.abs(df['balance'])
    
    # Dívidas
    df['total_loans'] = df['has_housing'] + df['has_loan'] + df['has_default']
    df['sem_dividas'] = (df['total_loans'] == 0).astype(int)
    df['uma_divida'] = (df['total_loans'] == 1).astype(int)
    df['multiplas_dividas'] = (df['total_loans'] >= 2).astype(int)
    
    # Perfis
    df['perfil_premium'] = ((df['balance'] > df['balance'].median()) & 
                            (df['sem_dividas'] == 1)).astype(int)
    df['perfil_risco'] = ((df['balance'] < 0) | (df['has_default'] == 1)).astype(int)
    
    print("Criando features demográficas...")
    
    # ===== FEATURES DEMOGRÁFICAS =====
    
    # Faixas etárias
    df['age_group'] = pd.cut(df['age'], 
                              bins=[0, 25, 35, 45, 55, 65, 100],
                              labels=[0, 1, 2, 3, 4, 5]).astype(int)
    
    df['is_very_young'] = (df['age'] < 25).astype(int)
    df['is_young'] = ((df['age'] >= 25) & (df['age'] < 35)).astype(int)
    df['is_middle_age'] = ((df['age'] >= 35) & (df['age'] < 55)).astype(int)
    df['is_senior'] = (df['age'] >= 60).astype(int)
    df['is_elderly'] = (df['age'] >= 70).astype(int)
    df['working_age'] = ((df['age'] >= 25) & (df['age'] < 65)).astype(int)
    
    # Transformações de idade
    df['age_squared'] = df['age'] ** 2
    df['age_log'] = np.log1p(df['age'])
    
    # Perfis combinados
    df['estudante_jovem'] = ((df['job'] == 'student') & (df['age'] < 30)).astype(int)
    df['aposentado_senior'] = ((df['job'] == 'retired') & (df['age'] >= 60)).astype(int)
    df['jovem_solteiro'] = ((df['age'] < 35) & (df['marital'] == 'single')).astype(int)
    df['adulto_casado'] = ((df['age'] >= 35) & (df['marital'] == 'married')).astype(int)
    
    # Status socioeconômico
    high_status_jobs = ['management', 'self-employed', 'entrepreneur']
    df['high_status'] = ((df['job'].isin(high_status_jobs)) & 
                         (df['education'] == 'tertiary')).astype(int)
    df['poupador_potencial'] = ((df['balance'] > df['balance'].median()) & 
                                 (df['working_age'] == 1) &
                                 (df['sem_dividas'] == 1)).astype(int)
    
    print("Criando features de interação...")
    
    # ===== FEATURES DE INTERAÇÃO =====
    
    # Interações numéricas
    df['age_balance_ratio'] = (df['age'] * df['balance']) / 1000
    df['age_balance_product'] = df['age'] * df['balance'] / 10000
    df['age_campaign'] = df['age'] * df['campaign']
    df['age_duration'] = df['age'] * df['duration'] / 100
    df['balance_duration'] = df['balance'] * df['duration'] / 1000
    df['balance_campaign'] = df['balance'] * df['campaign']
    
    # Interações com histórico
    df['campaign_success_interaction'] = df['campaign'] * df['historico_sucesso']
    df['previous_success_interaction'] = df['previous'] * df['historico_sucesso']
    df['duration_success_interaction'] = df['duration'] * df['historico_sucesso']
    
    # Interações categóricas
    df['education_job_match'] = ((df['education'] == 'tertiary') & 
                                  (df['job'].isin(['management', 'technician', 'services']))).astype(int)
    df['contact_timing'] = df['contact_cellular'] * df['is_high_season']
    df['contact_profile'] = df['contact_cellular'] * df['job_high_conversion']
    
    # Razões
    df['balance_loan_ratio'] = df['balance'] / (df['total_loans'] + 1)
    df['previous_campaign_ratio'] = df['previous'] / (df['campaign'] + 1)
    df['duration_campaign_ratio'] = df['duration'] / (df['campaign'] + 1)
    
    # Peso do histórico
    poutcome_weight = {'success': 3, 'failure': -1, 'other': 0, 'unknown': 0}
    df['poutcome_numeric'] = df['poutcome'].map(poutcome_weight)
    df['weighted_history'] = df['previous'] * df['poutcome_numeric']
    df['weighted_contacts'] = df['total_contacts'] * df['poutcome_numeric']
    
    # Score composto
    df['conversion_score'] = (
        df['historico_sucesso'] * 3 +
        df['job_high_conversion'] * 2 +
        df['is_high_season'] * 2 +
        df['contact_cellular'] * 1 +
        df['high_education'] * 1 -
        df['cliente_frio'] * 2
    )
    
    print("Aplicando encoding...")
    
    # ===== ENCODING DE VARIÁVEIS CATEGÓRICAS =====
    
    # Month (ordinal)
    month_order = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                   'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    df['month_encoded'] = df['month'].map(month_order)
    df['month_sin'] = np.sin(2 * np.pi * df['month_encoded'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month_encoded'] / 12)
    
    # Target Encoding para job e education
    if target is not None and train_size is not None:
        df_train = df.iloc[:train_size].copy()
        df_train['target'] = target
        
        # Job
        job_means = df_train.groupby('job')['target'].mean()
        job_counts = df_train.groupby('job').size()
        global_mean = target.mean()
        
        # Smoothing (para evitar overfitting)
        smoothing = 10
        df['job_encoded'] = df['job'].map(
            lambda x: (job_means.get(x, global_mean) * job_counts.get(x, 0) + 
                      global_mean * smoothing) / (job_counts.get(x, 0) + smoothing)
        )
        
        # Education
        education_means = df_train.groupby('education')['target'].mean()
        education_counts = df_train.groupby('education').size()
        df['education_encoded'] = df['education'].map(
            lambda x: (education_means.get(x, global_mean) * education_counts.get(x, 0) + 
                      global_mean * smoothing) / (education_counts.get(x, 0) + smoothing)
        )
    else:
        le_job = LabelEncoder()
        le_education = LabelEncoder()
        df['job_encoded'] = le_job.fit_transform(df['job'])
        df['education_encoded'] = le_education.fit_transform(df['education'])
    
    # Frequency encoding
    for col in ['job', 'education', 'marital']:
        freq = df[col].value_counts(normalize=True)
        df[f'{col}_freq'] = df[col].map(freq)
    
    print("Finalizando...")
    
    # ===== REMOVER FEATURES ORIGINAIS CATEGÓRICAS =====
    
    features_to_drop = ['job', 'marital', 'education', 'default', 'housing', 
                        'loan', 'contact', 'month', 'poutcome', 'poutcome_numeric']
    df = df.drop(columns=features_to_drop, errors='ignore')
    
    return df

# Aplicar feature engineering
print("Processando train e test...")

df_combined = pd.concat([train.drop('y', axis=1), test], axis=0, ignore_index=True)
df_processed = create_all_features(df_combined, target=train['y'], train_size=len(train))

# Separar
X_train = df_processed.iloc[:len(train)].copy()
X_test = df_processed.iloc[len(train):].copy()
y_train = train['y']

# Remover id
X_train = X_train.drop('id', axis=1, errors='ignore')
X_test = X_test.drop('id', axis=1, errors='ignore')
'''
print(f"\nFeature Engineering concluído")
print(f"   Train shape: {X_train.shape}")
print(f"   Test shape: {X_test.shape}")
print(f"   Total de features criadas: {X_train.shape[1]}")
'''

## 3. ENSEMBLE DE MODELOS - LIGHTGBM + XGBOOST + CATBOOST

In [None]:
print("TREINAMENTO DO MODELO - LIGHTGBM OTIMIZADO")

# Configuração da validação cruzada
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# ===== HIPERPARÂMETROS OTIMIZADOS =====

# LightGBM
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 88,
    'learning_rate': 0.07161197523282926,
    'feature_fraction': 0.8900436534571514,
    'bagging_fraction': 0.7870692246364229,
    'bagging_freq': 1,
    'min_child_samples': 43,
    'max_depth': 12,
    'scale_pos_weight': scale_pos_weight,
    'random_state': RANDOM_STATE,
    'verbose': -1
}

# XGBoost
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 8,
    'learning_rate': 0.05,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 10,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'scale_pos_weight': scale_pos_weight,
    'random_state': RANDOM_STATE,
    'tree_method': 'hist'
}

# CatBoost
cat_params = {
    'iterations': 1500,
    'learning_rate': 0.05,
    'depth': 8,
    'l2_leaf_reg': 3,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': RANDOM_STATE,
    'verbose': 0,
    'early_stopping_rounds': 100,
    'scale_pos_weight': scale_pos_weight
}

# Armazenamento de modelos e predições
lgb_models = []
xgb_models = []
cat_models = []
calibrators_lgb = []
calibrators_xgb = []
calibrators_cat = []

lgb_test_preds = []
xgb_test_preds = []
cat_test_preds = []

lgb_cv_scores = []
xgb_cv_scores = []
cat_cv_scores = []

oof_lgb = np.zeros(len(X_train))
oof_xgb = np.zeros(len(X_train))
oof_cat = np.zeros(len(X_train))

print("Iniciando treinamento do ensemble...\n")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\n{'='*70}")
    print(f"FOLD {fold}/{N_FOLDS}")
    print(f"{'='*70}\n")
    
    # Split
    X_train_fold = X_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_train_fold = y_train.iloc[train_idx]
    y_val_fold = y_train.iloc[val_idx]
    
    # ===== LIGHTGBM =====
    print("Treinando LightGBM...")
    train_data_lgb = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data_lgb = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data_lgb)
    
    model_lgb = lgb.train(
        lgb_params,
        train_data_lgb,
        num_boost_round=2000,
        valid_sets=[val_data_lgb],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(100, verbose=False),
            lgb.log_evaluation(200)
        ]
    )
    
    y_val_pred_lgb = model_lgb.predict(X_val_fold, num_iteration=model_lgb.best_iteration)
    oof_lgb[val_idx] = y_val_pred_lgb
    auc_lgb = roc_auc_score(y_val_fold, y_val_pred_lgb)
    lgb_cv_scores.append(auc_lgb)
    
    # Calibrar LightGBM
    cal_lgb = IsotonicRegression(out_of_bounds='clip')
    cal_lgb.fit(y_val_pred_lgb, y_val_fold)
    calibrators_lgb.append(cal_lgb)
    
    # Predição no test
    y_test_pred_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
    lgb_test_preds.append(y_test_pred_lgb)
    lgb_models.append(model_lgb)
    
    print(f"   LightGBM AUC: {auc_lgb:.5f} | Best iteration: {model_lgb.best_iteration}")
    
    # ===== XGBOOST =====
    print("\nTreinando XGBoost...")
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    
    model_xgb = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=2000,
        evals=[(dval, 'valid')],
        early_stopping_rounds=100,
        verbose_eval=200
    )
    
    y_val_pred_xgb = model_xgb.predict(dval, iteration_range=(0, model_xgb.best_iteration))
    oof_xgb[val_idx] = y_val_pred_xgb
    auc_xgb = roc_auc_score(y_val_fold, y_val_pred_xgb)
    xgb_cv_scores.append(auc_xgb)
    
    # Calibrar XGBoost
    cal_xgb = IsotonicRegression(out_of_bounds='clip')
    cal_xgb.fit(y_val_pred_xgb, y_val_fold)
    calibrators_xgb.append(cal_xgb)
    
    # Predição no test
    dtest = xgb.DMatrix(X_test)
    y_test_pred_xgb = model_xgb.predict(dtest, iteration_range=(0, model_xgb.best_iteration))
    xgb_test_preds.append(y_test_pred_xgb)
    xgb_models.append(model_xgb)
    
    print(f"   XGBoost AUC: {auc_xgb:.5f} | Best iteration: {model_xgb.best_iteration}")
    
    # ===== CATBOOST =====
    print("\nTreinando CatBoost...")
    model_cat = CatBoostClassifier(**cat_params)
    
    model_cat.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        verbose=False
    )
    
    y_val_pred_cat = model_cat.predict_proba(X_val_fold)[:, 1]
    oof_cat[val_idx] = y_val_pred_cat
    auc_cat = roc_auc_score(y_val_fold, y_val_pred_cat)
    cat_cv_scores.append(auc_cat)
    
    # Calibrar CatBoost
    cal_cat = IsotonicRegression(out_of_bounds='clip')
    cal_cat.fit(y_val_pred_cat, y_val_fold)
    calibrators_cat.append(cal_cat)
    
    # Predição no test
    y_test_pred_cat = model_cat.predict_proba(X_test)[:, 1]
    cat_test_preds.append(y_test_pred_cat)
    cat_models.append(model_cat)
    
    print(f"   CatBoost AUC: {auc_cat:.5f}")
    
    # Resumo do fold
    print(f"\nResumo do Fold {fold}:")
    print(f"   LightGBM: {auc_lgb:.5f}")
    print(f"   XGBoost:  {auc_xgb:.5f}")
    print(f"   CatBoost: {auc_cat:.5f}")

# ===== RESULTADOS FINAIS DA CV =====

print(f"\n{'='*70}")
print("RESULTADOS FINAIS DO ENSEMBLE")
print(f"{'='*70}\n")

# Out-of-fold AUCs
oof_auc_lgb = roc_auc_score(y_train, oof_lgb)
oof_auc_xgb = roc_auc_score(y_train, oof_xgb)
oof_auc_cat = roc_auc_score(y_train, oof_cat)

print("AUC-ROC por Modelo:\n")
print(f"{'Modelo':<15} {'Mean AUC':<12} {'Std AUC':<12} {'OOF AUC':<12}")
print("-" * 51)
print(f"{'LightGBM':<15} {np.mean(lgb_cv_scores):<12.5f} {np.std(lgb_cv_scores):<12.5f} {oof_auc_lgb:<12.5f}")
print(f"{'XGBoost':<15} {np.mean(xgb_cv_scores):<12.5f} {np.std(xgb_cv_scores):<12.5f} {oof_auc_xgb:<12.5f}")
print(f"{'CatBoost':<15} {np.mean(cat_cv_scores):<12.5f} {np.std(cat_cv_scores):<12.5f} {oof_auc_cat:<12.5f}")

# Ensemble de OOF
oof_ensemble = (oof_lgb + oof_xgb + oof_cat) / 3
oof_auc_ensemble = roc_auc_score(y_train, oof_ensemble)
print(f"\nENSEMBLE OOF AUC: {oof_auc_ensemble:.5f}")


## 4. Calibrando e gerando submissões

In [None]:
print("CALIBRAÇÃO E PREDIÇÕES FINAIS")

# Média das predições não calibradas
lgb_test_mean = np.mean(lgb_test_preds, axis=0)
xgb_test_mean = np.mean(xgb_test_preds, axis=0)
cat_test_mean = np.mean(cat_test_preds, axis=0)

# Aplicar calibração
lgb_test_calibrated = []
xgb_test_calibrated = []
cat_test_calibrated = []

for i in range(N_FOLDS):
    lgb_test_calibrated.append(calibrators_lgb[i].predict(lgb_test_preds[i]))
    xgb_test_calibrated.append(calibrators_xgb[i].predict(xgb_test_preds[i]))
    cat_test_calibrated.append(calibrators_cat[i].predict(cat_test_preds[i]))

lgb_test_cal_mean = np.mean(lgb_test_calibrated, axis=0)
xgb_test_cal_mean = np.mean(xgb_test_calibrated, axis=0)
cat_test_cal_mean = np.mean(cat_test_calibrated, axis=0)

# ===== ESTRATÉGIAS DE ENSEMBLE =====

print("Criando diferentes estratégias de ensemble...\n")

# 1. Média simples (calibrado)
ensemble_mean = (lgb_test_cal_mean + xgb_test_cal_mean + cat_test_cal_mean) / 3

# 2. Média ponderada por performance (calibrado)
total_score = oof_auc_lgb + oof_auc_xgb + oof_auc_cat
w_lgb = oof_auc_lgb / total_score
w_xgb = oof_auc_xgb / total_score
w_cat = oof_auc_cat / total_score

ensemble_weighted = (w_lgb * lgb_test_cal_mean + 
                     w_xgb * xgb_test_cal_mean + 
                     w_cat * cat_test_cal_mean)

# 3. Rank averaging (mais robusto a outliers)
from scipy.stats import rankdata
lgb_ranks = rankdata(lgb_test_cal_mean) / len(lgb_test_cal_mean)
xgb_ranks = rankdata(xgb_test_cal_mean) / len(xgb_test_cal_mean)
cat_ranks = rankdata(cat_test_cal_mean) / len(cat_test_cal_mean)
ensemble_rank = (lgb_ranks + xgb_ranks + cat_ranks) / 3

# 4. Melhor modelo individual (LightGBM se for o melhor)
best_model_pred = lgb_test_cal_mean if oof_auc_lgb >= max(oof_auc_xgb, oof_auc_cat) else (
    xgb_test_cal_mean if oof_auc_xgb >= oof_auc_cat else cat_test_cal_mean
)

# Estatísticas
print("ESTATÍSTICAS DAS PREDIÇÕES:\n")
print(f"{'Estratégia':<30} {'Média':<12} {'Mediana':<12} {'Min':<12} {'Max':<12}")
print("-" * 78)

strategies = {
    'LightGBM (calibrado)': lgb_test_cal_mean,
    'XGBoost (calibrado)': xgb_test_cal_mean,
    'CatBoost (calibrado)': cat_test_cal_mean,
    'Ensemble Média Simples': ensemble_mean,
    'Ensemble Média Ponderada': ensemble_weighted,
    'Ensemble Rank Averaging': ensemble_rank
}

for name, pred in strategies.items():
    print(f"{name:<30} {pred.mean():<12.5f} {np.median(pred):<12.5f} {pred.min():<12.5f} {pred.max():<12.5f}")

print(f"\nTaxa de conversão esperada no train: {y_train.mean():.5f} ({y_train.mean()*100:.2f}%)")
print(f"\nPesos do ensemble ponderado:")
print(f"   LightGBM: {w_lgb:.4f} ({w_lgb*100:.2f}%)")
print(f"   XGBoost:  {w_xgb:.4f} ({w_xgb*100:.2f}%)")
print(f"   CatBoost: {w_cat:.4f} ({w_cat*100:.2f}%)")


## 5. Feature importance

In [None]:
print("ANÁLISE DE FEATURE IMPORTANCE")

# LightGBM importance
lgb_importances = []
for model in lgb_models:
    lgb_importances.append(model.feature_importance(importance_type='gain'))

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance_lgb': np.mean(lgb_importances, axis=0)
})

# XGBoost importance
xgb_importances = []
for model in xgb_models:
    importance_dict = model.get_score(importance_type='gain')
    importance_array = np.zeros(len(X_train.columns))
    for i, col in enumerate(X_train.columns):
        importance_array[i] = importance_dict.get(f'f{i}', 0)
    xgb_importances.append(importance_array)

feature_importance['importance_xgb'] = np.mean(xgb_importances, axis=0)

# CatBoost importance
cat_importances = []
for model in cat_models:
    cat_importances.append(model.get_feature_importance())

feature_importance['importance_cat'] = np.mean(cat_importances, axis=0)

# Importância média normalizada
feature_importance['importance_mean'] = (
    feature_importance[['importance_lgb', 'importance_xgb', 'importance_cat']]
    .apply(lambda x: x / x.sum(), axis=0)
    .mean(axis=1)
)

feature_importance = feature_importance.sort_values('importance_mean', ascending=False)

# Top 20
print("Top 20 Features Mais Importantes (média dos 3 modelos):\n")
for idx, row in feature_importance.head(20).iterrows():
    print(f"   {row['feature']:<35} {row['importance_mean']:>8.5f}")

# Visualização
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for idx, (col, title) in enumerate([
    ('importance_lgb', 'LightGBM'),
    ('importance_xgb', 'XGBoost'),
    ('importance_cat', 'CatBoost')
]):
    top20 = feature_importance.nlargest(20, col)
    axes[idx].barh(top20['feature'][::-1], top20[col][::-1], color='steelblue')
    axes[idx].set_xlabel('Importance', fontweight='bold')
    axes[idx].set_title(f'Top 20 Features - {title}', fontweight='bold')
    axes[idx].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()


## 6. Criando Submissão final

In [None]:
print("GERANDO ARQUIVOS DE SUBMISSÃO")

# Criar submissões para cada estratégia
submissions = {
    'submission_lgb_calibrated.csv': lgb_test_cal_mean,
    'submission_xgb_calibrated.csv': xgb_test_cal_mean,
    'submission_cat_calibrated.csv': cat_test_cal_mean,
    'submission_ensemble_mean.csv': ensemble_mean,
    'submission_ensemble_weighted.csv': ensemble_weighted,
    'submission_ensemble_rank.csv': ensemble_rank,
    'submission.csv': ensemble_weighted  # Principal
}

for filename, predictions in submissions.items():
    submission = pd.DataFrame({
        'id': sample_submission['id'],
        'y': predictions
    })
    submission.to_csv(f"../submissions/{filename}", index=False)
    print(f"OK {filename}")

print(f"\nSUBMISSÃO PRINCIPAL: submission.csv (Ensemble Ponderado)")
print(f"   Média: {ensemble_weighted.mean():.5f} ({ensemble_weighted.mean()*100:.2f}%)")

# Mostrar primeiras linhas
print(f"\nPrimeiras linhas da submissão principal:")
display(pd.DataFrame({
    'id': sample_submission['id'],
    'y': ensemble_weighted
}).head(10))
