## Setup

In [None]:
import os
from dotenv import load_dotenv
from src.py_src import util

import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, recall_score
import optuna
import joblib

In [None]:
load_dotenv()

slided_df_path = os.path.join(os.getenv("SLIDED_DFS_CSV_PATH"), "data_slided_V2.parquet")
target_column = "target_class_in_24h"

df_model_input = util.create_df_model_input_opt(slided_df_path, target_column, "xl_", '10min', 'last')

In [None]:
df_model_input

In [None]:
great_filter_training_pool = df_model_input[df_model_input[target_column] > 0].copy()

In [None]:
train_pct = 0.7
val_pct = (1-train_pct)/2
test_pct = (1-train_pct)/2

great_filter_data = util.prepare_data(great_filter_training_pool, target_column, lambda lb: 1 if lb >= 3 else 0, train_pct, val_pct)

## Features Importance

In [None]:
great_filter_initial_model = joblib.load("../../models/great_filter/great_filter_model_v7.joblib")

In [None]:
feature_names = great_filter_data['x']['train'].columns

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': great_filter_initial_model.feature_importances_
}).sort_values('importance', ascending=False)

importance_df['cumulative_importance'] = importance_df['importance'].cumsum()

In [None]:
print("--- Tabela de Importância Acumulada ---")
print(importance_df.head(25))

In [None]:
THRESHOLD = 0.9

features_to_keep = importance_df[importance_df['cumulative_importance'] <= THRESHOLD]['feature'].tolist()

if len(features_to_keep) < 5:
    features_to_keep = importance_df['feature'].head(10).tolist()

print(f"\nNúmero original de features: {len(feature_names)}")
print(f"Número de features após corte de {THRESHOLD*100}%: {len(features_to_keep)}")
print("\nFeatures selecionadas:")
print(features_to_keep)

## Training and Parameter Tuning

In [None]:
great_filter_lean_data = {
    'x': {
        'train': great_filter_data['x']['train'][features_to_keep].copy(),
        'val':   great_filter_data['x']['val'][features_to_keep].copy(),
        'test':  great_filter_data['x']['test'][features_to_keep].copy()
    },
    'y': great_filter_data['y']
}

In [None]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': 1000,
        'random_state': 1502,
        'n_jobs': -1,
        'early_stopping_rounds': 50,

        'device': 'cuda',

        'scale_pos_weight': trial.suggest_float("scale_pos_weight", 0.3, 3.0, log=True),

        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    model_ = xgb.XGBClassifier(**params)

    model_.fit(
        great_filter_lean_data['x']['train'],
        great_filter_lean_data['y']['train'],
        eval_set=[(great_filter_lean_data['x']['val'], great_filter_lean_data['y']['val'])],
        verbose=False
    )

    y_pred_val = model_.predict(great_filter_lean_data['x']['val'])

    recall_cmx = recall_score(great_filter_data['y']['val'], y_pred_val, pos_label=1)
    recall_ab = recall_score(great_filter_data['y']['val'], y_pred_val, pos_label=0)
    w_ab = 6.0
    w_cmx = 1.0
    score = (w_ab * recall_ab) + (w_cmx * recall_cmx)

    return score

In [None]:
study = optuna.create_study(direction='maximize')
print("\nIniciando o tuning para o 'Great Filter' (Modelo 2)...")
study.optimize(objective, n_trials=2000)

print("\nBusca concluída!")
print(f"Melhor valor: {study.best_value:.4f}")
print("Melhores parâmetros encontrados:")
print(study.best_params)

In [None]:
great_filter_params = study.best_params

great_filter_params.update({
    'n_estimators': 1000,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': 1502,
    'n_jobs': -1,
    'early_stopping_rounds': 50
})

In [None]:
great_filter_model = xgb.XGBClassifier(**great_filter_params)

print("\nIniciando o treinamento do 'Great Filter' (Modelo 2)...")
great_filter_model.fit(
    great_filter_lean_data['x']['train'],
    great_filter_lean_data['y']['train'],
    eval_set=[(great_filter_lean_data['x']['val'], great_filter_lean_data['y']['val'])],
    verbose=100
)
print("Treinamento concluído.")

## Results

In [None]:
great_filter_y_pred = great_filter_model.predict(great_filter_lean_data['x']['test'])

print("--- Relatório de Classificação (Conjunto de Teste) ---\n")
print(classification_report(great_filter_lean_data['y']['test'], great_filter_y_pred, target_names=['AB (0)', 'CMX (1)']))

print("\n--- Matriz de Confusão ---")
cm = confusion_matrix(great_filter_lean_data['y']['test'], great_filter_y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['AB (0)', 'CMX (1)'])
disp.plot(cmap='Blues')
plt.show()

## Evaluating Tradeoff cost

In [None]:
great_filter_lean_data['x'].keys()

In [None]:
great_filter_y_multiclass = df_model_input[target_column]

val_end = int(len(df_model_input) * (val_pct + train_pct))
great_filter_y_multi_truth = great_filter_y_multiclass.iloc[val_end:]

In [None]:
df_analysis = pd.DataFrame(index=great_filter_lean_data['y']['test'].index)
df_analysis['binary_truth'] = great_filter_lean_data['y']['test']
df_analysis['binary_pred'] = great_filter_y_pred
df_analysis['multiclass_truth'] = great_filter_y_multi_truth

is_false_negative = (df_analysis['binary_truth'] == 1) & (df_analysis['binary_pred'] == 0)
df_false_negatives = df_analysis[is_false_negative]
print(f"Número total de Falsos Negativos encontrados: {len(df_false_negatives)}")

In [None]:
df_false_negatives

## Exporting Models

In [None]:
model_save_path = r'../../models/great_filter/great_filter_model_v9.joblib'
joblib.dump(great_filter_model, model_save_path)
print(f"Modelo salvo em {model_save_path}")