## Setup

In [None]:
import os
from dotenv import load_dotenv
from src import util

import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score
import optuna
import joblib

In [None]:
load_dotenv()

slided_df_path = os.path.join(os.getenv("SLIDED_DFS_CSV_PATH"), "data_slided_V2.parquet")
target_column = "target_class_in_24h"

df_model_input = util.create_df_model_input(slided_df_path, target_column, "xl_", '10min', 'last')

In [None]:
df_model_input

## Model 1 - 'Gatekeeper'

##### O objetivo é separar 'No Flare' de 'Flare'.
Ou seja, filtra 26% dos dados iniciais que são "calmaria total" e passa os outros 74% adiante.

### Preparing Data

In [None]:
train_pct = 0.7
val_pct = (1-train_pct)/2
test_pct = (1-train_pct)/2

gatekeeper_data = util.prepare_data(df_model_input, target_column, lambda lb: 1 if lb > 0 else 0, train_pct, val_pct)

### Training

In [None]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': 1000,
        'random_state': 1502,
        'n_jobs': -1,
        'early_stopping_rounds': 50,

        'device': 'cuda',

        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 10.0, 400.0, log=True),

        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    model_ = xgb.XGBClassifier(**params)

    model_.fit(
        gatekeeper_data['x']['train'],
        gatekeeper_data['y']['train'],
        eval_set=[(gatekeeper_data['x']['val'], gatekeeper_data['y']['val'])],
        verbose=False
    )

    y_pred_val = model_.predict(gatekeeper_data['x']['val'])

    recall_flare = recall_score(gatekeeper_data['y']['val'], y_pred_val, pos_label=1)
    recall_no_flare = recall_score(gatekeeper_data['y']['val'], y_pred_val, pos_label=0)
    w = 2
    score = (w * recall_flare) + (1 * recall_no_flare)

    return score

In [None]:
study = optuna.create_study(direction='maximize')

print("Iniciando a busca por hiperparâmetros (Tuning)...")
study.optimize(objective, n_trials=500)

print("\nBusca concluída!")
print(f"Melhor valor (macro f1-score): {study.best_value:.4f}")
print("Melhores parâmetros encontrados:")
print(study.best_params)

In [None]:
# gatekeeper_params = study.best_params
gatekeeper_params = {'scale_pos_weight': 22.56819655530684, 'max_depth': 10, 'learning_rate': 0.27008985053260204, 'subsample': 0.9745902403647663, 'colsample_bytree': 0.7646778022990397, 'gamma': 1.0884507524773834, 'min_child_weight': 7, 'n_estimators': 1000, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'random_state': 1502, 'n_jobs': -1, 'early_stopping_rounds': 50}

gatekeeper_params.update({
    'n_estimators': 1000,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': 1502,
    'n_jobs': -1,
    'early_stopping_rounds': 50
})

In [None]:
print(gatekeeper_params)

In [None]:
gatekeeper = xgb.XGBClassifier(**gatekeeper_params)

print("Iniciando o treinamento do XGBoost...")

gatekeeper.fit(
    gatekeeper_data['x']['train'],
    gatekeeper_data['y']['train'],
    eval_set=[(gatekeeper_data['x']['val'], gatekeeper_data['y']['val'])],
    verbose=100
)

print("Treinamento concluído.")

In [None]:
gatekeeper_y_pred = gatekeeper.predict(gatekeeper_data['x']['test'])

print("--- Relatório de Classificação (Conjunto de Teste) ---\n")
print(classification_report(gatekeeper_data['y']['test'], gatekeeper_y_pred, target_names=['No Flare (0)', 'Flare (1)']))

print("\n--- Matriz de Confusão ---")
cm = confusion_matrix(gatekeeper_data['y']['test'], gatekeeper_y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Flare (0)', 'Flare (1)'])
disp.plot(cmap='Blues')
plt.show()

### Evaluating Tradeoff cost

In [None]:
gatekeeper_y_multiclass = df_model_input[target_column]

val_end = int(len(gatekeeper_data['x']['all']) * (val_pct + train_pct))
gatekeeper_y_multi_truth = gatekeeper_y_multiclass.iloc[val_end:]

In [None]:
df_analysis = pd.DataFrame(index=gatekeeper_data['y']['test'].index)
df_analysis['binary_truth'] = gatekeeper_data['y']['test']
df_analysis['binary_pred'] = gatekeeper_y_pred
df_analysis['multiclass_truth'] = gatekeeper_y_multi_truth

is_false_negative = (df_analysis['binary_truth'] == 1) & (df_analysis['binary_pred'] == 0)
df_false_negatives = df_analysis[is_false_negative]
print(f"Número total de Falsos Negativos encontrados: {len(df_false_negatives)}")

In [None]:
df_false_negatives

### Exporting Model

In [None]:
# model_save_path = r'../models/gatekeeper_model_v7.joblib'
# joblib.dump(gatekeeper, model_save_path)
# print(f"Modelo salvo em {model_save_path}")

### Features Importance

In [None]:
print("--- Importância das Features ---")

importance = pd.DataFrame({
    'feature': gatekeeper_data['x']['train'].columns,
    'importance': gatekeeper.feature_importances_
}).sort_values('importance', ascending=False)

print(importance.head(20))

plt.figure(figsize=(10, 8))
plt.title("Importância das Features (XGBoost)")
plt.barh(importance['feature'].head(20), importance['importance'].head(20))
plt.gca().invert_yaxis()
plt.show()

In [None]:
model_ = joblib.load("../../models/gatekeeper/gatekeeper_model_v6.joblib")