## Setup

In [1]:
import os
import optuna
from dotenv import load_dotenv
from sklearn.metrics import f1_score, recall_score
import numpy as np

from src.py_src import util
from src.py_src.models import GreatFilterModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

slided_df_path = os.path.join(os.getenv("SLIDED_DFS_CSV_PATH"), "data_slided_V4.parquet")
target_class = 'target_class_in_24h'
target_flux = 'target_flux_in_24h'
target_columns = [target_class, target_flux]

buffer_limits = (5.0e-7, 4.0e-6)

df_model_input = util.create_df_model_input_opt(slided_df_path, target_columns, "xl_")

Carregando 46 colunas do arquivo Parquet...


## Preparing Data

In [3]:
great_filter_pool = df_model_input[df_model_input[target_class] > 0].copy()

train_pct = 0.7
val_pct = (1-train_pct)/2

data = util.prepare_data(
    df_model_input=great_filter_pool,
    target_class_col=target_class,
    lambda_function=lambda lb: 1 if lb >= 3 else 0,
    train_pct=train_pct,
    val_pct=val_pct,
    target_flux_col=target_flux
)

In [4]:
ratio = (np.sum(data['y']['train'] == 0)) / (np.sum(data['y']['train'] == 1))
print(f"Proporção de Classes (Neg/Pos): {ratio:.2f}")

Proporção de Classes (Neg/Pos): 0.35


## Discovery Model

In [5]:
discovery_model = GreatFilterModel(
    params={
        'n_estimators': 300,
        'learning_rate': 0.05,
        'max_depth': 5,
        'n_jobs': -1,
        'random_state': 42
    },
    buffer_limits=buffer_limits
)

In [6]:
selected_features = discovery_model.discover_top_features(
    x=data['x']['train'],
    y=data['y']['train'],
    flux_values=data['flux']['train'],
    cumulative_threshold=0.95
)

--- Quick Scan (Discovery Mode) ---
Quick Scan concluído. 25 features selecionadas (de 44).


## Hyperparameter Tuning (Optuna)

In [7]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': 1000,
        'random_state': 1502,
        'n_jobs': -1,
        'early_stopping_rounds': 50,
        'device': 'cuda',

        'scale_pos_weight': trial.suggest_float("scale_pos_weight", 1.0, 5.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'gamma': trial.suggest_float('gamma', 0.1, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10)
    }

    model = GreatFilterModel(params=params, buffer_limits=buffer_limits, features_to_keep=selected_features)

    model.fit(
        x=data['x']['train'],
        y=data['y']['train'],
        flux_values=data['flux']['train'],
        eval_set=[(data['x']['val'], data['y']['val'])],
        verbose=False
    )

    y_pred_proba = model.predict_proba(data['x']['val'])[:, 1]
    y_pred_class = (y_pred_proba >= 0.5).astype(int)

    recall_cmx = recall_score(data['y']['val'], y_pred_class, pos_label=1)
    recall_ab = recall_score(data['y']['val'], y_pred_class, pos_label=0)
    w_ab = 1.0
    w_cmx = 5.0
    score = (w_ab * recall_ab) + (w_cmx * recall_cmx)

    return score

In [8]:
study = optuna.create_study(direction='maximize')
print("\nIniciando tuning...")
study.optimize(objective, n_trials=50)

print(f"\nBest Score: {study.best_value:.4f}")
best_params = study.best_params

best_params.update({
    'n_estimators': 1000, 'objective': 'binary:logistic',
    'eval_metric': 'logloss', 'random_state': 1502,
    'n_jobs': -1, 'early_stopping_rounds': 50
})

[I 2025-12-17 10:33:07,457] A new study created in memory with name: no-name-18c0e377-14f7-482c-bed9-237ec873de02



Iniciando tuning...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-12-17 10:33:10,834] Trial 0 finished with value: 4.386103633633641 and parameters: {'scale_pos_weight': 1.0928426224289214, 'max_depth': 4, 'learning_rate': 0.12220691673710535, 'subsample': 0.7440162591694677, 'colsample_bytree': 0.6800125498264283, 'gamma': 3.720978574520128, 'min_child_weight': 7, 'max_delta_step': 3}. Best is trial 0 with value: 4.386103633633641.
[I 2025-12-17 10:33:14,619] Trial 1 finished with value: 4.96959898929659 and parameters: {'scale_pos_weight': 4.154681830731606, 'max_depth': 4, 'learning_rate': 0.12803125022369408, 'subsample': 0.899022734108853, 'colsample_bytree': 0.7284275683182481, 'gamma': 0.8370749513702649, 'min_child_weight': 7, 'max_delta_step': 0}. Best is trial 1 with value: 4.96959898929659.
[I 2025-12-17 10:33:25,215] Trial 2 finished with value: 4.4609690


Best Score: 5.1661


In [9]:
final_model = GreatFilterModel(params=study.best_params, buffer_limits=buffer_limits, features_to_keep=selected_features)
final_model.fit(
    x=data['x']['train'], y=data['y']['train'],
    flux_values=data['flux']['train']
)

0,1,2
,params,"{'colsample_bytree': 0.6294816663842567, 'gamma': 0.6789313222974398, 'learning_rate': 0.013996186179051703, 'max_delta_step': 0, ...}"
,buffer_limits,
,buffer_weight,0.2
,threshold,0.5
,features_to_keep,"['xl_mean_6h', 'xl_integ_6h', ...]"


## Threshold Tuning

In [10]:
fig = final_model.get_threshold_graph(data['x']['test'], data['y']['test'])
# display(fig)

In [11]:
final_model.optimize_threshold(data['x']['test'], data['y']['test'], target_recall=0.95)

Threshold ajustado para Recall ~0.95: 0.9206


np.float32(0.92061603)

## Results

In [12]:
print(final_model.get_classification_report(
    data['x']['test'], data['y']['test'], target_names=['AB', 'CMX']
))

              precision    recall  f1-score   support

          AB       0.54      0.75      0.63     12213
         CMX       0.98      0.95      0.96    155933

    accuracy                           0.94    168146
   macro avg       0.76      0.85      0.80    168146
weighted avg       0.95      0.94      0.94    168146



In [13]:
fig, summary = final_model.analyze_flux_errors(
    data['x']['test'], data['y']['test'],
    flux_values=data['flux']['test'],
    buffer_limits=buffer_limits
)
display(summary)

Outcome,TN (Correct Rejection),FP (False Alarm),FP Rate (%),TP (Hit),FN (Miss),FN Rate (%)
Zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1. Safe Zone (Low Flux),5141,267,4.9,0,0,
2. Buffer Zone,4059,2746,40.4,29443,5252,15.1
3. Safe Zone (High Flux),0,0,,118693,2545,2.1


In [14]:
error_report = final_model.analyze_error_distribution(
    x=data['x']['test'],
    y_true=data['y']['test'],
    flux_values=data['flux']['test']
)
display(error_report)

Unnamed: 0_level_0,FN (Miss),FP (False Alarm),FN (Miss) Avg Flux,FP (False Alarm) Avg Flux
SolarClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B (1.0 - 9.9),0,3013,-,7.70e-07
C (1.0 - 9.9),7170,0,2.98e-06,-
M (1.0 - 9.9),627,0,2.35e-05,-


## Features Importance

In [15]:
features_importance = final_model.get_feature_importance()
features_importance

Unnamed: 0,feature,importance,cumulative_importance
0,xl_mean_6h,0.278631,0.278631
5,xl_mean_12h,0.241691,0.520322
1,xl_integ_6h,0.089649,0.609971
21,xl_mean_1h,0.074224,0.684195
6,xl_log_mean_1h,0.050236,0.734431
8,xl_integ_12h,0.043064,0.777496
22,xl_std_12h,0.028698,0.806194
2,xl_log_mean_6h,0.028307,0.834501
7,xl_log_mean_12h,0.027551,0.862051
15,xl_max_6h,0.022404,0.884456


## Export

In [16]:
great_filter_dir = os.getenv('GREAT_FILTER_MODELS_PATH')
save_path = os.path.join(great_filter_dir, '24h/great_filter_24h_v1.joblib')
final_model.save(save_path)

Modelo salvo em: C:\Users\Eduardo\BES\IC\Solar_Flares\src\notebooks\models\great_filter\24h/great_filter_24h_v1.joblib
