## Setup

In [1]:
import os
import optuna
from dotenv import load_dotenv
from sklearn.metrics import f1_score, recall_score
import numpy as np

from src.py_src import util
from src.py_src.models import GreatFilterModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
slided_df_path = os.path.join(os.getenv("SLIDED_DFS_CSV_PATH"), "data_slided_V4.parquet")
target_class = 'target_class_in_12h'
target_flux = 'target_flux_in_12h'
target_columns = [target_class, target_flux]

buffer_limits = (5.0e-7, 4.0e-6)

df_model_input = util.create_df_model_input_opt(slided_df_path, target_columns, "xl_")

Carregando 56 colunas do arquivo Parquet...


## Preparing Data

In [4]:
great_filter_pool = df_model_input[df_model_input[target_class] > 0].copy()

train_pct = 0.7
val_pct = (1-train_pct)/2

data = util.prepare_data(
    df_model_input=great_filter_pool,
    target_class_col=target_class,
    lambda_function=lambda lb: 1 if lb >= 3 else 0,
    train_pct=train_pct,
    val_pct=val_pct,
    target_flux_col=target_flux
)

In [5]:
ratio = (np.sum(data['y']['train'] == 0)) / (np.sum(data['y']['train'] == 1))
print(f"Proporção de Classes (Neg/Pos): {ratio:.2f}")

Proporção de Classes (Neg/Pos): 0.43


## Discovery Model

In [6]:
discovery_model = GreatFilterModel(
    params={
        'n_estimators': 300,
        'learning_rate': 0.05,
        'max_depth': 5,
        'n_jobs': -1,
        'random_state': 42
    },
    buffer_limits=buffer_limits
)

In [7]:
selected_features = discovery_model.discover_top_features(
    x=data['x']['train'],
    y=data['y']['train'],
    flux_values=data['flux']['train'],
    cumulative_threshold=0.95
)

--- Quick Scan (Discovery Mode) ---
Quick Scan concluído. 37 features selecionadas (de 54).


## Hyperparameter Tuning (Optuna)

In [8]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': 1000,
        'random_state': 1502,
        'n_jobs': -1,
        'early_stopping_rounds': 50,
        'device': 'cuda',

        'scale_pos_weight': trial.suggest_float("scale_pos_weight", 1.0, 5.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'gamma': trial.suggest_float('gamma', 0.1, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10)
    }

    model = GreatFilterModel(params=params, buffer_limits=buffer_limits, features_to_keep=selected_features)

    model.fit(
        x=data['x']['train'],
        y=data['y']['train'],
        flux_values=data['flux']['train'],
        eval_set=[(data['x']['val'], data['y']['val'])],
        verbose=False
    )

    y_pred_proba = model.predict_proba(data['x']['val'])[:, 1]
    y_pred_class = (y_pred_proba >= 0.5).astype(int)

    recall_cmx = recall_score(data['y']['val'], y_pred_class, pos_label=1)
    recall_ab = recall_score(data['y']['val'], y_pred_class, pos_label=0)
    w_ab = 1.0
    w_cmx = 5.0
    score = (w_ab * recall_ab) + (w_cmx * recall_cmx)

    return score

In [9]:
study = optuna.create_study(direction='maximize')
print("\nIniciando tuning...")
study.optimize(objective, n_trials=10)

print(f"\nBest Score: {study.best_value:.4f}")
best_params = study.best_params

best_params.update({
    'n_estimators': 1000, 'objective': 'binary:logistic',
    'eval_metric': 'logloss', 'random_state': 1502,
    'n_jobs': -1, 'early_stopping_rounds': 50
})

[I 2025-12-15 17:32:03,686] A new study created in memory with name: no-name-d5cc6038-d041-44cd-901c-177fd69fdf50



Iniciando tuning...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-12-15 17:32:07,625] Trial 0 finished with value: 4.964356078170314 and parameters: {'scale_pos_weight': 4.061107872307917, 'max_depth': 4, 'learning_rate': 0.11668204044909797, 'subsample': 0.637758836147841, 'colsample_bytree': 0.8493345725088663, 'gamma': 3.5248782267807766, 'min_child_weight': 2, 'max_delta_step': 3}. Best is trial 0 with value: 4.964356078170314.
[I 2025-12-15 17:32:11,167] Trial 1 finished with value: 4.866295236641834 and parameters: {'scale_pos_weight': 2.89433652954847, 'max_depth': 6, 'learning_rate': 0.10588964388262889, 'subsample': 0.7104462978899664, 'colsample_bytree': 0.6128514113449676, 'gamma': 2.6345063258296024, 'min_child_weight': 3, 'max_delta_step': 9}. Best is trial 0 with value: 4.964356078170314.
[I 2025-12-15 17:32:22,787] Trial 2 finished with value: 5.018032


Best Score: 5.0180


In [10]:
final_model = GreatFilterModel(params=study.best_params, buffer_limits=buffer_limits, features_to_keep=selected_features)
final_model.fit(
    x=data['x']['train'], y=data['y']['train'],
    flux_values=data['flux']['train']
)

0,1,2
,params,"{'colsample_bytree': 0.6034569601162351, 'gamma': 4.515165859393959, 'learning_rate': 0.010781472120527099, 'max_delta_step': 0, ...}"
,buffer_limits,
,buffer_weight,0.2
,threshold,0.5
,features_to_keep,"['xl_mean_6h', 'xl_log_mean_6h', ...]"


## Threshold Tuning

In [11]:
fig = final_model.get_threshold_graph(data['x']['test'], data['y']['test'])
# display(fig)

In [12]:
final_model.optimize_threshold(data['x']['test'], data['y']['test'], target_recall=0.95)

Threshold ajustado para Recall ~0.95: 0.9216


np.float32(0.92164266)

## Results

In [13]:
print(final_model.get_classification_report(
    data['x']['test'], data['y']['test'], target_names=['AB', 'CMX']
))

              precision    recall  f1-score   support

          AB       0.48      0.73      0.58      9038
         CMX       0.98      0.95      0.97    140699

    accuracy                           0.94    149737
   macro avg       0.73      0.84      0.77    149737
weighted avg       0.95      0.94      0.94    149737



In [14]:
fig, summary = final_model.analyze_flux_errors(
    data['x']['test'], data['y']['test'],
    flux_values=data['flux']['test'],
    buffer_limits=buffer_limits
)
display(summary)

Outcome,TN (Correct Rejection),FP (False Alarm),FP Rate (%),TP (Hit),FN (Miss),FN Rate (%)
Zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1. Safe Zone (Low Flux),2215,53,2.3,0,0,
2. Buffer Zone,4383,2387,35.3,37507,5181,12.1
3. Safe Zone (High Flux),0,0,,96168,1843,1.9


In [15]:
error_report = final_model.analyze_error_distribution(
    x=data['x']['test'],
    y_true=data['y']['test'],
    flux_values=data['flux']['test']
)
display(error_report)

Unnamed: 0_level_0,FN (Miss),FP (False Alarm),FN (Miss) Avg Flux,FP (False Alarm) Avg Flux
SolarClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B (1.0 - 9.9),0,2440,-,8.46e-07
C (1.0 - 9.9),6611,0,2.75e-06,-
M (1.0 - 9.9),408,0,2.36e-05,-
X (> M10),5,0,1.84e-04,-


## Features Importance

In [16]:
features_importance = final_model.get_feature_importance()
features_importance

Unnamed: 0,feature,importance,cumulative_importance
0,xl_mean_6h,0.309893,0.309893
1,xl_log_mean_6h,0.227926,0.537819
7,xl_mean_1h,0.075671,0.61349
4,xl_log_mean_1h,0.06671,0.6802
2,xl_log_mean_12h,0.0624,0.7426
34,xl_integ_12h,0.04222,0.784819
9,xl_std_12h,0.029442,0.814261
10,xl_integ_6h,0.02926,0.843521
17,xl_max_6h,0.023086,0.866607
3,xl_max_1h,0.015174,0.881781


## Export

In [22]:
great_filter_dir = os.getenv('GREAT_FILTER_MODELS_PATH')
save_path = os.path.join(great_filter_dir, '12h/great_filter_12h_v1.joblib')
final_model.save(save_path)

Modelo salvo em: C:\Users\Eduardo\BES\IC\Solar_Flares\src\notebooks\models\great_filter\12h/great_filter_12h_v1.joblib
