## Setup

In [None]:
import os
from dotenv import load_dotenv
from src.py_src import util

import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, f1_score
import optuna

In [None]:
load_dotenv()

slided_df_path = os.path.join(os.getenv("SLIDED_DFS_CSV_PATH"), "data_slided_V3.parquet")
target_class = 'target_class_in_24h'
target_flux = 'target_flux_in_24h'
target_columns = [target_class, target_flux]

df_model_input = util.create_df_model_input_opt(slided_df_path, target_columns, "xl_")

## Model 3 - 'Specialist 910'

### Preparing Data

In [None]:
specialist_910_pool = df_model_input[df_model_input[target_class] > 2].copy()

In [None]:
train_pct = 0.7
val_pct = (1-train_pct)/2
test_pct = (1-train_pct)/2

specialist_910_data = util.prepare_data(df_model_input = specialist_910_pool,
                                        target_class_col = target_class,
                                        lambda_function = lambda lb: 1 if lb >= 4 else 0,
                                        train_pct = train_pct,
                                        val_pct = val_pct,
                                        target_flux_col = target_flux)

### Buffer Zone

In [None]:
buffer_lower_limit = 8.0e-6
buffer_upper_limit = 2.0e-5
buffer_mask = ((specialist_910_data['flux']['train'] > buffer_lower_limit) &
               (specialist_910_data['flux']['train'] < buffer_upper_limit))

flux_original = specialist_910_data['flux']['train'].copy()
flux_filtered = flux_original[~buffer_mask]

In [None]:
# HARD BUFFER
# # specialist_910_data['x']['train'] = specialist_910_data['x']['train'][~buffer_mask]
# # specialist_910_data['y']['train'] = specialist_910_data['y']['train'][~buffer_mask]
# # specialist_910_data['flux']['train'] = specialist_910_data['flux']['train'][~buffer_mask]

In [None]:
# SOFT BUFFER
weights_train = np.ones(specialist_910_data['y']['train'].shape[0])
weights_train[buffer_mask] = 0.2


In [None]:
plt.figure(figsize=(12, 6))

plt.hist(flux_original, bins=np.logspace(-6.5, -4.5, 50),
         alpha=0.6, label='Inside Buffer Zone', color='skyblue', edgecolor='black')

plt.hist(flux_filtered, bins=np.logspace(-6.5, -4.5, 50),
         alpha=0.8, label='Outside Buffer Zone', color='coral', edgecolor='black')

plt.axvline(buffer_lower_limit, color='green', linestyle='--', linewidth=2, label=f' Limite Inferior (C8.0): {buffer_lower_limit:.1e}')
plt.axvline(buffer_upper_limit, color='red', linestyle='--', linewidth=2, label=f'Limite Superior (M2.0): {buffer_upper_limit:.1e}')

plt.xscale('log')
plt.xlabel('X-Ray Flux (Watts/m²) - Logarithmic Scale')
plt.ylabel('Frequência (Número de Amostras)')
plt.title('Efeito da Buffer Zone Training na Distribuição de Fluxos (Conjunto de Treino)')
plt.legend()
plt.grid(True, which="both", ls="--", alpha=0.5)
plt.show()

### Training and Parameter Tuning

In [None]:
counts = specialist_910_data['y']['train'].value_counts()
scale_pos_weight = counts[0] / counts[1]

print("\nDistribuição do Alvo(no treino):")
print(f"Classe 0: {counts[0]} amostras")
print(f"Classe 1:  {counts[1]} amostras")
print(f"Scale_pos_weight: {scale_pos_weight:.2f}")

In [None]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': 1000,
        'random_state': 1502,
        'n_jobs': -1,
        'early_stopping_rounds': 50,

        'device': 'cuda',

        'scale_pos_weight': trial.suggest_float("scale_pos_weight", 1.0, 5, log=True),

        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'gamma': trial.suggest_float('gamma', 0.1, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    model_ = xgb.XGBClassifier(**params)

    model_.fit(
        specialist_910_data['x']['train'],
        specialist_910_data['y']['train'],
        sample_weight=weights_train,
        eval_set=[(specialist_910_data['x']['val'], specialist_910_data['y']['val'])],
        verbose=False
    )

    y_pred_val = model_.predict(specialist_910_data['x']['val'])


    score = f1_score(specialist_910_data['y']['val'], y_pred_val, average='macro')


    # recall_c = recall_score(specialist_910_data['y']['val'], y_pred_val, pos_label=0)
    # recall_mx = recall_score(specialist_910_data['y']['val'], y_pred_val, pos_label=1)
    # w_c = 2.0
    # w_mx = 1.0
    # score = (w_c * recall_c) + (w_mx * recall_mx)

    return score

In [None]:
study = optuna.create_study(direction='maximize')
print("\nIniciando o tuning...")
study.optimize(objective, n_trials=100)

print("\nBusca concluída!")
print(f"Melhor valor: {study.best_value:.4f}")
print("Melhores parâmetros encontrados:")
print(study.best_params)

In [None]:
great_filter_params = study.best_params

great_filter_params.update({
    'n_estimators': 1000,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': 1502,
    'n_jobs': -1,
    'early_stopping_rounds': 50
})

In [None]:
specialist_910_model = xgb.XGBClassifier(**great_filter_params)

print("\nIniciando o treinamento...")
specialist_910_model.fit(
    specialist_910_data['x']['train'],
    specialist_910_data['y']['train'],
    eval_set=[(specialist_910_data['x']['val'], specialist_910_data['y']['val'])],
    verbose=100
)
print("Treinamento concluído.")

### Threshold Tuning

In [None]:
y_proba = specialist_910_model.predict_proba(specialist_910_data['x']['test'])[:, 1]
y_true = specialist_910_data['y']['test']

precisions, recalls, thresholds = precision_recall_curve(y_true, y_proba)

df_thresholds = pd.DataFrame({
    'Threshold': thresholds,
    'Precision (MX)': precisions[:-1],
    'Recall (MX)': recalls[:-1]
})

plt.figure(figsize=(10, 6))
plt.plot(df_thresholds['Threshold'], df_thresholds['Recall (MX)'], label='Recall MX (Segurança)')
plt.plot(df_thresholds['Threshold'], df_thresholds['Precision (MX)'], label='Precision MX')
plt.xlabel('Limiar de Decisão (Threshold)')
plt.ylabel('Score')
plt.title('Trade-off: Escolhendo o Limiar Ideal para Specialist 910')
plt.legend()
plt.grid(True)
plt.show()

target_recall = 0.90
optimal_row = df_thresholds.iloc[(df_thresholds['Recall (MX)'] - target_recall).abs().argsort()[:1]]
print("--- Ponto de Operação Sugerido ---")
print(optimal_row)

optimal_thresh = optimal_row['Threshold'].values[0]

idx_equilibrio = np.abs(precisions[:-1] - recalls[:-1]).argmin()
balanced_thresh = thresholds[idx_equilibrio]

y_pred_balanced = (y_proba >= balanced_thresh).astype(int)

print("\n--- Relatório com Limiar 'Balanced' ---")
print(classification_report(y_true, y_pred_balanced, target_names=['C (0)', 'MX (1)']))

### Final Model

In [None]:
specialist_910_model = util.ThresholdXGBClassifier(specialist_910_model, balanced_thresh)

### Results

In [None]:
specialist_910_y_pred = specialist_910_model.predict(specialist_910_data['x']['test'])

print("--- Relatório de Classificação (Conjunto de Teste) ---\n")
print(classification_report(specialist_910_data['y']['test'], specialist_910_y_pred, target_names=['C (0)', 'MX (1)']))

print("\n--- Matriz de Confusão ---")
cm = confusion_matrix(specialist_910_data['y']['test'], specialist_910_y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['C (0)', 'MX (1)'])
disp.plot(cmap='Blues')
plt.show()

In [None]:
y_true = specialist_910_data['y']['test']
flux_test = specialist_910_data['flux']['test']

y_pred = specialist_910_model.predict(specialist_910_data['x']['test'])

fig, summary_table = util.analyze_flux_errors(y_true = y_true,
                                              y_pred = y_pred_balanced,
                                              flux_values = flux_test,
                                              buffer_lower_limit= buffer_lower_limit,
                                              buffer_upper_limit= buffer_upper_limit, )

display(summary_table)
plt.show()

In [None]:
summary_table.to_csv()

### Exporting Model

In [None]:
# model_save_path = r'../../models/specialist_910/specialist_910_model_v1.joblib'
# joblib.dump(specialist_910_model, model_save_path)
# print(f"Modelo salvo em {model_save_path}")