## Setup

In [None]:
import os
from dotenv import load_dotenv
from src import util
import gc

import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score
import optuna
import joblib

In [None]:
load_dotenv()

slided_df_path = os.path.join(os.getenv("SLIDED_DFS_CSV_PATH"), "data_slided_V2.parquet")
target_column = "target_class_in_24h"

df_model_input = util.create_df_model_input(slided_df_path, target_column, "xl_", '10min', 'last')

In [4]:
specialist_training_pool = df_model_input[df_model_input[target_column] > 3].copy()

In [None]:
train_pct = 0.7
val_pct = (1-train_pct)/2
test_pct = (1-train_pct)/2

specialist_data = util.prepare_data(specialist_training_pool, target_column, lambda lb: 1 if lb == 5 else 0, train_pct, val_pct)

## Features Importance

In [None]:
specialist_initial_model = joblib.load("../models/specialist_model_v1.joblib")

In [None]:
feature_names = specialist_data['x']['train'].columns

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': specialist_initial_model.feature_importances_
}).sort_values('importance', ascending=False)

importance_df['cumulative_importance'] = importance_df['importance'].cumsum()

In [None]:
print("--- Tabela de Importância Acumulada ---")
print(importance_df.head(25))

In [None]:
THRESHOLD = 0.95

features_to_keep = importance_df[importance_df['cumulative_importance'] <= THRESHOLD]['feature'].tolist()

if len(features_to_keep) < 5:
    features_to_keep = importance_df['feature'].head(10).tolist()

print(f"\nNúmero original de features: {len(feature_names)}")
print(f"Número de features após corte de {THRESHOLD*100}%: {len(features_to_keep)}")
print("\nFeatures selecionadas:")
print(features_to_keep)