In [1]:
import pandas as pd
import joblib
import time

from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV  # Bayesian optimization: utilizado para optimizar hiperparámetros

import xgboost  as xgb

from Funcoes_Comuns import avaliar_modelo, registrar_modelo

### 1. Recuperar base já pré-processada

In [2]:
# Obter dados
df_enem = pd.read_pickle('Bases\\Finais\\enem_microdados_2023.pkl')

In [None]:
variaveis_alvo = ['NUM_NOTA_MT', 'NUM_NOTA_LC', 'NUM_NOTA_CN', 'NUM_NOTA_CH', 'NUM_NOTA_REDACAO']

# separar em treino e teste
X = df_enem.drop(columns=variaveis_alvo)
y = df_enem[variaveis_alvo]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criar Eval Set para validação cruzada (15% do conjunto de treino)
X_train_final, X_eval, y_train_final, y_eval = train_test_split(
    X_train,
    y_train,
    test_size=0.15,
    random_state=42
)

# Ajuste de tipo para MLflow
# Converter colunas inteiras para float
X_train_final = X_train_final.astype({col: 'float' for col in X_train_final.select_dtypes('int').columns})
X_eval = X_eval.astype({col: 'float' for col in X_eval.select_dtypes('int').columns})
X_test = X_test.astype({col: 'float' for col in X_test.select_dtypes('int').columns})

### 2. Modelo base XGBM

In [4]:
# Treinar modelo XGBoost
modelo_xgb = xgb.XGBRegressor(n_estimators=200,
                              max_depth=3,
                              learning_rate=0.5,
                              colsample_bytree=0.5,
                              objective='reg:absoluteerror',
                              enable_categorical=True,
                              reg_alpha=0.3,
                              reg_lambda=0.7,
                              random_state=42,
                              n_jobs=-1)

start_time = time.time()

modelo_xgb.fit(X_train_final, 
               y_train_final['NUM_NOTA_CH'], 
               eval_set=[(X_eval, y_eval['NUM_NOTA_CH'])])

tempo_treino = time.time() - start_time

[0]	validation_0-mae:61.62117
[1]	validation_0-mae:59.37553
[2]	validation_0-mae:57.74793
[3]	validation_0-mae:57.09967
[4]	validation_0-mae:56.75975
[5]	validation_0-mae:56.53526
[6]	validation_0-mae:56.38367
[7]	validation_0-mae:56.28743
[8]	validation_0-mae:56.26233
[9]	validation_0-mae:56.18396
[10]	validation_0-mae:56.14632
[11]	validation_0-mae:56.08447
[12]	validation_0-mae:56.03774
[13]	validation_0-mae:56.00409
[14]	validation_0-mae:55.96261
[15]	validation_0-mae:55.96183
[16]	validation_0-mae:55.96306
[17]	validation_0-mae:55.95310
[18]	validation_0-mae:55.94153
[19]	validation_0-mae:55.94381
[20]	validation_0-mae:55.93691
[21]	validation_0-mae:55.90792
[22]	validation_0-mae:55.87570
[23]	validation_0-mae:55.86789
[24]	validation_0-mae:55.86623
[25]	validation_0-mae:55.86305
[26]	validation_0-mae:55.85595
[27]	validation_0-mae:55.85706
[28]	validation_0-mae:55.85409
[29]	validation_0-mae:55.82854
[30]	validation_0-mae:55.82967
[31]	validation_0-mae:55.83147
[32]	validation_0-

In [5]:
# Previsões
y_pred = modelo_xgb.predict(X_test)

In [6]:
nome_experimento = 'Notas CH ENEM 2023'

registrar_modelo(experimento=nome_experimento,
                 parametros={**modelo_xgb.get_params(), "amostra": X_train_final.shape[0], "tempo": tempo_treino},
                 X_train=X_train_final,
                 y_train=y_train_final,
                 y_test=y_test,
                 y_pred=y_pred,
                 variavel_alvo='NUM_NOTA_CH',
                 modelo=modelo_xgb,
                 nome_modelo='modelo_xgbm_base',
                 descricao_modelo='Modelo XGBRegressor base')

Registered model 'modelo_xgbm_base' already exists. Creating a new version of this model...
2025/08/09 14:58:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_xgbm_base, version 14


🏃 View run whimsical-sow-932 at: http://127.0.0.1:9080/#/experiments/957135083854196683/runs/499646d3e12b46edaf35fbb85ec16daf
🧪 View experiment at: http://127.0.0.1:9080/#/experiments/957135083854196683
Modelo registrado com sucesso no MLflow: modelo_xgbm_base
Rastreamento do MLflow finalizado.


Created version '14' of model 'modelo_xgbm_base'.


In [7]:
# Avaliação grupo treino
avaliar_modelo(y_train_final['NUM_NOTA_CH'], modelo_xgb.predict(X_train_final), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred, "teste")

MAE (treino): 54.3460
RMSE (treino): 69.9377
R2 (treino): 0.3171
MAE (teste): 55.1268
RMSE (teste): 70.5477
R2 (teste): 0.3019


In [8]:
# Salvar o modelo base como um arquivo pickle
joblib.dump(modelo_xgb, 'Modelos\\modelo_xgb_base.pkl')

['Modelos\\modelo_xgb_base.pkl']

### 3. Bayes Search

In [9]:
modelo_xgb_bayes = xgb.XGBRegressor(enable_categorical=True, 
                                    eval_metric=['rmse', 'mae'],
                                    objective='reg:absoluteerror',
                                    early_stopping_rounds=200)

In [10]:
param_grid = {
    'n_estimators': [300, 350, 600],
    'max_depth': [2, 3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'reg_alpha': [0.05, 0.1, 0.3, 0.5],
    'reg_lambda': [0.3, 0.5, 0.7]
}

In [11]:
bayes_search = BayesSearchCV(
    estimator=modelo_xgb_bayes,        # Modelo a ser otimizado
    search_spaces=param_grid,          # Espaço de busca definido acima
    scoring='neg_mean_absolute_error', # Critério de seleção
    n_iter=20,                         # Número de avaliações do modelo
    cv=3,                              # Validação cruzada
    random_state=42,                   # Semente para reprodutibilidade
    n_jobs=-1,                         # Paralelização total dos cálculos
    verbose=1                          # 0 = sem mensagens, 1 = mensagens de progresso, 2 = mensagens detalhadas
)

In [12]:
fit_params = {
    'eval_set': [(X_eval, y_eval['NUM_NOTA_CH'])],   # Conjunto de validação
    'verbose': 1,                                    # Mensagens durante o ajuste
}

In [13]:
# Executar a busca Bayesiana

start_time = time.time()
bayes_search.fit(X_train_final, y_train_final['NUM_NOTA_CH'], **fit_params) 

# Parar o cronômetro
end_time = time.time()
elapsed_time = end_time - start_time

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [14]:
# Melhores parâmetros encontrados
try:
    melhores_parametros = bayes_search.best_params_
    print(f"Melhores parâmetros: {melhores_parametros}")
    print(f"Tempo total de execução: {elapsed_time:.2f} segundos")
except:
    melhores_parametros = {'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 350, 'reg_alpha': 0.5, 'reg_lambda': 0.3}
    print(f"Erro ao obter melhores parâmetros, usando valores calculados anteriormente:\n {melhores_parametros}")


Melhores parâmetros: OrderedDict([('colsample_bytree', 0.3), ('learning_rate', 0.1), ('max_depth', 7), ('n_estimators', 600), ('reg_alpha', 0.05), ('reg_lambda', 0.7)])
Tempo total de execução: 2494.35 segundos


In [15]:
# Treinar o modelo com os melhores parâmetros encontrados
modelo_xgb_bayes.set_params(**melhores_parametros)

start_time = time.time()

# Treinamento do modelo com os melhores parâmetros encontrados
modelo_xgb_bayes.fit(X_train_final, 
                     y_train_final['NUM_NOTA_CH'], 
                     eval_set=[(X_eval, y_eval['NUM_NOTA_CH'])])

tempo_treino = time.time() - start_time

[0]	validation_0-rmse:83.12582	validation_0-mae:66.41204
[1]	validation_0-rmse:81.37205	validation_0-mae:64.89109
[2]	validation_0-rmse:80.10267	validation_0-mae:63.79457
[3]	validation_0-rmse:78.96611	validation_0-mae:62.80784
[4]	validation_0-rmse:78.10506	validation_0-mae:62.04709
[5]	validation_0-rmse:77.26226	validation_0-mae:61.30892
[6]	validation_0-rmse:76.37569	validation_0-mae:60.51822
[7]	validation_0-rmse:75.80481	validation_0-mae:60.00834
[8]	validation_0-rmse:75.17585	validation_0-mae:59.43680
[9]	validation_0-rmse:74.71488	validation_0-mae:59.01680
[10]	validation_0-rmse:74.20684	validation_0-mae:58.56585
[11]	validation_0-rmse:73.71619	validation_0-mae:58.12102
[12]	validation_0-rmse:73.33655	validation_0-mae:57.77630
[13]	validation_0-rmse:73.12055	validation_0-mae:57.57123
[14]	validation_0-rmse:72.91148	validation_0-mae:57.37869
[15]	validation_0-rmse:72.79338	validation_0-mae:57.27015
[16]	validation_0-rmse:72.67924	validation_0-mae:57.16456
[17]	validation_0-rmse:7

In [16]:
# Previsões
y_pred_bayes = modelo_xgb_bayes.predict(X_test)

In [17]:
nome_experimento = 'Notas CH ENEM 2023'

registrar_modelo(experimento=nome_experimento,
                    modelo=modelo_xgb_bayes,
                    parametros={**modelo_xgb_bayes.get_params(), "amostra": X_train_final.shape[0], "tempo": tempo_treino},
                    X_train=X_train_final,
                    y_train=y_train_final,
                    y_test=y_test,
                    y_pred=y_pred_bayes,
                    variavel_alvo='NUM_NOTA_CH',
                    nome_modelo='modelo_xgb_bayes',
                    descricao_modelo='Modelo XGBMRegressor otimizado com BayesSearchCV')

Registered model 'modelo_xgb_bayes' already exists. Creating a new version of this model...
2025/08/09 15:44:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_xgb_bayes, version 8


🏃 View run smiling-steed-289 at: http://127.0.0.1:9080/#/experiments/957135083854196683/runs/271780d13f0143269ef17045f7c03a5f
🧪 View experiment at: http://127.0.0.1:9080/#/experiments/957135083854196683
Modelo registrado com sucesso no MLflow: modelo_xgb_bayes
Rastreamento do MLflow finalizado.


Created version '8' of model 'modelo_xgb_bayes'.


In [18]:
# Avaliação grupo treino
avaliar_modelo(y_train_final['NUM_NOTA_CH'], modelo_xgb_bayes.predict(X_train_final), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred_bayes, "teste")

MAE (treino): 52.5638
RMSE (treino): 68.3149
R2 (treino): 0.3484
MAE (teste): 54.7669
RMSE (teste): 70.1671
R2 (teste): 0.3094


In [19]:
# Salvar o modelo otimizado como um arquivo pickle
joblib.dump(modelo_xgb_bayes, 'Modelos\\modelo_xgb_bayes.pkl')

['Modelos\\modelo_xgb_bayes.pkl']