In [1]:
import pandas as pd
import joblib
import time

from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV  # Bayesian optimization: utilizado para optimizar hiperparámetros

import xgboost  as xgb

from Funcoes_Comuns import avaliar_modelo, registrar_modelo

Recuperar base já pré-processada

In [2]:
# Obter dados
df_enem = pd.read_pickle('Bases\Finais\enem_2023_full.pkl')

In [3]:
variaveis_alvo = ['NUM_NOTA_MT', 'NUM_NOTA_LC', 'NUM_NOTA_CN', 'NUM_NOTA_CH', 'NUM_NOTA_REDACAO']

# separar em treino e teste
X = df_enem.drop(columns=variaveis_alvo)
y = df_enem[variaveis_alvo]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Ajuste de tipo para MLflow
# Converter colunas inteiras para float
X_train = X_train.astype({col: 'float' for col in X_train.select_dtypes('int').columns})
X_test = X_test.astype({col: 'float' for col in X_test.select_dtypes('int').columns})

Modelo base

In [5]:
# Treinar modelo XGBoost
modelo_xgb = xgb.XGBRegressor(n_estimators=200,
                              max_depth=3,
                              learning_rate=0.5,
                              colsample_bytree=0.5,
                              objective='reg:absoluteerror',
                              enable_categorical=True,
                              reg_alpha=0.3,
                              reg_lambda=0.7,
                              random_state=42,
                              n_jobs=-1)

start_time = time.time()

modelo_xgb.fit(X_train, 
                y_train['NUM_NOTA_CH'], 
                eval_set=[(X_test, y_test['NUM_NOTA_CH'])])

tempo_treino = time.time() - start_time

[0]	validation_0-mae:61.28037
[1]	validation_0-mae:59.06719
[2]	validation_0-mae:57.40444
[3]	validation_0-mae:56.75171
[4]	validation_0-mae:56.45477
[5]	validation_0-mae:56.26625
[6]	validation_0-mae:56.11019
[7]	validation_0-mae:56.03142
[8]	validation_0-mae:55.97569
[9]	validation_0-mae:55.90188
[10]	validation_0-mae:55.86109
[11]	validation_0-mae:55.80388
[12]	validation_0-mae:55.75941
[13]	validation_0-mae:55.71299
[14]	validation_0-mae:55.65487
[15]	validation_0-mae:55.65476
[16]	validation_0-mae:55.64548
[17]	validation_0-mae:55.63196
[18]	validation_0-mae:55.63118
[19]	validation_0-mae:55.61707
[20]	validation_0-mae:55.61282
[21]	validation_0-mae:55.59031
[22]	validation_0-mae:55.55889
[23]	validation_0-mae:55.55493
[24]	validation_0-mae:55.54882
[25]	validation_0-mae:55.54850
[26]	validation_0-mae:55.53795
[27]	validation_0-mae:55.53260
[28]	validation_0-mae:55.53277
[29]	validation_0-mae:55.53002
[30]	validation_0-mae:55.52446
[31]	validation_0-mae:55.52130
[32]	validation_0-

In [6]:
# Previsões
y_pred = modelo_xgb.predict(X_test)

In [7]:
nome_experimento = 'Notas CH ENEM 2023'

registrar_modelo(experimento=nome_experimento,
                 parametros={**modelo_xgb.get_params(), "amostra": X_train.shape[0], "tempo": tempo_treino},
                 X_train=X_train,
                 y_train=y_train,
                 y_test=y_test,
                 y_pred=y_pred,
                 variavel_alvo='NUM_NOTA_CH',
                 modelo=modelo_xgb,
                 nome_modelo='modelo_xgbm_base',
                 descricao_modelo='Modelo XGBRegressor base')

Registered model 'modelo_xgbm_base' already exists. Creating a new version of this model...
2025/06/07 21:38:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_xgbm_base, version 13


🏃 View run legendary-flea-839 at: http://127.0.0.1:9080/#/experiments/957135083854196683/runs/b3774e8616c042dd996fdf0c8b5a1c4a
🧪 View experiment at: http://127.0.0.1:9080/#/experiments/957135083854196683
Modelo registrado com sucesso no MLflow: modelo_xgbm_base
Rastreamento do MLflow finalizado.


Created version '13' of model 'modelo_xgbm_base'.


In [8]:
# Avaliação grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'], modelo_xgb.predict(X_train), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred, "teste")

MAE (treino): 54.2976
RMSE (treino): 69.8943
R2 (treino): 0.3184
MAE (teste): 54.9792
RMSE (teste): 70.4107
R2 (teste): 0.3046


In [9]:
# Salvar o modelo otimizado como um arquivo pickle
joblib.dump(modelo_xgb, 'Modelos\modelo_xgb_base.pkl')

['Modelos\\modelo_xgb_base.pkl']

Bayes Search

In [10]:
modelo_xgb_bayes = xgb.XGBRegressor(enable_categorical=True, 
                                    eval_metric=['rmse', 'mae'],
                                    objective='reg:absoluteerror',
                                    early_stopping_rounds=200)

In [11]:
param_grid = {
    'n_estimators': [300, 350, 600],
    'max_depth': [2, 3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'reg_alpha': [0.05, 0.1, 0.3, 0.5],
    'reg_lambda': [0.3, 0.5, 0.7]
}

In [12]:
bayes_search = BayesSearchCV(
    estimator=modelo_xgb_bayes,        # Modelo a ser otimizado
    search_spaces=param_grid,          # Espaço de busca definido acima
    scoring='neg_mean_absolute_error', # Critério de seleção
    n_iter=20,                         # Número de avaliações do modelo
    cv=3,                              # Validação cruzada
    random_state=42,                   # Semente para reprodutibilidade
    n_jobs=-1,                         # Paralelização total dos cálculos
    verbose=1                          # 0 = sem mensagens, 1 = mensagens de progresso, 2 = mensagens detalhadas
)

In [13]:
# Criar Eval Set para validação cruzada (15% do conjunto de treino)
X_train_bayes, X_eval, y_train_bayes, y_eval = train_test_split(
    X_train,
    y_train['NUM_NOTA_CH'],
    test_size=0.15,
    random_state=42
)

In [14]:
fit_params = {
    'eval_set': [(X_eval, y_eval)],   # Conjunto de validação
    'verbose': 1,                     # Mensagens durante o ajuste
}

In [189]:
# Executar a busca Bayesiana

start_time = time.time()
bayes_search.fit(X_train_bayes, y_train_bayes, **fit_params) 

# Parar o cronômetro
end_time = time.time()
elapsed_time = end_time - start_time

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [15]:
# Melhores parâmetros encontrados
try:
    melhores_parametros = bayes_search.best_params_
    print(f"Melhores parâmetros: {melhores_parametros}")
    print(f"Tempo total de execução: {elapsed_time:.2f} segundos")
except:
    melhores_parametros = {'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 350, 'reg_alpha': 0.5, 'reg_lambda': 0.3}
    print(f"Erro ao obter melhores parâmetros, usando valores calculados anteriormente:\n {melhores_parametros}")


Erro ao obter melhores parâmetros, usando valores calculados anteriormente:
 {'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 350, 'reg_alpha': 0.5, 'reg_lambda': 0.3}


In [17]:
# Treinar o modelo com os melhores parâmetros encontrados
modelo_xgb_bayes.set_params(**melhores_parametros)

start_time = time.time()

# Treinamento do modelo com os melhores parâmetros encontrados
modelo_xgb_bayes.fit(X_train_bayes, 
                        y_train_bayes, 
                        eval_set=[(X_eval, y_eval)])

tempo_treino = time.time() - start_time

[0]	validation_0-rmse:83.12615	validation_0-mae:66.41244
[1]	validation_0-rmse:81.37187	validation_0-mae:64.89107
[2]	validation_0-rmse:80.10153	validation_0-mae:63.79431
[3]	validation_0-rmse:78.96108	validation_0-mae:62.80427
[4]	validation_0-rmse:78.09907	validation_0-mae:62.04182
[5]	validation_0-rmse:77.25631	validation_0-mae:61.30331
[6]	validation_0-rmse:76.37107	validation_0-mae:60.51376
[7]	validation_0-rmse:75.79908	validation_0-mae:60.00320
[8]	validation_0-rmse:75.16990	validation_0-mae:59.43169
[9]	validation_0-rmse:74.70676	validation_0-mae:59.00978
[10]	validation_0-rmse:74.19943	validation_0-mae:58.55931
[11]	validation_0-rmse:73.70841	validation_0-mae:58.11448
[12]	validation_0-rmse:73.32712	validation_0-mae:57.76887
[13]	validation_0-rmse:73.10950	validation_0-mae:57.56196
[14]	validation_0-rmse:72.89951	validation_0-mae:57.36842
[15]	validation_0-rmse:72.78025	validation_0-mae:57.25811
[16]	validation_0-rmse:72.66462	validation_0-mae:57.15184
[17]	validation_0-rmse:7

In [18]:
# Previsões
y_pred_bayes = modelo_xgb_bayes.predict(X_test)

In [19]:
nome_experimento = 'Notas CH ENEM 2023'

registrar_modelo(experimento=nome_experimento,
                    modelo=modelo_xgb_bayes,
                    parametros={**modelo_xgb_bayes.get_params(), "amostra": X_train.shape[0], "tempo": tempo_treino},
                    X_train=X_train,
                    y_train=y_train,
                    y_test=y_test,
                    y_pred=y_pred_bayes,
                    variavel_alvo='NUM_NOTA_CH',
                    nome_modelo='modelo_xgb_bayes',
                    descricao_modelo='Modelo XGBMRegressor otimizado com BayesSearchCV')

Registered model 'modelo_xgb_bayes' already exists. Creating a new version of this model...
2025/06/07 21:49:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_xgb_bayes, version 7


🏃 View run rare-whale-765 at: http://127.0.0.1:9080/#/experiments/957135083854196683/runs/58ef78b2db43469295f535b7215e5ce3
🧪 View experiment at: http://127.0.0.1:9080/#/experiments/957135083854196683
Modelo registrado com sucesso no MLflow: modelo_xgb_bayes
Rastreamento do MLflow finalizado.


Created version '7' of model 'modelo_xgb_bayes'.


In [20]:
# Avaliação grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'], modelo_xgb_bayes.predict(X_train), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred_bayes, "teste")

MAE (treino): 53.1003
RMSE (treino): 68.7682
R2 (treino): 0.3402
MAE (teste): 54.7667
RMSE (teste): 70.1497
R2 (teste): 0.3097


In [21]:
# Salvar o modelo otimizado como um arquivo pickle
joblib.dump(modelo_xgb_bayes, 'Modelos\modelo_xgb_bayes.pkl')

['Modelos\\modelo_xgb_bayes.pkl']