In [1]:
import pandas as pd
import joblib
import time

from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV  # Bayesian optimization: utilizado para optimizar hiperpar√°metros

import lightgbm as lgbm
from lightgbm import early_stopping  # Early stopping: utilizado para evitar sobreajuste

from Funcoes_Comuns import avaliar_modelo, registrar_modelo

In [None]:
# Obter dados socioecon√¥micos e notas do ENEM 2023
df_enem = pd.read_pickle('Bases\Finais\dados_socioeconomicos_2023.pkl')

# Remover linhas com valores n√£o explicativos
df_enem = df_enem[
    (df_enem['CAT_NACIONALIDADE'] != 0) &
    (df_enem['CAT_COR_RACA'] != 0) &
    (df_enem['CAT_ESTADO_CIVIL'] != 0)
]

#Variaveis alvo
variaveis_alvo = ['NUM_NOTA_MT', 'NUM_NOTA_LC', 'NUM_NOTA_CN', 'NUM_NOTA_CH', 'NUM_NOTA_REDACAO']

# separar em treino e teste
X = df_enem.drop(columns=variaveis_alvo)
y = df_enem[variaveis_alvo]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajuste de tipo para MLflow -> Converter colunas inteiras para float
X_train = X_train.astype({col: 'float' for col in X_train.select_dtypes('int').columns})
X_test = X_test.astype({col: 'float' for col in X_test.select_dtypes('int').columns})

# Obter colunas categ√≥ricas
categorical_features = X_train.select_dtypes(include=['category']).columns.tolist()

# Criar Eval Set para valida√ß√£o cruzada (15% do conjunto de treino)
# Apenas utilizado nos modelos fianais, BayesSearchCV n√£o utiliza Eval Set j√° possui validacao cruzada interna
X_train_final, X_eval, y_train_final, y_eval = train_test_split(
    X_train,
    y_train,
    test_size=0.15,
    random_state=42
)

(716493, 38)


Modelo Base

In [8]:
# Treinar modelo LGBMRegressor Base
modelo_lgbm = lgbm.LGBMRegressor(n_estimators=1000, 
                                 learning_rate=0.01, 
                                 random_state=42,
                                 max_bin=4095,
                                 force_row_wise=True)

start_time = time.time()

modelo_lgbm.fit(X_train_final, 
                y_train_final['NUM_NOTA_CH'], 
                eval_set=[(X_test, y_test['NUM_NOTA_CH'])], 
                eval_metric=['r2', 'rmse', 'mae'],
                categorical_feature=categorical_features)

tempo_treino = time.time() - start_time

[LightGBM] [Info] Total Bins 188
[LightGBM] [Info] Number of data points in the train set: 465039, number of used features: 33
[LightGBM] [Info] Start training from score 528.371987


In [9]:
# Previs√µes
y_pred = modelo_lgbm.predict(X_test)

In [10]:
nome_experimento = 'Socioeconomicos CH 2023'

registrar_modelo(experimento=nome_experimento,
                 parametros={**modelo_lgbm.get_params(), "amostra": X_train.shape[0], "tempo": tempo_treino},
                 X_train=X_train,
                 y_train=y_train,
                 y_test=y_test,
                 y_pred=y_pred,
                 variavel_alvo='NUM_NOTA_CH',
                 modelo=modelo_lgbm,
                 nome_modelo='modelo_lgbm_base_socioeconomicos_2023',
                 descricao_modelo='Modelo LGBMRegressor base para dados socioeconomicos CH',)

2025/06/11 13:03:41 INFO mlflow.tracking.fluent: Experiment with name 'Socioeconomicos CH 2023' does not exist. Creating a new experiment.
Successfully registered model 'modelo_lgbm_base_socioeconomicos_2023'.
2025/06/11 13:04:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_lgbm_base_socioeconomicos_2023, version 1


üèÉ View run dashing-asp-648 at: http://127.0.0.1:9080/#/experiments/419068002121547902/runs/01e5198fa23f4b29a5f751c4849d2158
üß™ View experiment at: http://127.0.0.1:9080/#/experiments/419068002121547902
Modelo registrado com sucesso no MLflow: modelo_lgbm_base_socioeconomicos_2023
Rastreamento do MLflow finalizado.


Created version '1' of model 'modelo_lgbm_base_socioeconomicos_2023'.


In [11]:
# Avalia√ß√£o grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'], modelo_lgbm.predict(X_train), "treino")

# Avalia√ß√£o grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred, "teste")

MAE (treino): 56.6110
RMSE (treino): 71.6664
R2 (treino): 0.2835
MAE (teste): 56.5271
RMSE (teste): 71.4176
R2 (teste): 0.2776


Bayes Search

In [12]:
modelo_lgbm_bayes = lgbm.LGBMRegressor(random_state=42,
                                       max_bin=4095, 
                                       force_row_wise=True)

In [13]:
# Defini√ß√£o do espa√ßo de busca para otimiza√ß√£o bayesiana
param_grid = {
    'num_leaves': (5, 60),                         # N√∫mero de folhas na √°rvore de decis√£o
    'max_depth': (60, 120),                        # Profundidade m√°xima da √°rvore
    'learning_rate': (0.001, 0.01, 'log-uniform'), # Taxa de aprendizado
    'n_estimators': (5000, 8000),                  # N√∫mero de √°rvores
    'subsample': (0.1, 0.9),                       # Propor√ß√£o de amostras usadas em cada √°rvore
    'colsample_bytree': (0.1, 0.9),                # Fra√ß√£o de colunas a serem usadas por √°rvore
    'reg_alpha': (1e-3, 1.0, 'log-uniform'),       # Regulariza√ß√£o L1
    'reg_lambda': (1e-7, 1e-2, 'log-uniform'),     # Regulariza√ß√£o L2
}

In [14]:
# Configurar a busca Bayesiana usando BayesSearchCV

# Criando o otimizador Bayesiano
bayes_search = BayesSearchCV(
    estimator=modelo_lgbm_bayes,    # Modelo a ser otimizado
    search_spaces=param_grid,       # Espa√ßo de busca definido acima
    scoring='r2',                   # Crit√©rio de sele√ß√£o
    n_iter=5,                       # N√∫mero de avalia√ß√µes do modelo
    cv=5,                           # Valida√ß√£o cruzada
    random_state=42,                # Semente para reprodutibilidade
    n_jobs=-1,                      # Paraleliza√ß√£o total dos c√°lculos
    verbose=1                       # 0 = sem mensagens, 1 = mensagens de progresso, 2 = mensagens detalhadas
)

In [15]:
fit_params = {
    'eval_metric': ['r2', 'rmse', 'mae'],              # M√©tricas a serem avaliadas
    'categorical_feature': categorical_features,       # Colunas categ√≥ricas
}

In [17]:
# Executar a busca Bayesiana

start_time = time.time()
bayes_search.fit(X_train, y_train['NUM_NOTA_CH'], **fit_params)

# Parar o cron√¥metro
end_time = time.time()
elapsed_time = end_time - start_time

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Total Bins 188
[LightGBM] [Info] Number of data points in the train set: 547105, number of used features: 33
[LightGBM] [Info] Start training from score 528.385000


In [None]:
# Melhores par√¢metros encontrados
try:
    melhores_parametros = bayes_search.best_params_
    print(f"Melhores par√¢metros: {melhores_parametros}")
    print("R2: ", bayes_search.best_score_)
    print(f"Tempo total de execu√ß√£o: {elapsed_time:.2f} segundos")
except:
    melhores_parametros = {'colsample_bytree': 0.42808316708265115, 'learning_rate': 0.0053422688874711095, 'max_depth': 116, 'n_estimators': 5947, 'num_leaves': 42, 'reg_alpha': 0.017472534129202036, 'reg_lambda': 5.684034097210144e-06, 'subsample': 0.6916033873523364}
    print(f"Erro ao obter melhores par√¢metros, usando valores calculados anteriormente:\n {melhores_parametros}")

Melhores par√¢metros: OrderedDict([('colsample_bytree', 0.42808316708265115), ('learning_rate', 0.0053422688874711095), ('max_depth', 116), ('n_estimators', 5947), ('num_leaves', 42), ('reg_alpha', 0.017472534129202036), ('reg_lambda', 5.684034097210144e-06), ('subsample', 0.6916033873523364)])
R2:  0.28127374549224193
Tempo total de execu√ß√£o: 22909.22 segundos


In [19]:
# Treinar o modelo com os melhores par√¢metros encontrados
modelo_lgbm_bayes.set_params(**melhores_parametros)

start_time = time.time()

# Treinamento do modelo com os melhores par√¢metros encontrados
modelo_lgbm_bayes.fit(X_train_final, 
                      y_train_final['NUM_NOTA_CH'], 
                      eval_set=[(X_eval, y_eval['NUM_NOTA_CH'])], 
                      eval_metric=['r2', 'rmse', 'mae'],
                      categorical_feature=categorical_features,
                      callbacks=[early_stopping(stopping_rounds=200)])

tempo_treino = time.time() - start_time

[LightGBM] [Info] Total Bins 188
[LightGBM] [Info] Number of data points in the train set: 465039, number of used features: 33
[LightGBM] [Info] Start training from score 528.371987
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[5945]	valid_0's rmse: 71.7137	valid_0's l1: 56.6025	valid_0's l2: 5142.85


In [20]:
# Previs√µes
y_pred_bayes = modelo_lgbm_bayes.predict(X_test)

In [21]:
nome_experimento = 'Socioeconomicos CH 2023'

registrar_modelo(experimento=nome_experimento,
                    modelo=modelo_lgbm_bayes,
                    parametros={**modelo_lgbm_bayes.get_params(), "amostra": X_train.shape[0], "tempo": tempo_treino},
                    X_train=X_train,
                    y_train=y_train,
                    y_test=y_test,
                    y_pred=y_pred_bayes,
                    variavel_alvo='NUM_NOTA_CH',
                    nome_modelo='modelo_lgbm_bayes_socioeconomicos_2023',
                    descricao_modelo='Modelo LGBMRegressor otimizado com BayesSearchCV para dados socioeconomicos CH',)

Successfully registered model 'modelo_lgbm_bayes_socioeconomicos_2023'.
2025/06/11 19:48:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_lgbm_bayes_socioeconomicos_2023, version 1


üèÉ View run intelligent-sponge-3 at: http://127.0.0.1:9080/#/experiments/419068002121547902/runs/1b23ada2f66a42d0b39bc5abbb5f1290
üß™ View experiment at: http://127.0.0.1:9080/#/experiments/419068002121547902
Modelo registrado com sucesso no MLflow: modelo_lgbm_bayes_socioeconomicos_2023
Rastreamento do MLflow finalizado.


Created version '1' of model 'modelo_lgbm_bayes_socioeconomicos_2023'.


In [22]:
# Avalia√ß√£o grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'], modelo_lgbm_bayes.predict(X_train), "treino")

# Avalia√ß√£o grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred_bayes, "teste")

MAE (treino): 56.1135
RMSE (treino): 71.0638
R2 (treino): 0.2955
MAE (teste): 56.3995
RMSE (teste): 71.2942
R2 (teste): 0.2801


In [23]:
# Salvar modelo como Pickle
joblib.dump(modelo_lgbm_bayes, 'modelos/modelo_lgbm_bayes_socioeconomico.pkl')

['modelos/modelo_lgbm_bayes_socioeconomico.pkl']