In [2]:
import pandas as pd
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

import lightgbm as lgbm

In [3]:
# Iniciar o servidor de rastreamento do MLflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:9080")

Recuperar base já pré-processada

In [4]:
# Obter dados
df_enem = pd.read_pickle('Bases\MICRODADOS_ENEM_2023_tratados.pkl')

In [5]:
variaveis_alvo = ['NUM_NOTA_MT', 'NUM_NOTA_LC', 'NUM_NOTA_CN', 'NUM_NOTA_CH', 'NUM_NOTA_REDACAO']

# separar em treino e teste
X = df_enem.drop(columns=variaveis_alvo)
y = df_enem[variaveis_alvo]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Ajuste de tipo para MLflow
# Converter colunas inteiras para float
X_train = X_train.astype({col: 'float' for col in X_train.select_dtypes('int').columns})
X_test = X_test.astype({col: 'float' for col in X_test.select_dtypes('int').columns})

In [7]:
categorical_features = X_train.select_dtypes(include=['category']).columns.tolist()

categorical_features

['CAT_COR_RACA',
 'CAT_CO_MUNICIPIO_ESC',
 'CAT_CO_UF_ESC',
 'CAT_DEPENDENCIA_ADM_ESC',
 'CAT_ENSINO',
 'CAT_ESCOLA',
 'CAT_ESTADO_CIVIL',
 'CAT_FAIXA_ETARIA',
 'CAT_LINGUA',
 'CAT_LOCALIZACAO_ESC',
 'CAT_NACIONALIDADE',
 'CAT_Q003',
 'CAT_Q004',
 'CAT_SEXO',
 'CAT_SIT_FUNC_ESC']

In [8]:
# Treinar modelo LGBMRegressor Base
modelo_lgbm = lgbm.LGBMRegressor(n_estimators=1000, 
                                 learning_rate=0.01, 
                                 random_state=42,
                                 max_bin=4095,
                                 force_row_wise=True)

modelo_lgbm.fit(X_train, 
                y_train['NUM_NOTA_CH'], 
                eval_set=[(X_test, y_test['NUM_NOTA_CH'])], 
                eval_metric=['r2', 'rmse', 'mae'],
                categorical_feature=categorical_features)


[LightGBM] [Info] Total Bins 4244
[LightGBM] [Info] Number of data points in the train set: 573256, number of used features: 40
[LightGBM] [Info] Start training from score 527.936960


In [9]:
# Criar experimento no MLflow
mlflow.set_experiment('Notas CH ENEM 2023')

<Experiment: artifact_location='mlflow-artifacts:/299918284299748162', creation_time=1746134789450, experiment_id='299918284299748162', last_update_time=1746134789450, lifecycle_stage='active', name='Notas CH ENEM 2023', tags={}>

In [10]:
# Previsões
y_pred = modelo_lgbm.predict(X_test)

In [11]:
# Iniciar o rastreamento do MLflow
with mlflow.start_run() as run:

    # Registrar os parâmetros
    mlflow.log_param("n_estimators", 1000)
    mlflow.log_param("learning_rate", 0.01)
    mlflow.log_param("max_bin", 4095)
    mlflow.log_param("force_row_wise", True)
    mlflow.log_param("random_state", 42)
    
    # Registrar as métricas
    r2 = r2_score(y_test['NUM_NOTA_CH'], y_pred)
    mae = mean_absolute_error(y_test['NUM_NOTA_CH'], y_pred)
    rmse = root_mean_squared_error(y_test['NUM_NOTA_CH'], y_pred)

    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)

    # Definir uma TAG para o modelo
    mlflow.set_tag("model_type", "LGBMRegressor")

    # Inferir assinatura do modelo
    signature = mlflow.models.infer_signature(X_train, y_train['NUM_NOTA_CH'])

    # Registrar modelo
    mlflow.sklearn.log_model(sk_model=modelo_lgbm, 
                             artifact_path="modelo_lgbm_base", 
                             signature=signature,
                             registered_model_name="modelo_lgbm_base")


Registered model 'modelo_lgbm_base' already exists. Creating a new version of this model...
2025/05/01 18:30:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_lgbm_base, version 6
Created version '6' of model 'modelo_lgbm_base'.


🏃 View run salty-perch-318 at: http://127.0.0.1:9080/#/experiments/299918284299748162/runs/594818e9857b4bffaf77704261a47bb8
🧪 View experiment at: http://127.0.0.1:9080/#/experiments/299918284299748162


In [12]:
modelo_lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.01,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'max_bin': 4095,
 'force_row_wise': True}

In [20]:
# Função para calcular as métricas
def avaliar_modelo(y_true, y_pred, grupo):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"MAE ({grupo}): {mae:.2f}")
    print(f"RMSE ({grupo}): {rmse:.2f}")
    print(f"R2 ({grupo}): {r2:.2f}")

In [14]:
# Avaliação grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'], modelo_lgbm.predict(X_train), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred, "teste")

MAE (treino): 53.68
RMSE (treino): 68.10
R2 (treino): 0.35
MAE (teste): 55.34
RMSE (teste): 70.14
R2 (teste): 0.31


In [None]:
# Salvar o modelo otimizado como um arquivo pickle
import joblib
joblib.dump(modelo_lgbm, 'Projeto\Modelos\modelo_lgbm_base.pkl')

['Modelos\\modelo_lgbm_base.pkl']

Bayes Search

In [1]:
from skopt import BayesSearchCV
import time
from lightgbm import early_stopping

In [31]:
modelo_lgbm_bayes = lgbm.LGBMRegressor(random_state=42, 
                                       max_bin=4095, 
                                       force_row_wise=True)

In [33]:
# Definição do espaço de busca para otimização bayesiana
param_grid = {
    'num_leaves': (5, 60),                         # Número de folhas na árvore de decisão
    'max_depth': (40, 100),                        # Profundidade máxima da árvore
    'learning_rate': (0.005, 0.1, 'log-uniform'),  # Taxa de aprendizado
    'n_estimators': (2000, 5000),                  # Número de árvores
    'subsample': (0.3, 1.0),                       # Proporção de amostras usadas em cada árvore
    'colsample_bytree': (0.2, 1.0),                # Fração de colunas a serem usadas por árvore
    'reg_alpha': (1e-3, 1.0, 'log-uniform'),       # Regularização L1
    'reg_lambda': (1e-5, 1.0, 'log-uniform'),      # Regularização L2
}

In [35]:
# Configurar a busca Bayesiana usando BayesSearchCV

# Criando o otimizador Bayesiano
bayes_search = BayesSearchCV(
    estimator=modelo_lgbm_bayes,    # Modelo a ser otimizado
    search_spaces=param_grid,       # Espaço de busca definido acima
    scoring='r2',                   # Critério de seleção
    n_iter=30,                      # Número de avaliações do modelo
    cv=5,                           # Validação cruzada
    random_state=42,                # Semente para reprodutibilidade
    n_jobs=-1,                      # Paralelização total dos cálculos
    verbose=1                       # 0 = sem mensagens, 1 = mensagens de progresso, 2 = mensagens detalhadas
)

In [36]:
# Criar Eval Set para validação cruzada (15% do conjunto de treino)
X_train_bayes, X_eval, y_train_bayes, y_eval = train_test_split(
    X_train,
    y_train['NUM_NOTA_CH'],
    test_size=0.15,
    random_state=42
)

In [37]:
fit_params = {
    'eval_set': [(X_eval, y_eval)],        # Conjunto de validação
    'eval_metric': ['r2', 'rmse', 'mae'],  # Métricas a serem avaliadas
    'early_stopping_rounds': 200,          # Parar se não houver melhoria por n iterações
    'verbose': 1,                          # Não exibir mensagens durante o ajuste
}

In [38]:
# Executar a busca Bayesiana

start_time = time.time()
bayes_search.fit(X_train_bayes, y_train_bayes)

# Parar o cronômetro
end_time = time.time()
elapsed_time = end_time - start_time

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [39]:
# Resultados da busca Bayesiana

print("Melhores parâmetros encontrados:")
print(bayes_search.best_params_)
print("R2: ", bayes_search.best_score_)
print(f"Tempo total de execução: {elapsed_time:.2f} segundos")

Melhores parâmetros encontrados:
OrderedDict([('colsample_bytree', 0.319934457252814), ('learning_rate', 0.005), ('max_depth', 54), ('n_estimators', 5000), ('num_leaves', 49), ('reg_alpha', 0.15096028361393093), ('reg_lambda', 0.020875891601585508), ('subsample', 0.31695966954573485)])
R2:  0.319378834529559
Tempo total de execução: 17137.04 segundos


In [40]:
# Treinar o modelo com os melhores parâmetros encontrados
modelo_lgbm_bayes.set_params(**bayes_search.best_params_)

# Treinamento do modelo com os melhores parâmetros encontrados
modelo_lgbm_bayes.fit(X_train_bayes, 
                      y_train_bayes, 
                      eval_set=[(X_eval, y_eval)], 
                      eval_metric=['r2', 'rmse', 'mae'],
                      categorical_feature=categorical_features,
                      callbacks=[early_stopping(stopping_rounds=200)])

[LightGBM] [Info] Total Bins 4289
[LightGBM] [Info] Number of data points in the train set: 487267, number of used features: 40
[LightGBM] [Info] Start training from score 527.961360
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[4999]	valid_0's rmse: 69.8985	valid_0's l1: 55.1119	valid_0's l2: 4885.8


In [41]:
# Previsões
y_pred_bayes = modelo_lgbm_bayes.predict(X_test)

In [42]:
# Iniciar o rastreamento do MLflow
with mlflow.start_run() as run:

    # Registrar os parâmetros
    for param, value in bayes_search.best_params_.items():
        mlflow.log_param(param, value)

    # Registrar as métricas
    r2_bayes = r2_score(y_test['NUM_NOTA_CH'], y_pred_bayes)
    mae_bayes = mean_absolute_error(y_test['NUM_NOTA_CH'], y_pred_bayes)
    rmse_bayes = root_mean_squared_error(y_test['NUM_NOTA_CH'], y_pred_bayes)

    mlflow.log_metric("r2", r2_bayes)
    mlflow.log_metric("mae", mae_bayes)
    mlflow.log_metric("rmse", rmse_bayes)

    # Definir uma TAG para o modelo
    mlflow.set_tag("model_type", "LGBMRegressor - BayesSearchCV")

    # Inferir assinatura do modelo
    signature = mlflow.models.infer_signature(X_train, y_train['NUM_NOTA_CH'])

    # Registrar modelo
    mlflow.sklearn.log_model(sk_model=modelo_lgbm_bayes, 
                             artifact_path="modelo_lgbm_bayes", 
                             signature=signature,
                             registered_model_name="modelo_lgbm_bayes")

Registered model 'modelo_lgbm_bayes' already exists. Creating a new version of this model...
2025/05/02 18:05:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_lgbm_bayes, version 6


🏃 View run magnificent-bear-137 at: http://127.0.0.1:9080/#/experiments/0/runs/f49e8847e1924e2a862e44ae548127f2
🧪 View experiment at: http://127.0.0.1:9080/#/experiments/0


Created version '6' of model 'modelo_lgbm_bayes'.


In [43]:
# Avaliação grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'], modelo_lgbm_bayes.predict(X_train), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred_bayes, "teste")

MAE (treino): 53.50
RMSE (treino): 67.91
R2 (treino): 0.36
MAE (teste): 55.02
RMSE (teste): 69.80
R2 (teste): 0.32


In [None]:
# Salvar o modelo otimizado como um arquivo pickle
import joblib
joblib.dump(modelo_lgbm_bayes, 'Projeto\Modelos\modelo_lgbm_bayes.pkl')

['Modelos\\modelo_lgbm_bayes.pkl']