In [1]:
import pandas as pd
import joblib
import time

from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from Funcoes_Comuns import avaliar_modelo, registrar_modelo

#### 1. Recuperar base já pré-processada

In [2]:
# Obter dados
df_enem = pd.read_pickle('Bases\\Finais\\enem_microdados_2023.pkl')

#### 2. Label Encoding
- Apenas transforma variáveis categóricas em numéricas, não há significado numérico
- Processamento necessário para modelos aplicados: random forest baseado em árvore de decisão

In [3]:
# Aplicar o LabelEncoder para converter as colunas categóricas em numéricas
# Salvar os encoders para possível uso posterior
label_encoders = {}
categorical_columns = df_enem.select_dtypes(include=['category']).columns

for col in categorical_columns:
    le = LabelEncoder()
    df_enem[col] = le.fit_transform(df_enem[col])
    label_encoders[col] = le  # Salvar o encoder para a coluna

In [4]:
df_enem.head()

Unnamed: 0,NUM_NOTA_CH,NUM_NOTA_CN,NUM_NOTA_LC,NUM_NOTA_MT,NUM_NOTA_REDACAO,BIN_Q001_DUMMY_H,BIN_Q002_DUMMY_H,BIN_Q018,BIN_Q020,BIN_Q021,...,NUM_Q011,NUM_Q012,NUM_Q013,NUM_Q014,NUM_Q015,NUM_Q016,NUM_Q017,NUM_Q019,NUM_Q022,NUM_Q024
0,508.5,459.0,507.2,466.7,880.0,False,False,False,False,False,...,0,1,0,0,0,0,0,1,3,0
1,379.2,402.5,446.9,338.3,560.0,False,False,False,False,False,...,0,1,0,0,0,0,0,1,1,0
2,667.6,608.2,607.9,691.9,780.0,False,False,False,False,False,...,0,1,0,1,0,0,0,1,1,1
3,553.1,515.7,544.4,437.0,880.0,False,False,False,False,False,...,0,1,0,0,0,0,0,1,3,0
4,576.3,523.8,596.5,628.1,600.0,False,False,False,False,False,...,0,1,0,1,0,0,0,1,1,0


In [5]:
variaveis_alvo = ['NUM_NOTA_MT', 'NUM_NOTA_LC', 'NUM_NOTA_CN', 'NUM_NOTA_CH', 'NUM_NOTA_REDACAO']

# separar em treino e teste
X = df_enem.drop(columns=variaveis_alvo)
y = df_enem[variaveis_alvo]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajuste de tipo para MLflow
# Converter colunas inteiras para float
X_train = X_train.astype({col: 'float' for col in X_train.select_dtypes('int').columns})
X_test = X_test.astype({col: 'float' for col in X_test.select_dtypes('int').columns})

#### 3. Modelo Base Random Forest

In [6]:
len(X_train)
amostra = 100000  # Até 50 mil o processamento é rápido: aproximadamente 8 minutos com 200 árvores

    Utilizar amostras menores para testes rápidos
    max_features como None deixa mais lento o treinamento

In [7]:
parametros = {
    'n_estimators': 200,
    'max_depth': 50,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'bootstrap': True,
    'criterion': 'absolute_error',
    'random_state': 42,
    'verbose': 2,
    'n_jobs': -1,
}

In [8]:
# Treinar o modelo
modelo_random_forest = RandomForestRegressor(**parametros)
start_time = time.time()
modelo_random_forest.fit(X_train[:amostra], y_train['NUM_NOTA_CH'][:amostra])
tempo_treino = time.time() - start_time

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 200
building tree 2 of 200
building tree 3 of 200
building tree 4 of 200
building tree 5 of 200
building tree 6 of 200
building tree 7 of 200
building tree 8 of 200
building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  6.9min


building tree 34 of 200
building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
building tree 43 of 200
building tree 44 of 200
building tree 45 of 200
building tree 46 of 200
building tree 47 of 200
building tree 48 of 200
building tree 49 of 200
building tree 50 of 200
building tree 51 of 200
building tree 52 of 200
building tree 53 of 200
building tree 54 of 200
building tree 55 of 200
building tree 56 of 200
building tree 57 of 200
building tree 58 of 200
building tree 59 of 200
building tree 60 of 200
building tree 61 of 200
building tree 62 of 200
building tree 63 of 200
building tree 64 of 200
building tree 65 of 200
building tree 66 of 200
building tree 67 of 200
building tree 68 of 200
building tree 69 of 200
building tree 70 of 200
building tree 71 of 200
building tree 72 of 200
building tree 73 of 200
building tree 74 of 200
building tree 75

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 35.5min


building tree 155 of 200
building tree 156 of 200
building tree 157 of 200
building tree 158 of 200
building tree 159 of 200
building tree 160 of 200
building tree 161 of 200
building tree 162 of 200
building tree 163 of 200
building tree 164 of 200
building tree 165 of 200
building tree 166 of 200
building tree 167 of 200
building tree 168 of 200
building tree 169 of 200
building tree 170 of 200
building tree 171 of 200
building tree 172 of 200
building tree 173 of 200
building tree 174 of 200
building tree 175 of 200
building tree 176 of 200
building tree 177 of 200
building tree 178 of 200
building tree 179 of 200
building tree 180 of 200
building tree 181 of 200
building tree 182 of 200
building tree 183 of 200
building tree 184 of 200
building tree 185 of 200
building tree 186 of 200
building tree 187 of 200
building tree 188 of 200
building tree 189 of 200
building tree 190 of 200
building tree 191 of 200
building tree 192 of 200
building tree 193 of 200
building tree 194 of 200


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 47.6min finished


In [9]:
# Avaliação do modelo final
y_pred_rf = modelo_random_forest.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    3.9s finished


In [10]:
nome_experimento = 'Notas CH ENEM 2023'

registrar_modelo(experimento=nome_experimento, 
                 parametros={**parametros, 'amostra': amostra, "tempo": tempo_treino}, 
                 X_train=X_train[:amostra], 
                 y_train=y_train[:amostra], 
                 y_test=y_test, 
                 y_pred=y_pred_rf,
                 variavel_alvo='NUM_NOTA_CH',
                 modelo=modelo_random_forest, 
                 nome_modelo='modelo_random_forest',
                 descricao_modelo='Modelo Random Forest')

Erro ao registrar o modelo no MLflow: API request to http://127.0.0.1:9080/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=9080): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Notas+CH+ENEM+2023 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000251D9023890>: Failed to establish a new connection: [WinError 10061] Nenhuma conexão pôde ser feita porque a máquina de destino as recusou ativamente'))
Rastreamento do MLflow finalizado.


In [11]:
# Avaliação grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'][:amostra], modelo_random_forest.predict(X_train[:amostra]), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred_rf, "teste")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    2.0s


MAE (treino): 37.5994
RMSE (treino): 49.3668
R2 (treino): 0.6579
MAE (teste): 55.8744
RMSE (teste): 70.7671
R2 (teste): 0.2975


[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    2.6s finished


In [12]:
# Salvar o modelo base como um arquivo pickle
joblib.dump(modelo_random_forest, 'Modelos\\modelo_random_forest_base.pkl')

['Modelos\\modelo_random_forest_base.pkl']

#### 4. Aplicar Otimização com Random Search

In [13]:
len(X_train)
amostra = 10000 # Até 10 mil o processamento é rápido: aproximadamente 20 mibutos

In [14]:
param_distributions = {
    'n_estimators': [100, 300, 500],                  # Número de árvores
    'max_depth': range(50, 100, 10),                  # Profundidade máxima das árvores
    'min_samples_split': [2, 5, 10],                  # Número mínimo de amostras para dividir um nó
    'min_samples_leaf': [1, 2, 5],                    # Número mínimo de amostras em uma folha
    'max_features': ['sqrt', 'log2'],                 # Número de features consideradas para divisão
    'bootstrap': [True],                              # Usar bootstrap ou não
    'criterion': ['absolute_error'],                  # Critério de divisão
    'ccp_alpha': [0.2, 0.3436, 0.4]                   # Parâmetro de poda de complexidade de custo
}

In [15]:
# Configurar o RandomizedSearchCV: este métodos testa combinações aleatórias de hiperparâmetros

n_iter = 5  # Número de combinações a serem testadas
kf = KFold(n_splits=4, shuffle=True, random_state=42)  # Divisão em partes para validação cruzada

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_distributions,
    n_iter=n_iter,
    cv=kf,
    scoring='neg_mean_absolute_error',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [16]:
# Executar o Random Search
start_time_search = time.time()
random_search.fit(X_train[:amostra], y_train['NUM_NOTA_CH'][:amostra])
tempo_search = time.time() - start_time_search

print(f"Tempo de execução do Random Search: {tempo_search:.2f} segundos")

Fitting 4 folds for each of 5 candidates, totalling 20 fits
Tempo de execução do Random Search: 637.11 segundos


In [17]:
# Melhores parâmetros encontrados
try:
    best_params = random_search.best_params_
    print("Melhores parâmetros encontrados: ", best_params)
except:
    best_params = {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 60, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'bootstrap': True}
    print(f"Erro ao obter melhores parâmetros, usando valores calculados anteriormente:\n {best_params}")

Melhores parâmetros encontrados:  {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 60, 'criterion': 'absolute_error', 'ccp_alpha': 0.2, 'bootstrap': True}


In [18]:
# Treinar o modelo com os melhores parâmetros
modelo_random_forest_random_search = RandomForestRegressor(**best_params, n_jobs=-1, random_state=42)

start_time = time.time()
modelo_random_forest_random_search.fit(X_train[:amostra], y_train['NUM_NOTA_CH'][:amostra])
tempo_treino = time.time() - start_time

In [19]:
# Avaliação do modelo final
y_pred_rf_rs = modelo_random_forest_random_search.predict(X_test)

In [20]:
nome_experimento = 'Notas CH ENEM 2023'

registrar_modelo(experimento=nome_experimento, 
                     parametros={**best_params, 'amostra': amostra, "tempo": tempo_treino}, 
                     X_train=X_train[:amostra], 
                     y_train=y_train[:amostra], 
                     y_test=y_test, 
                     y_pred=y_pred_rf_rs, 
                     variavel_alvo='NUM_NOTA_CH', 
                     modelo=modelo_random_forest_random_search, 
                     nome_modelo='modelo_random_forest_random_search',
                     descricao_modelo='Modelo Random Forest com RandomizedSearchCV')

Erro ao registrar o modelo no MLflow: API request to http://127.0.0.1:9080/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=9080): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Notas+CH+ENEM+2023 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000251D9018F50>: Failed to establish a new connection: [WinError 10061] Nenhuma conexão pôde ser feita porque a máquina de destino as recusou ativamente'))
Rastreamento do MLflow finalizado.


In [21]:
# Avaliação grupo treino
avaliar_modelo(y_train['NUM_NOTA_CH'][:amostra], modelo_random_forest_random_search.predict(X_train[:amostra]), "treino")

# Avaliação grupo teste
avaliar_modelo(y_test['NUM_NOTA_CH'], y_pred_rf_rs, "teste")

MAE (treino): 58.1375
RMSE (treino): 73.7055
R2 (treino): 0.2503
MAE (teste): 57.8502
RMSE (teste): 73.3496
R2 (teste): 0.2453


In [22]:
# Salvar o modelo otimizado como um arquivo pickle
joblib.dump(modelo_random_forest_random_search, 'Modelos\\modelo_random_forest_random_search.pkl')

['Modelos\\modelo_random_forest_random_search.pkl']