### 1. Importações

In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

### 2. Leitura do Dataframe

In [20]:
df = pd.read_csv('../../../../data/pre-processado/por-municipio/completo/dados-manaus-preprocessado.csv') 

In [21]:
df = df[df['ano']>=2000]
df = df.drop(columns=['municipio','uf'])
X = df.drop(columns='vazao')
y = df['vazao'] # Separa o target

### 3. Código

In [22]:
scaler = StandardScaler()
cv_folds = 3
parametros = [
    {"n_estimators": 50},
    {"n_estimators": 100},
]

metodos_selecao = {
    "SelectKBest": SelectKBest(score_func=f_regression, k=5),
    "RFE": RFE(estimator=LinearRegression(), n_features_to_select=5)
}

resultados = []

for params in parametros:
    for nome_selecao, metodo_selecao in metodos_selecao.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        pipeline = Pipeline([
            ("scale", scaler),
            ("selecao", metodo_selecao),
            ("modelo", RandomForestRegressor(**params, random_state=42))
        ])

        pipeline.fit(X_train, y_train)

        y_pred_test = pipeline.predict(X_test)
        y_pred_train = pipeline.predict(X_train)

        mse_test = mean_squared_error(y_test, y_pred_test)
        mae_test = mean_absolute_error(y_test, y_pred_test)
        r2_test = r2_score(y_test, y_pred_test)

        mse_train = mean_squared_error(y_train, y_pred_train)
        mae_train = mean_absolute_error(y_train, y_pred_train)
        r2_train = r2_score(y_train, y_pred_train)

        r2_cv = cross_val_score(pipeline, X_train, y_train, cv=cv_folds, scoring="r2").mean()

        resultados.append({
            "algoritmo": "RandomForestRegressor",
            "n_estimators": params["n_estimators"],
            "cv_folds": cv_folds,
            "variaveis": nome_selecao,
            "normalizacao": scaler.__class__.__name__,
            "mse_treino": mse_train,
            "mae_treino": mae_train,
            "r2_treino": r2_train,
            "mse_teste": mse_test,
            "mae_teste": mae_test,
            "r2_teste": r2_test,
            "r2_cv_medio": r2_cv
        })

df_resultados = pd.DataFrame(resultados)
df_resultados.to_csv("resultados_randomforest.csv", index=False)

### 4. Exibição dos Resultados

In [23]:
df_resultados

Unnamed: 0,algoritmo,n_estimators,cv_folds,variaveis,normalizacao,mse_treino,mae_treino,r2_treino,mse_teste,mae_teste,r2_teste,r2_cv_medio
0,RandomForestRegressor,50,3,SelectKBest,StandardScaler,47588610.0,5202.413577,0.971561,300630600.0,12734.740425,0.815285,0.819869
1,RandomForestRegressor,50,3,RFE,StandardScaler,354205500.0,13405.210484,0.78833,1202997000.0,27894.873346,0.260848,0.288901
2,RandomForestRegressor,100,3,SelectKBest,StandardScaler,45982920.0,5162.425246,0.972521,300715600.0,12743.637951,0.815233,0.820688
3,RandomForestRegressor,100,3,RFE,StandardScaler,348601000.0,13321.098523,0.791679,1192108000.0,27777.242695,0.267538,0.29524
