In [4]:
import os, gc, joblib, shap, optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, root_mean_squared_error

# ------------------------- CONFIGS GERAIS ------------------------------- #
DATA_DIR   = '../data/RESULTADOS/'
MODELS_DIR = '../models/'
os.makedirs(MODELS_DIR, exist_ok=True)


# ------------------------- FUNÇÕES AUXILIARES --------------------------- #
def carregar_dados(fp:str) -> pd.DataFrame:
    df = pd.read_csv(fp).sort_values(['Município','Ano']).reset_index(drop=True)
    df['Município'] = df['Município'].astype('category')
    return df


def criar_features(df:pd.DataFrame, cols_lag, n_lags=3, janelas=(2,3)):
    df = df.copy()
    for col in cols_lag:
        for lag in range(1, n_lags+1):
            # ADICIONE observed=True AQUI
            df[f'{col}_lag{lag}'] = df.groupby('Município', observed=True)[col].shift(lag)
            
        # ADICIONE observed=True AQUI
        df[f'{col}_growth'] = df.groupby('Município', observed=True)[col].pct_change()

        for w in janelas:
            # ADICIONE observed=True AQUI
            df[f'{col}_roll_mean_{w}'] = (
                df.groupby('Município', observed=True)[col].shift(1).rolling(w).mean()
            )
            
    # tendência linear simples (regressão nos últimos 3 valores)
    for col in cols_lag:
        # ADICIONE observed=True AQUI
        df[f'{col}_trend'] = (
            df.groupby('Município', observed=True)[col]
            .apply(lambda s: s.shift(1).rolling(3).apply(
                lambda x: np.polyfit(range(len(x)), x, 1)[0] if x.count()==3 else np.nan))
            .reset_index(level=0, drop=True)
        )
        
    df.replace([np.inf, -np.inf], 0, inplace=True)
    return df


def split_temporal(df, ano_train_fim=2020, ano_val=2021):
    mask_train = df['Ano'] <= ano_train_fim
    mask_val   = df['Ano'] == ano_val
    mask_test  = df['Ano'] >  ano_val          # 2022 em diante

    return (df[mask_train], df[mask_val], df[mask_test])


def treinar_modelo(df, alvo, feat_list, nome_modelo):
    df_f = criar_features(df, feat_list, n_lags=3)
    # target t+1
    df_f['target'] = df_f.groupby('Município')[alvo].shift(-1)
    df_f = df_f.dropna(subset=['target'])

    train, val, test = split_temporal(df_f)

    cols_to_drop = ['target', alvo]

    y_train, X_train = train['target'], train.drop(columns=cols_to_drop, errors='ignore')
    y_val,   X_val   = val  ['target'], val  .drop(columns=cols_to_drop, errors='ignore')
    y_test,  X_test  = test ['target'], test .drop(columns=cols_to_drop, errors='ignore')
    

    cat_cols = ['Município']
    for c in cat_cols:
        if c in X_train.columns:
            X_train[c] = X_train[c].cat.codes
            X_val[c]   = X_val[c].cat.codes
            X_test[c]  = X_test[c].cat.codes

    # --------- Hyper-parameter tuning rápido via Optuna ---------- #
    def objective(trial):
        params = {
            'objective':        'rmse',
            'metric':           'rmse',
            'boosting_type':    'gbdt',
            'learning_rate':    trial.suggest_float('lr', 0.01, 0.15, log=True),
            'num_leaves':       trial.suggest_int('leaves', 31, 256),
            'feature_fraction': trial.suggest_float('ff', 0.6, 1.0),
            'bagging_fraction': trial.suggest_float('bf', 0.6, 1.0),
            'bagging_freq':     trial.suggest_int('b_freq', 1, 7),
            'min_data_in_leaf': trial.suggest_int('min_leaf', 10, 100),
            'lambda_l1':        trial.suggest_float('l1', 0.0, 0.8),
            'lambda_l2':        trial.suggest_float('l2', 0.0, 0.8),
            'seed':             42,
            'verbose':          -1,
            'n_estimators':     5000,
        }
        mdl = lgb.LGBMRegressor(**params)
        mdl.fit(X_train, y_train,
                eval_set=[(X_val, y_val)],
                eval_metric='rmse',
                callbacks=[lgb.early_stopping(300, verbose=False)])
        pred = mdl.predict(X_val)
        return root_mean_squared_error(y_val, pred)

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=5, show_progress_bar=False)

    print(f"Melhor RMSE-Val: {study.best_value:,.2f}")
    best_params = study.best_params | {
        'objective':'rmse', 'metric':'rmse', 'verbose':-1, 'n_estimators':5000, 'seed':42
    }
    model = lgb.LGBMRegressor(**best_params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(300, verbose=False)])

    # --------- Avaliação final no TESTE (2022) -------- #
    pred_test = model.predict(X_test)
    rmse = root_mean_squared_error(y_test, pred_test)
    r2   = r2_score(y_test, pred_test)
    print(f'--> TEST RMSE: {rmse:,.2f} | R²: {r2:.3f}')

    # --------- Salvar artefatos -------- #
    m_path = os.path.join(MODELS_DIR, f'{nome_modelo}.joblib')
    c_path = os.path.join(MODELS_DIR, f'{nome_modelo}_cols.joblib')
    joblib.dump(model, m_path)
    joblib.dump(X_train.columns.tolist(), c_path)

    # --------- SHAP (debug) -------- #
    """ explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_val.iloc[:1000])   # amostra p/ velocidade
    shap.summary_plot(shap_values, X_val.iloc[:1000], show=False, plot_type='bar')
    plt.title(f'Importância média das features – {nome_modelo}')
    plt.tight_layout(); plt.show() """

    return model


# ------------------------- EXECUÇÃO ------------------------------------- #
if __name__ == '__main__':
    df = carregar_dados(os.path.join(DATA_DIR, 'df_final2.csv'))

    # Definindo as listas de features para cada modelo
    FEATURES_PIB = [
        'VAB Agropecuária (R$ 1.000)', 'VAB Indústria (R$ 1.000)', 'VAB Serviços (R$ 1.000)',
        'Área plantada soja (ha)', 'Área plantada milho (ha)', 'Total Rebanho (Bovino)',
        'Desmatamento (km²)', 'Focos de Queimada', 'Total de Benefícios Básicos (Bolsa Família)'
    ]
    FEATURES_AGRO = [
        'Total Rebanho (Bovino)', 'Área plantada soja (ha)', 'Área plantada milho (ha)',
        'Desmatamento (km²)', 'Focos de Queimada', 'PIB per capita (R$)',
        'VAB Indústria (R$ 1.000)', 'VAB Serviços (R$ 1.000)', 'População'
    ]
    FEATURES_BENEFICIOS = [
        'PIB per capita (R$)', 'VAB Agropecuária (R$ 1.000)', 'VAB Indústria (R$ 1.000)',
        'VAB Serviços (R$ 1.000)', 'População', 'Focos de Queimada'
    ]
    FEATURES_RESP = [
        'Focos de Queimada', 'Desmatamento (km²)', 'VAB Indústria (R$ 1.000)',
        'População', 'PIB per capita (R$)'
    ]

    mapas = [
        # O segundo elemento da tupla agora é a lista de features que definimos
        ('PIB per capita (R$)',                        FEATURES_PIB, 'modelo_pib_pred'),
        ('VAB Agropecuária (R$ 1.000)',               FEATURES_AGRO, 'modelo_vab_pred'),
        ('Total de Benefícios Básicos (Bolsa Família)',FEATURES_BENEFICIOS,  'modelo_beneficios_pred'),
        ('Internações por Doenças Respiratórias',     FEATURES_RESP, 'modelo_respiratorio_pred')
    ]

    for alvo, feats, nome in mapas:
        print('\n' + '='*90)
        print(f'Treinando {nome.upper()}')
        # A função criar_features usará a lista 'feats' para criar lags/trends apenas dessas colunas
        # E também criará lags/trends para a coluna 'alvo', o que é o comportamento correto.
        # Para isso, precisamos adicionar o alvo na lista que é passada para criar_features
        features_completas_para_engenharia = feats + [alvo]
        
        # Chamada da função de treinamento
        treinar_modelo(df, alvo, features_completas_para_engenharia, nome)
        gc.collect()

    print('\nTODOS OS MODELOS AJUSTADOS E SALVOS EM:', MODELS_DIR)


Treinando MODELO_PIB_PRED


  df_f['target'] = df_f.groupby('Município')[alvo].shift(-1)
[I 2025-06-30 23:38:09,279] A new study created in memory with name: no-name-5794e74c-1fde-4ccc-a465-2a77cccd8693
[I 2025-06-30 23:38:10,828] Trial 0 finished with value: 23540.75855727015 and parameters: {'lr': 0.1488733133698014, 'leaves': 74, 'ff': 0.8732524942736, 'bf': 0.6482282193766254, 'b_freq': 3, 'min_leaf': 48, 'l1': 0.07663394651455341, 'l2': 0.3126859280402262}. Best is trial 0 with value: 23540.75855727015.
[I 2025-06-30 23:38:14,536] Trial 1 finished with value: 28649.963150427957 and parameters: {'lr': 0.022416688037230348, 'leaves': 64, 'ff': 0.8461492666728297, 'bf': 0.8620496769494523, 'b_freq': 7, 'min_leaf': 19, 'l1': 0.08757017439197759, 'l2': 0.21716517327586393}. Best is trial 0 with value: 23540.75855727015.
[I 2025-06-30 23:38:15,854] Trial 2 finished with value: 21586.826467396135 and parameters: {'lr': 0.10298964484914422, 'leaves': 243, 'ff': 0.9356622642369856, 'bf': 0.9664265383745303, 'b_freq':

Melhor RMSE-Val: 18,521.56
--> TEST RMSE: 40,581.25 | R²: 0.499

Treinando MODELO_VAB_PRED


  df_f['target'] = df_f.groupby('Município')[alvo].shift(-1)
[I 2025-06-30 23:38:27,443] A new study created in memory with name: no-name-1119b29e-a95d-4945-aef9-43f03893ccf8
[I 2025-06-30 23:38:28,517] Trial 0 finished with value: 55624.72796637206 and parameters: {'lr': 0.10930873997913104, 'leaves': 241, 'ff': 0.934310680262873, 'bf': 0.7531649959538647, 'b_freq': 5, 'min_leaf': 91, 'l1': 0.13119452940154516, 'l2': 0.4626919619196593}. Best is trial 0 with value: 55624.72796637206.
[I 2025-06-30 23:38:33,280] Trial 1 finished with value: 56416.75075227584 and parameters: {'lr': 0.011324418709833103, 'leaves': 193, 'ff': 0.8928850030156215, 'bf': 0.7647479164568101, 'b_freq': 2, 'min_leaf': 75, 'l1': 0.21752957484469607, 'l2': 0.3562280756998238}. Best is trial 0 with value: 55624.72796637206.
[I 2025-06-30 23:38:35,490] Trial 2 finished with value: 56081.47497866519 and parameters: {'lr': 0.06964640947973544, 'leaves': 245, 'ff': 0.6293166386145957, 'bf': 0.9552892317303077, 'b_freq

Melhor RMSE-Val: 33,205.77
--> TEST RMSE: 48,468.05 | R²: 0.929

Treinando MODELO_BENEFICIOS_PRED


  df_f['target'] = df_f.groupby('Município')[alvo].shift(-1)
[I 2025-06-30 23:39:13,208] A new study created in memory with name: no-name-2b557394-cb93-4f36-a59d-e7b307be80cd
[I 2025-06-30 23:39:16,221] Trial 0 finished with value: 2280.212350737156 and parameters: {'lr': 0.03206917423685568, 'leaves': 115, 'ff': 0.8148239446142356, 'bf': 0.6083612514682605, 'b_freq': 3, 'min_leaf': 76, 'l1': 0.19233585701216427, 'l2': 0.007460844677773082}. Best is trial 0 with value: 2280.212350737156.
[I 2025-06-30 23:39:27,105] Trial 1 finished with value: 2260.1567586241918 and parameters: {'lr': 0.011295098875007136, 'leaves': 180, 'ff': 0.6204434499001781, 'bf': 0.7020611397282419, 'b_freq': 3, 'min_leaf': 18, 'l1': 0.4830309948150272, 'l2': 0.5460147330964772}. Best is trial 1 with value: 2260.1567586241918.
[I 2025-06-30 23:39:30,990] Trial 2 finished with value: 2315.637738092719 and parameters: {'lr': 0.036152723615381306, 'leaves': 122, 'ff': 0.9834762446235632, 'bf': 0.6073516903942624, 'b

Melhor RMSE-Val: 2,260.16
--> TEST RMSE: 2,721.36 | R²: 0.944

Treinando MODELO_RESPIRATORIO_PRED


  df[f'{col}_growth'] = df.groupby('Município', observed=True)[col].pct_change()
  df_f['target'] = df_f.groupby('Município')[alvo].shift(-1)
[I 2025-06-30 23:39:50,972] A new study created in memory with name: no-name-2fdc7715-7950-4ba4-a154-fe6e4893917c
[I 2025-06-30 23:39:52,576] Trial 0 finished with value: 2690.4802580947876 and parameters: {'lr': 0.12304415220683701, 'leaves': 162, 'ff': 0.9021817844615538, 'bf': 0.9219971997557543, 'b_freq': 5, 'min_leaf': 13, 'l1': 0.4314492502563083, 'l2': 0.3388962493152651}. Best is trial 0 with value: 2690.4802580947876.
[I 2025-06-30 23:39:53,516] Trial 1 finished with value: 2826.5994985554344 and parameters: {'lr': 0.06245450924962415, 'leaves': 72, 'ff': 0.8483342378810226, 'bf': 0.621169262132399, 'b_freq': 2, 'min_leaf': 76, 'l1': 0.5714715548271926, 'l2': 0.4508659639619119}. Best is trial 0 with value: 2690.4802580947876.
[I 2025-06-30 23:40:02,463] Trial 2 finished with value: 3011.5525344605053 and parameters: {'lr': 0.01414490373

Melhor RMSE-Val: 2,690.48
--> TEST RMSE: 4,735.53 | R²: 0.858

TODOS OS MODELOS AJUSTADOS E SALVOS EM: ../models/
