# Inicializando


In [None]:
!pip install pytorch-forecasting lightning numpy pandas pyarrow matplotlib tqdm_joblib

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
import json
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback

# --- Configuração de Paths e Constantes ---
BASE_DIR = "/content/drive/MyDrive/lstm_projeto"
DATA_PATH = os.path.join(BASE_DIR, "data", "final_training_data.parquet")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, "model_checkpoint.keras")
EPOCH_TRACK_PATH = CHECKPOINT_PATH.replace(".keras", "_epoch.txt")

SCALER_DIR = os.path.join(BASE_DIR, "scalers")

# Cria os diretórios no Google Drive se eles não existirem
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(SCALER_DIR, exist_ok=True) 

print("Paths configurados com sucesso.")

# Treinando IA

In [None]:
def create_sequences_multi_step(data: np.ndarray, seq_len: int, horizon: int):
    num_samples = data.shape[0] - seq_len - horizon + 1
    if num_samples < 0: return np.array([]), np.array([])
    
    X = np.zeros((num_samples, seq_len, data.shape[1]), dtype=np.float32)
    y = np.zeros((num_samples, horizon), dtype=np.float32)
    
    for i in range(num_samples):
        X[i] = data[i : i + seq_len]
        y[i] = data[i + seq_len : i + seq_len + horizon, 0] # O alvo é a coluna 'numero_casos'
    return X, y
    
    
class PersistentEarlyStoppingCityVal(Callback):
    def __init__(self, city_val_callback, patience=5, restore_best_weights=True, state_file="earlystop_city_state.json"):
        super().__init__()
        self.city_val_callback = city_val_callback
        self.patience = patience
        self.restore_best_weights = restore_best_weights
        self.state_file = state_file
        self.wait = 0
        self.best_weights = None
        self.best = np.inf

        if os.path.exists(self.state_file):
            with open(self.state_file, "r") as f:
                try:
                    state = json.load(f)
                    self.wait = int(state.get("wait", 0))
                    self.best = float(state.get("best", np.inf))
                except Exception:
                    self.wait = 0
                    self.best = np.inf

    def on_epoch_end(self, epoch, logs=None):
        current = logs.get("city_val_loss")
        if current is None:
            self.city_val_callback.on_epoch_end(epoch, logs)
            current = logs.get("city_val_loss")
            if current is None:
                return

        if current < self.best:
            self.best = float(current)
            self.wait = 0
            if self.restore_best_weights:
                self.best_weights = self.model.get_weights()
        else:
            self.wait += 1

        with open(self.state_file, "w") as f:
            json.dump({"wait": int(self.wait), "best": float(self.best)}, f)

        if self.wait >= self.patience:
            if self.restore_best_weights and self.best_weights is not None:
                self.model.set_weights(self.best_weights)
            print(f"\nEarly stopping triggered at epoch {epoch + 1} (city_val_loss: {self.best:.4f})")
            self.model.stop_training = True


class EpochSaver(Callback):
    """
    Callback simples para salvar o número da última época concluída.
    """
    def __init__(self, epoch_file_path):
        super().__init__()
        self.epoch_file_path = epoch_file_path
    def on_epoch_end(self, epoch, logs=None):
        with open(self.epoch_file_path, "w") as f:
            f.write(str(epoch))


def process_municipio(municipio_id):
    """
    Processa os dados de um único município: cria sequências, divide em treino/teste
    e aplica os scalers globais pré-ajustados.
    """
    df_mun = df[df["codigo_ibge"] == municipio_id].copy()
    if len(df_mun) < SEQUENCE_LENGTH + HORIZON:
        return None

    dynamic_data = df_mun[dynamic_features].values
    static_data = df_mun[static_features].iloc[0].values.reshape(1, -1)

    X_mun_raw, y_mun_raw = create_sequences_multi_step(dynamic_data, SEQUENCE_LENGTH, HORIZON)
    if X_mun_raw.shape[0] == 0:
        return None

    static_seq = np.repeat(static_data, len(X_mun_raw), axis=0)

    split_idx = int(len(X_mun_raw) * 0.8)
    X_train_raw, y_train_raw, static_train = (
        X_mun_raw[:split_idx], y_mun_raw[:split_idx], static_seq[:split_idx]
    )
    X_test_raw, y_test_raw, static_test = (
        X_mun_raw[split_idx:], y_mun_raw[split_idx:], static_seq[split_idx:]
    )

    if X_train_raw.shape[0] == 0 or X_test_raw.shape[0] == 0:
        return None
    
    # --- CORREÇÃO AQUI: Carrega os scalers do diretório único 'SCALER_DIR' ---
    scaler_dyn = joblib.load(os.path.join(SCALER_DIR, "scaler_dyn_global.pkl"))
    scaler_static = joblib.load(os.path.join(SCALER_DIR, "scaler_static_global.pkl"))
    scaler_target = joblib.load(os.path.join(SCALER_DIR, "scaler_target_global.pkl"))
    
    # Aplica a transformação (.transform) nos dados
    X_train_2d = X_train_raw.reshape(-1, X_train_raw.shape[-1])
    X_test_2d = X_test_raw.reshape(-1, X_test_raw.shape[-1])

    X_train_scaled = scaler_dyn.transform(X_train_2d).reshape(X_train_raw.shape)
    X_test_scaled = scaler_dyn.transform(X_test_2d).reshape(X_test_raw.shape)
    static_train_scaled = scaler_static.transform(static_train)
    static_test_scaled = scaler_static.transform(static_test)
    y_train_scaled = scaler_target.transform(y_train_raw)
    y_test_scaled = scaler_target.transform(y_test_raw)

    # Retorna também o y_train_raw para calcular o pico de treino para a métrica
    return (
        X_train_scaled, y_train_scaled,
        X_test_scaled, y_test_scaled,
        static_train_scaled, static_test_scaled,
        y_train_raw
    )


class CityValLossWeighted(Callback):
    def __init__(self, X_val, static_val, y_val, city_sizes, city_weights, save_path):
        super().__init__()
        self.X_val = X_val
        self.static_val = static_val
        self.y_val = y_val
        self.city_sizes = city_sizes
        self.city_weights = city_weights # Pesos pré-calculados
        self.save_path = save_path
        self.best = np.inf

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict([self.X_val, self.static_val], batch_size=1024, verbose=0)
        start = 0
        weighted_losses = []
        total_weight = 0.0

        for i, n in enumerate(self.city_sizes):
            y_true_city = self.y_val[start:start+n]
            y_pred_city = y_pred[start:start+n]
            start += n

            mse_city = np.mean((y_true_city - y_pred_city)**2)

            # Usa o peso pré-calculado do conjunto de treino
            weight = self.city_weights[i]

            weighted_losses.append(mse_city * weight)
            total_weight += weight

        city_val_loss = np.sum(weighted_losses) / max(total_weight, 1e-8)
        logs["city_val_loss"] = city_val_loss

        if city_val_loss < self.best:
            self.best = city_val_loss
            self.model.save(self.save_path)
            print(f"\nEpoch {epoch+1}: Melhor modelo salvo com city_val_loss ponderado: {city_val_loss:.6f}")

In [None]:
TARGET_COLUMN = "numero_casos"
SEQUENCE_LENGTH = 12
HORIZON = 8
EPOCHS = 20
BATCH_SIZE = 64

# --- 4. CARREGAMENTO E PRÉ-PROCESSAMENTO DOS DADOS ---
print("\n--- Etapa 4: Carregando e processando os dados ---")
df = pd.read_parquet(DATA_PATH)
df = df.sort_values(by=["codigo_ibge", "ano", "semana"])

df["week_sin"] = np.sin(2 * np.pi * df["semana"] / 52)
df["week_cos"] = np.cos(2 * np.pi * df["semana"] / 52)
df["year_norm"] = (df["ano"] - df["ano"].min()) / (df["ano"].max() - df["ano"].min())

dynamic_features = [
    "numero_casos", "T2M", "T2M_MAX", "T2M_MIN",
    "PRECTOTCORR", "RH2M", "ALLSKY_SFC_SW_DWN",
    "week_sin", "week_cos", "year_norm"
]
static_features = ["latitude", "longitude"]
municipios = df["codigo_ibge"].unique()

# --- Etapa 4.5: Preparando ou Carregando Scalers Globais ---

# Define o caminho para um dos scalers para verificar sua existência
scaler_dyn_path = os.path.join(SCALER_DIR, "scaler_dyn_global.pkl")

# <> CORREÇÃO: Verifica se os scalers já existem antes de criá-los
if os.path.exists(scaler_dyn_path):
    print("Scalers globais já existem. Pulando a etapa de criação.")
else:
    print("Scalers globais não encontrados. Iniciando processo de criação (pode demorar)...")
    all_dyn_train, all_static_train, all_target_train = [], [], []
    for mid in tqdm(municipios, desc="Coletando dados de treino para scalers"):
        df_mun = df[df["codigo_ibge"] == mid]
        if len(df_mun) < SEQUENCE_LENGTH + HORIZON: continue
        
        dyn_data = df_mun[dynamic_features].values
        static_data = df_mun[static_features].iloc[0].values.reshape(1, -1)
        X_raw, y_raw = create_sequences_multi_step(dyn_data, SEQUENCE_LENGTH, HORIZON)
        if X_raw.shape[0] == 0: continue
        
        split_idx = int(len(X_raw) * 0.8)
        if split_idx == 0: continue

        all_dyn_train.append(X_raw[:split_idx].reshape(-1, X_raw.shape[-1]))
        all_static_train.append(np.repeat(static_data, split_idx, axis=0))
        all_target_train.append(y_raw[:split_idx])

    scaler_dyn_global = MinMaxScaler().fit(np.concatenate(all_dyn_train))
    scaler_static_global = MinMaxScaler().fit(np.concatenate(all_static_train))
    scaler_target_global = MinMaxScaler().fit(np.concatenate(all_target_train))

    # Salva os scalers no diretório único
    joblib.dump(scaler_dyn_global, os.path.join(SCALER_DIR, "scaler_dyn_global.pkl"))
    joblib.dump(scaler_static_global, os.path.join(SCALER_DIR, "scaler_static_global.pkl"))
    joblib.dump(scaler_target_global, os.path.join(SCALER_DIR, "scaler_target_global.pkl"))
    print("Scalers globais criados e salvos.")

# --- Processamento por município ---
print("Iniciando processamento por município com scalers globais")
with tqdm_joblib(tqdm(desc="Processando municípios", total=len(municipios))):
    results = Parallel(n_jobs=6, backend='threading')(
        delayed(process_municipio)(mid) for mid in municipios
    )

results = [r for r in results if r is not None]
X_train_list, y_train_list, X_test_list, y_test_list, static_train_list, static_test_list, y_train_raw_list = zip(*results)
print("Processamento por município concluído.")

# --- 5. AGREGAÇÃO FINAL DOS DADOS ---
print("\n--- Etapa 5: Agregando dados para o treinamento ---")
X_train = np.concatenate(X_train_list, axis=0)
y_train = np.concatenate(y_train_list, axis=0)
X_test = np.concatenate(X_test_list, axis=0)
y_test = np.concatenate(y_test_list, axis=0)
static_train = np.concatenate(static_train_list, axis=0)
static_test = np.concatenate(static_test_list, axis=0)
print(f"Total de sequências de Treino: {X_train.shape[0]}, Teste: {X_test.shape[0]}")

city_sizes = [x.shape[0] for x in X_test_list]
city_train_peaks = [np.max(y_raw) if y_raw.size > 0 else 1.0 for y_raw in y_train_raw_list]

# --- 6. CRIAÇÃO OU CARREGAMENTO DO MODELO ---
print("\n--- Etapa 6: Carregando ou criando o modelo ---")
initial_epoch = 0
if os.path.exists(CHECKPOINT_PATH):
    print(f"Carregando modelo existente de: {CHECKPOINT_PATH}")
    model = load_model(CHECKPOINT_PATH)
    if os.path.exists(EPOCH_TRACK_PATH):
        with open(EPOCH_TRACK_PATH, "r") as f:
            initial_epoch = int(f.read().strip()) + 1
    print(f"Continuando o treinamento a partir da época: {initial_epoch}")
else:
    print("Nenhum checkpoint encontrado. Criando um novo modelo.")
    input_dyn = Input(shape=(SEQUENCE_LENGTH, len(dynamic_features)))
    lstm_out = LSTM(50, return_sequences=True)(input_dyn)
    lstm_out = LSTM(50)(lstm_out)
    input_static = Input(shape=(len(static_features),))
    concat = Concatenate()([lstm_out, input_static])
    output = Dense(HORIZON)(concat)
    model = Model(inputs=[input_dyn, input_static], outputs=output)
    model.compile(optimizer="adam", loss="mean_squared_error")
    print("Novo modelo criado e compilado.")

# --- 7. TREINAMENTO DO MODELO ---
print("\n--- Etapa 7: Iniciando o treinamento do modelo ---")
epoch_saver = EpochSaver(EPOCH_TRACK_PATH)
checkpoint_last = ModelCheckpoint(filepath=CHECKPOINT_PATH, save_best_only=False, save_freq="epoch", verbose=1)

city_val_loss_cb = CityValLossWeighted(
    X_val=X_test,
    static_val=static_test,
    y_val=y_test,
    city_sizes=city_sizes,
    city_weights=city_train_peaks,
    save_path=CHECKPOINT_PATH.replace(".keras", "_best_city.keras")
)

early_stopping_city = PersistentEarlyStoppingCityVal(
    city_val_callback=city_val_loss_cb,
    patience=5,
    restore_best_weights=True,
    state_file=os.path.join(BASE_DIR, "earlystop_city_state.json")
)

history = model.fit(
    [X_train, static_train], y_train,
    validation_data=([X_test, static_test], y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping_city, epoch_saver, city_val_loss_cb, checkpoint_last], 
    initial_epoch=initial_epoch,
    verbose=1,
)

print("\n--- TREINAMENTO FINALIZADO! ---")

# Usando IA

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from tensorflow.keras.models import load_model

# --- 1. CONFIGURAÇÃO ---
# Paths para os artefatos salvos
BASE_DIR = "/content/drive/MyDrive/lstm_projeto"
MODEL_PATH = os.path.join(BASE_DIR, "checkpoints", "model_checkpoint_best_city.keras")
SCALER_DIR = os.path.join(BASE_DIR, "scalers")
DATA_PATH = os.path.join(BASE_DIR, "data", "inference_data.parquet") 

# Constantes do modelo
SEQUENCE_LENGTH = 12
HORIZON = 6 # O modelo prevê 6 semanas

# Features usadas no treino (a ordem DEVE ser a mesma)
DYN_FEATS = [
    "numero_casos", "T2M", "T2M_MAX", "T2M_MIN",
    "PRECTOTCORR", "RH2M", "ALLSKY_SFC_SW_DWN",
    "week_sin", "week_cos", "year_norm"
]
STATIC_FEATS = ["latitude", "longitude"]

# Min/max do ano usado no TREINAMENTO para normalizar o ano
YEAR_MIN_TRAIN = 2014
YEAR_MAX_TRAIN = 2025 # Ajustado para o ano final real do seu treinamento.

# --- 2. FUNÇÃO PRINCIPAL ---

def main():
    # --- 3. CARREGAMENTO DOS ARTEFATOS ---
    print("Carregando modelo e scalers...")
    try:
        model = load_model(MODEL_PATH)
        scaler_dyn = joblib.load(os.path.join(SCALER_DIR, "scaler_dyn_global.pkl"))
        scaler_static = joblib.load(os.path.join(SCALER_DIR, "scaler_static_global.pkl"))
        scaler_target = joblib.load(os.path.join(SCALER_DIR, "scaler_target_global.pkl"))
    except FileNotFoundError as e:
        print(f"Erro: Arquivo não encontrado. Verifique os paths. Detalhe: {e}")
        return
    print("Modelo e scalers carregados.")

    # --- 4. PREPARAÇÃO DOS DADOS ---
    df = pd.read_parquet(DATA_PATH)
    df['codigo_ibge'] = df['codigo_ibge'].astype(int)
    df['ano'] = df['ano'].astype(int)
    df['semana'] = df['semana'].astype(int)
    df = df.sort_values(by=["codigo_ibge", "ano", "semana"]).reset_index(drop=True)
    
    # Adiciona uma coluna de data para facilitar manipulação
    df['date'] = pd.to_datetime(df['ano'].astype(str) + df['semana'].astype(str) + '1', format='%Y%W%w')

    # Seleção do município
    ibge = int(input("Digite o código IBGE do município: ").strip())
    df_mun = df[df['codigo_ibge'] == ibge].copy().reset_index(drop=True)
    
    if df_mun.empty:
        print(f"Município com IBGE {ibge} não encontrado nos dados.")
        return
    
    mun_name = df_mun['municipio'].iloc[0]
    print(f"Preparando previsão para: {mun_name} ({ibge})")

    # Feature Engineering
    df_mun["week_sin"] = np.sin(2 * np.pi * df_mun["semana"] / 52)
    df_mun["week_cos"] = np.cos(2 * np.pi * df_mun["semana"] / 52)
    df_mun["year_norm"] = (df_mun["ano"] - YEAR_MIN_TRAIN) / (YEAR_MAX_TRAIN - YEAR_MIN_TRAIN)

    # --- 5. LÓGICA DE PREVISÃO CORRIGIDA ---
    # <> CORREÇÃO: Permite ao usuário escolher o ponto de partida da previsão.
    start_year = int(input(f"Digite o ANO de início da previsão (ex: 2025): ").strip())
    start_week = int(input(f"Digite a SEMANA de início da previsão (ex: 24): ").strip())

    # Encontra o índice da semana ANTERIOR à semana de início da previsão
    prediction_point_idx = df_mun.index[
        (df_mun['ano'] == start_year) & (df_mun['semana'] == start_week)
    ].tolist()

    if not prediction_point_idx:
        print(f"Erro: A semana {start_week} do ano {start_year} não foi encontrada nos dados.")
        return
        
    last_known_idx = prediction_point_idx[0] - 1

    if last_known_idx < SEQUENCE_LENGTH - 1:
        print(f"Erro: Histórico insuficiente. São necessárias pelo menos {SEQUENCE_LENGTH} semanas de dados antes de {start_year}/{start_week}.")
        return

    # A sequência de entrada são as 12 semanas que terminam no ponto escolhido.
    start_idx = last_known_idx - SEQUENCE_LENGTH + 1
    end_idx = last_known_idx
    input_sequence_df = df_mun.iloc[start_idx : end_idx + 1]
    
    last_known_date = input_sequence_df['date'].iloc[-1]
    print(f"Usando dados até {last_known_date.strftime('%Y-%m-%d')} para prever a partir de {start_year}/{start_week}.")

    static_raw = input_sequence_df[STATIC_FEATS].iloc[0].values.reshape(1, -1)
    dyn_hist_raw = input_sequence_df[DYN_FEATS].values
    
    input_dyn_scaled = scaler_dyn.transform(dyn_hist_raw).reshape(1, SEQUENCE_LENGTH, len(DYN_FEATS))
    input_static_scaled = scaler_static.transform(static_raw)

    # --- 6. EXECUÇÃO DA PREVISÃO ---
    print(f"Executando o modelo para prever as próximas {HORIZON} semanas...")
    prediction_scaled = model.predict([input_dyn_scaled, input_static_scaled])
    
    prediction_cases = scaler_target.inverse_transform(prediction_scaled).flatten()
    
    # --- 7. VISUALIZAÇÃO ---
    print("Previsão concluída. Gerando gráfico...")
    
    display_hist_start_idx = max(0, last_known_idx - 24)
    hist_to_display = df_mun.iloc[display_hist_start_idx : last_known_idx + 1]
    
    future_start_idx = last_known_idx + 1
    future_end_idx = last_known_idx + HORIZON
    future_to_display = df_mun.iloc[future_start_idx : future_end_idx + 1]
    
    plt.figure(figsize=(15, 7))
    
    last_known_case = hist_to_display['numero_casos'].iloc[-1]
    connected_prediction = np.insert(prediction_cases, 0, last_known_case)
    
    x_hist = np.arange(len(hist_to_display))
    x_future_real = np.arange(len(hist_to_display), len(hist_to_display) + len(future_to_display))
    x_pred = np.arange(len(hist_to_display) - 1, len(hist_to_display) - 1 + len(connected_prediction))

    plt.plot(x_hist, hist_to_display['numero_casos'], 'o-', label="Histórico de Casos Conhecido")
    plt.plot(x_future_real, future_to_display['numero_casos'], 'o-', color='green', label="Futuro (Real)")
    plt.plot(x_pred, connected_prediction, 'o--', color='red', label=f"Previsão de Casos (Próximas {HORIZON} semanas)")

    all_dates_df = pd.concat([hist_to_display, future_to_display])
    tick_labels = [f"{row.ano}/{row.semana:02d}" for index, row in all_dates_df.iterrows()]
    tick_positions = np.arange(len(all_dates_df))
    
    plt.xticks(ticks=tick_positions, labels=tick_labels, rotation=45, ha="right")
    
    if len(tick_positions) > 30:
        plt.gca().xaxis.set_major_locator(plt.MaxNLocator(30))

    plt.axvline(x=len(hist_to_display) - 1, color="black", linestyle=":", linewidth=2, label=f"Ponto da Previsão")
    plt.title(f"Previsão de Casos de Dengue - {mun_name} ({ibge})")
    plt.xlabel("Ano/Semana Epidemiológica")
    plt.ylabel("Número de Casos Semanais")
    plt.legend()
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()
