Modelo LSTM
3 colunas
pytorch light
optuna

In [1]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import optuna
from optuna.integration import PyTorchLightningPruningCallback
import os
from datetime import datetime
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: 
Could not find `optuna-integration` for `pytorch_lightning`.
Please run `pip install optuna-integration[pytorch_lightning]`.

In [None]:
# cria pasta de logs com data de hoje

data_hoje = datetime.now().strftime('%d-%m')
os.makedirs(f'../logs/{data_hoje}', exist_ok=True)
os.makedirs(f'../plots/{data_hoje}', exist_ok=True)

In [None]:
import logging

inicio_execucao = pd.Timestamp.now()

logging.basicConfig(filename=f'../logs/{data_hoje}/lstm.log', level=logging.INFO, format='- %(message)s')
logging.info('-' * 50)
logging.info(f'{inicio_execucao} - Iniciando o processo de treinamento do modelo LSTM')

In [None]:
df_original = pd.read_csv('../dados_tratados/combinado/Piratininga/Piratininga_tratado_combinado.csv',
                          usecols=['PM2.5', 'Data e Hora', 'PM10', 'Monóxido de Carbono'], low_memory=False)

In [None]:
df_original['Data e Hora'] = pd.to_datetime(df_original['Data e Hora'], format='%Y-%m-%d %H:%M:%S')
df_original.index = df_original['Data e Hora']
df_original.sort_index(inplace=True)

colunas_selecionadas = ['PM2.5', 'PM10', 'Monóxido de Carbono']
df = df_original[colunas_selecionadas]

df = df.loc['2019-01-01':'2022-01-01']

df = df.apply(pd.to_numeric, errors='coerce')

logging.info(f'Colunas Selecionadas: {colunas_selecionadas}')
df.head(10)

In [None]:
def impute_missing_values(df):
    return df.interpolate(method='linear', limit_direction='both')


df_imputed = impute_missing_values(df)

logging.info(f"Dados ausentes antes da imputação: {df.isna().sum()}")
logging.info(f"Dados ausentes após a imputação: {df_imputed.isna().sum()}")

In [None]:
from copy import deepcopy as dc
from sklearn.preprocessing import MinMaxScaler


# Preparando os dados para LSTM
def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)
    for col in colunas_selecionadas:
        for i in range(1, n_steps + 1):
            df[f'{col}(t-{i})'] = df[col].shift(i)
    df.dropna(inplace=True)
    return df


lookback = 8  # 8 horas de lookback
shifted_df = prepare_dataframe_for_lstm(df_imputed, lookback)

# Normalizando os dados
scaler = MinMaxScaler(feature_range=(0, 1))
shifted_df_as_np = scaler.fit_transform(shifted_df)

X = shifted_df_as_np[:, len(colunas_selecionadas):]
y = shifted_df_as_np[:, 0]  # Mantemos PM2.5 como nossa variável alvo

X = dc(np.flip(X, axis=1))

# Dividindo em conjuntos de treino, validação e teste
train_split = int(len(X) * 0.7)
val_split = int(len(X) * 0.85)

X_train, X_val, X_test = X[:train_split], X[train_split:val_split], X[val_split:]
y_train, y_val, y_test = y[:train_split], y[train_split:val_split], y[val_split:]

In [None]:
class TimeSeriesDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y).unsqueeze(1)  # Add an extra dimension to match the model output

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
class LSTMModel(pl.LightningModule):
    def __init__(self, input_size, hidden_size, num_layers, output_size, learning_rate):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.learning_rate = learning_rate

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a sequence length dimension
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return output

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('test_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [None]:
def objective(trial):
    hidden_size = trial.suggest_int('hidden_size', 32, 256)
    num_layers = trial.suggest_int('num_layers', 1, 3)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])

    model = LSTMModel(
        input_size=X.shape[1],
        hidden_size=hidden_size,
        num_layers=num_layers,
        output_size=1,
        learning_rate=learning_rate
    )

    train_dataset = TimeSeriesDataset(X_train, y_train)
    val_dataset = TimeSeriesDataset(X_val, y_val)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

    early_stop_callback = EarlyStopping(monitor='val_loss', patience=10, mode='min')
    checkpoint_callback = ModelCheckpoint(dirpath=f'../models/{data_hoje}', filename='best_model', save_top_k=1, monitor='val_loss', mode='min')
    logger = TensorBoardLogger(f'../logs/{data_hoje}', name='lstm_optuna')

    trainer = pl.Trainer(
        max_epochs=100,
        callbacks=[early_stop_callback, checkpoint_callback],
        logger=logger,
        log_every_n_steps=10,
    )

    trainer.fit(model, train_loader, val_loader)

    return trainer.callback_metrics['val_loss'].item()

In [None]:
import optuna
from optuna.integration import PyTorchLightningPruningCallback

# Executar a otimização com Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

logging.info(f'Melhor trial: {study.best_trial.params}')

# Treinar o modelo final com os melhores hiperparâmetros
best_params = study.best_trial.params
final_model = LSTMModel(
    input_size=X.shape[1],
    hidden_size=best_params['hidden_size'],
    num_layers=best_params['num_layers'],
    output_size=1,
    learning_rate=best_params['learning_rate']
)

In [None]:
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=best_params['batch_size'])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=best_params['batch_size'])


early_stop_callback = EarlyStopping(monitor='val_loss', patience=10, mode='min')
checkpoint_callback = ModelCheckpoint(dirpath=f'../models/{data_hoje}', filename='final_model', save_top_k=1, monitor='val_loss', mode='min')
logger = TensorBoardLogger(f'../logs/{data_hoje}', name='lstm_final')

In [None]:
trainer = pl.Trainer(
    max_epochs=200,
    callbacks=[early_stop_callback, checkpoint_callback],
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(final_model, train_loader, val_loader)
test_result = trainer.test(final_model, test_loader)

logging.info(f'Resultado do teste: {test_result}')

# Salvar o modelo final
torch.save(final_model.state_dict(), f'../models/{data_hoje}/final_model.pth')

fim_execucao = pd.Timestamp.now()
logging.info(f'{fim_execucao} - Processo de treinamento do modelo LSTM concluído')
logging.info(f'Tempo total de execução: {fim_execucao - inicio_execucao}')