In [1]:
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer, KNNImputer
from copy import deepcopy as dc
import logging
from datetime import datetime
import optuna
import torch.nn.functional as F
import os

In [2]:
# Configuração inicial
data_hoje = datetime.now().strftime('%d-%m')
inicio_execucao = pd.Timestamp.now()

# Criando diretórios para logs e plots
os.makedirs(f'../logs/{data_hoje}', exist_ok=True)
os.makedirs(f'../plots/{data_hoje}', exist_ok=True)

# Configuração do logging
logging.basicConfig(filename=f'../logs/{data_hoje}/lstm_optuna.log', level=logging.INFO, format='- %(message)s')
logging.info('-' * 50)
logging.info(f'{inicio_execucao} - Iniciando o processo de otimização e treinamento do modelo LSTM')

# Carregando e preparando os dados
df_original = pd.read_csv('../dados_tratados/combinado/Piratininga/Piratininga_tratado_combinado.csv',
                          usecols=['PM2.5', 'Data e Hora', 'PM10', 'Monóxido de Carbono'], low_memory=False)

df_original['Data e Hora'] = pd.to_datetime(df_original['Data e Hora'], format='%Y-%m-%d %H:%M:%S')
df_original.index = df_original['Data e Hora']
df_original.sort_index(inplace=True)

colunas_selecionadas = ['PM2.5', 'PM10', 'Monóxido de Carbono']
df = df_original[colunas_selecionadas]
df = df.loc['2019-01-01':'2022-01-01']

df = df.apply(pd.to_numeric, errors='coerce')

In [3]:

def treat_outliers(df):
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df




# Função para imputação de dados ausentes
def impute_missing_data(df):
    # Imputação para dados ausentes aleatórios
    random_imputer = SimpleImputer(strategy='mean')
    df_random_imputed = pd.DataFrame(random_imputer.fit_transform(df), columns=df.columns, index=df.index)

    # Imputação para dados ausentes em sequência (usando interpolação)
    df_interpolated = df_random_imputed.interpolate(method='time')

    # Imputação KNN para lidar com padrões complexos
    knn_imputer = KNNImputer(n_neighbors=5)
    df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_interpolated), columns=df.columns, index=df.index)

    return df_interpolated


# df = treat_outliers(df)
df_imputed = impute_missing_data(df)

logging.info(f"Dados ausentes antes da imputação: {df.isna().sum()}")
logging.info(f"Dados ausentes após a imputação: {df_imputed.isna().sum()}")

In [4]:
# Preparando os dados para LSTM
def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)
    for col in colunas_selecionadas:
        for i in range(1, n_steps + 1):
            df[f'{col}(t-{i})'] = df[col].shift(i)
    df.dropna(inplace=True)
    return df


lookback = 8  # 24 horas de lookback
shifted_df = prepare_dataframe_for_lstm(df_imputed, lookback)

# Normalizando os dados
scaler = MinMaxScaler(feature_range=(0, 1))
shifted_df_as_np = scaler.fit_transform(shifted_df)

X = shifted_df_as_np[:, len(colunas_selecionadas):]
y = shifted_df_as_np[:, 0]  # Mantemos PM2.5 como nossa variável alvo

X = dc(np.flip(X, axis=1))

# Dividindo em conjuntos de treino, validação e teste
train_split = int(len(X) * 0.7)
val_split = int(len(X) * 0.85)

X_train, X_val, X_test = X[:train_split], X[train_split:val_split], X[val_split:]
y_train, y_val, y_test = y[:train_split], y[train_split:val_split], y[val_split:]

# Reshape para LSTM
X_train = X_train.reshape((-1, lookback, len(colunas_selecionadas)))
X_val = X_val.reshape((-1, lookback, len(colunas_selecionadas)))
X_test = X_test.reshape((-1, lookback, len(colunas_selecionadas)))
y_train = y_train.reshape((-1, 1))
y_val = y_val.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

# Convertendo para tensores PyTorch
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).float()
X_val = torch.tensor(X_val).float()
y_val = torch.tensor(y_val).float()
X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).float()


# Dataset e DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]


# Modelo LSTM
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [5]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_sizes, activation, dropout):
        super().__init__()
        self.hidden_sizes = hidden_sizes
        self.num_layers = len(hidden_sizes)
        self.activation = activation
        
        self.lstm_layers = nn.ModuleList([
            nn.LSTM(input_size if i == 0 else hidden_sizes[i-1], 
                    hidden_sizes[i], 
                    num_layers=1, 
                    batch_first=True, 
                    dropout=dropout if len(hidden_sizes) > 1 and i < len(hidden_sizes) - 1 else 0)
            for i in range(len(hidden_sizes))
        ])
        
        self.fc = nn.Linear(hidden_sizes[-1], 1)

    def forward(self, x):
        batch_size = x.size(0)
        for i, lstm in enumerate(self.lstm_layers):
            h0 = torch.zeros(1, batch_size, self.hidden_sizes[i]).to(device)
            c0 = torch.zeros(1, batch_size, self.hidden_sizes[i]).to(device)
            x, _ = lstm(x, (h0, c0))
            
            if i < len(self.lstm_layers) - 1:  # Não aplicar ativação na última camada LSTM
                if self.activation == 'relu':
                    x = F.relu(x)
                elif self.activation == 'tanh':
                    x = F.tanh(x)
                elif self.activation == 'sigmoid':
                    x = F.sigmoid(x)
                elif self.activation == 'leaky_relu':
                    x = F.leaky_relu(x)
                elif self.activation == 'elu':
                    x = F.elu(x)
        
        out = self.fc(x[:, -1, :])
        return out

In [6]:
# Função objetivo para Optuna
def objective(trial):
    # Hiperparâmetros para otimização
    num_layers = trial.suggest_int('num_layers', 1, 5)
    hidden_sizes = [trial.suggest_int(f'hidden_size_{i}', 1, 512) for i in range(num_layers)]
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'sigmoid', 'leaky_relu', 'elu'])
    dropout = trial.suggest_float('dropout', 0.0, 0.5)

    # Criação dos DataLoaders
    train_dataset = TimeSeriesDataset(X_train, y_train)
    val_dataset = TimeSeriesDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Criação e treinamento do modelo
    model = LSTM(len(colunas_selecionadas), hidden_sizes, activation, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    num_epochs = 400
    best_val_loss = float('inf')
    early_stopping_patience = 50
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        model.train()
        for batch in train_loader:
            x_batch, y_batch = batch[0].to(device), batch[1].to(device)
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                x_batch, y_batch = batch[0].to(device), batch[1].to(device)
                output = model(x_batch)
                val_loss += criterion(output, y_batch).item()
        val_loss /= len(val_loader)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= early_stopping_patience:
            break

    return best_val_loss

In [7]:
# Execução da otimização
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100)

best_params = study.best_params
logging.info(f"Melhores hiperparâmetros: {best_params}")

best_hidden_sizes = [best_params[f'hidden_size_{i}'] for i in range(best_params['num_layers'])]
best_batch_size = best_params['batch_size']
best_learning_rate = best_params['learning_rate']
best_activation = best_params['activation']
best_dropout = best_params['dropout']

train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=best_batch_size, shuffle=False)

final_model = LSTM(len(colunas_selecionadas), best_hidden_sizes, best_activation, best_dropout).to(device)
optimizer = torch.optim.Adam(final_model.parameters(), lr=best_learning_rate)
criterion = nn.MSELoss()

# Treinamento final
num_epochs = 400
best_val_loss = float('inf')
early_stopping_patience = 50
epochs_without_improvement = 0

for epoch in range(num_epochs):
    final_model.train()
    for batch in train_loader:
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()
        output = final_model(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

    final_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            x_batch, y_batch = batch[0].to(device), batch[1].to(device)
            output = final_model(x_batch)
            val_loss += criterion(output, y_batch).item()
    val_loss /= len(val_loader)

    logging.info(f"Época {epoch + 1}/{num_epochs}, Perda de validação: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(final_model.state_dict(), f'../models/best_model_optuna_{data_hoje}.pth')
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= early_stopping_patience:
        logging.info(f"Early stopping ativado na época {epoch + 1}")
        break

# Avaliação final
final_model.load_state_dict(torch.load(f'../models/best_model_optuna_{data_hoje}.pth'))
final_model.eval()


def evaluate(model, dataloader):
    predictions = []
    actual = []
    with torch.no_grad():
        for batch in dataloader:
            x_batch, y_batch = batch[0].to(device), batch[1].to(device)
            output = model(x_batch)
            predictions.extend(output.cpu().numpy().flatten())
            actual.extend(y_batch.cpu().numpy().flatten())
    return np.array(predictions), np.array(actual)


train_predictions, train_actual = evaluate(final_model, train_loader)
val_predictions, val_actual = evaluate(final_model, val_loader)
test_predictions, test_actual = evaluate(final_model, test_loader)

[I 2024-08-12 11:41:51,138] A new study created in memory with name: no-name-e205c754-457e-45ce-97da-c92ca6d2a338
[W 2024-08-12 11:42:42,096] Trial 0 failed with parameters: {'num_layers': 3, 'hidden_size_0': 254, 'hidden_size_1': 420, 'hidden_size_2': 373, 'batch_size': 32, 'learning_rate': 1.999259459007879e-05, 'activation': 'sigmoid', 'dropout': 0.20921894478864428} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\dev\scripts\pm25-plots\venv\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\portes\AppData\Local\Temp\ipykernel_3068\579255110.py", line 32, in objective
    output = model(x_batch)
             ^^^^^^^^^^^^^^
  File "C:\dev\scripts\pm25-plots\venv\Lib\site-packages\torch\nn\modules\module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fi

KeyboardInterrupt: 

In [None]:
# Desnormalização
def inverse_transform_data(data):
    dummies = np.zeros((data.shape[0], shifted_df_as_np.shape[1]))
    dummies[:, 0] = data
    dummies = scaler.inverse_transform(dummies)
    return dummies[:, 0]


train_predictions = inverse_transform_data(train_predictions)
val_predictions = inverse_transform_data(val_predictions)
test_predictions = inverse_transform_data(test_predictions)
train_actual = inverse_transform_data(train_actual)
val_actual = inverse_transform_data(val_actual)
test_actual = inverse_transform_data(test_actual)


# Cálculo das métricas
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mse, mae, r2


train_rmse, train_mse, train_mae, train_r2 = calculate_metrics(train_actual, train_predictions)
val_rmse, val_mse, val_mae, val_r2 = calculate_metrics(val_actual, val_predictions)
test_rmse, test_mse, test_mae, test_r2 = calculate_metrics(test_actual, test_predictions)

In [None]:

# Log das métricas finais
logging.info("\nMétricas finais:")
logging.info("Treinamento - RMSE: {:.4f}, MSE: {:.4f}, MAE: {:.4f}, R²: {:.4f}".format(train_rmse, train_mse, train_mae, train_r2))
logging.info("Validação - RMSE: {:.4f}, MSE: {:.4f}, MAE: {:.4f}, R²: {:.4f}".format(val_rmse, val_mse, val_mae, val_r2))
logging.info("Teste - RMSE: {:.4f}, MSE: {:.4f}, MAE: {:.4f}, R²: {:.4f}".format(test_rmse, test_mse, test_mae, test_r2))

# Plotagem dos resultados
plt.figure(figsize=(12, 6))
plt.plot(train_actual, label='Actual PM2.5')
plt.plot(train_predictions, label='Predicted PM2.5')
plt.title('Treinamento: PM2.5 Real vs Previsto')
plt.xlabel('Hora')
plt.ylabel('PM2.5')
plt.legend()
plt.savefig(f'../plots/{data_hoje}/lstm_optuna_train_{data_hoje}.png')
plt.close()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

train_dates = shifted_df.index[:len(train_actual)]
val_dates = shifted_df.index[len(train_actual):len(train_actual) + len(val_actual)]
test_dates = shifted_df.index[-len(test_actual):]

def plot_results(actual, predicted, dates, title, filename):
    plt.figure(figsize=(20, 12))
    plt.plot(dates, actual, label='Real', color='blue')
    plt.plot(dates, predicted, label='Previsto', color='red')
    plt.title(title)
    plt.xlabel('Data')
    plt.ylabel('PM2.5')
    plt.legend()
    
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))  # Mostrar a cada 3 meses

    plt.gcf().autofmt_xdate()  # Rotacionar e alinhar os rótulos de data
    plt.tight_layout()
    plt.savefig(f'../plots/{data_hoje}/{filename}_{data_hoje}.png')
    plt.close()


plot_results(train_actual, train_predictions, train_dates, 'Treinamento: PM2.5 Real vs Previsto', 'lstm_optuna_train')
plot_results(val_actual, val_predictions, val_dates, 'Validação: PM2.5 Real vs Previsto', 'lstm_optuna_val')
plot_results(test_actual, test_predictions, test_dates, 'Teste: PM2.5 Real vs Previsto', 'lstm_optuna_test')

fim_execucao = pd.Timestamp.now()
tempo_execucao = fim_execucao - inicio_execucao
logging.info(f"\nExecução finalizada em {fim_execucao}")
logging.info(f"Tempo total de execução: {tempo_execucao}")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd

def plot_results_by_month(actual, predicted, dates, title_prefix, filename_prefix):
    df = pd.DataFrame({'date': dates, 'actual': actual, 'predicted': predicted})
    df.set_index('date', inplace=True)

    grouped = df.groupby(pd.Grouper(freq='M'))

    for name, group in grouped:
        if len(group) > 0:  
            plt.figure(figsize=(12, 6))
            plt.plot(group.index, group['actual'], label='Real', color='blue')
            plt.plot(group.index, group['predicted'], label='Previsto', color='red')
            
            month_year = name.strftime('%B %Y')
            plt.title(f'{title_prefix} - {month_year}')
            plt.xlabel('Data')
            plt.ylabel('PM2.5')
            plt.legend()

            plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d-%m'))
            plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=5))  

            plt.gcf().autofmt_xdate()  # Rotacionar e alinhar os rótulos de data
            plt.tight_layout()
            
            month_filename = f'{filename_prefix}_{name.strftime("%Y_%m")}_{data_hoje}.png'
            plt.savefig(f'../plots/{data_hoje}/{month_filename}')
            plt.close()

plot_results_by_month(train_actual, train_predictions, train_dates, 'Treinamento: PM2.5 Real vs Previsto', 'lstm_optuna_train')
plot_results_by_month(val_actual, val_predictions, val_dates, 'Validação: PM2.5 Real vs Previsto', 'lstm_optuna_val')
plot_results_by_month(test_actual, test_predictions, test_dates, 'Teste: PM2.5 Real vs Previsto', 'lstm_optuna_test')