### Melhor modelo LSTM padrao ate o momento R2 de .71

In [4]:
import pandas as pd
import numpy as np
from copy import deepcopy as dc
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.amp import GradScaler, autocast

# Data preparation
df_original = pd.read_csv('../dados_tratados/combinado/Piratininga/Piratininga_tratado_combinado.csv',
                          usecols=['PM2.5', 'Data e Hora', 'PM10', 'Monóxido de Carbono', 'Dióxido de Enxofre',
                                   'Dióxido de Nitrogênio', 'Temperatura', 'Velocidade do Vento', 'Umidade Relativa',
                                   'Direção do Vento'], low_memory=False)

df_original['Data e Hora'] = pd.to_datetime(df_original['Data e Hora'])
df_original.set_index('Data e Hora', inplace=True)
df_original.sort_index(inplace=True)

colunas_selecionadas = ['PM2.5', 'PM10', 'Monóxido de Carbono', 'Dióxido de Enxofre', 'Dióxido de Nitrogênio', 'Temperatura', 'Velocidade do Vento', 'Umidade Relativa', 'Direção do Vento']
df = df_original[colunas_selecionadas]
df = df.loc['2019-01-01':'2022-01-01']

df = df.apply(pd.to_numeric, errors='coerce')
df = df.interpolate(method='time')

# Add time-based features
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month
df['day_of_year'] = df.index.dayofyear

# Function to prepare dataframe for LSTM
def prepare_dataframe_for_lstm(df, n_steps, target_col):
    df = dc(df)
    target = df[target_col]
    df = df.drop(columns=[target_col])
    
    for col in df.columns:
        for i in range(1, n_steps + 1):
            df[f'{col}(t-{i})'] = df[col].shift(i)
    
    df[target_col] = target
    df.dropna(inplace=True)
    return df

lookback = 24  # Increased lookback for daily patterns
target_col = 'PM2.5'
shifted_df = prepare_dataframe_for_lstm(df, lookback, target_col)

# Splitting data
train_size = int(len(shifted_df) * 0.7)
val_size = int(len(shifted_df) * 0.15)

train_df = shifted_df.iloc[:train_size]
val_df = shifted_df.iloc[train_size:train_size + val_size]
test_df = shifted_df.iloc[train_size + val_size:]

# Normalizing data
scaler = MinMaxScaler(feature_range=(0, 1))
train_scaled = pd.DataFrame(scaler.fit_transform(train_df), columns=shifted_df.columns, index=train_df.index)
val_scaled = pd.DataFrame(scaler.transform(val_df), columns=shifted_df.columns, index=val_df.index)
test_scaled = pd.DataFrame(scaler.transform(test_df), columns=shifted_df.columns, index=test_df.index)

X_train, y_train = train_scaled.drop(columns=[target_col]).values, train_scaled[target_col].values
X_val, y_val = val_scaled.drop(columns=[target_col]).values, val_scaled[target_col].values
X_test, y_test = test_scaled.drop(columns=[target_col]).values, test_scaled[target_col].values

# Calculate the number of features
num_features = X_train.shape[1] // lookback

# Reshape for LSTM
X_train = X_train.reshape((-1, lookback, num_features))
X_val = X_val.reshape((-1, lookback, num_features))
X_test = X_test.reshape((-1, lookback, num_features))
y_train = y_train.reshape((-1, 1))
y_val = y_val.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

# Convert to PyTorch tensors
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).float()
X_val = torch.tensor(X_val).float()
y_val = torch.tensor(y_val).float()
X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).float()

# Dataset and DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

# LSTM Model with Attention
class AttentionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout):
        super(AttentionLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attention = nn.MultiheadAttention(hidden_size, num_heads=4, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        att_out, _ = self.attention(out, out, out)
        out = self.fc(att_out[:, -1, :])
        return out

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, early_stopping_patience, device):
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
    best_loss = float('inf')
    patience_counter = 0
    best_model = None
    scaler = GradScaler()

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch in train_loader:
            x_batch, y_batch = batch[0].to(device), batch[1].to(device)

            optimizer.zero_grad()
            with autocast():
                output = model(x_batch)
                loss = criterion(output, y_batch)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                x_batch, y_batch = batch[0].to(device), batch[1].to(device)
                output = model(x_batch)
                val_loss += criterion(output, y_batch).item()

        val_loss /= len(val_loader)
        scheduler.step(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        if val_loss < best_loss:
            best_loss = val_loss
            patience_counter = 0
            best_model = model.state_dict()
        else:
            patience_counter += 1

        if patience_counter >= early_stopping_patience:
            print(f'Early stopping at epoch {epoch + 1}')
            break

    model.load_state_dict(best_model)
    return model

# Evaluation function
def evaluate_model(model, data_loader, criterion, device, scaler):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch in data_loader:
            x_batch, y_batch = batch[0].to(device), batch[1].to(device)
            output = model(x_batch)
            total_loss += criterion(output, y_batch).item()
            all_preds.append(output.cpu().numpy())
            all_targets.append(y_batch.cpu().numpy())

    avg_loss = total_loss / len(data_loader)

    all_preds = np.concatenate(all_preds, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    # Inverse transform predictions and targets
    all_preds = scaler.inverse_transform(all_preds)
    all_targets = scaler.inverse_transform(all_targets)

    mae = np.mean(np.abs(all_preds - all_targets))
    mse = np.mean((all_preds - all_targets) ** 2)
    rmse = np.sqrt(mse)
    r2 = r2_score(all_targets, all_preds)

    print(f"Loss: {avg_loss:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")

    return avg_loss, mae, mse, rmse, r2

# Hyperparameters
input_size = num_features
hidden_size = 128
num_layers = 2
output_size = 1
dropout = 0.2
batch_size = 64
learning_rate = 0.001
weight_decay = 1e-5
num_epochs = 200
early_stopping_patience = 20

# Create DataLoaders
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create and train the model
model = AttentionLSTM(input_size, hidden_size, num_layers, output_size, dropout).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.MSELoss()

# Train the model
model = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, early_stopping_patience, device)

# Evaluate the model
print("Validation Set Metrics:")
evaluate_model(model, val_loader, criterion, device, scaler)

print("\nTest Set Metrics:")
evaluate_model(model, test_loader, criterion, device, scaler)

  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{i})'] = df[col].shift(i)
  df[f'{col}(t-{

ValueError: cannot reshape array of size 5523600 into shape (24,12)