In [12]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df = pd.read_csv('./dados_tratados/combinado/Piratininga/Piratininga_tratado_combinado.csv',
                 usecols=['PM2.5', 'Data e Hora'])


df['Data e Hora'] = pd.to_datetime(df['Data e Hora'])
df['hour'] = df['Data e Hora'].dt.hour
df['minute'] = df['Data e Hora'].dt.minute
df['year'] = df['Data e Hora'].dt.year
df['month'] = df['Data e Hora'].dt.month
df['day'] = df['Data e Hora'].dt.day
df['day_of_week'] = df['Data e Hora'].dt.dayofweek
df['day_of_year'] = df['Data e Hora'].dt.dayofyear
df['week'] = df['Data e Hora'].dt.isocalendar().week

df.drop('Data e Hora', axis=1, inplace=True)

# Normalizando os dados de PM2.5
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_values = scaler.fit_transform(df[['PM2.5']].values)


def create_sequences(data, seq_length):
    xs = []
    ys = []
    for i in range(len(data) - seq_length - 1):
        x = data[i:(i + seq_length)]
        y = data[i + seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)


seq_length = 8
X, y = create_sequences(scaled_values, seq_length)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float().reshape(-1, 1)
X_val = torch.from_numpy(X_val).float()
y_val = torch.from_numpy(y_val).float().reshape(-1, 1)
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float().reshape(-1, 1)

train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
test_data = TensorDataset(X_test, y_test)

batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


class LSTM(nn.Module):
    def __init__(self, input_size, hidden_layer_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_layer_size = hidden_layer_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_layer_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)

    def forward(self, input_seq):
        lstm_out, _ = self.lstm(input_seq)
        predictions = self.linear(lstm_out[:, -1, :])
        return predictions


input_size = X_train.shape[2]
model = LSTM(input_size=input_size, hidden_layer_size=100, num_layers=2, output_size=1).to(device)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


def train_model(model, train_loader, val_loader, loss_function, optimizer, epochs=150, patience=10):
    train_losses = []
    val_losses = []
    best_val_loss = np.inf
    counter = 0
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for seq, labels in train_loader:
            seq, labels = seq.to(device), labels.to(device)
            optimizer.zero_grad()
            y_pred = model(seq)
            single_loss = loss_function(y_pred, labels)
            single_loss.backward()
            optimizer.step()
            train_loss += single_loss.item() * seq.size(0)

        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for seq, labels in val_loader:
                seq, labels = seq.to(device), labels.to(device)
                y_pred = model(seq)
                single_loss = loss_function(y_pred, labels)
                val_loss += single_loss.item() * seq.size(0)

        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        print(f'Epoch {epoch} train loss: {train_loss}, val loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            counter += 1

        if counter >= patience:
            print('Early stopping')
            break
    return train_losses, val_losses


train_losses, val_losses = train_model(model, train_loader, val_loader, loss_function, optimizer, epochs=150,
                                       patience=10)

model.load_state_dict(torch.load('best_model.pth'))

model.eval()
test_predictions = []

with torch.no_grad():
    for seq, labels in test_loader:
        seq, labels = seq.to(device), labels.to(device)
        y_pred = model(seq)
        test_predictions.append(y_pred.cpu().numpy())

test_predictions = np.concatenate(test_predictions, axis=0)

test_predictions = scaler.inverse_transform(test_predictions.reshape(-1, 1))
y_test = scaler.inverse_transform(y_test.cpu().numpy().reshape(-1, 1))

mae = np.mean(np.abs(test_predictions - y_test))
mse = np.mean((test_predictions - y_test) ** 2)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')

# Plotagem das perdas de treinamento e validação
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.legend()
plt.show()


           Data e Hora  PM2.5
0  2009-01-01 01:00:00    NaN
1  2009-01-01 02:00:00    NaN
2  2009-01-01 03:00:00    NaN
3  2009-01-01 04:00:00    NaN
4  2009-01-01 05:00:00    NaN
-------------------
               Data e Hora  PM2.5
59962  2015-11-04 10:30:00   25.0
59963  2015-11-04 11:30:00   13.0
59964  2015-11-04 12:30:00   12.0
59965  2015-11-04 13:30:00   14.0
59966  2015-11-04 14:30:00   12.0


KeyboardInterrupt: 

In [5]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Carregar dados
df = pd.read_csv(
    './dados_tratados/combinado/Piratininga/Piratininga_tratado_combinado.csv',
    usecols=['PM2.5', 'PM10', 'Dióxido de Enxofre', 'Monóxido de Carbono', 'Monóxido de Nitrogênio', 'Data e Hora'],
    parse_dates=['Data e Hora'],
    dtype={'PM2.5': str, 'PM10': str, 'Dióxido de Enxofre': str,
           'Monóxido de Carbono': str, 'Monóxido de Nitrogênio': str})

cols = ['PM2.5', 'PM10', 'Dióxido de Enxofre', 'Monóxido de Carbono', 'Monóxido de Nitrogênio']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df = df['PM2.5'].dropna()


# Convertendo a coluna de data e hora para datetime
df['Data e Hora'] = pd.to_datetime(df['Data e Hora'])
df = df.set_index('Data e Hora')

features = ['PM2.5', 'PM10', 'Dióxido de Enxofre', 'Monóxido de Carbono', 'Monóxido de Nitrogênio']
data_values = df[features].values

# Normalizando os dados
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_values = scaler.fit_transform(data_values)


def create_sequences(data, window_size):
    xs = []
    ys = []
    for i in range(len(data) - window_size - 1):
        x = data[i:(i + window_size)]
        y = data[i + window_size][0]  # Prevendo o valor de PM2.5
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)


seq_length = 8
X, y = create_sequences(scaled_values, seq_length)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float().reshape(-1, 1)
X_val = torch.from_numpy(X_val).float()
y_val = torch.from_numpy(y_val).float().reshape(-1, 1)
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float().reshape(-1, 1)

train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
test_data = TensorDataset(X_test, y_test)

batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


class LSTM(nn.Module):
    def __init__(self, input_size, hidden_layer_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_layer_size = hidden_layer_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_layer_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)

    def forward(self, input_seq):
        lstm_out, _ = self.lstm(input_seq)
        predictions = self.linear(lstm_out[:, -1, :])
        return predictions


input_size = X_train.shape[2]
model = LSTM(input_size=input_size, hidden_layer_size=100, num_layers=2, output_size=1).to(device)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


def train_model(model, train_loader, val_loader, loss_function, optimizer, epochs=150, patience=10):
    train_losses = []
    val_losses = []
    best_val_loss = np.inf
    counter = 0
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for seq, labels in train_loader:
            seq, labels = seq.to(device), labels.to(device)
            optimizer.zero_grad()
            y_pred = model(seq)
            single_loss = loss_function(y_pred, labels)
            single_loss.backward()
            optimizer.step()
            train_loss += single_loss.item() * seq.size(0)

        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for seq, labels in val_loader:
                seq, labels = seq.to(device), labels.to(device)
                y_pred = model(seq)
                single_loss = loss_function(y_pred, labels)
                val_loss += single_loss.item() * seq.size(0)

        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)

        print(f'Epoch {epoch} train loss: {train_loss}, val loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            counter += 1

        if counter >= patience:
            print('Early stopping')
            break
    return train_losses, val_losses


train_losses, val_losses = train_model(model, train_loader, val_loader, loss_function, optimizer, epochs=150,
                                       patience=10)

model.load_state_dict(torch.load('best_model.pth'))

model.eval()
test_predictions = []

with torch.no_grad():
    for seq, labels in test_loader:
        seq, labels = seq.to(device), labels.to(device)
        y_pred = model(seq)
        test_predictions.append(y_pred.cpu().numpy())

test_predictions = np.concatenate(test_predictions, axis=0)

# Inversão da normalização apenas para o PM2.5
test_predictions_pm25 = scaler.inverse_transform(
    np.concatenate([test_predictions, np.zeros((test_predictions.shape[0], scaled_values.shape[1] - 1))], axis=1))[:, 0]
y_test_pm25 = scaler.inverse_transform(
    np.concatenate([y_test.cpu().numpy(), np.zeros((y_test.shape[0], scaled_values.shape[1] - 1))], axis=1))[:, 0]

mae = np.mean(np.abs(test_predictions_pm25 - y_test_pm25))
mse = np.mean((test_predictions_pm25 - y_test_pm25) ** 2)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')

# Plotagem das perdas de treinamento e validação
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Losses')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


cuda


KeyError: 'Data e Hora'