In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

In [11]:
# Load the CO2 concentration data
df = pd.read_csv('../../data/processed/airport_cleaned.csv', index_col='Unnamed: 0', parse_dates=['From Date'])
# df.drop("To Date", axis=1, inplace=True)
df.drop(['NH3','SO2','CO','Ozone','Benzene','PM2.5','NO','NO2','NOx','Eth-Benzene','MP-Xylene','O','WS','WD','AT','HOUR','MONTH'], axis=1, inplace=True)
df

Unnamed: 0,From Date,PM10
0,2019-06-01 00:00:00,58.661489
1,2019-06-01 01:00:00,51.717188
2,2019-06-01 02:00:00,54.967634
3,2019-06-01 03:00:00,53.554946
4,2019-06-01 04:00:00,53.374362
...,...,...
33832,2023-04-10 16:00:00,237.620000
33833,2023-04-10 17:00:00,219.950000
33834,2023-04-10 18:00:00,205.440000
33835,2023-04-10 19:00:00,185.990000


In [12]:
# Convert the 'date' column to a datetime object and set it as the index

df['From Date'] = pd.to_datetime(df['From Date'])
df.set_index('From Date', inplace=True)

In [16]:
scaler = MinMaxScaler()
df['PM10'] = scaler.fit_transform(df['PM10'].values.reshape(-1,1))


In [17]:
# Define the LSTM model
n_timesteps = 7 # length of each time series
n_features = 1 # number of variables in each time step
n_units = 64 # number of memory cells in LSTM layer
n_outputs = 1 # number of predicted values


In [18]:
model = Sequential()
model.add(LSTM(n_units, input_shape=(n_timesteps, n_features)))
model.add(Dense(n_outputs))
model.compile(loss='mean_squared_error', optimizer='adam')

In [20]:
# Split the data into training and testing sets
train_size = int(len(df) * 0.7)
train_data = df.iloc[:train_size, :]
test_data = df.iloc[train_size:, :]

In [21]:
# Generate the training sequences and labels
def generate_sequences(data, n_timesteps):
    X, y = [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i+n_timesteps])
        y.append(data[i+n_timesteps])
    return np.array(X), np.array(y)

In [22]:
X_train, y_train = generate_sequences(train_data['PM10'].values, n_timesteps)
X_test, y_test = generate_sequences(test_data['PM10'].values, n_timesteps)


In [23]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x22a252caa40>

In [24]:
# Make predictions on the testing set
y_pred = model.predict(X_test)



In [25]:

# Rescale the predictions to the original range
y_pred = scaler.inverse_transform(y_pred)

In [27]:
# Evaluate the model
mse = np.mean(np.square(y_pred - test_data['PM10'].values))
print('MSE: %.3f' % mse)

MSE: 29633.815
