In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

In [5]:
# Load the CO2 concentration data
df = pd.read_csv('../../data/processed/airport_cleaned.csv', parse_dates=['From Date'])
# df.drop("To Date", axis=1, inplace=True)
df.head()

Unnamed: 0,From Date,NH3,SO2,CO,Ozone,Benzene,PM2.5,PM10,NO,NO2,NOx,Eth-Benzene,MP-Xylene,O,WS,WD,AT,HOUR,MONTH
0,2019-06-01 00:00:00,8.968447,12.179626,0.373962,13.710286,6.367238,17.086,58.661489,59.723048,16.579524,76.262476,4.171932,21.8771,87.716774,1.227097,184.529677,27.810215,0,6
1,2019-06-01 01:00:00,9.016863,11.966923,0.349231,13.906923,5.444466,16.093861,51.717188,59.477379,15.399406,74.491359,4.104333,21.602857,87.869348,1.153152,187.873913,27.74413,1,6
2,2019-06-01 02:00:00,9.504804,12.471415,0.305619,13.750286,4.365825,15.65932,54.967634,56.430673,14.60901,70.510865,4.089186,21.7,88.223226,1.124731,181.79871,27.602903,2,6
3,2019-06-01 03:00:00,9.89505,12.776481,0.294231,14.580286,2.354608,15.780962,53.554946,56.006408,13.140686,68.903107,4.638222,23.246042,88.747065,1.141848,176.929457,27.411304,3,6
4,2019-06-01 04:00:00,10.109903,13.154128,0.298302,14.143962,4.111058,14.980377,53.374362,58.782404,13.3354,71.507692,4.162989,21.476224,88.678602,1.12828,173.737204,27.353011,4,6


In [None]:
# Convert the 'date' column to a datetime object and set it as the index

df['From Date'] = pd.to_datetime(df['From Date'])
df.set_index('From Date', inplace=True)

In [None]:
scaler = MinMaxScaler()
df['PM10'] = scaler.fit_transform(df['PM10'].values.reshape(-1,1))


In [None]:
# Define the LSTM model
n_timesteps = 7 # length of each time series
n_features = 1 # number of variables in each time step
n_units = 64 # number of memory cells in LSTM layer
n_outputs = 1 # number of predicted values


In [None]:
model = Sequential()
model.add(LSTM(n_units, input_shape=(n_timesteps, n_features)))
model.add(Dense(n_outputs))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
# Split the data into training and testing sets
train_size = int(len(df) * 0.7)
train_data = df.iloc[:train_size, :]
test_data = df.iloc[train_size:, :]

In [None]:
# Generate the training sequences and labels
def generate_sequences(data, n_timesteps):
    X, y = [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i+n_timesteps])
        y.append(data[i+n_timesteps])
    return np.array(X), np.array(y)

In [22]:
X_train, y_train = generate_sequences(train_data['PM10'].values, n_timesteps)
X_test, y_test = generate_sequences(test_data['PM10'].values, n_timesteps)


In [23]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x22a252caa40>

In [24]:
# Make predictions on the testing set
y_pred = model.predict(X_test)



In [25]:

# Rescale the predictions to the original range
y_pred = scaler.inverse_transform(y_pred)

In [27]:
# Evaluate the model
mse = np.mean(np.square(y_pred - test_data['PM10'].values))
print('MSE: %.3f' % mse)

MSE: 29633.815
