In [1]:
# Type hints
from typing import Tuple

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import seaborn as sn
pd.options.plotting.backend = 'plotly'

# Neural network and machine learning
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

# Our data pre-processing
from utils.generate_dataset import *


In [2]:
dataset = pd.read_csv('data_v/dataset.csv')
#dataset["Date/time"] = pd.to_datetime(dataset["Date/time"])
dataset.set_index("Date/time", inplace=True)

In [3]:
dataset = get_unprocessed_dataset()

In [5]:
dataset.plot(width=1920, height=1000)

In [6]:
corrMatrix = dataset.corr()
fig = px.imshow(corrMatrix)
fig.show()
pio.write_image(fig, "plots/correlogrma.svg")

In [7]:
temperature_series, precipitation_series, pm25_series, pm10_series = extract_time_series(dataset)

In [10]:
series = [temperature_series, precipitation_series, pm25_series, pm10_series]

fig = make_subplots(
    rows=4, 
    cols=1, 
    #subplot_titles=["Temperature (in °C)", "Precipitation (in mm)", "PM2.5 (in µg/m3)", "PM10 (in µg/m3)"]
)
for s, idx in zip(series, range(1, 5)):
    fig.add_trace(
        go.Scatter(
            x=s.index,
            y=s.values,
        ),
        row=idx,
        col=1,
    )
fig.update_layout(width=1700, height=1700, showlegend=False)
fig.show()

In [15]:
def split_into_train_test_validation_sets(X: np.array) -> Tuple[np.array, np.array, np.array]:
    train_size = int(0.7 * len(X))
    validation_size = int(0.1 * len(X))
    test_size = len(X) - validation_size - train_size
    

    train_set, validation_set, test_set = X[0: train_size, :], X[train_size: train_size + validation_size,:], X[train_size + validation_size: train_size + test_size + validation_size, :]
    
    print(f"""
    Total dataset length: {len(X)}

    Number of features: {n_features}
    
    Train set shape: {train_set.shape} ({train_set.shape[0] / len(X) * 100:0.2f}% of dataset)
    Validation set shape: {validation_set.shape} ({validation_set.shape[0] / len(X) * 100:0.2f}% of dataset)
    Test set shape: {test_set.shape} ({test_set.shape[0] / len(X) * 100:0.2f}% of dataset)
    """)

    return train_set, validation_set, test_set

In [11]:
def normalize_dataset(values) -> Tuple[np.array, MinMaxScaler]:
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    return (scaler.fit_transform(values), scaler)

In [12]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		for j in range(n_vars):
			names += [('var%d(t-%d)' % (j+1, i))]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg


In [13]:
def train_LSTM_model(
    train_X, 
    train_y, 
    validation_X,
    validation_y,
    epochs=500,
    batch_size=72,
    verbose=2,
    neurons=50,
    ) -> Tuple[Sequential, np.array]:
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')

    history = model.fit(
        train_X, train_y, 
        epochs=epochs, 
        batch_size=batch_size, 
        validation_data=(validation_X, validation_y), 
        verbose=verbose, 
        shuffle=False,
    )

    print(f"Model trained with {epochs} epochs")

    return (model, history)

## Error metrics

$$ MAPE = \frac{1}{n} \sum_{i=1}^n \frac{y_i - \hat{y}_i}{y_i} $$

In [13]:
#Defining MAPE function
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual + 1 - (Y_Predicted + 1)) / (Y_actual + 1))) * 100
    return mape

# Use a smaller dataset

In [8]:
reduction_size = 10000

reduced_dataset = dataset[-reduction_size:]

fig = reduced_dataset.plot(width=1920, height=300)
fig.show()
# pio.write_image(fig, 'plots/reduced_plot.png', height=1200, scale=1)

In [9]:
train_size = int(0.7 * len(reduced_dataset))
validation_size = int(0.1 * len(reduced_dataset))
test_size = len(reduced_dataset) - validation_size - train_size


train_set, validation_set, test_set = reduced_dataset[0: train_size], reduced_dataset[train_size: train_size + validation_size], reduced_dataset[train_size + validation_size: train_size + test_size + validation_size]


In [15]:
fig = go.Figure()

sets = {"blue": train_set, "green": validation_set, "red": test_set}

for color, s in sets.items():
    fig.add_trace(
        go.Scatter(x=s.index, y=s['PM10 [ug/m3]'], line=dict(color=color))
    )
    fig.add_trace(
        go.Scatter(x=s.index, y=s['PM2.5 [ug/m3]'], line=dict(color=color))
    )
    fig.add_trace(
        go.Scatter(x=s.index, y=s['TEMP [C]'], line=dict(color=color))
    )
    fig.add_trace(
        go.Scatter(x=s.index, y=s['PREC [mm]'], line=dict(color=color))
    )

fig.update_layout(width=1920, showlegend=False)

fig.show()

In [16]:
scaled_data, scaler = normalize_dataset(reduced_dataset.values)


reframed = series_to_supervised(scaled_data, n_in=1, n_out=1)
reframed.drop(reframed.columns[[4, 6, 7]], axis=1, inplace=True)
print(reframed.head())
print(reframed.shape)
n_features = len(reframed.columns)

train_set, validation_set, test_set = split_into_train_test_validation_sets(X=reframed.values)

train_X, train_y = train_set[:, :-1], train_set[:, -1]
validation_X, validation_y = validation_set[:, :-1], validation_set[:, -1]
test_X, test_y = test_set[:, :-1], test_set[:, -1]

train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
validation_X = validation_X.reshape((validation_X.shape[0], 1, validation_X.shape[1]))

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape, validation_X.shape, validation_y.shape)

   var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)   var2(t)
1   0.434657   0.321680   0.727919        0.0  0.138442
2   0.192373   0.138442   0.772046        0.0  0.148237
3   0.171954   0.148237   0.822332        0.0  0.185866
4   0.239259   0.185866   0.760790        0.0  0.235019
5   0.273417   0.235019   0.757989        0.0  0.098100
(9999, 5)

    Total dataset length: 9999

    Number of features: 5
    
    Train set shape: (6999, 5) (70.00% of dataset)
    Validation set shape: (999, 5) (9.99% of dataset)
    Test set shape: (2001, 5) (20.01% of dataset)
    
(6999, 1, 4) (6999,) (2001, 1, 4) (2001,) (999, 1, 4) (999,)


In [17]:
neurons = [1, 2, 10, 25, 50, 100, 150, 200]
epochs = [1, 2, 10, 25, 50, 100, 150, 200]

errors = dict()
val_loss = dict()
loss = dict()
inv_yhats = dict()

test_X_r = test_X.reshape((test_X.shape[0], test_X.shape[2]))
 
# invert scaling for actual
test_y_r = test_y.reshape((len(test_y), 1))

inv_y = np.concatenate((test_y_r, test_X_r[:, :-1]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:, 0]

for neuron in neurons:
    for epoch in  epochs:
        dict_key = f"Epochs: {epoch}, neurons: {neuron}"
        print(dict_key)
        
        model, history = train_LSTM_model(
            train_X, 
            train_y,
            validation_X, 
            validation_y, 
            epochs=epoch, 
            neurons=neuron, 
            verbose=0
        )

        # make a prediction
        yhat = model.predict(test_X)

        
        # invert scaling for forecast
        inv_yhat = np.concatenate((yhat, test_X_r[:, :-1]), axis=1)
        inv_yhat = scaler.inverse_transform(inv_yhat)
        inv_yhat = inv_yhat[:, 0]

        inv_yhats[dict_key] = inv_yhat

        # calculate RMSE
        rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))

        print('Validation RMSE: %.3f' % rmse)

        error_percentage = MAPE(inv_y, inv_yhat)
        
        errors[dict_key] = error_percentage
        
        print(f"Mean absolute error percentage for {epoch} epochs and {neuron} neurons: {error_percentage:.2f}%\n")

        val_loss[dict_key] = history.history["val_loss"]
        loss[dict_key] = history.history["loss"]

        new_fig = go.Figure()
        new_fig.add_trace(go.Scatter(x=list(dataset.index)[-len(inv_y):], y=inv_y,
                            mode='lines',
                            name='Truth value'))
        new_fig.add_trace(go.Scatter(x=list(dataset.index)[-len(inv_y):], y=inv_yhat,
                            mode='lines',
                            name='Prediction'))
        new_fig.update_layout(width=1200, yaxis_title="PM2.5", title=dict_key)

        # new_fig.show()
        pio.write_image(new_fig, f'plots/predictions/{neuron}-{epoch}p.png', width=1100, scale=2)

        print("===============================================================================================================")

errors = {v: k for (k, v) in errors.items()}
min_idx = np.min(list(errors.keys()))
best_conf = errors[min_idx]

print(f"Best configuration for LSTM among the tested parameters is : {best_conf}")

Epochs: 1, neurons: 1
Model trained with 1 epochs
Validation RMSE: 7.196
Mean absolute error percentage for 1 epochs and 1 neurons: 51.43%

Epochs: 2, neurons: 1
Model trained with 2 epochs
Validation RMSE: 7.524
Mean absolute error percentage for 2 epochs and 1 neurons: 81.67%

Epochs: 10, neurons: 1
Model trained with 10 epochs
Validation RMSE: 7.261
Mean absolute error percentage for 10 epochs and 1 neurons: 86.17%

Epochs: 25, neurons: 1
Model trained with 25 epochs
Validation RMSE: 5.483
Mean absolute error percentage for 25 epochs and 1 neurons: 49.33%

Epochs: 50, neurons: 1
Model trained with 50 epochs
Validation RMSE: 4.646
Mean absolute error percentage for 50 epochs and 1 neurons: 37.75%

Epochs: 100, neurons: 1
Model trained with 100 epochs
Validation RMSE: 4.575
Mean absolute error percentage for 100 epochs and 1 neurons: 36.71%

Epochs: 150, neurons: 1
Model trained with 150 epochs
Validation RMSE: 4.512
Mean absolute error percentage for 150 epochs and 1 neurons: 36.67%


## Prediction and losses graphs for the best parameter

In [22]:
vals = [v.strip() for v in "Epochs: 10, Neurons: 200".split(",")]
best_epochs, best_neurons = [int(v.split(" ")[1]) for v in vals]
best_epochs, best_neurons

(10, 200)

In [44]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(1, best_epochs + 1), y=loss[best_conf], name="Training loss", marker_color="#ed7953"))
fig.add_trace(go.Scatter(x=np.arange(1, best_epochs + 1), y=val_loss[best_conf], name="Validation loss", marker_color="#7201a8"))
fig.update_layout(title=f"Best parameters: {best_neurons} neurons with {best_epochs} epochs", xaxis_title="Epochs", yaxis_title="Loss")
fig.show()

In [20]:
with open('results/errors_lstm.csv', 'w') as f:
    for key, value in errors.items():
        f.write(f"{key} : {value:.3f}\n")


In [29]:
print(errors[best_conf])

33.79453051137226


In [48]:
for conf, error in errors.items():
    print(conf, f"\nError : {error:.2f}%\n")

Epochs: 1, neurons: 1 
Error : 51.43%

Epochs: 2, neurons: 1 
Error : 81.67%

Epochs: 10, neurons: 1 
Error : 86.17%

Epochs: 25, neurons: 1 
Error : 49.33%

Epochs: 50, neurons: 1 
Error : 37.75%

Epochs: 100, neurons: 1 
Error : 36.71%

Epochs: 150, neurons: 1 
Error : 36.67%

Epochs: 200, neurons: 1 
Error : 35.56%

Epochs: 1, neurons: 2 
Error : 46.65%

Epochs: 2, neurons: 2 
Error : 70.60%

Epochs: 10, neurons: 2 
Error : 63.66%

Epochs: 25, neurons: 2 
Error : 38.32%

Epochs: 50, neurons: 2 
Error : 38.69%

Epochs: 100, neurons: 2 
Error : 36.31%

Epochs: 150, neurons: 2 
Error : 35.23%

Epochs: 200, neurons: 2 
Error : 35.60%

Epochs: 1, neurons: 10 
Error : 72.70%

Epochs: 2, neurons: 10 
Error : 83.38%

Epochs: 10, neurons: 10 
Error : 42.62%

Epochs: 25, neurons: 10 
Error : 39.41%

Epochs: 50, neurons: 10 
Error : 38.50%

Epochs: 100, neurons: 10 
Error : 35.77%

Epochs: 150, neurons: 10 
Error : 36.48%

Epochs: 200, neurons: 10 
Error : 34.41%

Epochs: 1, neurons: 25 
Error

In [30]:

print(f"Mean average percentage error for best parameters: {errors[best_conf]:.3f}% for {best_conf}")

Mean average percentage error for best parameters: 33.795% for Epochs: 10, neurons: 200


### Visualize MAPE as a function of parameters

In [73]:
mape_data = pd.read_csv('results/errors_lstm.csv')

In [84]:
fig = go.Figure()

for neuron in neurons:
    vals = mape_data.loc[mape_data['Neurons'] == neuron]
    fig.add_trace(go.Scatter(x=vals['Epochs'], y=vals['Error'], name=f"{neuron} neurons"))

fig.update_layout(xaxis_title="Epochs", yaxis_title="MAPE error percentages", width=1100)

fig.show()