# LSTM model

### variant with 2 delays

##### Example: feature data from 2019 + feature data from 2020 -> target co2 from 2021

## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src_new.config import Config
from utils.comparing_actual_vs_prediction import PredictionEvaluator
from utils.model_charts import ModelCharts
#from model.lstm_model import LSTMModelBuilder

In [2]:
class DataReshaperLSTM:
    def __init__(self):
        config = Config()
        self.time_steps = config.window_size
        self.no_of_targets = config.pred_horizon

    # Reshape input to be 3D [samples, timesteps, features]
    def reshape_data(self, train_df, test_df):
        
        # Split into input and outputs
        X_train, y_train = train_df.values[:, : -self.no_of_targets], train_df.values[:, -self.no_of_targets:]
        X_test, y_test = test_df.values[:, : -self.no_of_targets], test_df.values[:, -self.no_of_targets:]

        # Calculate the number of features
        n_features = X_train.shape[1]

        # If the total number of features isn't divisible by the time steps, pad the features
        if n_features % self.time_steps != 0:
            # Calculate the number of padding features needed
            padding = self.time_steps - (n_features % self.time_steps)
            # Add padding features (columns of zeros)
            X_train = np.hstack((X_train, np.zeros((X_train.shape[0], padding))))
            X_test = np.hstack((X_test, np.zeros((X_test.shape[0], padding))))
            n_features = X_train.shape[1]
        
        # Ensure the total number of features is divisible by the time steps
        assert n_features % self.time_steps == 0, "Number of features is not divisible by the number of time steps"

        # Reshape into 3D [samples, timesteps, features]
        x_train = X_train.reshape((X_train.shape[0], self.time_steps, X_train.shape[1] // self.time_steps))
        x_test = X_test.reshape((X_test.shape[0], self.time_steps, X_test.shape[1] // self.time_steps))

        return x_train, x_test, y_train, y_test

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

class LSTMModelBuilder:
    def __init__(self, input_shape, output_units=1):
        self.input_shape = input_shape
        self.output_units = output_units

    def build_model(self):
        model = Sequential([
            LSTM(50, input_shape=self.input_shape, return_sequences = True),
            Dropout(0.2),
            LSTM(50),
            Dropout(0.2),
            Dense(self.output_units, activation='relu')
        ])
        model.compile(optimizer="adam", loss="mean_squared_error")
        return model

# Data

In [4]:
config = Config()
train_df = pd.read_csv("../output/0_data_cleaning/lstm/train_lstm.csv")
test_df = pd.read_csv("../output/0_data_cleaning/lstm/test_lstm.csv")

In [5]:
train_df

Unnamed: 0,year,country_index,country_t-3,population_t-3,gdp_t-3,temperature_change_from_co2_t-3,cement_co2_t-3,coal_co2_t-3,flaring_co2_t-3,gas_co2_t-3,...,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2_t+1,co2_t+2
0,1931-1932,Argentina,0,0.006986,0.185214,0.507331,0.016797,0.048410,0.000000,0.003933,...,0.166327,0.481904,0.024072,0.045068,0.000000,0.005711,0.252516,0.021200,0.199768,0.179355
1,1932-1933,Argentina,0,0.007221,0.188748,0.493851,0.024525,0.047334,0.000000,0.005904,...,0.150338,0.471364,0.033014,0.037719,0.000000,0.006939,0.217711,0.026236,0.179355,0.165318
2,1933-1934,Argentina,0,0.007440,0.175768,0.481904,0.025995,0.045068,0.000000,0.005711,...,0.141933,0.462130,0.030168,0.033701,0.000000,0.009276,0.198031,0.029199,0.165318,0.155812
3,1934-1935,Argentina,0,0.007643,0.158872,0.471364,0.035651,0.037719,0.000000,0.006939,...,0.146288,0.454119,0.030486,0.031171,0.000000,0.012256,0.184591,0.029855,0.155812,0.147009
4,1935-1936,Argentina,0,0.007828,0.149989,0.462130,0.032579,0.033701,0.000000,0.009276,...,0.155917,0.446248,0.033024,0.033827,0.000000,0.014548,0.174709,0.030048,0.147009,0.141011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029,2003-2004,Vietnam,44,0.059400,0.067103,0.071533,0.114904,0.017069,0.001639,0.005719,...,0.072114,0.070078,0.153245,0.022190,0.001236,0.007499,0.089355,0.022043,0.068187,0.070668
3030,2004-2005,Vietnam,44,0.060031,0.071460,0.070801,0.132844,0.020349,0.001616,0.004740,...,0.077062,0.069361,0.171695,0.025545,0.001445,0.009425,0.092293,0.022477,0.070668,0.064508
3031,2005-2006,Vietnam,44,0.060668,0.076207,0.070078,0.165488,0.022190,0.001236,0.007499,...,0.082705,0.068657,0.186919,0.027515,0.001409,0.015698,0.093135,0.025559,0.064508,0.061692
3032,2006-2007,Vietnam,44,0.061312,0.081437,0.069361,0.185412,0.025545,0.001445,0.009425,...,0.088530,0.067971,0.215267,0.030893,0.001344,0.016666,0.086350,0.025280,0.061692,0.065353


In [6]:
train_df[config.additional_index] = train_df["country_index"]
train_df = train_df.set_index(["year", config.additional_index])

In [7]:
test_df[config.additional_index] = test_df["country_index"]
test_df = test_df.set_index(["year", config.additional_index])

In [8]:
data_resherper = DataReshaperLSTM()
x_train, x_test, y_train, y_test = data_resherper.reshape_data(train_df, test_df)

In [9]:
x_train

array([[[0.00000000e+00, 6.98612119e-03, 1.85214099e-01, ...,
         3.93290870e-03, 3.11606196e-01, 2.25196873e-02],
        [0.00000000e+00, 7.13110326e-03, 1.80486080e-01, ...,
         5.90374525e-03, 3.06015032e-01, 2.26591021e-02],
        [0.00000000e+00, 7.25949876e-03, 1.66327412e-01, ...,
         5.71060913e-03, 2.52516163e-01, 2.12004556e-02]],

       [[0.00000000e+00, 7.22095525e-03, 1.88747694e-01, ...,
         5.90374525e-03, 3.06015032e-01, 2.26591021e-02],
        [0.00000000e+00, 7.34887138e-03, 1.68074818e-01, ...,
         5.71060913e-03, 2.52516163e-01, 2.12004556e-02],
        [0.00000000e+00, 7.45952988e-03, 1.50338440e-01, ...,
         6.93907702e-03, 2.17711097e-01, 2.62355153e-02]],

       [[0.00000000e+00, 7.44007089e-03, 1.75768316e-01, ...,
         5.71060913e-03, 2.52516163e-01, 2.12004556e-02],
        [0.00000000e+00, 7.55015263e-03, 1.51917869e-01, ...,
         6.93907702e-03, 2.17711097e-01, 2.62355153e-02],
        [0.00000000e+00, 7.64228324e

In [10]:
# Build LSTM model
lstm_model_builder = LSTMModelBuilder(
        input_shape=(x_train.shape[1], x_train.shape[2]),
        output_units=config.pred_horizon
    )
model = lstm_model_builder.build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 3, 50)             12200     
                                                                 
 dropout (Dropout)           (None, 3, 50)             0         
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense (Dense)               (None, 2)                 102       
                                                                 
Total params: 32,502
Trainable params: 32,502
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Train the model
history = model.fit(x_train, y_train, epochs=config.epochs, batch_size=config.batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [12]:
# Save the model
model.save(f'{config.output_lstm}/lstm_model.h5')

In [13]:
# Evaluate the model on the test set
loss = model.evaluate(x_test, y_test, verbose = 0)

In [14]:
import pickle

# Load the fitted DataScaler object <- do poprawy
with open('../output/0_data_cleaning/lstm/data_preprocessor.pkl', 'rb') as f:
    data_preprocessor = pickle.load(f)

In [15]:
# Make predictions
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

inverted_data_predicted_train_y = data_preprocessor.inverse_transform_data(train_predictions, train_predictions.shape[0], train_df.shape[1]-config.window_size)
inverted_data_train_y = data_preprocessor.inverse_transform_data(y_train, train_predictions.shape[0], train_df.shape[1]-config.window_size)

inverted_data_predicted_test_y = data_preprocessor.inverse_transform_data(test_predictions, test_predictions.shape[0], test_df.shape[1]-config.window_size)
inverted_data_test_y = data_preprocessor.inverse_transform_data(y_test, test_predictions.shape[0], test_df.shape[1]-config.window_size)



In [16]:
# Initialize the evaluator
evaluator = PredictionEvaluator(config.pred_horizon)

In [17]:
# Evaluate the predictions for train data
train_predictions_df, train_summary_metrics = evaluator.evaluate_predictions(
    inverted_data_train_y, inverted_data_predicted_train_y, train_df.index
)
print(train_summary_metrics)

  Metric       Overall
0    MAE  1.284443e-06
1    MSE  4.850558e-12
2   RMSE  2.202398e-06
3     R2  9.359387e-01


In [18]:
# Evaluate the predictions for test data
test_predictions_df, test_summary_metrics = evaluator.evaluate_predictions(
    inverted_data_test_y, inverted_data_predicted_test_y, test_df.index
)
print(test_summary_metrics)

  Metric       Overall
0    MAE  1.149342e-06
1    MSE  3.075133e-12
2   RMSE  1.753606e-06
3     R2  9.014893e-01


In [19]:
train_predictions_df.to_csv(f'{config.output_lstm}/3_train_predictions.csv')
test_predictions_df.to_csv(f'{config.output_lstm}/3_test_predictions.csv')

In [20]:
charts = ModelCharts(train_predictions_df, test_predictions_df, config.pred_horizon)
processed_data = charts.load_and_process_data(config.year_index, config.year_index, config.additional_index)
charts.generate_line_and_scatter_plots(processed_data, config.year_index, config.additional_index, config.output_lstm)
