In [1]:
import os
import pickle

import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

from src.data_preparation.data_preprocessing import DataReshaperLSTM
from src.visualization.charts import ModelCharts
from src.visualization.metrics import PredictionEvaluator
from src.config import Config

## Load the data

In [2]:
config = Config()
variant_co2 = 'co2_per_capita'

#lstm
train_lstm_df = pd.read_csv(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/train.csv')).set_index(["year", config.additional_index])
test_lstm_df = pd.read_csv(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/test.csv')).set_index(["year", config.additional_index])

with open(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/data_preprocessor_lstm.pkl'), 'rb') as f:
    data_preprocessor_lstm = pickle.load(f)

#residual arimax
train_arimax_residuals_df = pd.read_csv(os.path.join(config.output_cleaned_hybrid, f'arimax_lstm/{variant_co2}/pca_residuals_train.csv')).set_index(["year", "country"]).rename_axis(index={"country": config.additional_index})
test_arimax_residuals_df = pd.read_csv(os.path.join(config.output_cleaned_hybrid, f'arimax_lstm/{variant_co2}/pca_residuals_test.csv')).set_index(["year", "country"]).rename_axis(index={"country": config.additional_index})
train_arimax_residuals_preprocessed, test_arimax_residuals_preprocessed = data_preprocessor_lstm.preprocess_data(train_arimax_residuals_df, test_arimax_residuals_df)

In [3]:
train_lstm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,country_t-2,population_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,oil_co2_t-2,...,population_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1930,Argentina,0,0.006943,0.175266,0.507331,0.016325,0.048410,0.000000,0.003933,0.311606,0.022520,...,0.007087,0.174392,0.493851,0.022711,0.047334,0.000000,0.005904,0.306015,0.022659,0.236081
1931,Argentina,0,0.007176,0.178610,0.493851,0.023836,0.047334,0.000000,0.005904,0.306015,0.022659,...,0.007303,0.162400,0.481904,0.024072,0.045068,0.000000,0.005711,0.252516,0.021200,0.199768
1932,Argentina,0,0.007394,0.166327,0.481904,0.025265,0.045068,0.000000,0.005711,0.252516,0.021200,...,0.007503,0.146788,0.471364,0.033014,0.037719,0.000000,0.006939,0.217711,0.026236,0.179355
1933,Argentina,0,0.007595,0.150338,0.471364,0.034650,0.037719,0.000000,0.006939,0.217711,0.026236,...,0.007686,0.138581,0.462130,0.030168,0.033701,0.000000,0.009276,0.198031,0.029199,0.165318
1934,Argentina,0,0.007779,0.141933,0.462130,0.031663,0.033701,0.000000,0.009276,0.198031,0.029199,...,0.007850,0.142834,0.454119,0.030486,0.031171,0.000000,0.012256,0.184591,0.029855,0.155812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2003,Vietnam,44,0.059659,0.067622,0.070801,0.129113,0.020349,0.001616,0.004740,0.093405,0.019882,...,0.059873,0.070411,0.070078,0.153245,0.022190,0.001236,0.007499,0.089355,0.022043,0.068187
2004,Vietnam,44,0.060292,0.072114,0.070078,0.160839,0.022190,0.001236,0.007499,0.089355,0.022043,...,0.060508,0.075243,0.069361,0.171695,0.025545,0.001445,0.009425,0.092293,0.022477,0.070668
2005,Vietnam,44,0.060932,0.077062,0.069361,0.180204,0.025545,0.001445,0.009425,0.092293,0.022477,...,0.061146,0.080752,0.068657,0.186919,0.027515,0.001409,0.015698,0.093135,0.025559,0.064508
2006,Vietnam,44,0.061573,0.082705,0.068657,0.196182,0.027515,0.001409,0.015698,0.093135,0.025559,...,0.061780,0.086439,0.067971,0.215267,0.030893,0.001344,0.016666,0.086350,0.025280,0.061692


In [4]:
train_arimax_residuals_preprocessed

Unnamed: 0_level_0,Unnamed: 1_level_0,residual
year,country_index,Unnamed: 2_level_1
1930,Argentina,0.972903
1931,Argentina,0.859663
1932,Argentina,0.859195
1933,Argentina,0.865234
1934,Argentina,0.864851
...,...,...
2003,Vietnam,0.866074
2004,Vietnam,0.866237
2005,Vietnam,0.864723
2006,Vietnam,0.865210


In [5]:
#combine train df
train_hybrid_df = pd.merge(train_arimax_residuals_preprocessed, train_lstm_df, left_index=True, right_index=True)
train_hybrid_df

Unnamed: 0_level_0,Unnamed: 1_level_0,residual,country_t-2,population_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,...,population_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1930,Argentina,0.972903,0,0.006943,0.175266,0.507331,0.016325,0.048410,0.000000,0.003933,0.311606,...,0.007087,0.174392,0.493851,0.022711,0.047334,0.000000,0.005904,0.306015,0.022659,0.236081
1931,Argentina,0.859663,0,0.007176,0.178610,0.493851,0.023836,0.047334,0.000000,0.005904,0.306015,...,0.007303,0.162400,0.481904,0.024072,0.045068,0.000000,0.005711,0.252516,0.021200,0.199768
1932,Argentina,0.859195,0,0.007394,0.166327,0.481904,0.025265,0.045068,0.000000,0.005711,0.252516,...,0.007503,0.146788,0.471364,0.033014,0.037719,0.000000,0.006939,0.217711,0.026236,0.179355
1933,Argentina,0.865234,0,0.007595,0.150338,0.471364,0.034650,0.037719,0.000000,0.006939,0.217711,...,0.007686,0.138581,0.462130,0.030168,0.033701,0.000000,0.009276,0.198031,0.029199,0.165318
1934,Argentina,0.864851,0,0.007779,0.141933,0.462130,0.031663,0.033701,0.000000,0.009276,0.198031,...,0.007850,0.142834,0.454119,0.030486,0.031171,0.000000,0.012256,0.184591,0.029855,0.155812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2003,Vietnam,0.866074,44,0.059659,0.067622,0.070801,0.129113,0.020349,0.001616,0.004740,0.093405,...,0.059873,0.070411,0.070078,0.153245,0.022190,0.001236,0.007499,0.089355,0.022043,0.068187
2004,Vietnam,0.866237,44,0.060292,0.072114,0.070078,0.160839,0.022190,0.001236,0.007499,0.089355,...,0.060508,0.075243,0.069361,0.171695,0.025545,0.001445,0.009425,0.092293,0.022477,0.070668
2005,Vietnam,0.864723,44,0.060932,0.077062,0.069361,0.180204,0.025545,0.001445,0.009425,0.092293,...,0.061146,0.080752,0.068657,0.186919,0.027515,0.001409,0.015698,0.093135,0.025559,0.064508
2006,Vietnam,0.865210,44,0.061573,0.082705,0.068657,0.196182,0.027515,0.001409,0.015698,0.093135,...,0.061780,0.086439,0.067971,0.215267,0.030893,0.001344,0.016666,0.086350,0.025280,0.061692


In [6]:
#combine test df
test_hybrid_df = pd.merge(test_arimax_residuals_preprocessed, test_lstm_df, left_index=True, right_index=True)
test_hybrid_df

Unnamed: 0_level_0,Unnamed: 1_level_0,residual,country_t-2,population_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,...,population_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2004,Argentina,0.868758,0,0.027473,0.316497,0.335627,0.071543,0.001888,0.003866,0.243811,0.102414,...,0.027557,0.335041,0.332180,0.088965,0.003384,0.003287,0.266162,0.139762,0.096260,0.176022
2005,Argentina,0.871657,0,0.027774,0.343145,0.332180,0.093373,0.003384,0.003287,0.266162,0.139762,...,0.027855,0.363731,0.328824,0.104327,0.003700,0.002737,0.316271,0.159878,0.108860,0.185939
2006,Argentina,0.873330,0,0.028074,0.372528,0.328824,0.109497,0.003700,0.002737,0.316271,0.159878,...,0.028161,0.394123,0.325443,0.123898,0.005445,0.002267,0.309455,0.168585,0.114077,0.190046
2007,Argentina,0.862541,0,0.028382,0.403655,0.325443,0.130038,0.005445,0.002267,0.309455,0.168585,...,0.028471,0.424181,0.322093,0.142410,0.006375,0.001963,0.313377,0.169177,0.128804,0.142005
2008,Argentina,0.860927,0,0.028694,0.434440,0.322093,0.149467,0.006375,0.001963,0.313377,0.169177,...,0.028776,0.461103,0.318868,0.149749,0.005975,0.001691,0.322480,0.123591,0.121350,0.122854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,Vietnam,0.870780,44,0.069875,0.160141,0.075854,0.619830,0.074690,0.001631,0.032261,0.090407,...,0.070087,0.166282,0.075123,0.678791,0.074917,0.001788,0.028043,0.102411,0.038999,0.098931
2019,Vietnam,0.871526,44,0.070571,0.170304,0.075123,0.712429,0.074917,0.001788,0.028043,0.102411,...,0.070760,0.177784,0.074425,0.781100,0.083753,0.001403,0.028835,0.101226,0.043189,0.109148
2020,Vietnam,0.873483,44,0.071247,0.182084,0.074425,0.819808,0.083753,0.001403,0.028835,0.101226,...,0.071417,0.189888,0.088506,0.881685,0.130114,0.001472,0.029117,0.101077,0.047605,0.111395
2021,Vietnam,0.875045,44,0.071909,0.194481,0.088506,0.925378,0.130114,0.001472,0.029117,0.101077,...,0.072083,0.193794,0.087708,0.928252,0.145356,0.001262,0.025840,0.101007,0.044492,0.109799


## Reshape the data

In [7]:
data_resherper = DataReshaperLSTM()
x_train, x_test, y_train, y_test = data_resherper.reshape_data(train_hybrid_df, test_hybrid_df)

## Build the model

In [8]:
input_shape = (x_train.shape[1], x_train.shape[2])
output_units = 1

model = Sequential([
    LSTM(50, input_shape=input_shape, return_sequences = True),
    Dropout(0.2),
    LSTM(50),
    Dropout(0.2),
    Dense(output_units, activation='relu')
])

model.compile(optimizer="adam", loss="mean_squared_error")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 50)             14400     
                                                                 
 dropout (Dropout)           (None, 1, 50)             0         
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 34,651
Trainable params: 34,651
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [9]:
history = model.fit(x_train, y_train, epochs=config.epochs, batch_size=config.batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [10]:
# Save the model
model.save(os.path.join(config.models_folder, f'{variant_co2}_pca_arimax_lstm_model.h5'))

# Evaluate the model on the test set
loss = model.evaluate(x_test, y_test, verbose = 0)

## Predictions

In [11]:
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

inverted_data_predicted_train_y = data_preprocessor_lstm.inverse_transform_data(train_predictions, train_predictions.shape[0], train_hybrid_df.shape[1]-2)
inverted_data_train_y = data_preprocessor_lstm.inverse_transform_data(y_train, train_predictions.shape[0], train_hybrid_df.shape[1]-2)

inverted_data_predicted_test_y = data_preprocessor_lstm.inverse_transform_data(test_predictions, test_predictions.shape[0], test_hybrid_df.shape[1]-2)
inverted_data_test_y = data_preprocessor_lstm.inverse_transform_data(y_test, test_predictions.shape[0], test_hybrid_df.shape[1]-2)



In [12]:
train_df_reset = train_hybrid_df.reset_index()
test_df_reset = test_hybrid_df.reset_index()

co2_predicted_train = inverted_data_predicted_train_y[:, -1]
co2_actual_train = inverted_data_train_y[:, -1]
co2_predicted_test = inverted_data_predicted_test_y[:, -1]
co2_actual_test = inverted_data_test_y[:, -1]

# Create DataFrames for train and test
train_results = pd.DataFrame({
    "country": train_df_reset["country_index"].values,
    "year": train_df_reset["year"].values,
    "co2_predicted": co2_predicted_train,
    "co2_actual": co2_actual_train
})

test_results = pd.DataFrame({
    "country": test_df_reset["country_index"].values,
    "year": test_df_reset["year"].values,
    "co2_predicted": co2_predicted_test,
    "co2_actual": co2_actual_test
})


train_results.to_csv(os.path.join(config.predictions_hybrid_arimax_lstm, f'arimax_lstm_{variant_co2}_train.csv'))
test_results.to_csv(os.path.join(config.predictions_hybrid_arimax_lstm, f'arimax_lstm_{variant_co2}_test.csv'))

## Charts

In [13]:
charts = ModelCharts(train_results, test_results)

#Line plot
charts.generate_line_plot(variant = f'{variant_co2}_pca', model_output_file=config.predictions_hybrid_arimax_lstm)
charts.generate_line_plot_one_dataset(variant = f'{variant_co2}_pca', model_output_file=config.predictions_hybrid_arimax_lstm, dataset_type='train')
charts.generate_line_plot_one_dataset(variant = f'{variant_co2}_pca', model_output_file=config.predictions_hybrid_arimax_lstm, dataset_type='test')

#Scatter plot
charts.generate_scatter_plot(variant = f'{variant_co2}_pca', model_output_file=config.predictions_hybrid_arimax_lstm)
charts.generate_scatter_plot_one_dataset(variant = f'{variant_co2}_pca', model_output_file=config.predictions_hybrid_arimax_lstm, dataset_type='train')
charts.generate_scatter_plot_one_dataset(variant = f'{variant_co2}_pca', model_output_file=config.predictions_hybrid_arimax_lstm, dataset_type='test')

## Metrics

In [14]:
evaluator = PredictionEvaluator()
evaluator.evaluate(train_results, test_results, actual_col='co2_actual', predicted_col='co2_predicted', variant = f'{variant_co2}_pca', model_output_file=config.metrics_hybrid)