In [1]:
import os
import pickle

import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

from src.data_preparation.data_preprocessing import DataReshaperLSTM
from src.visualization.charts import ModelCharts
from src.visualization.metrics import PredictionEvaluator
from src.config import Config

## Load the data

In [2]:
config = Config()
variant_co2 = 'co2'

#lstm
train_lstm_df = pd.read_csv(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/train.csv')).set_index(["year", config.additional_index])
test_lstm_df = pd.read_csv(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/test.csv')).set_index(["year", config.additional_index])

with open(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/data_preprocessor_lstm.pkl'), 'rb') as f:
    data_preprocessor_lstm = pickle.load(f)

#residual lightgbm
train_lightgbm_residuals_df = pd.read_csv(os.path.join(config.output_cleaned_hybrid, f'lightgbm_lstm/{variant_co2}/residuals_train.csv')).set_index(["year", "country"]).rename_axis(index={"country": config.additional_index})
test_lightgbm_residuals_df = pd.read_csv(os.path.join(config.output_cleaned_hybrid, f'lightgbm_lstm/{variant_co2}/residuals_test.csv')).set_index(["year", "country"]).rename_axis(index={"country": config.additional_index})
train_lightgbm_residuals_preprocessed, test_lightgbm_residuals_preprocessed = data_preprocessor_lstm.preprocess_data(train_lightgbm_residuals_df, test_lightgbm_residuals_df)

In [3]:
train_lstm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,country_t-2,population_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,oil_co2_t-2,...,population_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1930,Argentina,0,0.006943,0.008760,0.026846,0.000282,0.001872,0.000000,0.000253,0.189716,0.001601,...,0.007087,0.008291,0.02649,0.000370,0.001696,0.000000,0.000390,0.190037,0.001654,0.050794
1931,Argentina,0,0.007176,0.009169,0.026846,0.000423,0.001881,0.000000,0.000390,0.190037,0.001654,...,0.007303,0.007940,0.02649,0.000402,0.001655,0.000000,0.000386,0.170616,0.001586,0.046617
1932,Argentina,0,0.007394,0.008782,0.026846,0.000459,0.001835,0.000000,0.000386,0.170616,0.001586,...,0.007503,0.007374,0.02649,0.000564,0.001416,0.000000,0.000480,0.157812,0.002007,0.044323
1933,Argentina,0,0.007595,0.008158,0.026846,0.000644,0.001570,0.000000,0.000480,0.157812,0.002007,...,0.007686,0.007125,0.02649,0.000525,0.001291,0.000000,0.000655,0.150694,0.002278,0.042756
1934,Argentina,0,0.007779,0.007883,0.026846,0.000600,0.001431,0.000000,0.000655,0.150694,0.002278,...,0.007850,0.007466,0.02649,0.000540,0.001215,0.000000,0.000880,0.145852,0.002371,0.041755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2003,Vietnam,44,0.059659,0.027092,0.026846,0.015970,0.005640,0.017690,0.002183,0.159475,0.010126,...,0.059873,0.026209,0.02649,0.017601,0.005604,0.013673,0.003490,0.148809,0.011342,0.055115
2004,Vietnam,44,0.060292,0.028933,0.026846,0.020100,0.006213,0.013673,0.003490,0.148809,0.011342,...,0.060508,0.028055,0.02649,0.019924,0.006518,0.016147,0.004431,0.157641,0.011685,0.057655
2005,Vietnam,44,0.060932,0.030970,0.026846,0.022752,0.007227,0.016147,0.004431,0.157641,0.011685,...,0.061146,0.030158,0.02649,0.021913,0.007093,0.015907,0.007456,0.160666,0.013424,0.052390
2006,Vietnam,44,0.061573,0.033289,0.026846,0.025024,0.007864,0.015907,0.007456,0.160666,0.013424,...,0.061780,0.032354,0.02649,0.025491,0.008044,0.015322,0.007996,0.141767,0.013411,0.050074


In [4]:
train_lightgbm_residuals_preprocessed

Unnamed: 0_level_0,Unnamed: 1_level_0,residual
year,country_index,Unnamed: 2_level_1
1930,Argentina,0.306904
1931,Argentina,0.309740
1932,Argentina,0.311444
1933,Argentina,0.308531
1934,Argentina,0.306986
...,...,...
2003,Vietnam,0.318426
2004,Vietnam,0.322770
2005,Vietnam,0.296870
2006,Vietnam,0.305532


In [5]:
#combine train df
train_hybrid_df = pd.merge(train_lightgbm_residuals_preprocessed, train_lstm_df, left_index=True, right_index=True)
train_hybrid_df

Unnamed: 0_level_0,Unnamed: 1_level_0,residual,country_t-2,population_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,...,population_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1930,Argentina,0.306904,0,0.006943,0.008760,0.026846,0.000282,0.001872,0.000000,0.000253,0.189716,...,0.007087,0.008291,0.02649,0.000370,0.001696,0.000000,0.000390,0.190037,0.001654,0.050794
1931,Argentina,0.309740,0,0.007176,0.009169,0.026846,0.000423,0.001881,0.000000,0.000390,0.190037,...,0.007303,0.007940,0.02649,0.000402,0.001655,0.000000,0.000386,0.170616,0.001586,0.046617
1932,Argentina,0.311444,0,0.007394,0.008782,0.026846,0.000459,0.001835,0.000000,0.000386,0.170616,...,0.007503,0.007374,0.02649,0.000564,0.001416,0.000000,0.000480,0.157812,0.002007,0.044323
1933,Argentina,0.308531,0,0.007595,0.008158,0.026846,0.000644,0.001570,0.000000,0.000480,0.157812,...,0.007686,0.007125,0.02649,0.000525,0.001291,0.000000,0.000655,0.150694,0.002278,0.042756
1934,Argentina,0.306986,0,0.007779,0.007883,0.026846,0.000600,0.001431,0.000000,0.000655,0.150694,...,0.007850,0.007466,0.02649,0.000540,0.001215,0.000000,0.000880,0.145852,0.002371,0.041755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2003,Vietnam,0.318426,44,0.059659,0.027092,0.026846,0.015970,0.005640,0.017690,0.002183,0.159475,...,0.059873,0.026209,0.02649,0.017601,0.005604,0.013673,0.003490,0.148809,0.011342,0.055115
2004,Vietnam,0.322770,44,0.060292,0.028933,0.026846,0.020100,0.006213,0.013673,0.003490,0.148809,...,0.060508,0.028055,0.02649,0.019924,0.006518,0.016147,0.004431,0.157641,0.011685,0.057655
2005,Vietnam,0.296870,44,0.060932,0.030970,0.026846,0.022752,0.007227,0.016147,0.004431,0.157641,...,0.061146,0.030158,0.02649,0.021913,0.007093,0.015907,0.007456,0.160666,0.013424,0.052390
2006,Vietnam,0.305532,44,0.061573,0.033289,0.026846,0.025024,0.007864,0.015907,0.007456,0.160666,...,0.061780,0.032354,0.02649,0.025491,0.008044,0.015322,0.007996,0.141767,0.013411,0.050074


In [6]:
#combine test df
test_hybrid_df = pd.merge(test_lightgbm_residuals_preprocessed, test_lstm_df, left_index=True, right_index=True)
test_hybrid_df

Unnamed: 0_level_0,Unnamed: 1_level_0,residual,country_t-2,population_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,...,population_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2004,Argentina,0.319805,0,0.027473,0.053334,0.060403,0.004200,0.000248,0.020084,0.053300,0.138354,...,0.027557,0.052814,0.059603,0.004850,0.000406,0.017251,0.058790,0.188310,0.023510,0.084815
2005,Argentina,0.325793,0,0.027774,0.058281,0.060403,0.005539,0.000450,0.017251,0.058790,0.188310,...,0.027855,0.057789,0.059603,0.005746,0.000448,0.014511,0.070571,0.216190,0.026859,0.089670
2006,Argentina,0.331326,0,0.028074,0.063769,0.060403,0.006561,0.000497,0.014511,0.070571,0.216190,...,0.028161,0.063138,0.059603,0.006894,0.000666,0.012143,0.069768,0.229215,0.028439,0.092114
2007,Argentina,0.294222,0,0.028382,0.069668,0.060403,0.007873,0.000739,0.012143,0.069768,0.229215,...,0.028471,0.068539,0.059603,0.008007,0.000788,0.010627,0.071386,0.231403,0.032444,0.071866
2008,Argentina,0.287459,0,0.028694,0.075626,0.060403,0.009144,0.000874,0.010627,0.071386,0.231403,...,0.028776,0.075119,0.059603,0.008505,0.000746,0.009244,0.074203,0.169793,0.030876,0.063909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,Vietnam,0.300232,44,0.069875,0.068733,0.033557,0.089450,0.024151,0.020829,0.017337,0.160261,...,0.070087,0.066628,0.033113,0.090909,0.022062,0.023050,0.015216,0.199982,0.023399,0.091968
2019,Vietnam,0.326799,44,0.070571,0.073518,0.033557,0.103814,0.024460,0.023050,0.015216,0.199982,...,0.070760,0.071618,0.033113,0.105591,0.024895,0.018262,0.015793,0.197052,0.026156,0.103267
2020,Vietnam,0.333081,44,0.071247,0.079022,0.033557,0.120581,0.027602,0.018262,0.015793,0.197052,...,0.071417,0.076904,0.039735,0.120271,0.039027,0.019326,0.016092,0.197468,0.029092,0.106353
2021,Vietnam,0.328863,44,0.071909,0.084853,0.040268,0.137345,0.043270,0.019326,0.016092,0.197468,...,0.072083,0.079115,0.039735,0.127776,0.043996,0.016719,0.014411,0.198151,0.027438,0.105343


## Reshape the data

In [7]:
data_resherper = DataReshaperLSTM()
x_train, x_test, y_train, y_test = data_resherper.reshape_data(train_hybrid_df, test_hybrid_df)

## Build the model

In [8]:
input_shape = (x_train.shape[1], x_train.shape[2])
output_units = 1

model = Sequential([
    LSTM(50, input_shape=input_shape, return_sequences = True),
    Dropout(0.2),
    LSTM(50),
    Dropout(0.2),
    Dense(output_units, activation='relu')
])

model.compile(optimizer="adam", loss="mean_squared_error")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 50)             14400     
                                                                 
 dropout (Dropout)           (None, 1, 50)             0         
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 34,651
Trainable params: 34,651
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [9]:
history = model.fit(x_train, y_train, epochs=config.epochs, batch_size=config.batch_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [10]:
# Save the model
model.save(os.path.join(config.models_folder, f'{variant_co2}_lightgbm_lstm_model.h5'))

# Evaluate the model on the test set
loss = model.evaluate(x_test, y_test, verbose = 0)

## Predictions

In [11]:
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

inverted_data_predicted_train_y = data_preprocessor_lstm.inverse_transform_data(train_predictions, train_predictions.shape[0], train_hybrid_df.shape[1]-2)
inverted_data_train_y = data_preprocessor_lstm.inverse_transform_data(y_train, train_predictions.shape[0], train_hybrid_df.shape[1]-2)

inverted_data_predicted_test_y = data_preprocessor_lstm.inverse_transform_data(test_predictions, test_predictions.shape[0], test_hybrid_df.shape[1]-2)
inverted_data_test_y = data_preprocessor_lstm.inverse_transform_data(y_test, test_predictions.shape[0], test_hybrid_df.shape[1]-2)



In [12]:
train_df_reset = train_hybrid_df.reset_index()
test_df_reset = test_hybrid_df.reset_index()

co2_predicted_train = inverted_data_predicted_train_y[:, -1]
co2_actual_train = inverted_data_train_y[:, -1]
co2_predicted_test = inverted_data_predicted_test_y[:, -1]
co2_actual_test = inverted_data_test_y[:, -1]

# Create DataFrames for train and test
train_results = pd.DataFrame({
    "country": train_df_reset["country_index"].values,
    "year": train_df_reset["year"].values,
    "co2_predicted": co2_predicted_train,
    "co2_actual": co2_actual_train
})

test_results = pd.DataFrame({
    "country": test_df_reset["country_index"].values,
    "year": test_df_reset["year"].values,
    "co2_predicted": co2_predicted_test,
    "co2_actual": co2_actual_test
})


train_results.to_csv(os.path.join(config.predictions_hybrid_lightgbm_lstm, f'lightgbm_lstm_{variant_co2}_train.csv'))
test_results.to_csv(os.path.join(config.predictions_hybrid_lightgbm_lstm, f'lightgbm_lstm_{variant_co2}_test.csv'))

## Charts

In [None]:
charts = ModelCharts(train_results, test_results)

#Line plot
charts.generate_line_plot(variant = variant_co2, model_output_file=config.predictions_hybrid_lightgbm_lstm)
charts.generate_line_plot_one_dataset(variant = variant_co2, model_output_file=config.predictions_hybrid_lightgbm_lstm, dataset_type='train')
charts.generate_line_plot_one_dataset(variant = variant_co2, model_output_file=config.predictions_hybrid_lightgbm_lstm, dataset_type='test')

#Scatter plot
charts.generate_scatter_plot(variant = variant_co2, model_output_file=config.predictions_hybrid_lightgbm_lstm)
charts.generate_scatter_plot_one_dataset(variant = variant_co2, model_output_file=config.predictions_hybrid_lightgbm_lstm, dataset_type='train')
charts.generate_scatter_plot_one_dataset(variant = variant_co2, model_output_file=config.predictions_hybrid_lightgbm_lstm, dataset_type='test')

## Metrics

In [None]:
evaluator = PredictionEvaluator()
evaluator.evaluate(train_results, test_results, actual_col='co2_actual', predicted_col='co2_predicted', variant = variant_co2, model_output_file=config.metrics_hybrid)