In [1]:
import os
import pickle

import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

from src.data_preparation.data_preprocessing import DataReshaperLSTM
from src.visualization.metrics import PredictionEvaluator, GlobalResults
from src.config import Config

## Load the data

In [2]:
config = Config()
variant_co2 = 'co2_per_capita'

#lstm
train_lstm_df = pd.read_csv(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/train.csv')).set_index(["year", config.additional_index]).drop(columns=['country_order'])
test_lstm_df = pd.read_csv(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/test.csv')).set_index(["year", config.additional_index]).drop(columns=['country_order'])

with open(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/data_preprocessor_lstm.pkl'), 'rb') as f:
    data_preprocessor_lstm_for_prediction = pickle.load(f)

with open(os.path.join(config.output_cleaned_lstm, f'{variant_co2}/data_preprocessor_lstm.pkl'), 'rb') as f:
    data_preprocessor_lstm = pickle.load(f)
#lightgbm
train_lightgbm_pred_df = pd.read_csv(os.path.join(config.output_cleaned_hybrid, f'lightgbm_lstm/{variant_co2}/lightgbm_pred_train.csv')).set_index(["year", "country"]).rename_axis(index={"country": config.additional_index})
test_lightgbm_pred_df = pd.read_csv(os.path.join(config.output_cleaned_hybrid, f'lightgbm_lstm/{variant_co2}/lightgbm_pred_test.csv')).set_index(["year", "country"]).rename_axis(index={"country": config.additional_index})
train_lightgbm_pred_preprocessed, test_lightgbm_pred_preprocessed = data_preprocessor_lstm.preprocess_data(train_lightgbm_pred_df, test_lightgbm_pred_df)

In [3]:
train_lightgbm_pred_preprocessed['lightgbm_pred_t-2'] = train_lightgbm_pred_preprocessed['lightgbm_pred']
train_lightgbm_pred_preprocessed['lightgbm_pred_t-1'] = train_lightgbm_pred_preprocessed['lightgbm_pred']
train_lightgbm_pred_preprocessed.drop(columns=['lightgbm_pred'], inplace=True)
train_lightgbm_pred_preprocessed

Unnamed: 0_level_0,Unnamed: 1_level_0,lightgbm_pred_t-2,lightgbm_pred_t-1
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1
1961,Arabia Saudyjska,0.078933,0.078933
1962,Arabia Saudyjska,0.103684,0.103684
1963,Arabia Saudyjska,0.105361,0.105361
1964,Arabia Saudyjska,0.111449,0.111449
1965,Arabia Saudyjska,0.127403,0.127403
...,...,...,...
2000,Włochy,0.188417,0.188417
2001,Włochy,0.187318,0.187318
2002,Włochy,0.190610,0.190610
2003,Włochy,0.180628,0.180628


In [4]:
test_lightgbm_pred_preprocessed['lightgbm_pred_t-2'] = test_lightgbm_pred_preprocessed['lightgbm_pred']
test_lightgbm_pred_preprocessed['lightgbm_pred_t-1'] = test_lightgbm_pred_preprocessed['lightgbm_pred']
test_lightgbm_pred_preprocessed.drop(columns=['lightgbm_pred'], inplace=True)
test_lightgbm_pred_preprocessed

Unnamed: 0_level_0,Unnamed: 1_level_0,lightgbm_pred_t-2,lightgbm_pred_t-1
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1
1989,Stany Zjednoczone,0.467747,0.467747
1990,Stany Zjednoczone,0.466720,0.466720
1991,Stany Zjednoczone,0.466320,0.466320
1992,Stany Zjednoczone,0.472049,0.472049
1993,Stany Zjednoczone,0.473386,0.473386
...,...,...,...
2019,Szwecja,0.164062,0.164062
2020,Szwecja,0.163468,0.163468
2021,Szwecja,0.148943,0.148943
2022,Szwecja,0.147648,0.147648


In [5]:
#combine train df
train_hybrid_df = pd.merge(train_lightgbm_pred_preprocessed, train_lstm_df, left_index=True, right_index=True)
train_hybrid_df

Unnamed: 0_level_0,Unnamed: 1_level_0,lightgbm_pred_t-2,lightgbm_pred_t-1,country_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,...,country_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1961,Arabia Saudyjska,0.078933,0.078933,0,0.243528,0.000000,0.019437,0.000000,0.000000,0.000000,0.074544,...,0,0.258822,0.000000,0.021676,0.000000,0.000000,0.000000,0.071374,0.062017,0.067313
1962,Arabia Saudyjska,0.103684,0.103684,0,0.258822,0.000000,0.022903,0.000000,0.000000,0.000000,0.071374,...,0,0.279879,0.000000,0.024220,0.000000,0.000000,0.000000,0.072538,0.079918,0.079858
1963,Arabia Saudyjska,0.105361,0.105361,0,0.279879,0.000000,0.025592,0.000000,0.000000,0.000000,0.072538,...,0,0.302746,0.000000,0.041657,0.000000,0.000000,0.001262,0.073308,0.134421,0.083187
1964,Arabia Saudyjska,0.111449,0.111449,0,0.302746,0.000000,0.044016,0.000000,0.000000,0.001262,0.073308,...,0,0.317662,0.000000,0.043238,0.000000,0.000000,0.001603,0.074590,0.143784,0.082177
1965,Arabia Saudyjska,0.127403,0.127403,0,0.317662,0.000000,0.045686,0.000000,0.000000,0.001603,0.074590,...,0,0.333562,0.000000,0.053995,0.000000,0.000000,0.065958,0.074268,0.115466,0.069495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000,Włochy,0.188417,0.188417,44,0.703614,0.166694,0.322117,0.062423,0.003817,0.311536,0.066203,...,44,0.719492,0.166619,0.320211,0.061341,0.003052,0.338965,0.063718,0.260213,0.139462
2001,Włochy,0.187318,0.187318,44,0.719492,0.166619,0.338340,0.061341,0.003052,0.338965,0.063718,...,44,0.751454,0.190318,0.332290,0.063685,0.003200,0.353906,0.062837,0.255294,0.138231
2002,Włochy,0.190610,0.190610,44,0.751454,0.190318,0.351103,0.063685,0.003200,0.353906,0.062837,...,44,0.770000,0.190206,0.343234,0.067046,0.003011,0.354470,0.061673,0.251337,0.138098
2003,Włochy,0.180628,0.180628,44,0.770000,0.190206,0.362667,0.067046,0.003011,0.354470,0.061673,...,44,0.776193,0.189826,0.340353,0.068037,0.002782,0.355745,0.060207,0.256716,0.142459


In [6]:
#combine test df
test_hybrid_df = pd.merge(test_lightgbm_pred_preprocessed, test_lstm_df, left_index=True, right_index=True)
test_hybrid_df

Unnamed: 0_level_0,Unnamed: 1_level_0,lightgbm_pred_t-2,lightgbm_pred_t-1,country_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,...,country_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1989,Stany Zjednoczone,0.467747,0.467747,34,0.791559,0.846847,0.169824,0.452087,0.001514,0.563915,0.082400,...,34,0.817759,0.850208,0.160594,0.469840,0.001729,0.583915,0.082995,0.517934,0.308212
1990,Stany Zjednoczone,0.466720,0.466720,34,0.817759,0.850208,0.169686,0.469840,0.001729,0.583915,0.082995,...,34,0.838605,0.853340,0.159209,0.470866,0.001702,0.616149,0.082725,0.512904,0.305718
1991,Stany Zjednoczone,0.466320,0.466320,34,0.838605,0.853340,0.168223,0.470866,0.001702,0.616149,0.082725,...,34,0.844259,0.860384,0.158555,0.466174,0.010276,0.607288,0.083405,0.493612,0.297345
1992,Stany Zjednoczone,0.472049,0.472049,34,0.844259,0.860384,0.167532,0.466174,0.010276,0.607288,0.083405,...,34,0.831598,0.860616,0.153140,0.456131,0.009935,0.611097,0.080776,0.474745,0.297198
1993,Stany Zjednoczone,0.473386,0.473386,34,0.831598,0.860616,0.161811,0.456131,0.009935,0.611097,0.080776,...,34,0.850993,0.860721,0.152479,0.455670,0.009743,0.625705,0.078382,0.480823,0.298140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,Szwecja,0.164062,0.164062,35,1.034912,0.406395,0.187046,0.053815,0.004475,0.027086,0.075266,...,35,1.042982,0.401701,0.189482,0.051773,0.004556,0.027867,0.075373,0.164163,0.101755
2020,Szwecja,0.163468,0.163468,35,1.042982,0.401701,0.200210,0.051773,0.004556,0.027867,0.075373,...,35,1.053059,0.397648,0.158390,0.051390,0.002930,0.026035,0.075950,0.159296,0.096965
2021,Szwecja,0.148943,0.148943,35,1.053059,0.397648,0.167357,0.051390,0.002930,0.026035,0.075950,...,35,1.022503,0.394784,0.147399,0.041803,0.002415,0.020940,0.076321,0.146172,0.099935
2022,Szwecja,0.147648,0.147648,35,1.022503,0.394784,0.155745,0.041803,0.002415,0.020940,0.076321,...,35,1.071581,0.392417,0.145018,0.043696,0.000049,0.027439,0.077285,0.154059,0.098720


In [7]:
columns_train_t2 = [col for col in train_hybrid_df.columns if col.endswith('t-2')]
columns_train_t1 = [col for col in train_hybrid_df.columns if col.endswith('t-1')]
co2_train = [col for col in train_hybrid_df.columns if col.endswith('co2')]

train_hybrid_df_sorted = train_hybrid_df[columns_train_t2 + columns_train_t1 + co2_train].copy()
train_hybrid_df_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,lightgbm_pred_t-2,country_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,oil_co2_t-2,...,country_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1961,Arabia Saudyjska,0.078933,0,0.243528,0.000000,0.019437,0.000000,0.000000,0.000000,0.074544,0.044424,...,0,0.258822,0.000000,0.021676,0.000000,0.000000,0.000000,0.071374,0.062017,0.067313
1962,Arabia Saudyjska,0.103684,0,0.258822,0.000000,0.022903,0.000000,0.000000,0.000000,0.071374,0.062017,...,0,0.279879,0.000000,0.024220,0.000000,0.000000,0.000000,0.072538,0.079918,0.079858
1963,Arabia Saudyjska,0.105361,0,0.279879,0.000000,0.025592,0.000000,0.000000,0.000000,0.072538,0.079918,...,0,0.302746,0.000000,0.041657,0.000000,0.000000,0.001262,0.073308,0.134421,0.083187
1964,Arabia Saudyjska,0.111449,0,0.302746,0.000000,0.044016,0.000000,0.000000,0.001262,0.073308,0.134421,...,0,0.317662,0.000000,0.043238,0.000000,0.000000,0.001603,0.074590,0.143784,0.082177
1965,Arabia Saudyjska,0.127403,0,0.317662,0.000000,0.045686,0.000000,0.000000,0.001603,0.074590,0.143784,...,0,0.333562,0.000000,0.053995,0.000000,0.000000,0.065958,0.074268,0.115466,0.069495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000,Włochy,0.188417,44,0.703614,0.166694,0.322117,0.062423,0.003817,0.311536,0.066203,0.265250,...,44,0.719492,0.166619,0.320211,0.061341,0.003052,0.338965,0.063718,0.260213,0.139462
2001,Włochy,0.187318,44,0.719492,0.166619,0.338340,0.061341,0.003052,0.338965,0.063718,0.260213,...,44,0.751454,0.190318,0.332290,0.063685,0.003200,0.353906,0.062837,0.255294,0.138231
2002,Włochy,0.190610,44,0.751454,0.190318,0.351103,0.063685,0.003200,0.353906,0.062837,0.255294,...,44,0.770000,0.190206,0.343234,0.067046,0.003011,0.354470,0.061673,0.251337,0.138098
2003,Włochy,0.180628,44,0.770000,0.190206,0.362667,0.067046,0.003011,0.354470,0.061673,0.251337,...,44,0.776193,0.189826,0.340353,0.068037,0.002782,0.355745,0.060207,0.256716,0.142459


In [8]:
columns_test_t2 = [col for col in test_hybrid_df.columns if col.endswith('t-2')]
columns_test_t1 = [col for col in test_hybrid_df.columns if col.endswith('t-1')]
co2_test = [col for col in test_hybrid_df.columns if col.endswith('co2')]

test_hybrid_df_sorted = test_hybrid_df[columns_test_t2 + columns_test_t1 + co2_test].copy()
test_hybrid_df_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,lightgbm_pred_t-2,country_t-2,gdp_t-2,temperature_change_from_co2_t-2,cement_co2_t-2,coal_co2_t-2,flaring_co2_t-2,gas_co2_t-2,land_use_change_co2_t-2,oil_co2_t-2,...,country_t-1,gdp_t-1,temperature_change_from_co2_t-1,cement_co2_t-1,coal_co2_t-1,flaring_co2_t-1,gas_co2_t-1,land_use_change_co2_t-1,oil_co2_t-1,co2
year,country_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1989,Stany Zjednoczone,0.467747,34,0.791559,0.846847,0.169824,0.452087,0.001514,0.563915,0.082400,0.499771,...,34,0.817759,0.850208,0.160594,0.469840,0.001729,0.583915,0.082995,0.517934,0.308212
1990,Stany Zjednoczone,0.466720,34,0.817759,0.850208,0.169686,0.469840,0.001729,0.583915,0.082995,0.517934,...,34,0.838605,0.853340,0.159209,0.470866,0.001702,0.616149,0.082725,0.512904,0.305718
1991,Stany Zjednoczone,0.466320,34,0.838605,0.853340,0.168223,0.470866,0.001702,0.616149,0.082725,0.512904,...,34,0.844259,0.860384,0.158555,0.466174,0.010276,0.607288,0.083405,0.493612,0.297345
1992,Stany Zjednoczone,0.472049,34,0.844259,0.860384,0.167532,0.466174,0.010276,0.607288,0.083405,0.493612,...,34,0.831598,0.860616,0.153140,0.456131,0.009935,0.611097,0.080776,0.474745,0.297198
1993,Stany Zjednoczone,0.473386,34,0.831598,0.860616,0.161811,0.456131,0.009935,0.611097,0.080776,0.474745,...,34,0.850993,0.860721,0.152479,0.455670,0.009743,0.625705,0.078382,0.480823,0.298140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,Szwecja,0.164062,35,1.034912,0.406395,0.187046,0.053815,0.004475,0.027086,0.075266,0.167806,...,35,1.042982,0.401701,0.189482,0.051773,0.004556,0.027867,0.075373,0.164163,0.101755
2020,Szwecja,0.163468,35,1.042982,0.401701,0.200210,0.051773,0.004556,0.027867,0.075373,0.164163,...,35,1.053059,0.397648,0.158390,0.051390,0.002930,0.026035,0.075950,0.159296,0.096965
2021,Szwecja,0.148943,35,1.053059,0.397648,0.167357,0.051390,0.002930,0.026035,0.075950,0.159296,...,35,1.022503,0.394784,0.147399,0.041803,0.002415,0.020940,0.076321,0.146172,0.099935
2022,Szwecja,0.147648,35,1.022503,0.394784,0.155745,0.041803,0.002415,0.020940,0.076321,0.146172,...,35,1.071581,0.392417,0.145018,0.043696,0.000049,0.027439,0.077285,0.154059,0.098720


## Reshape the data

In [9]:
data_resherper = DataReshaperLSTM()
x_train, x_test, y_train, y_test = data_resherper.reshape_data(train_hybrid_df_sorted, test_hybrid_df_sorted, num_lags=2)

## Build the model

In [10]:
input_shape = (x_train.shape[1], x_train.shape[2])
output_units = 1

model = Sequential([
    LSTM(48, input_shape=input_shape, return_sequences = True),
    Dropout(0.2),
    LSTM(40),
    Dropout(0.2),
    Dense(output_units, activation='relu')
])

model.compile(optimizer="adam", loss="mean_absolute_error")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 2, 48)             11328     
                                                                 
 dropout (Dropout)           (None, 2, 48)             0         
                                                                 
 lstm_1 (LSTM)               (None, 40)                14240     
                                                                 
 dropout_1 (Dropout)         (None, 40)                0         
                                                                 
 dense (Dense)               (None, 1)                 41        
                                                                 
Total params: 25,609
Trainable params: 25,609
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [11]:
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
history = model.fit(x_train, y_train, epochs=config.epochs, batch_size=config.batch_size, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


In [12]:
# Save the model
model.save(os.path.join(config.models_folder, f'{variant_co2}_lightgbm_lstm_model.h5'))

# Evaluate the model on the test set
loss = model.evaluate(x_test, y_test, verbose = 0)

## Predictions

In [13]:
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

inverted_data_predicted_train_y = data_preprocessor_lstm_for_prediction.inverse_transform_data(train_predictions, train_predictions.shape[0], train_hybrid_df.shape[1]-4)
inverted_data_train_y = data_preprocessor_lstm_for_prediction.inverse_transform_data(y_train, train_predictions.shape[0], train_hybrid_df.shape[1]-4)

inverted_data_predicted_test_y = data_preprocessor_lstm_for_prediction.inverse_transform_data(test_predictions, test_predictions.shape[0], test_hybrid_df.shape[1]-4)
inverted_data_test_y = data_preprocessor_lstm_for_prediction.inverse_transform_data(y_test, test_predictions.shape[0], test_hybrid_df.shape[1]-4)



In [14]:
train_df_reset = train_hybrid_df.reset_index()
test_df_reset = test_hybrid_df.reset_index()

co2_predicted_train = inverted_data_predicted_train_y[:, -1]
co2_actual_train = inverted_data_train_y[:, -1]
co2_predicted_test = inverted_data_predicted_test_y[:, -1]
co2_actual_test = inverted_data_test_y[:, -1]

# Create DataFrames for train and test
train_results = pd.DataFrame({
    "country": train_df_reset["country_index"].values,
    "year": train_df_reset["year"].values,
    "co2_predicted": co2_predicted_train,
    "co2_actual": co2_actual_train
})

test_results = pd.DataFrame({
    "country": test_df_reset["country_index"].values,
    "year": test_df_reset["year"].values,
    "co2_predicted": co2_predicted_test,
    "co2_actual": co2_actual_test
})

population_df = pd.read_csv(os.path.join(config.output_cleaned, 'population_data.csv'))
population_df = population_df.rename(columns={'country_index': 'country'})

train_df_with_population = train_results.merge(
    population_df[['year', 'country', 'population']],
    on=['year', 'country'],
    how='left'
)

train_df_with_population['co2_predicted'] = train_df_with_population['co2_predicted'] * train_df_with_population['population']
train_df_with_population['co2_actual'] = train_df_with_population['co2_actual'] * train_df_with_population['population']
train_df_with_population = train_df_with_population.drop(columns=['population'])

train_df_with_population.to_csv(os.path.join(config.predictions_lightgbm, f'{variant_co2}_train.csv'))

test_df_with_population = test_results.merge(
    population_df[['year', 'country', 'population']],
    on=['year', 'country'],
    how='left'
)

test_df_with_population['co2_predicted'] = test_df_with_population['co2_predicted'] * test_df_with_population['population']
test_df_with_population['co2_actual'] = test_df_with_population['co2_actual'] * test_df_with_population['population']
test_df_with_population = test_df_with_population.drop(columns=['population'])

test_df_with_population.to_csv(os.path.join(config.predictions_lightgbm, f'{variant_co2}_test.csv'))

In [15]:
current_model = "lightgbm_lstm"

global_csv_path = os.path.join(config.predictions, f'combined_results_{variant_co2}.csv')
global_results = GlobalResults(global_csv_path, keys=["country", "year", "set"])

train = train_df_with_population.copy()
test = test_df_with_population.copy()

if "year" not in train.columns:
    train = train.reset_index()
if "year" not in test.columns:
    test = test.reset_index()

train["set"] = "train"
test["set"] = "test"

train = train.rename(columns={
    "co2_predicted": f"co2_predicted_{current_model}",
    "co2_actual": f"co2_actual_{current_model}"
})
test = test.rename(columns={
    "co2_predicted": f"co2_predicted_{current_model}",
    "co2_actual": f"co2_actual_{current_model}"
})

new_results_df = pd.concat([train, test], axis=0)
new_results_df = new_results_df.sort_values(by=["year", "country", "set"])

global_results.append_results(new_results_df)

## Metrics

In [16]:
evaluator = PredictionEvaluator()
evaluator.evaluate(train_df_with_population, test_df_with_population, actual_col='co2_actual', predicted_col='co2_predicted', variant = f'lightgbm_lstm_{variant_co2}', model_output_file=config.metrics_hybrid)