In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tsgm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers






In [2]:
#Set Global Random Seed
global_seed = 8
tf.random.set_seed(global_seed)
np.random.seed(global_seed)

In [3]:
#Save Kyoto_Gases data (2020-2100), consider the case of C1-C8 for simplicity.
Kyoto_Gases = pd.read_csv('Kyoto Gases.csv')
Kyoto_Gases = Kyoto_Gases[Kyoto_Gases['Category'].isin(['C1','C2','C3','C4','C5','C6','C7','C8'])]
mapping = {'C1':0,'C2':0,'C3':0,'C4':1,'C5':1,'C6':1,'C7':2,'C8':2}
Kyoto_Gases['Category'].replace(mapping,inplace=True)
Kyoto_Gases.reset_index(drop=True,inplace=True)

In [4]:
Kyoto_Gases.drop(columns=['Category_name'],inplace = True)

In [5]:
#Load a dataset of individual variables
CarbonSequestration = pd.read_csv('Carbon_Sequestration_CCS_imputed.csv')
FinalEnergy_Liquid = pd.read_csv('Final Energy_Liquids.csv')
PrimaryEnergy_Gas = pd.read_csv('Primary Energy_Gas.csv')
PrimaryEnergy_Oil = pd.read_csv('Primary Energy_Oil.csv')
PrimaryEnergy_Coal = pd.read_csv('PrimaryEnergy_Coal.csv')

In [6]:
#Get the intersection of the models and scenarios contained in each variable
Model_Scenario = Kyoto_Gases[['Model','Scenario']]
Variables = [CarbonSequestration,FinalEnergy_Liquid,PrimaryEnergy_Coal,PrimaryEnergy_Gas,PrimaryEnergy_Oil]
for variable in Variables:
    Model_Scenario = pd.merge(Model_Scenario,variable[['Model','Scenario']],on=['Model','Scenario'],how='inner')

In [7]:
for i in range(len(Variables)):
    Variables[i] = pd.merge(Model_Scenario,Variables[i],on=['Model','Scenario'],how='inner')
for i in range(len(Variables)):
    Variables[i].drop(columns=['Category_name'],inplace = True)

In [8]:
Kyoto_Gases = pd.merge(Kyoto_Gases,Model_Scenario,on = ['Model','Scenario'],how = 'inner')

In [9]:
#Variables [CarbonSequestration,FinalEnergy_Liquid,PrimaryEnergy_Coal,PrimaryEnergy_Gas,PrimaryEnergy_Oil,Kyoto_Gases]
Variables.append(Kyoto_Gases)

In [10]:
#Generate feature matrices, the values of each variable during 2020-2100. 9 time steps, 6 features
#1160 is the amount of data
X = np.zeros((1160,9,6))
for i in range(len(Variables)):
    Variables[i] = Variables[i].iloc[:,3:-1].values
for i in range(1160):
    for j in range(9):
        for k in range(6):
            X[i][j][k] = (Variables[k])[i,j]

In [11]:
Y = Kyoto_Gases['Category'].values

In [12]:
#Separate datasets by category.
C123_DataSet = X[Y == 0]
C456_DataSet = X[Y == 1]
C78_DataSet = X[Y == 2]

In [13]:
architecture1 = tsgm.models.zoo["vae_conv5"](9, 6, 8)#Latent Dim = 8
encoder1, decoder1 = architecture1.encoder, architecture1.decoder







In [14]:
scaler_C123 = tsgm.utils.TSFeatureWiseScaler((0,1))        
scaled_C123_data = scaler_C123.fit_transform(C123_DataSet)

In [15]:
architecture2 = tsgm.models.zoo["vae_conv5"](9, 6, 8)
encoder2, decoder2 = architecture2.encoder, architecture2.decoder
scaler_C456 = tsgm.utils.TSFeatureWiseScaler((0,1))    
scaled_C456_data = scaler_C456.fit_transform(C456_DataSet)

In [16]:
architecture3 = tsgm.models.zoo["vae_conv5"](9, 6, 8)
encoder3, decoder3 = architecture3.encoder, architecture3.decoder
scaler_C78 = tsgm.utils.TSFeatureWiseScaler((0,1))    
scaled_C78_data = scaler_C78.fit_transform(C78_DataSet)

In [17]:
#Load model parameters
encoder1.load_weights('encoder1_weights.h5')
encoder2.load_weights('encoder2_weights.h5')
encoder3.load_weights('encoder3_weights.h5')
decoder1.load_weights('decoder1_weights.h5')
decoder2.load_weights('decoder2_weights.h5')
decoder3.load_weights('decoder3_weights.h5')

In [18]:
#Generate data using generative models (500 for each class)
z1 = tf.random.normal((500, 8))
z2 = tf.random.normal((500, 8))
z3 = tf.random.normal((500, 8))
Gen_C123 = decoder1(z1)
Gen_C456 = decoder2(z2)
Gen_C78 = decoder3(z3)
Gen_C123 = scaler_C123.inverse_transform(Gen_C123)
Gen_C456 = scaler_C456.inverse_transform(Gen_C456)
Gen_C78 = scaler_C78.inverse_transform(Gen_C78)

In [19]:
Gen_Data = np.concatenate((Gen_C123,Gen_C456,Gen_C78),axis=0)
Gen_Labels = np.zeros(1500)
Gen_Labels[500:1000] = 1
Gen_Labels[1000:] = 2
Gen_Labels.astype(np.int32)

array([0, 0, 0, ..., 2, 2, 2])

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [21]:
#Compute the cumulative value of the five input variables as a feature of the real dataset
features_names = ['CarbonSequestration','FinalEnergy_Liquid','PrimaryEnergy_Coal','PrimaryEnergy_Gas','PrimaryEnergy_Oil']
Real_Data_Sum = np.zeros((1160,5))
for i in range(1160):#1160 :amount of data
    for j in range(5):#5 feature dimension
        for k in range(8):#time step
            Real_Data_Sum[i][j] += (X[i,k,j] + X[i,k+1,j]) * 5

In [22]:
Real_DataSet = pd.DataFrame(Real_Data_Sum,columns=features_names)

In [23]:
#Construct the feature matrix of the feature cumulative values of the generated dataset
Gen_Data_Sum = np.zeros((1500,5))
for i in range(1500):#1500 :amount of data
    for j in range(5):#5 ：feature dimension
        for k in range(8):#time step
            Gen_Data_Sum[i][j] += (Gen_Data[i,k,j] + Gen_Data[i,k+1,j]) * 5

In [24]:
Gen_DataSet = pd.DataFrame(Gen_Data_Sum,columns=features_names)

In [25]:
#First train the model with real datasets to predict the generated data.Cross-validation using grid search
R_G = RandomForestClassifier(random_state=42)
parameters_1 = {
    'n_estimators':[10,100,200,500,1000],
    'max_depth':[6,8,10,12,14,16],
    'min_samples_split':[3,4,5,6]
}

In [26]:
clf_RG = GridSearchCV(R_G,parameters_1,cv=3,n_jobs=-1,verbose=2)

In [27]:
clf_RG.fit(Real_DataSet,Y)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


In [28]:
print('Best parameters found:')
print(clf_RG.best_params_)

Best parameters found:
{'max_depth': 8, 'min_samples_split': 4, 'n_estimators': 10}


In [29]:
y_pred = clf_RG.predict(Real_DataSet)
print(f"Test results on the training dataset：")
print(classification_report(Y,y_pred,target_names=['C123','C456','C78']))

Test results on the training dataset：
              precision    recall  f1-score   support

        C123       0.96      0.97      0.97       524
        C456       0.96      0.95      0.96       464
         C78       1.00      0.98      0.99       172

    accuracy                           0.97      1160
   macro avg       0.97      0.97      0.97      1160
weighted avg       0.97      0.97      0.97      1160



In [30]:
y_pred = clf_RG.predict(Gen_DataSet)
print(f"Test results on the test set：")
print(classification_report(Gen_Labels,y_pred,target_names=['C123','C456','C78']))

Test results on the test set：
              precision    recall  f1-score   support

        C123       0.90      0.93      0.92       500
        C456       0.83      0.90      0.86       500
         C78       1.00      0.89      0.94       500

    accuracy                           0.91      1500
   macro avg       0.91      0.91      0.91      1500
weighted avg       0.91      0.91      0.91      1500



In [31]:
#Use the generated dataset to train models to predict real data
G_R = RandomForestClassifier(random_state=42)
parameters_2 = {
    'n_estimators':[10,100,200,500,1000],
    'max_depth':[6,8,10,12,14,16],
    'min_samples_split':[3,4,5,6]
}

In [32]:
clf_GR = GridSearchCV(G_R,parameters_2,cv=3,n_jobs=-1,verbose=2)

In [33]:
clf_GR.fit(Gen_DataSet,Gen_Labels)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


In [34]:
print('Best parameters found:')
print(clf_GR.best_params_)

Best parameters found:
{'max_depth': 14, 'min_samples_split': 3, 'n_estimators': 100}


In [35]:
y_pred = clf_GR.predict(Gen_DataSet)
print(f"Test results on the training dataset：")
print(classification_report(Gen_Labels,y_pred,target_names=['C123','C456','C78']))

Test results on the training dataset：
              precision    recall  f1-score   support

        C123       1.00      1.00      1.00       500
        C456       1.00      1.00      1.00       500
         C78       1.00      1.00      1.00       500

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



In [36]:
y_pred = clf_GR.predict(Real_DataSet)
print(f"Test results on the test set：")
print(classification_report(Y,y_pred,target_names=['C123','C456','C78']))

Test results on the test set：
              precision    recall  f1-score   support

        C123       0.82      0.99      0.90       524
        C456       0.92      0.74      0.82       464
         C78       0.93      0.87      0.90       172

    accuracy                           0.87      1160
   macro avg       0.89      0.87      0.87      1160
weighted avg       0.88      0.87      0.87      1160

