In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tsgm
import tensorflow as tf
from tensorflow import keras

In [2]:
#Save Kyoto_Gases data (2020-2100), consider the case of C1-C8 for simplicity.
Kyoto_Gases = pd.read_csv('Kyoto Gases.csv')
Kyoto_Gases = Kyoto_Gases[Kyoto_Gases['Category'].isin(['C1','C2','C3','C4','C5','C6','C7','C8'])]
mapping = {'C1':0,'C2':0,'C3':0,'C4':0,'C5':1,'C6':1,'C7':2,'C8':2}#Aggregate categories into 3 categories, with 0-2 corresponding to C1234-C78, respectively
Kyoto_Gases['Category'].replace(mapping,inplace=True)
Kyoto_Gases.reset_index(drop=True,inplace=True)
Kyoto_Gases.drop(columns=['Category_name'],inplace = True)

In [3]:
#Load a dataset of individual variables
CarbonSequestration = pd.read_csv('Carbon_Sequestration_CCS_imputed.csv')
FinalEnergy_Liquid = pd.read_csv('Final Energy_Liquids.csv')
PrimaryEnergy_Gas = pd.read_csv('Primary Energy_Gas.csv')
PrimaryEnergy_Oil = pd.read_csv('Primary Energy_Oil.csv')
PrimaryEnergy_Coal = pd.read_csv('PrimaryEnergy_Coal.csv')
PrimaryEnergy = pd.read_csv('PrimaryEnergy_imputed.csv')
SecondaryEnergy_Gas = pd.read_csv('Secondary Energy_Gases.csv')
FinalEnergy_Solid = pd.read_csv('Final Energy_Solids.csv')

In [4]:
#Get the intersection of the models and scenarios contained in each variable
Model_Scenario = Kyoto_Gases[['Model','Scenario']]
Variables = [CarbonSequestration,FinalEnergy_Liquid,PrimaryEnergy_Coal,PrimaryEnergy_Gas,PrimaryEnergy_Oil,PrimaryEnergy,SecondaryEnergy_Gas,
            FinalEnergy_Solid]
for variable in Variables:
    Model_Scenario = pd.merge(Model_Scenario,variable[['Model','Scenario']],on=['Model','Scenario'],how='inner')

In [5]:
for i in range(len(Variables)):
    Variables[i] = pd.merge(Model_Scenario,Variables[i],on=['Model','Scenario'],how='inner')
for i in range(len(Variables)):
    Variables[i].drop(columns=['Category_name'],inplace = True)

In [6]:
Kyoto_Gases = pd.merge(Kyoto_Gases,Model_Scenario,on = ['Model','Scenario'],how = 'inner')

In [None]:
Variables.append(Kyoto_Gases)
#Variables [CarbonSequestration,FinalEnergy_Liquid,PrimaryEnergy_Coal,PrimaryEnergy_Gas,PrimaryEnergy_Oil,PrimaryEnergy,
#SecondaryEnergy_Gas,FinalEnergy_Solid,Kyoto_Gases]
data_num = Variables[0].shape[0]
data_num

In [8]:
#Generate feature matrices, the values of each variable during 2020-2100. 9 time steps, 9 features
#data_num is the amount of data
X = np.zeros((data_num,9,9))
for i in range(len(Variables)):
    Variables[i] = Variables[i].iloc[:,3:-1].values
for i in range(data_num):
    for j in range(9):
        for k in range(9):
            X[i][j][k] = (Variables[k])[i,j]

In [9]:
Y = Kyoto_Gases['Category'].values

In [10]:
#Separate datasets by category. But the training process does not distinguish between categories like VAE
C1234_DataSet = X[Y == 0]
C56_DataSet = X[Y == 1]
C78_DataSet = X[Y == 2]

In [11]:
#Set problem parameters
latent_dim = 36
output_dim = 3
feature_dim = 9
seq_len = 9
batch_size = 100
generator_in_channels = latent_dim + output_dim
discriminator_in_channels = feature_dim + output_dim

In [12]:
#Standardize the data and compress it to (0, 1).
scaler = tsgm.utils.TSFeatureWiseScaler((0,1))
X_train = scaler.fit_transform(X)
Y_train = keras.utils.to_categorical(Y, 3)

X_train = X_train.astype(np.float32)
Y_train = Y_train.astype(np.float32)

dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)

In [13]:
architecture = tsgm.models.architectures.cGAN_LSTMnArchitecture(
    seq_len=seq_len, feat_dim=feature_dim,
    latent_dim=latent_dim, output_dim=output_dim)
discriminator, generator = architecture._discriminator, architecture._generator

In [None]:
#Define an optimization strategy
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,#
    decay_steps=200,#200
    decay_rate=0.95)#0.95

cond_gan = tsgm.models.cgan.ConditionalGAN(
    discriminator=discriminator, generator=generator, latent_dim=latent_dim
)
cond_gan.compile(
    d_optimizer=keras.optimizers.Adam(lr_schedule),
    g_optimizer=keras.optimizers.Adam(lr_schedule),
    loss_fn=keras.losses.BinaryCrossentropy(),
)
cond_gan.fit(dataset, epochs=1200)

In [None]:
#Set Global Random Seed
np.random.seed(8)
Gen_Labels = np.zeros(3000)
Gen_Labels[1000:2000] = 1
Gen_Labels[2000:] = 2
Gen_Labels.astype(np.int32)

In [16]:
#Generate data using generative models (1000 for each class)
#Subsequent random forest models partially refer to "RCGAN Random forest.ipynb"
z = np.concatenate([np.random.randn(3000,36),keras.utils.to_categorical(Gen_Labels,3)],axis=1)
Gen_Data = generator(z)
Gen_Data = Gen_Data.numpy()
Gen_Data = scaler.inverse_transform(Gen_Data)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [18]:
#Compute the cumulative value of the eight input variables as a feature of the real dataset
features_names = ['CarbonSequestration','FinalEnergy_Liquid','PrimaryEnergy_Coal','PrimaryEnergy_Gas','PrimaryEnergy_Oil','PrimaryEnergy',
                 'SecondaryEnergy_Gas','FinalEnergy_Solids']
Real_Data_Sum = np.zeros((data_num,8))
for i in range(data_num):#data_num :amount of data
    for j in range(8):#8 ：feature dimension
        for k in range(8):#time step
            Real_Data_Sum[i][j] += (X[i,k,j] + X[i,k+1,j]) * 5

In [19]:
Real_DataSet = pd.DataFrame(Real_Data_Sum,columns=features_names)

In [20]:
#Construct the feature matrix of the feature cumulative values of the generated dataset
Gen_Data_Sum = np.zeros((3000,8))
for i in range(3000):#3000 :amount of data
    for j in range(8):#8 ：feature dimension
        for k in range(8):#time step
            Gen_Data_Sum[i][j] += (Gen_Data[i,k,j] + Gen_Data[i,k+1,j]) * 5

In [21]:
Gen_DataSet = pd.DataFrame(Gen_Data_Sum,columns=features_names)

In [22]:
#First train the model with real datasets to predict the generated data.Cross-validation using grid search
R_G = RandomForestClassifier(random_state=42)
parameters_1 = {
    'n_estimators':[10,100,200,500,1000],
    'max_depth':[6,8,10,12,14,16],
    'min_samples_split':[3,4,5,6]
}

In [23]:
clf_RG = GridSearchCV(R_G,parameters_1,cv=3,n_jobs=-1,verbose=2)

In [None]:
clf_RG.fit(Real_DataSet,Y)

In [None]:
print('Best parameters found:')
print(clf_RG.best_params_)

In [None]:
y_pred = clf_RG.predict(Real_DataSet)
print(f"Test results on the training dataset：")
print(classification_report(Y,y_pred,target_names=['C1234','C56','C78']))

In [None]:
y_pred = clf_RG.predict(Gen_DataSet)
print(f"Test results on the test set：")
print(classification_report(Gen_Labels,y_pred,target_names=['C1234','C56','C78']))

In [28]:
#Use the generated dataset to train models to predict real data
G_R = RandomForestClassifier(random_state=42)
parameters_2 = {
    'n_estimators':[10,100,200,500,1000],
    'max_depth':[6,8,10,12,14,16],
    'min_samples_split':[3,4,5,6]
}

In [29]:
clf_GR = GridSearchCV(G_R,parameters_2,cv=3,n_jobs=-1,verbose=2)

In [None]:
clf_GR.fit(Gen_DataSet,Gen_Labels)

In [None]:
print('Best parameters found:')
print(clf_GR.best_params_)

In [None]:
y_pred = clf_GR.predict(Gen_DataSet)
print(f"Test results on the training dataset：")
print(classification_report(Gen_Labels,y_pred,target_names=['C1234','C56','C78']))

In [None]:
y_pred = clf_GR.predict(Real_DataSet)
print(f"Test results on the test set：")
print(classification_report(Y,y_pred,target_names=['C1234','C56','C78']))

In [34]:
generator.save_weights('Policy-Top9-RCGAN_generator.h5')
discriminator.save_weights('Policy-Top9-RCGAN_discriminator.h5')