### Libraries

In [1]:
import imageio
from imblearn.over_sampling import SMOTE
from ipywidgets import interact
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import moviepy.editor as mpy
import numpy as np
from numpy import genfromtxt
from numpy import ones
from numpy import zeros
from numpy.random import randint
from numpy.random import randn
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import L1

### SMOTE

In [2]:
Xy_train_file = './Datasets/Outputs/Xy_train.csv'
Xy_train = pd.read_csv(Xy_train_file, index_col=False)

# split the training dataset based on the label column
Xy_train_positive = Xy_train.loc[Xy_train['label']==1]
Xy_train_unlabelled = Xy_train.loc[Xy_train['label']==0]
print('Number of positive training samples before removing outliers:', Xy_train_positive.shape[0])
print('Number of unlabelled training samples before removing outliers:', Xy_train_unlabelled.shape[0])

features_label_list = Xy_train.columns.tolist()
num_features_list = []
cat_features_list = []

for column in features_label_list:
    if column.startswith('Intrusions_') or column.startswith('MetamorphicFacies') or column.startswith('RockUnits'):
        cat_features_list.append(column)
    else:
        num_features_list.append(column)

num_features = Xy_train[num_features_list]
cat_features = Xy_train[cat_features_list]

# remove outliers
Xy_train_positive_num = Xy_train_positive[num_features_list]
Xy_train_unlabelled_num = Xy_train_unlabelled[num_features_list]

Xy_train_positive_num_out_ind = (np.abs(stats.zscore(Xy_train_positive_num.drop(columns=['label']))) < 3).all(axis=1)
Xy_train_unlabelled_num_out_ind = (np.abs(stats.zscore(Xy_train_unlabelled_num.drop(columns=['label']))) < 3).all(axis=1)

Xy_train_positive_num_out_ind = Xy_train_positive_num_out_ind.index[Xy_train_positive_num_out_ind == False].tolist()
Xy_train_unlabelled_num_out_ind = Xy_train_unlabelled_num_out_ind.index[Xy_train_unlabelled_num_out_ind == False].tolist()
Xy_train_out = Xy_train_positive_num_out_ind + Xy_train_unlabelled_num_out_ind

Xy_train_positive_num = Xy_train_positive_num.drop(index=Xy_train_positive_num_out_ind)
Xy_train_unlabelled_num = Xy_train_unlabelled_num.drop(index=Xy_train_unlabelled_num_out_ind)

print('Number of positive training samples after removing outliers:', Xy_train_positive_num.shape[0])
print('Number of unlabelled training samples after removing outliers:', Xy_train_unlabelled_num.shape[0])

Xy_train_num = pd.concat([Xy_train_positive_num, Xy_train_unlabelled_num]).reset_index(drop=True)
Xy_train_num = Xy_train_num.to_numpy()
features_num = Xy_train_num[:, :-1]
labels_num = Xy_train_num[:, -1]

# SMOTE
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(features_num, labels_num)
smote_samples = np.concatenate((X_sm, y_sm.reshape(-1, 1)), axis=1)
X_positive = smote_samples[np.where(smote_samples[:, -1]==1)]
X_positive = X_positive[:, 0:-1]

Number of positive training samples before removing outliers: 36
Number of unlabelled training samples before removing outliers: 1761
Number of positive training samples after removing outliers: 31
Number of unlabelled training samples after removing outliers: 1151


### GAN

In [3]:
learning_rate = 1e-4

# define the standalone discriminator model
def define_discriminator(n_inputs):
    model = Sequential()
    model.add(Dense(int(n_inputs*10), activation='LeakyReLU', activity_regularizer=L1(learning_rate), kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(Dense(1, activation='sigmoid', activity_regularizer=L1(learning_rate)))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

# define the standalone generator model
def define_generator(latent_dim, n_outputs):
    model = Sequential()
    model.add(Dense(int(n_outputs*5), activation='LeakyReLU', activity_regularizer=L1(learning_rate), kernel_initializer='he_uniform', input_dim=latent_dim))
    model.add(Dense(n_outputs, activation='linear', activity_regularizer=L1(learning_rate)))
    return model

# define the combined generator and discriminator model for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

n_samples = int(features_num.shape[0]-(2*labels_num.sum()))
# sample real data
def sample_real_data(n=n_samples):
    X_rand = X_positive[randint(X_positive.shape[0], size=n), :]
    y_rand = ones((n, 1))
    return X_rand, y_rand

# generate points in the latent space as input for the generator
def generate_latent_points(latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = randn(latent_dim * n)
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n, latent_dim)
    return x_input

# use the generator to generate n fake examples with class labels
def generate_fake_samples(generator, latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = generate_latent_points(latent_dim, n)
    # predict outputs
    X_fake = generator.predict(x_input, verbose=0)
    # create class labels
    y_fake = zeros((n, 1))
    return X_fake, y_fake

# evaluate the discriminator
def summarize_performance(epoch, generator, discriminator, latent_dim):
    # sample real data
    x_real, y_real = sample_real_data()
    # evaluate the discriminator on real examples
    _, acc_real = discriminator.evaluate(X_sm, y_sm, verbose=0)
    # prepare fake examples
    x_fake, y_fake = generate_fake_samples(generator, latent_dim)
    # evaluate the discriminator on fake examples
    _, acc_fake = discriminator.evaluate(x_fake, y_fake, verbose=0)
    return x_real, x_fake, acc_real, acc_fake

# train the generator and discriminator
def train(g_model, d_model, gan_model, latent_dim, n_epochs, n_batch=128, n_eval=100):
    # determine half the size of one batch, for updating the discriminator
    half_batch = int(n_batch / 2)
    acc_real_all = []
    acc_fake_all = []
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        x_real, y_real = sample_real_data(half_batch)
        # prepare fake examples
        x_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
        # update discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in the latent space as input for the generator
        x_gan = generate_latent_points(latent_dim, n_batch)
        # create inverted labels for the fake samples
        y_gan = ones((n_batch, 1))
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        
        # plot data points
        # ----------------
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            # summarize discriminator performance
            x_real, x_fake, acc_real, acc_fake = summarize_performance(i, g_model, d_model, latent_dim)
            print(i+1, acc_real, acc_fake)
    
        if i == n_epochs-1:
            # remove outliers
            x_fake = x_fake[(np.abs(stats.zscore(x_fake)) < 3).all(axis=1)]
            x_fake_ones = ones((x_fake.shape[0], 1))
            x_fake = np.concatenate((x_fake, x_fake_ones), axis=1)
            x_fake_num = x_fake.shape[0]
            smote_gan_samples = np.concatenate((Xy_train_num, x_fake), axis=0)
            smote_gan_samples = pd.DataFrame(smote_gan_samples, columns=num_features_list)
    
    return smote_gan_samples, x_fake_num

### Multiple Iterations

In [4]:
# size of the latent space
latent_dim = int(features_num.shape[1]*0.5)
# create the discriminator
discriminator = define_discriminator(features_num.shape[1])
# create the generator
generator = define_generator(latent_dim, features_num.shape[1])
# create the gan
gan_model = define_gan(generator, discriminator)

n_iter = 10

Xy_train = Xy_train.drop(index=Xy_train_out)
Xy_train_cat = Xy_train[cat_features_list]
Xy_train_cat_positive = Xy_train_positive[cat_features_list]
Xy_train_cat_positive_mode = Xy_train_cat_positive.mode()

for i in range(n_iter):
    print('--------------------')
    print(f'Iteration {i+1}')
    print('--------------------')
    
    # train model
    smote_gan_samples, x_fake_num = train(generator, discriminator, gan_model, latent_dim, n_epochs=10000)
    
    Xy_train_cat_positive_mode_ = Xy_train_cat_positive_mode.loc[Xy_train_cat_positive_mode.index.repeat(x_fake_num)].reset_index(drop=True)
    Xy_train_cat_ = pd.concat([Xy_train_cat, Xy_train_cat_positive_mode_]).reset_index(drop=True)
    smote_gan_samples = pd.concat([smote_gan_samples, Xy_train_cat_], axis=1).reset_index(drop=True)
    smote_gan_samples = smote_gan_samples[features_label_list]
    smote_gan_samples.to_csv(f'./Datasets/Outputs/smote_gan_{i+1}.csv', index=False)

--------------------
Iteration 1
--------------------
100 0.8649001121520996 0.9928571581840515
200 0.907471776008606 0.9982143044471741
300 0.8397046327590942 0.9973214268684387
400 0.8583840131759644 0.9803571701049805
500 0.9322328567504883 0.9678571224212646
600 0.9960903525352478 0.9669643044471741
700 0.9908775091171265 0.9741071462631226
800 0.9986968040466309 0.9642857313156128
900 0.9982624053955078 0.9857142567634583
1000 0.9986968040466309 0.981249988079071
1100 0.99782794713974 0.9821428656578064
1200 0.9956559538841248 0.9767857193946838
1300 0.9743701219558716 0.9982143044471741
1400 0.9761077165603638 0.9973214268684387
1500 0.9695916771888733 0.9919642806053162
1600 0.9348392486572266 1.0
1700 0.9396177530288696 0.9883928298950195
1800 0.9474369883537292 0.981249988079071
1900 0.9704604744911194 0.9982143044471741
2000 0.9439617991447449 0.9892857074737549
2100 0.9187663197517395 0.9964285492897034
2200 0.9165942668914795 0.9991071224212646
2300 0.9513466358184814 0.999

9800 0.9126846194267273 0.9982143044471741
9900 0.9417897462844849 0.9857142567634583
10000 0.9287576079368591 0.9928571581840515
--------------------
Iteration 3
--------------------
100 0.9205039143562317 0.9955357313156128
200 0.9335360527038574 0.9901785850524902
300 0.9348392486572266 0.9821428656578064
400 0.9317984580993652 0.9937499761581421
500 0.9500434398651123 0.9517857432365417
600 0.9348392486572266 0.9928571581840515
700 0.9317984580993652 0.9928571581840515
800 0.9300608038902283 0.9821428656578064
900 0.9239791631698608 0.9964285492897034
1000 0.9396177530288696 0.9785714149475098
1100 0.9391832947731018 0.9723214507102966
1200 0.9474369883537292 0.956250011920929
1300 0.9387488961219788 0.9848214387893677
1400 0.944830596446991 0.9535714387893677
1500 0.9496090412139893 0.9696428775787354
1600 0.9483058452606201 0.9580357074737549
1700 0.9304952025413513 0.9758928418159485
1800 0.9456993937492371 0.987500011920929
1900 0.927888810634613 0.9928571581840515
2000 0.93223

8900 0.9339704513549805 0.9937499761581421
9000 0.9365769028663635 0.9892857074737549
9100 0.942658543586731 0.9883928298950195
9200 0.9396177530288696 0.9919642806053162
9300 0.9483058452606201 0.9857142567634583
9400 0.927888810634613 0.9973214268684387
9500 0.942658543586731 0.9901785850524902
9600 0.943092942237854 0.9803571701049805
9700 0.9331016540527344 0.9955357313156128
9800 0.9404865503311157 0.9785714149475098
9900 0.944830596446991 0.9830357432365417
10000 0.9396177530288696 0.9964285492897034
--------------------
Iteration 5
--------------------
100 0.9439617991447449 0.9848214387893677
200 0.9317984580993652 0.9973214268684387
300 0.9396177530288696 0.9964285492897034
400 0.9565595388412476 0.9651785492897034
500 0.9235447645187378 0.9964285492897034
600 0.9456993937492371 0.9848214387893677
700 0.9465681910514832 0.9821428656578064
800 0.9331016540527344 0.9964285492897034
900 0.92745441198349 0.9964285492897034
1000 0.9500434398651123 0.9633928537368774
1100 0.93701130

8000 0.9526498913764954 0.9901785850524902
8100 0.9391832947731018 0.9991071224212646
8200 0.944830596446991 0.9937499761581421
8300 0.9483058452606201 0.9830357432365417
8400 0.9439617991447449 0.9973214268684387
8500 0.944830596446991 0.9964285492897034
8600 0.9344048500061035 0.9973214268684387
8700 0.9474369883537292 0.9901785850524902
8800 0.9530842900276184 0.9705356955528259
8900 0.9404865503311157 0.9973214268684387
9000 0.9461337924003601 0.9928571581840515
9100 0.9326672554016113 0.9964285492897034
9200 0.9483058452606201 0.9803571701049805
9300 0.9504778385162354 0.9803571701049805
9400 0.9530842900276184 0.9866071343421936
9500 0.943092942237854 0.9928571581840515
9600 0.9483058452606201 0.9928571581840515
9700 0.9383144974708557 0.9928571581840515
9800 0.9526498913764954 0.9732142686843872
9900 0.9487402439117432 0.9919642806053162
10000 0.9491746425628662 0.9946428537368774
--------------------
Iteration 7
--------------------
100 0.9526498913764954 0.9660714268684387
200

7200 0.9383144974708557 0.9973214268684387
7300 0.943527340888977 0.9973214268684387
7400 0.9465681910514832 0.9973214268684387
7500 0.9504778385162354 0.9750000238418579
7600 0.9470025897026062 0.9964285492897034
7700 0.9470025897026062 0.9919642806053162
7800 0.9517810344696045 0.9785714149475098
7900 0.9439617991447449 0.9991071224212646
8000 0.9522154927253723 0.9892857074737549
8100 0.9543874859809875 0.9830357432365417
8200 0.9517810344696045 0.9848214387893677
8300 0.9409209489822388 0.9991071224212646
8400 0.9470025897026062 0.9946428537368774
8500 0.9439617991447449 0.9928571581840515
8600 0.9465681910514832 0.9839285612106323
8700 0.9443961977958679 0.9928571581840515
8800 0.9509122371673584 0.9910714030265808
8900 0.9509122371673584 0.9785714149475098
9000 0.9439617991447449 0.9964285492897034
9100 0.9470025897026062 0.9937499761581421
9200 0.9470025897026062 0.9955357313156128
9300 0.9461337924003601 0.9901785850524902
9400 0.9339704513549805 1.0
9500 0.9491746425628662 0.9

7000 0.9483058452606201 0.9883928298950195
7100 0.9474369883537292 0.9937499761581421
7200 0.9535186886787415 0.9857142567634583
7300 0.944830596446991 0.9973214268684387
7400 0.9474369883537292 0.9973214268684387
7500 0.9461337924003601 0.9964285492897034
7600 0.9461337924003601 0.9964285492897034
7700 0.9413553476333618 1.0
7800 0.9500434398651123 0.9901785850524902
7900 0.9539530873298645 0.9821428656578064
8000 0.9478713870048523 0.9919642806053162
8100 0.9504778385162354 0.9919642806053162
8200 0.942658543586731 0.9964285492897034
8300 0.9474369883537292 0.9982143044471741
8400 0.9456993937492371 0.9973214268684387
8500 0.9500434398651123 0.9866071343421936
8600 0.9474369883537292 0.9901785850524902
8700 0.9478713870048523 0.9910714030265808
8800 0.9504778385162354 0.9857142567634583
8900 0.9470025897026062 0.9910714030265808
9000 0.9352736473083496 0.9928571581840515
9100 0.944830596446991 0.9919642806053162
9200 0.9539530873298645 0.9892857074737549
9300 0.9400521516799927 1.0
9