### Libraries

In [1]:
import imageio
from imblearn.over_sampling import SMOTE
from ipywidgets import interact
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import moviepy.editor as mpy
import numpy as np
from numpy import genfromtxt
from numpy import ones
from numpy import zeros
from numpy.random import randint
from numpy.random import randn
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import L1

### SMOTE

In [2]:
Xy_train_file = './Datasets/Outputs/Xy_train.csv'
Xy_train = pd.read_csv(Xy_train_file, index_col=False)

# split the training dataset based on the label column
Xy_train_positive = Xy_train.loc[Xy_train['label']==1]
Xy_train_unlabelled = Xy_train.loc[Xy_train['label']==0]
print('Number of positive training samples before removing outliers:', Xy_train_positive.shape[0])
print('Number of unlabelled training samples before removing outliers:', Xy_train_unlabelled.shape[0])

features_label_list = Xy_train.columns.tolist()
num_features_list = []
cat_features_list = []

for column in features_label_list:
    if column.startswith('Intrusions_') or column.startswith('MetamorphicFacies') or column.startswith('RockUnits'):
        cat_features_list.append(column)
    else:
        num_features_list.append(column)

num_features = Xy_train[num_features_list]
cat_features = Xy_train[cat_features_list]

# remove outliers
Xy_train_positive_num = Xy_train_positive[num_features_list]
Xy_train_unlabelled_num = Xy_train_unlabelled[num_features_list]

Xy_train_positive_num_out_ind = (np.abs(stats.zscore(Xy_train_positive_num.drop(columns=['label']))) < 3).all(axis=1)
Xy_train_unlabelled_num_out_ind = (np.abs(stats.zscore(Xy_train_unlabelled_num.drop(columns=['label']))) < 3).all(axis=1)

Xy_train_positive_num_out_ind = Xy_train_positive_num_out_ind.index[Xy_train_positive_num_out_ind == False].tolist()
Xy_train_unlabelled_num_out_ind = Xy_train_unlabelled_num_out_ind.index[Xy_train_unlabelled_num_out_ind == False].tolist()
Xy_train_out = Xy_train_positive_num_out_ind + Xy_train_unlabelled_num_out_ind

Xy_train_positive_num = Xy_train_positive_num.drop(index=Xy_train_positive_num_out_ind)
Xy_train_unlabelled_num = Xy_train_unlabelled_num.drop(index=Xy_train_unlabelled_num_out_ind)

print('Number of positive training samples after removing outliers:', Xy_train_positive_num.shape[0])
print('Number of unlabelled training samples after removing outliers:', Xy_train_unlabelled_num.shape[0])

Xy_train_num = pd.concat([Xy_train_positive_num, Xy_train_unlabelled_num]).reset_index(drop=True)
Xy_train_num = Xy_train_num.to_numpy()
features_num = Xy_train_num[:, :-1]
labels_num = Xy_train_num[:, -1]

# SMOTE
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(features_num, labels_num)
smote_samples = np.concatenate((X_sm, y_sm.reshape(-1, 1)), axis=1)
X_positive = smote_samples[np.where(smote_samples[:, -1]==1)]
X_positive = X_positive[:, 0:-1]

Number of positive training samples before removing outliers: 36
Number of unlabelled training samples before removing outliers: 225
Number of positive training samples after removing outliers: 35
Number of unlabelled training samples after removing outliers: 195


### GAN

In [3]:
learning_rate = 1e-4

# define the standalone discriminator model
def define_discriminator(n_inputs):
    model = Sequential()
    model.add(Dense(int(n_inputs*10), activation='LeakyReLU', activity_regularizer=L1(learning_rate), kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(Dense(1, activation='sigmoid', activity_regularizer=L1(learning_rate)))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

# define the standalone generator model
def define_generator(latent_dim, n_outputs):
    model = Sequential()
    model.add(Dense(int(n_outputs*5), activation='LeakyReLU', activity_regularizer=L1(learning_rate), kernel_initializer='he_uniform', input_dim=latent_dim))
    model.add(Dense(n_outputs, activation='linear', activity_regularizer=L1(learning_rate)))
    return model

# define the combined generator and discriminator model for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

n_samples = int(features_num.shape[0]-(2*labels_num.sum()))
# sample real data
def sample_real_data(n=n_samples):
    X_rand = X_positive[randint(X_positive.shape[0], size=n), :]
    y_rand = ones((n, 1))
    return X_rand, y_rand

# generate points in the latent space as input for the generator
def generate_latent_points(latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = randn(latent_dim * n)
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n, latent_dim)
    return x_input

# use the generator to generate n fake examples with class labels
def generate_fake_samples(generator, latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = generate_latent_points(latent_dim, n)
    # predict outputs
    X_fake = generator.predict(x_input, verbose=0)
    # create class labels
    y_fake = zeros((n, 1))
    return X_fake, y_fake

# evaluate the discriminator
def summarize_performance(epoch, generator, discriminator, latent_dim):
    # sample real data
    x_real, y_real = sample_real_data()
    # evaluate the discriminator on real examples
    _, acc_real = discriminator.evaluate(X_sm, y_sm, verbose=0)
    # prepare fake examples
    x_fake, y_fake = generate_fake_samples(generator, latent_dim)
    # evaluate the discriminator on fake examples
    _, acc_fake = discriminator.evaluate(x_fake, y_fake, verbose=0)
    return x_real, x_fake, acc_real, acc_fake

# train the generator and discriminator
def train(g_model, d_model, gan_model, latent_dim, n_epochs, n_batch=128, n_eval=100):
    # determine half the size of one batch, for updating the discriminator
    half_batch = int(n_batch / 2)
    acc_real_all = []
    acc_fake_all = []
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        x_real, y_real = sample_real_data(half_batch)
        # prepare fake examples
        x_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
        # update discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in the latent space as input for the generator
        x_gan = generate_latent_points(latent_dim, n_batch)
        # create inverted labels for the fake samples
        y_gan = ones((n_batch, 1))
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        
        # plot data points
        # ----------------
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            # summarize discriminator performance
            x_real, x_fake, acc_real, acc_fake = summarize_performance(i, g_model, d_model, latent_dim)
            print(i+1, acc_real, acc_fake)
#             x_max = x_real[:, 1].max()
#             x_min = x_real[:, 1].min()
#             y_max = x_real[:, 12].max()
#             y_min = x_real[:, 12].min()
#             z_max = x_real[:, 7].max()
#             z_min = x_real[:, 7].min()

#             x_max = 2
#             x_min = -0.5
#             y_max = 2
#             y_min = -1
#             z_max = 1
#             z_min = -3

            # scatter plot real and fake data points
#             fig = plt.figure(figsize=(18, 6))
#             fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=-0.1, hspace=None)
#             ax1 = fig.add_subplot(1, 2, 1)
#             ax1.scatter(x_real[:, 1], x_real[:, 2], c='darkorange', alpha=0.5)
#             ax1.scatter(x_fake[:, 1], x_fake[:, 2], c='navy', alpha=0.5)
#             ax1.set_facecolor('whitesmoke')
#             ax1.grid(linestyle=':')
# #             ax1.set_xticks(np.arange(-0.5, 2.1, 0.5))
# #             ax1.set_yticks(np.arange(-1, 2.1, 0.5))
# #             ax1.set_xlim(x_min-0.5, x_max+0.5, auto=True)
# #             ax1.set_ylim(y_min-0.5, y_max+0.5, auto=True)
#             ax1.set_xlabel('Feature 1')
#             ax1.set_ylabel('Feature 2')
# #             ax1.xaxis.set_major_formatter(FormatStrFormatter('%.1f'))
# #             ax1.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
#             ax2 = fig.add_subplot(1, 2, 2, projection='3d')
#             ax2.scatter(x_real[:, 1], x_real[:, 2], x_real[:, 3], c='darkorange', alpha=0.5)
#             ax2.scatter(x_fake[:, 1], x_fake[:, 2], x_fake[:, 3], c='navy', alpha=0.5)
#             ax2.grid(linestyle=':')
# #             ax2.set_xticks(np.arange(-0.5, 2.1, 0.5))
# #             ax2.set_yticks(np.arange(-1, 2.1, 0.5))
# #             ax2.set_zticks(np.arange(-3, 1.1, 1))
# #             ax2.set_xlim(x_min-0.5, x_max+0.5, auto=True)
# #             ax2.set_ylim(y_min-0.5, y_max+0.5, auto=True)
# #             ax2.set_zlim(z_min-0.5, z_max+0.5, auto=True)
#             ax2.set_xlabel('Feature 1')
#             ax2.set_ylabel('Feature 2')
#             ax2.set_zlabel('Feature 3')
# #             ax2.xaxis.set_major_formatter(FormatStrFormatter('%.1f'))
# #             ax2.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
#             plt.suptitle(f'Epoch {i+1}')
# #             plt.savefig(f'./augmentation/muller2019/epoch_{i+1}.jpg', bbox_inches='tight', pad_inches=0.1, dpi=150)
# #             plt.close()
#             plt.show()

#         # plot accuracy
#         # -------------
#         x_real, x_fake, acc_real, acc_fake = summarize_performance(i, g_model, d_model, latent_dim)
#         acc_real_all.append(acc_real)
#         acc_fake_all.append(acc_fake)
        
#         if (i+1) % n_eval == 0:
#             print(i+1, acc_real, acc_fake)
    
        if i == n_epochs-1:
            # remove outliers
            x_fake = x_fake[(np.abs(stats.zscore(x_fake)) < 3).all(axis=1)]
            x_fake_ones = ones((x_fake.shape[0], 1))
            x_fake = np.concatenate((x_fake, x_fake_ones), axis=1)
            x_fake_num = x_fake.shape[0]
            smote_gan_samples = np.concatenate((Xy_train_num, x_fake), axis=0)
            smote_gan_samples = pd.DataFrame(smote_gan_samples, columns=num_features_list)
#             smote_gan_samples = smote_gan_samples[(np.abs(stats.zscore(smote_gan_samples)) < 3).all(axis=1)]
    
    return smote_gan_samples, x_fake_num # acc_real_all, acc_fake_all

### Single Iteration

In [None]:
# size of the latent space
latent_dim = int(features_num.shape[1]*0.5)
# create the discriminator
discriminator = define_discriminator(features_num.shape[1])
# create the generator
generator = define_generator(latent_dim, features_num.shape[1])
# create the gan
gan_model = define_gan(generator, discriminator)
# train model
smote_gan_samples, x_fake_num = train(generator, discriminator, gan_model, latent_dim, n_epochs=10000)

Xy_train = Xy_train.drop(index=Xy_train_out)
Xy_train_cat = Xy_train[cat_features_list]
Xy_train_cat_positive = Xy_train_positive[cat_features_list]
Xy_train_cat_positive_mode = Xy_train_cat_positive.mode()
Xy_train_cat_positive_mode = Xy_train_cat_positive_mode.loc[Xy_train_cat_positive_mode.index.repeat(x_fake_num)].reset_index(drop=True)
Xy_train_cat = pd.concat([Xy_train_cat, Xy_train_cat_positive_mode]).reset_index(drop=True)
smote_gan_samples = pd.concat([smote_gan_samples, Xy_train_cat], axis=1).reset_index(drop=True)
smote_gan_samples = smote_gan_samples[features_label_list]
smote_gan_samples.to_csv(f'./Datasets/Outputs/smote_gan.csv', index=False)

### Multiple Iterations

In [4]:
# size of the latent space
latent_dim = int(features_num.shape[1]*0.5)
# create the discriminator
discriminator = define_discriminator(features_num.shape[1])
# create the generator
generator = define_generator(latent_dim, features_num.shape[1])
# create the gan
gan_model = define_gan(generator, discriminator)

n_iter = 2
acc_real_all = []
acc_fake_all = []

Xy_train = Xy_train.drop(index=Xy_train_out)
Xy_train_cat = Xy_train[cat_features_list]
Xy_train_cat_positive = Xy_train_positive[cat_features_list]
Xy_train_cat_positive_mode = Xy_train_cat_positive.mode()

for i in range(n_iter):
    print('--------------------')
    print(f'Iteration {i+1}')
    print('--------------------')
    
    # train model
    smote_gan_samples, x_fake_num = train(generator, discriminator, gan_model, latent_dim, n_epochs=5000)
    
#     acc_real_all.append(acc_real)
#     acc_fake_all.append(acc_fake)

    Xy_train_cat_positive_mode_ = Xy_train_cat_positive_mode.loc[Xy_train_cat_positive_mode.index.repeat(x_fake_num)].reset_index(drop=True)
    Xy_train_cat_ = pd.concat([Xy_train_cat, Xy_train_cat_positive_mode_]).reset_index(drop=True)
    smote_gan_samples = pd.concat([smote_gan_samples, Xy_train_cat_], axis=1).reset_index(drop=True)
    smote_gan_samples = smote_gan_samples[features_label_list]
    smote_gan_samples.to_csv(f'./Datasets/Outputs/smote_gan_{i+1}.csv', index=False)

--------------------
Iteration 1
--------------------
100 0.6384615302085876 0.856249988079071
200 0.7589743733406067 0.925000011920929
300 0.8153846263885498 0.8812500238418579
400 0.9179487228393555 0.9125000238418579
500 0.892307698726654 0.7749999761581421
600 0.9179487228393555 0.7562500238418579
700 0.9589743614196777 0.8062499761581421
800 0.9358974099159241 0.75
900 0.9358974099159241 0.8374999761581421
1000 0.9564102292060852 0.856249988079071
1100 0.9256410002708435 0.9312499761581421
1200 0.9564102292060852 0.7562500238418579
1300 0.892307698726654 0.731249988079071
1400 0.7589743733406067 0.7250000238418579
1500 0.7589743733406067 0.8125
1600 0.7538461685180664 0.7875000238418579
1700 0.7769230604171753 0.768750011920929
1800 0.7461538314819336 0.862500011920929
1900 0.6435897350311279 0.9312499761581421
2000 0.7435897588729858 0.925000011920929
2100 0.7948718070983887 0.9624999761581421
2200 0.728205144405365 0.90625
2300 0.7820512652397156 0.84375
2400 0.8846153616905212 

In [None]:
frame_list = [f'./augmentation/muller2019/epoch_{i}.jpg' for i in range(100, 10001, 100)]
clip = mpy.ImageSequenceClip(frame_list, fps=5)
clip.write_gif('./augmentation/muller2019/epochs.gif')

In [None]:
frame_list = [f'./augmentation/muller2019/epoch_{i}.jpg' for i in range(100, 10001, 100)]
images = []

for frame in frame_list:
    images.append(imageio.imread(frame))
    
imageio.mimsave('./augmentation/muller2019/epochs.gif', images, loop=5)

In [None]:
smote_gan_samples_file = f'./Datasets/Outputs/smote_gan.csv'
smote_gan_samples = pd.read_csv(smote_gan_samples_file, index_col=False)

def interactive_hist(df, columns, colorby=None):
  """Plots interactive histograms of a dataframe.
  Input:
    df: a pandas dataframe containing numerical columns
    columns: list of columns within dataframe
    colorby: column within dataframe to color data by
  Output:
    None
  Jack Maughan
  DATAROCK
  Date: 2/5/2022"""

  @interact(x=columns)
  def update(x):
      fig = px.histogram(df, x=x, nbins=100, color=colorby)
      fig.update_layout(xaxis = dict(title=x+' Values'),
                      yaxis = dict(title='Counts'),
                      title = "Distribution Data - "+x,
                      barmode='stack',    autosize=True,
                      width=800, height=600)
      fig.show('notebook')

def plot_3D_scatter(df, columns, colorby=None):
    """Plots interactive 3D scatter plots of dataframes. Number of dataframes
    in the list 'dfs' determines how many scatters are plotted (max. 6).
    Input:
    dfs: a list of pandas dataframes containing numerical columns
    columns: list of columns within dataframe
    subplot_titles: list of titles for each plot
    colorby: array/pd.Series to colorby (numerical)
    Output:
    None
    Jack Maughan
    DATAROCK
    Date: 2/5/2022"""
    
    df0 = df[df['color_label']==0]
    df1 = df[df['color_label']==1]
    df2 = df[df['color_label']==2]

    @interact(x=columns, y=columns, z=columns)
    def update(x, y, z):
        fig = go.Figure()
        fig.add_trace(go.Scatter3d(x=df0[columns][x], y=df0[columns][y], z=df0[columns][z], name ='Background', marker=dict(color = '#EF553B', size=6, line=dict(width=2, color='DarkSlateGrey')), mode='markers'))
        fig.add_trace(go.Scatter3d(x=df1[columns][x], y=df1[columns][y], z=df1[columns][z], name = 'GAN Deposits', marker=dict(color = '#00CC96', size=6, line=dict(width=2, color='DarkSlateGrey')), mode='markers'))
        fig.add_trace(go.Scatter3d(x=df2[columns][x], y=df2[columns][y], z=df2[columns][z], name = 'Actual Deposits', marker=dict(color = '#636EFA', size=6, line=dict(width=2, color='DarkSlateGrey')), mode='markers'))
        
        #Update layout to include axis titles
        fig.update_layout(height=1200, width=1200, showlegend=True, scene1 = dict(xaxis_title=x, yaxis_title=y, zaxis_title=z))
        fig.show('notebook')

smote_gan_samples['color_label'] = smote_gan_samples['label']
smote_gan_samples.loc[:36, 'color_label'] = 2
interactive_hist(smote_gan_samples, smote_gan_samples.columns, colorby='color_label')

In [None]:
plot_3D_scatter(smote_gan_samples, smote_gan_samples.columns, colorby=smote_gan_samples['color_label'])