### Libraries

In [117]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from numpy import genfromtxt
from numpy import ones
from numpy import zeros
from numpy.random import randint
from numpy.random import randn

import pandas as pd
import numpy as np
import plotly.express as px
from ipywidgets import interact, interact_manual
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from imblearn.over_sampling import SMOTE

def interactive_hist(df, columns, colorby=None):
  """Plots interactive histograms of a dataframe. 
  Input:
    df: a pandas dataframe containing numerical columns
    columns: list of columns within dataframe
    colorby: column within dataframe to color data by
  Output:
    None
  Jack Maughan
  DATAROCK
  Date: 2/5/2022"""

  @interact(x=columns)
  def update(x):
      fig = px.histogram(df, x=x, nbins=100, color=colorby)
      fig.update_layout(xaxis = dict(title=x+' Values'), 
                      yaxis = dict(title='Counts'),
                      title = "Distribution Data - "+x, 
                      barmode='stack',    autosize=True,
                      width=800, height=600)
      fig.show('notebook')


# generate a confusion matrix
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    '''pretty print for confusion matrixes'''
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = ' ' * columnwidth
    print('    ' + empty_cell, end=' ')
    for label in labels:
        print('%{0}s'.format(columnwidth) % 'pred_' + label, end=' ')
    print()

    # Print rows
    for i, label1 in enumerate(labels):
        print('    %{0}s'.format(columnwidth) % 'true_' + label1, end=' ')
        for j in range(len(labels)):
            cell = '%{0}.1f'.format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=' ')
        print()


def plot_3D_scatter(df, columns, colorby=None):
    """Plots interactive 3D scatter plots of dataframes.Number of dataframes
    in the list 'dfs' determines how many scatters are plotted (max. 6).
    Input:
    dfs: a list of pandas dataframes containing numerical columns
    columns: list of columns within dataframe
    subplot_titles: list of titles for each plot
    colorby: array/pd.Series to colorby (numerical)
    Output:
    None
    Jack Maughan
    DATAROCK
    Date: 2/5/2022"""
    df0 = df[df['colour_label']==0]
    df1 = df[df['colour_label']==1]    
    df2 = df[df['colour_label']==2]

    @interact(x=columns, y=columns, z=columns)
    def update(x, y, z):
        fig = go.Figure()
        fig.add_trace(go.Scatter3d(x=df0[columns][x], y=df0[columns][y], z=df0[columns][z], name ='Background', marker=dict(color = '#EF553B', size=6, line=dict(width=2, color='DarkSlateGrey')), mode='markers'))
        fig.add_trace(go.Scatter3d(x=df1[columns][x], y=df1[columns][y], z=df1[columns][z], name = 'GAN Deposits', marker=dict(color = '#00CC96', size=6, line=dict(width=2, color='DarkSlateGrey')), mode='markers'))
        fig.add_trace(go.Scatter3d(x=df2[columns][x], y=df2[columns][y], z=df2[columns][z], name = 'Actual Deposits', marker=dict(color = '#636EFA', size=6, line=dict(width=2, color='DarkSlateGrey')), mode='markers'))

        
    
        #Update layout to include axis titles
        fig.update_layout(height=1200, width=1200, showlegend=True, scene1 = dict(xaxis_title=x, yaxis_title=y, zaxis_title=z))
        fig.show('notebook')

### SMOTE

In [118]:
commodity = 'Co'

df = pd.read_csv('smote_gan_columns_Co.csv')
df = df.loc[:3104]
df

Unnamed: 0,Ag_soil,Al_soil,As_soil,Au_soil,Ba_soil,Be_soil,Bi_soil,Ca_soil,Cd_soil,Ce_soil,...,"Neoproterozoic - Ordovician polygons_Slope to basinal shale, siltstone, lime mudstone; minor sandstone and pebbly sandstone at top",Neoproterozoic - Ordovician polygons_Syn-Delamerian mafic intrusive rocks,"Neoproterozoic - Ordovician polygons_Undifferentiated Cambro-Ordovician metasediments, including Kanmantoo Group?",Neoproterozoic - Ordovician polygons_Undifferentiated Neoproterozoic rocks,"Neoproterozoic - Ordovician polygons_Undifferentiated Neoproterozoic to Cambrian metasediments, Neoproterozoic-Ordovician volcanics and mafic intrusives",Neoproterozoic - Ordovician polygons_Undifferentiated Proterozoic rocks,"Neoproterozoic - Ordovician polygons_Volcaniclastics, porphyritic trachyte, dacite","Neoproterozoic - Ordovician polygons_Vuggy dolomite; shelf limestone, dolomite, siltstone, shale and sandstone; glauconitic sandstone","Neoproterozoic - Ordovician polygons_Within-plate amygdaloidal basalt, hyaloclastite, porphyritic and trachytic basalt",label
0,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.505113,2.635810,-1.171303,-0.157419,1.122872,-2.105673,-0.408191,0.926316,-0.598847,2.835151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.351863,-0.205735,-1.886645,-0.156791,-0.181980,0.078759,-0.433241,-1.119143,-0.533638,2.312036,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.551521,-0.205735,2.450703,-0.155771,-0.181980,0.078759,-0.585509,2.671770,-0.643158,-0.303559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3100,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3101,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3102,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3103,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
X = df.drop(['label'], axis=1)
Y = df['label']

In [120]:
# SMOTE
smote = SMOTE(random_state=1)
X_sm, y_sm = smote.fit_resample(X.values, Y.values)
smote_samples = np.concatenate((X_sm, y_sm.reshape(y_sm.shape[0], 1)), axis=1)
# np.savetxt('SMOTE_Samples.csv', smote_samples, delimiter=',')

X_positive = smote_samples[np.where(smote_samples[:, -1]==1)]
X_positive = X_positive[:, 0:-1]

### GAN

In [121]:
# define the standalone discriminator model
def define_discriminator(n_inputs):
    model = Sequential()
    model.add(Dense(25, activation='relu', kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(Dense(1, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# define the standalone generator model
def define_generator(latent_dim, n_outputs):
    model = Sequential()
    model.add(Dense(15, activation='relu', kernel_initializer='he_uniform', input_dim=latent_dim))
    model.add(Dense(n_outputs, activation='linear'))
    return model

# define the combined generator and discriminator model for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

# sample real data
def sample_real_data(n):
    X_rand = X_positive[randint(X_positive.shape[0], size=n), :]
    y_rand = ones((n, 1))
    return X_rand, y_rand

# generate points in the latent space as input for the generator
def generate_latent_points(latent_dim, n=int(X.shape[0]-(2*Y.sum()))):
    # generate points in the latent space
    x_input = randn(latent_dim * n)
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n, latent_dim)
    return x_input

# use the generator to generate n fake examples with class labels
def generate_fake_samples(generator, latent_dim, n=int(X.shape[0]-(2*Y.sum()))):
    # generate points in the latent space
    x_input = generate_latent_points(latent_dim, n)
    # predict outputs
    X_fake = generator.predict(x_input)
    # create class labels
    y_fake = zeros((n, 1))
    return X_fake, y_fake

# evaluate the discriminator
def summarize_performance(epoch, generator, discriminator, latent_dim):
    # evaluate the discriminator on real examples
    _, acc_real = discriminator.evaluate(X_sm, y_sm, verbose=0)
    # prepare fake examples
    x_fake, y_fake = generate_fake_samples(generator, latent_dim)
    # evaluate discriminator on fake examples
    _, acc_fake = discriminator.evaluate(x_fake, y_fake, verbose=0)
    # summarize the discriminator performance
    print(epoch, acc_real, acc_fake)
    if epoch == 999:
        x_fake_ones = ones((x_fake.shape[0], 1))
        x_fake = np.concatenate((x_fake, x_fake_ones), axis=1)

        smote_gan_samples = np.concatenate((df.values, x_fake), axis=0)
        return smote_gan_samples
        #np.savetxt(f'../Gawler_MPM/Gawler/{commodity}/smote_gan_{commodity}_JM.csv', smote_gan_samples, delimiter=',')

# train the generator and discriminator
def train(g_model, d_model, gan_model, latent_dim, n_epochs=1000, n_batch=128, n_eval=100):
    # determine half the size of one batch for updating the discriminator
    half_batch = int(n_batch / 2)
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        x_real, y_real = sample_real_data(half_batch)
        # prepare fake examples
        x_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
        # update the discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in the latent space as input for the generator
        x_gan = generate_latent_points(latent_dim, n_batch)
        # create inverted labels for the fake samples
        y_gan = ones((n_batch, 1))
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            smote_gan_samples = summarize_performance(i, g_model, d_model, latent_dim)
        
    return smote_gan_samples

In [122]:
# size of the latent space
latent_dim = 10000
# create the discriminator
discriminator = define_discriminator(X.shape[1])
# create the generator
generator = define_generator(latent_dim, X.shape[1])
# create the gan
gan_model = define_gan(generator, discriminator)


In [123]:
# train the model
dfout = train(generator, discriminator, gan_model, latent_dim)

99 0.8395739186571982 1.0
199 0.9244673983214977 0.990294403105791
299 0.974176888315042 1.0
399 0.9887023886378309 1.0
499 0.9898321497740478 1.0
599 0.9890251775338929 1.0
699 0.974499677211104 1.0
799 0.9774047772756617 0.9996764801035264
899 0.9667527437056165 1.0
999 0.9769205939315687 0.999029440310579


In [124]:
dfout = pd.DataFrame(dfout, columns=df.columns)

# geol_cols = [col for col in list(dfout.columns) if any(x in col for x in ['Neoproterozoic', 'Mesoproterozoic'])]
# geol_cols = [col for col in geol_cols if 'Contact' not in col]
# dfout[geol_cols] = dfout[geol_cols].round()

# chem_cols = [col for col in list(dfout.columns) if any(x in col for x in ['_rock', '_soil', '_stream'])]

# dfout = dfout[[col for col in dfout.columns if col not in chem_cols]]

In [125]:
dfout['colour_label'] = dfout['label']

dfout.loc[:7, 'colour_label']=2
dfout

Unnamed: 0,Ag_soil,Al_soil,As_soil,Au_soil,Ba_soil,Be_soil,Bi_soil,Ca_soil,Cd_soil,Ce_soil,...,Neoproterozoic - Ordovician polygons_Syn-Delamerian mafic intrusive rocks,"Neoproterozoic - Ordovician polygons_Undifferentiated Cambro-Ordovician metasediments, including Kanmantoo Group?",Neoproterozoic - Ordovician polygons_Undifferentiated Neoproterozoic rocks,"Neoproterozoic - Ordovician polygons_Undifferentiated Neoproterozoic to Cambrian metasediments, Neoproterozoic-Ordovician volcanics and mafic intrusives",Neoproterozoic - Ordovician polygons_Undifferentiated Proterozoic rocks,"Neoproterozoic - Ordovician polygons_Volcaniclastics, porphyritic trachyte, dacite","Neoproterozoic - Ordovician polygons_Vuggy dolomite; shelf limestone, dolomite, siltstone, shale and sandstone; glauconitic sandstone","Neoproterozoic - Ordovician polygons_Within-plate amygdaloidal basalt, hyaloclastite, porphyritic and trachytic basalt",label,colour_label
0,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,2.0
1,-0.505113,2.635810,-1.171303,-0.157419,1.122872,-2.105673,-0.408191,0.926316,-0.598847,2.835151,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,2.0
2,-0.351863,-0.205735,-1.886645,-0.156791,-0.181980,0.078759,-0.433241,-1.119143,-0.533638,2.312036,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,2.0
3,-0.551521,-0.205735,2.450703,-0.155771,-0.181980,0.078759,-0.585509,2.671770,-0.643158,-0.303559,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,2.0
4,-0.148653,-0.205735,0.133028,-0.133012,-0.181980,0.078759,-0.199128,-0.117539,-0.083921,-0.306286,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6191,0.058677,0.207723,-1.500996,-0.165690,0.410535,-0.560511,0.267749,1.512168,-0.526435,0.621749,...,-0.344134,0.103029,0.163540,0.466295,-0.029796,-0.012015,0.074537,0.538003,1.0,1.0
6192,-0.064937,0.112651,-2.138408,0.154473,0.548312,-0.929416,-0.955932,0.849504,-0.366380,0.021879,...,-0.537636,0.027211,0.073783,0.365769,0.384416,-0.400565,0.262109,0.175127,1.0,1.0
6193,-0.402564,0.843360,-1.670890,-0.190899,0.588073,-0.887988,-0.490653,1.091639,-0.610281,1.211450,...,-0.306854,0.341653,0.194375,0.592679,0.522326,0.065116,0.207429,0.609635,1.0,1.0
6194,0.251078,-0.073178,-1.719808,0.170403,0.280495,-0.706585,-0.151677,1.488154,-0.336283,0.014044,...,-0.480600,-0.227420,0.079443,0.452700,0.105450,-0.355213,0.071087,0.195553,1.0,1.0


In [126]:
interactive_hist(dfout, dfout.columns, colorby='colour_label')

interactive(children=(Dropdown(description='x', options=('Ag_soil', 'Al_soil', 'As_soil', 'Au_soil', 'Ba_soil'…

In [116]:
plot_3D_scatter(dfout, dfout.columns, colorby=dfout['colour_label'])

interactive(children=(Dropdown(description='x', options=('SA_TMI_GDA94_mean', 'SA_TMI_GDA94_std', 'SA_TMI_GDA9…