# Combination of synthetic minority oversampling technique and generative adversarial networks for data augmentation

### Ehsan Farahbakhsh<sup>1</sup>, Sabin Zahirovic<sup>1</sup>, Brent I. A. McInnes<sup>2</sup>, Sara Polanco<sup>1</sup>, Fabian Kohlmann<sup>3</sup>, Maria Seton<sup>1</sup>, R. Dietmar M&uuml;ller<sup>1</sup>

<sup>1</sup>*EarthByte Group, School of Geosciences, The University of Sydney, Sydney, Australia*

<sup>2</sup>*John de Laeter Centre, Faculty of Science and Engineering, Curtin University, Perth, Australia*

<sup>3</sup>*Lithodat Pty. Ltd., Melbourne, Australia*

This notebook enables the user to combine two methods of data agumentation including synthetic minority oversampling technique and generative adversarial networks for increasing the number of positive samples. The output of this notebook is used to create a spatio-temporal mineral prospectivity model of the Papua New Guinea and Solomon Islands region using plate motion models.

### Libraries

In [None]:
import imageio
from imblearn.over_sampling import SMOTE
from ipywidgets import interact
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import moviepy.editor as mpy
import numpy as np
from numpy import genfromtxt
from numpy import ones
from numpy import zeros
from numpy.random import randint
from numpy.random import randn
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import L1, L2, L1L2

### SMOTE

In [None]:
Xy_train_file = './augmentation/Xy_train_muller2019.csv'
# Xy_train_file = './augmentation/Xy_train_muller2016.csv'

Xy_train = pd.read_csv(Xy_train_file, index_col=False)
features_label_list = Xy_train.columns.tolist()

# split the training dataset based on the label column
Xy_train_positive = Xy_train.loc[Xy_train['label']==1]
Xy_train_unlabelled = Xy_train.loc[Xy_train['label']==0]
print('Number of positive training samples before removing outliers:', Xy_train_positive.shape[0])
print('Number of unlabelled training samples before removing outliers:', Xy_train_unlabelled.shape[0])
# remove outliers
Xy_train_positive = Xy_train_positive[(np.abs(stats.zscore(Xy_train_positive.drop(columns=['label']))) < 3).all(axis=1)]
Xy_train_unlabelled = Xy_train_unlabelled[(np.abs(stats.zscore(Xy_train_unlabelled.drop(columns=['label']))) < 3).all(axis=1)]
print('Number of positive training samples after removing outliers:', Xy_train_positive.shape[0])
print('Number of unlabelled training samples after removing outliers:', Xy_train_unlabelled.shape[0])

Xy_train = pd.concat([Xy_train_positive, Xy_train_unlabelled]).reset_index(drop=True)
Xy_train = Xy_train.to_numpy()
features = Xy_train[:, :-1]
labels = Xy_train[:, -1]

# SMOTE
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(features, labels)
smote_samples = np.concatenate((X_sm, y_sm.reshape(-1, 1)), axis=1)
X_positive = smote_samples[np.where(smote_samples[:, -1]==1)]
X_positive = X_positive[:, 0:-1]

### GAN

In [None]:
learning_rate = 1e-2

# define the standalone discriminator model
def define_discriminator(n_inputs):
    model = Sequential()
    model.add(Dense(int(n_inputs*10), activation='LeakyReLU', activity_regularizer=L2(learning_rate), kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(Dense(1, activation='sigmoid', activity_regularizer=L2(learning_rate)))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

# define the standalone generator model
def define_generator(latent_dim, n_outputs):
    model = Sequential()
    model.add(Dense(int(n_outputs*5), activation='LeakyReLU', activity_regularizer=L1(learning_rate), kernel_initializer='he_uniform', input_dim=latent_dim))
    model.add(Dense(n_outputs, activation='linear', activity_regularizer=L1(learning_rate)))
    # model.add(Dense(n_outputs, activation='linear'))
    return model

# define the combined generator and discriminator model for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

n_samples = int(features.shape[0]-(2*labels.sum()))
# sample real data
def sample_real_data(n=n_samples):
    X_rand = X_positive[randint(X_positive.shape[0], size=n), :]
    y_rand = ones((n, 1))
    return X_rand, y_rand

# generate points in the latent space as input for the generator
def generate_latent_points(latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = randn(latent_dim * n)
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n, latent_dim)
    return x_input

# use the generator to generate n fake examples with class labels
def generate_fake_samples(generator, latent_dim, n=n_samples):
    # generate points in the latent space
    x_input = generate_latent_points(latent_dim, n)
    # predict outputs
    X_fake = generator.predict(x_input, verbose=0)
    # create class labels
    y_fake = zeros((n, 1))
    return X_fake, y_fake

# evaluate the discriminator
def summarize_performance(epoch, generator, discriminator, latent_dim):
    # sample real data
    x_real, y_real = sample_real_data()
    # evaluate the discriminator on real examples
    _, acc_real = discriminator.evaluate(X_sm, y_sm, verbose=0)
    # prepare fake examples
    x_fake, y_fake = generate_fake_samples(generator, latent_dim)
    # evaluate the discriminator on fake examples
    _, acc_fake = discriminator.evaluate(x_fake, y_fake, verbose=0)
    return x_real, x_fake, acc_real, acc_fake

# train the generator and discriminator
def train(g_model, d_model, gan_model, latent_dim, n_epochs, smote_gan_file, n_batch=128, n_eval=100):
    # determine half the size of one batch, for updating the discriminator
    half_batch = int(n_batch / 2)
    acc_real_all = []
    acc_fake_all = []
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        x_real, y_real = sample_real_data(half_batch)
        # prepare fake examples
        x_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
        # update discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in the latent space as input for the generator
        x_gan = generate_latent_points(latent_dim, n_batch)
        # create inverted labels for the fake samples
        y_gan = ones((n_batch, 1))
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        
        # plot data points
        # ----------------
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            # summarize discriminator performance
            x_real, x_fake, acc_real, acc_fake = summarize_performance(i, g_model, d_model, latent_dim)
            print(i+1, acc_real, acc_fake)
                
        if i == n_epochs-1:
            # remove outliers
            x_fake = x_fake[(np.abs(stats.zscore(x_fake)) < 3).all(axis=1)]
            x_fake_ones = np.full((x_fake.shape[0], 1), 2)
            x_fake = np.concatenate((x_fake, x_fake_ones), axis=1)
            smote_gan_samples = np.concatenate((Xy_train, x_fake), axis=0)
            smote_gan_samples = pd.DataFrame(smote_gan_samples, columns=features_label_list)
            smote_gan_samples.to_csv(smote_gan_file, index=False)
            
    return

### Multiple Iterations

In [None]:
# size of the latent space
latent_dim = int(features.shape[1]*0.5)

n_iter = 10
acc_real_all = []
acc_fake_all = []

for i in range(n_iter):
    print('--------------------')
    print(f'Iteration {i+1}')
    print('--------------------')

    # create the discriminator
    discriminator = define_discriminator(features.shape[1])
    # create the generator
    generator = define_generator(latent_dim, features.shape[1])
    # create the gan
    gan_model = define_gan(generator, discriminator)    
    # train model
    train(generator, discriminator, gan_model, latent_dim, n_epochs=2000,
          smote_gan_file=f'./augmentation/smote_gan_muller2019_{i+1}.csv')