In [None]:
"""
GAN oversampling for banknote authentication

"""



import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd
from pandas import DataFrame
from numpy import ndarray,random
from sklearn import preprocessing

np.random.seed(10)


In [None]:
#function to normalize data

def normalize(x):
    
    x=x.astype(float)
    maxnorm= [np.amax(x[:,j])  if (j in range(x.shape[1]))==True else 1 for j in range(x.shape[1])]
    minnorm= [np.amin(x[:,j])  if (j in range(x.shape[1]))== True else 0 for j in range(x.shape[1])]
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i][j]=(x[i][j]- minnorm[j])/(maxnorm[j]-minnorm[j])
    return x
            

In [None]:
#extract seismic data and preprocessing
banknote=pd.read_csv('banknote_auth.csv')

#input data
X=banknote.iloc[:,0:4]
X=np.asarray(X)

#normalize data
X=preprocessing.scale(X)
X=normalize(X)

#training output
y=banknote.iloc[:,4]
y=np.asarray(y)
y=np.ravel(y)



In [None]:
def shuffle(x,y):
    
    y=np.reshape(y,(y.shape[0],1))
    z=np.concatenate((x,y),axis=1)
    np.random.shuffle(z)
    x1=z[:,0:-1]
    y1=z[:,-1]
    y1=np.ravel(y1)
    return x1,y1

In [None]:
#train_test_split (X_train contains both classes, but a stratified sampling is carried out)
X,y=shuffle(X,y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,shuffle=True,stratify=y, random_state=np.random.randint(0,100))

#extract class 1 data from training set
hazardous = [X_train[i,:] for i in range(X_train.shape[0]) if y[i] ==1]
hazardous=np.asarray(hazardous)

#extract class 0 data from training set
non_hazardous = [X_train[i,:] for i in range(X_train.shape[0]) if y[i] ==0]
non_hazardous=np.asarray(non_hazardous)


In [None]:
#dictionary of all models used in this notebook
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from collections import OrderedDict
models = OrderedDict([
#('Knn 2', KNeighborsClassifier(2)),
#('Naive Bayes', GaussianNB()),
#('Logistic Regression', LogisticRegression()),
#('Classification Tree', DecisionTreeClassifier(max_depth=18)),
#('Random Forest', RandomForestClassifier(max_depth=5, n_estimators=50)),
('Multilayer Perceptron', MLPClassifier((4), activation='logistic',solver='adam', max_iter=100000,learning_rate_init=0.01, random_state=np.random.randint(0,100)))
])

In [None]:
#performs GAN oversampling with Tensorflow

def GAN_ups(hazardous,name,ups_size):
    
    if ups_size==0:
        return hazardous
    
    # Training Params
    num_steps = hazardous.shape[0]
    learning_rate = 1e-4
    batch_size=1
    # Network Params
    image_dim = 4# 
    gen_hidden_dim =10
    disc_hidden_dim =10
    noise_dim = 4 # Noise data points
    
    
    
    # A custom initialization (see Xavier Glorot init)
    def glorot_init(shape):
        return tf.random_normal(shape=shape,seed=np.random.randint(0,100),stddev=1. / tf.sqrt(shape[0] / 2.))

    # Store layers weight & bias
    weights = {
        'gen_hidden1': tf.Variable(glorot_init([noise_dim, gen_hidden_dim])),
        'gen_out': tf.Variable(glorot_init([gen_hidden_dim, image_dim])),
        'disc_hidden1': tf.Variable(glorot_init([image_dim, disc_hidden_dim])),
        'disc_out': tf.Variable(glorot_init([disc_hidden_dim, 1])),
        'log_disc': tf.Variable(glorot_init([image_dim, 1]))
    }
    biases = {
        'gen_hidden1': tf.Variable(tf.zeros([gen_hidden_dim])),
        'gen_out': tf.Variable(tf.zeros([image_dim])),
        'disc_hidden1': tf.Variable(tf.zeros([disc_hidden_dim])),
        'disc_out': tf.Variable(tf.zeros([1])),
        'log_disc':tf.Variable(tf.zeros([1]))
    }
   
        
   
    
    # Generator
    def generator(x):
        
        hidden_layer = tf.matmul(x, weights['gen_hidden1'])
        hidden_layer = tf.add(hidden_layer, biases['gen_hidden1'])
        hidden_layer = tf.nn.relu(hidden_layer)
        out_layer = tf.matmul(hidden_layer, weights['gen_out'])
        out_layer = tf.add(out_layer, biases['gen_out'])
        out_layer = tf.nn.sigmoid(out_layer)
        return out_layer
    
  
    # Discriminator
    def discriminator(x,name):
        
        if name=='Multilayer Perceptron':
            hidden_layer = tf.matmul(x, weights['disc_hidden1'])
            hidden_layer = tf.add(hidden_layer, biases['disc_hidden1'])
            hidden_layer = tf.nn.relu(hidden_layer)
            out_layer = tf.matmul(hidden_layer, weights['disc_out'])
            out_layer = tf.add(out_layer, biases['disc_out'])
            out_layer = tf.nn.sigmoid(out_layer)
            return out_layer
        else:
            out_layer = tf.matmul(x,weights['log_disc'])
            out_layer = tf.add(out_layer, biases['log_disc'])
            out_layer = tf.nn.sigmoid(out_layer)
            return out_layer

    # Build Networks
    
    # Network Inputs
    gen_input = tf.placeholder(tf.float32, shape=[None, noise_dim], name='input_noise')
    disc_input = tf.placeholder(tf.float32, shape=[None, image_dim], name='disc_input')
    
    
    # Build Generator Network
    gen_sample = generator(gen_input)
    
    # Build 2 Discriminator Networks (one from noise input, one from generated samples)
    disc_real = discriminator(disc_input,name)
    disc_fake = discriminator(gen_sample,name)

    # Build Loss
    gen_loss = -tf.reduce_mean(tf.log(disc_fake))
    disc_loss =-(tf.reduce_mean(tf.log(disc_real) + tf.log(1. - disc_fake)))

    # Build Optimizers
    optimizer_gen = tf.train.AdamOptimizer(learning_rate=learning_rate)
    optimizer_disc = tf.train.AdamOptimizer(learning_rate=learning_rate)
    
    
    # Training Variables for each optimizer
    
    # Generator Network Variables
    gen_vars = [weights['gen_hidden1'], weights['gen_out'],
            biases['gen_hidden1'], biases['gen_out']]
    # Discriminator Network Variables
    if name == 'Multilayer Perceptron':
        disc_vars = [weights['disc_hidden1'], weights['disc_out'],
            biases['disc_hidden1'], biases['disc_out']]
    else:
        disc_vars = [ weights['log_disc'],biases['log_disc']]
    

    # Create training operations
    train_gen = optimizer_gen.minimize(gen_loss, var_list=gen_vars)
    train_disc = optimizer_disc.minimize(disc_loss, var_list=disc_vars)
    
    
    init = tf.global_variables_initializer()
    with tf.Session() as sess:

        # Run the initializer
        sess.run(init)
    
        i=0
        for i in range(num_steps-batch_size):
            # Prepare Data
        
            batch_x=hazardous[i:i+batch_size]
        
            # Generate noise to feed to the generator
            z = np.random.uniform(-1, 1., size=[batch_size, noise_dim])

            # Train
            feed_dict = {disc_input: batch_x, gen_input: z}
            if i % 1 == 0:
                _,_, gl, dl = sess.run([train_gen, train_disc, gen_loss, disc_loss],feed_dict=feed_dict)
                if (i== num_steps-batch_size-2):
                    print('Step %i: Generator Loss: %f, Discriminator Loss: %f' % (i, gl, dl))
            else:
                _, gl, dl = sess.run([train_disc, gen_loss, disc_loss],feed_dict=feed_dict)
                if (i== num_steps-batch_size-2):
                    print('Step %i: Generator Loss: %f, Discriminator Loss: %f' % (i, gl, dl))
        
        
        
        #generate new data
        z = np.random.uniform(-1, 1., size=[ups_size, noise_dim])
        g = sess.run([gen_sample], feed_dict={gen_input: z})
        g=np.asarray(g)
        g=g.reshape(ups_size,image_dim)
        sess.close()
        return g

In [None]:
#concatenates old data and generated one in a unique dataset;

def new_data(X_train,y_train,x):
    Xu_train=np.concatenate((X_train,x),axis=0)
    yu_train=np.concatenate((y_train,np.ones(x.shape[0])),axis=0)
       
    return Xu_train,yu_train
        

In [None]:
#performs bootstrap of a matrix of datapoints
def bootstrap(x,boot_size):
    if boot_size==0:
        return x
    else:
        s=np.random.randint(0,x.shape[0]-1,boot_size)
        s=np.asarray(s)
        w=np.asarray([x[s[i],:] for i in range(s.shape[0])])
        #x=np.concatenate((x,w),axis=0)
        #x=np.asarray(x)
        return w

In [None]:
#discretize some features
def discretize(x):
    bins=[np.arange(-0.5,2.5,1),np.arange(-0.5,3.5,1)/2.0,np.arange(-0.5,2.5,1),[],[],[],[],np.arange(-0.5,3.5,1)/2.0,
          np.arange(-0.5,10.5,1)/9.0,np.arange(-0.5,9.5,1)/8.0,np.arange(-0.5,8.5,1)/7.0,np.arange(-0.5,4.5,1)/3.0]
    
    
    my_range=[0,1,2,7,8,9,10,11]
    for j in my_range:
        bins_j=np.asarray(bins[j])
        inds=np.digitize(x[:,j],bins_j)
        inds=(inds-1)/(bins_j.shape[0]-2.0)
        x[:,j]=[inds[i] for i in range(x.shape[0])]
 
    return x


In [None]:
#shuffle data
def shuffle(x,y):
    
    y=np.reshape(y,(y.shape[0],1))
    z=np.concatenate((x,y),axis=1)
    np.random.shuffle(z)
    x1=z[:,0:-1]
    y1=z[:,-1]
    y1=np.ravel(y1)
   
    return x1,y1

In [None]:

#initialization
name = 'Multilayer Perceptron'
clf = models[name]

from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix


b_ord_of_mag=[100000,300000,500000]
boot_nh_mag=[1000,5000,10000,20000,50000]
u_ord_of_mag=[1000,1500,2000,5000,10000]
boot_size=100000
ups_size=3000


    






In [None]:
from sklearn import decomposition
pca=decomposition.PCA(n_components=4)

hazards=np.asarray(hazardous)
hazards= bootstrap(hazards,boot_size)
#non_hazardous1=bootstrap(non_hazardous,1)

trials=1
av_acc=0
av_rec=0
for i in range(trials):
    np.random.seed(i)
    for name, clf in models.items():
        X,y=shuffle(X,y)
        hazard_GAN= GAN_ups(hazards,name,ups_size)
        hazard_GAN=normalize(hazard_GAN)
        #hazards1=hazards[0:1,:]
        Xu_train, yu_train= new_data(X_train,y_train,hazard_GAN)
        Xu_train,yu_train=shuffle(Xu_train,yu_train)
        clf.fit(Xu_train, yu_train)
        score = clf.score(X_test, y_test)
        y_pred=clf.predict(X_test)
        rec_score= recall_score(y_test,y_pred)
        print(score,rec_score,name," boot_size:",boot_size," ups_size:",ups_size)
        av_acc=av_acc+score
        av_rec=av_rec +rec_score
        data={'original_data':hazards,'gen_data':hazard_GAN}
        for key,value in data.items():
            #print(key, value.shape)
            value_new=pca.fit_transform(value)
            plt.scatter(value_new[:,0],value_new[:,1])

        plt.show()
av_acc=av_acc/trials
av_rec=av_rec/trials
print("average accuracy: ",av_acc,"average_recall: ",av_rec)
    

In [None]:
from numpy import linalg as LA


data={'original_data':hazards,'generated data':hazard_GAN}
for key,value in data.items():
    cov=np.cov(value.T)
    print(cov.shape)
    w,v=LA.eig(cov)
    print(key,w[0:2])
    print(v[:,0:3])

   

In [None]:
hazard_GAN

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import linalg as LA


my_range=[0,2]
for j in range(4):
    z=hazards[:,j]
    w=hazard_GAN[:,j]

    plt.hist(z,normed=True,bins=100)
    plt.hist(w,normed=True,bins=100)

    plt.show()
     
      