In [None]:
"""
Artificial noise upsampling for a comparison with GAN

"""

from __future__ import division, print_function, absolute_import

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd
from pandas import DataFrame
from numpy import ndarray,random
import random

np.random.seed(1)




In [None]:
#function to normalize data
def normalize(x):
 
    x=x.astype(float)
    maxnorm= [np.amax(x[:,j])  if (j in range(x.shape[1]))==True else 1 for j in range(x.shape[1])]
    minnorm= [np.amin(x[:,j])  if (j in range(x.shape[1]))== True else 0 for j in range(x.shape[1])]
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i][j]=(x[i][j]- minnorm[j])/(maxnorm[j]-minnorm[j])
    return x
            

In [None]:
#extract seismic data and preprocessing
banknote=pd.read_csv('banknote_auth.csv')

#input data
X=banknote.iloc[:,0:4]

X=np.asarray(X)

#normalize data

from sklearn import preprocessing

X=preprocessing.scale(X)
X=normalize(X)

#training output
y=banknote.iloc[:,4]
y=np.asarray(y)
y=np.ravel(y)


In [None]:
#train_test_split (X_train contains both classes, but a stratified sampling is carried out)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,shuffle=True,random_state=np.random.randint(0,100),stratify=y)

#extract class 1 data from training set
hazardous = [X_train[i,:] for i in range(X_train.shape[0]) if y[i] ==1]
hazardous=np.asarray(hazardous)


#extract class 0 data from training set
non_hazardous = [X_train[i,:] for i in range(X_train.shape[0]) if y[i] ==0]
non_hazardous=np.asarray(non_hazardous)

In [None]:
#dictionary of all models used in this notebook
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from collections import OrderedDict
models = OrderedDict([
#('Knn 2', KNeighborsClassifier(2)),
#('Naive Bayes', GaussianNB()),
#('Logistic Regression', LogisticRegression()),
#('Classification Tree', DecisionTreeClassifier(max_depth=18)),
#('Random Forest', RandomForestClassifier(max_depth=5, n_estimators=50)),
('Multilayer Perceptron', MLPClassifier((4), activation='logistic',solver='adam', max_iter=100000,learning_rate_init=0.01, random_state=np.random.randint(0,100)))
])

In [None]:
#concatenates old data and generated one in a unique dataset;for the latter, 
def new_data(X_train,y_train,x):
  
    Xu_train=np.concatenate((X_train,x),axis=0)
    yu_train=np.concatenate((y_train,np.ones(x.shape[0])),axis=0)
       
    return Xu_train,yu_train
    
        
        

In [None]:
#performs bootstrap of a matrix of datapoints
def bootstrap(x,boot_size,noise):
    if boot_size==0:
        return x
    else:
        s=np.random.randint(0,x.shape[0]-1,boot_size)
        s=np.asarray(s)
        if noise:
            w=np.asarray([x[s[i],:] +np.random.uniform(-0.05,0.05) for i in range(s.shape[0])])
        else:
            w=np.asarray([x[s[i],:] for i in range(s.shape[0])])
       
        return w

In [None]:

from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix


b_ord_of_mag=[10000,100000]
u_ord_of_mag=[0,500,1000,2000,5000,10000,50000,100000]
boot_size=3000


hazards=np.asarray(hazardous)

trials=10
av_acc=0
av_rec=0
for i in range(trials):
        np.random.seed(i)
        #initialization
        name = 'Multilayer Perceptron'
        clf = models[name]
        noise=True
        hazards1= bootstrap(hazards,3000,noise)
        for name, clf in models.items():
            
            Xu_train, yu_train= new_data(X_train,y_train,hazards1)
            clf.fit(Xu_train, yu_train)
            score = clf.score(X_test, y_test)
            y_pred=clf.predict(X_test)
            rec_score= recall_score(y_test,y_pred)
            print(score,rec_score,name," boot_size:",boot_size)
            av_acc=av_acc+score
            av_rec=av_rec +rec_score
av_acc=av_acc/trials
av_rec=av_rec/trials
print("average accuracy: ",av_acc,"average_recall: ",av_rec)
    

In [None]:
from numpy import linalg as LA


data={'original_data':hazards,'generated data':hazards1}
for key,value in data.items():
    cov=np.cov(value.T)
    print(cov.shape)
    w,v=LA.eig(cov)
    print(key,w[0:2])
    print(v[:,0:3])

   

In [None]:
from sklearn import decomposition
pca=decomposition.PCA(n_components=2)


data={'0 data ':non_hazardous ,'1 data ': hazards1}
for key,value in data.items():
    value_new=pca.fit_transform(value)
    plt.scatter(value_new[:,0],value_new[:,1])

plt.show()