In [None]:
# Created by: Adam Fabo
# Date: 22.5.2022
# Created at HMU Crete
# Class: Neural Networks
# File contains script to create load, plot data, remove outliers and show differences (Chapter 2 in documentation) 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import neurolab as nl
import pandas as pd
from sklearn import preprocessing

# change plot sizes
plt.rcParams['figure.figsize'] = [8,2]
plt.rcParams['figure.dpi'] = 100

In [None]:
# load data
data = pd.read_csv('data_banknote_authentication.txt', sep=",", header=None)

# data description
# 1. variance of Wavelet Transformed image (continuous)
# 2. skewness of Wavelet Transformed image (continuous)
# 3. curtosis of Wavelet Transformed image (continuous)
# 4. entropy of image (continuous)
# 5. class (integer)

data.columns = ["Variance", "Skewness", "Curtosis", "Entropy", "Class"]

data.head()



In [None]:
# visualise the data

# different colors for different graphs
colors = ["b","g", "r","c","m"]
counter = 0



# show data as graph and their histogram
for col in data.columns:
    np_arr = data[col].to_numpy()
    
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.tight_layout()
    
    # dotted graph
    ax1.set_title("Graph of " + col)
    ax1.set_xlabel("Sample number")
    ax1.set_ylabel("Sample value")
    
    ax1.plot(np_arr, ".", alpha=0.5, color=colors[counter])
    
    
    # histogram
    ax2.set_title("Histogram of " + col)
    ax2.set_xlabel("Values")
    ax2.set_ylabel("Frequency")
    
    ax2.hist(np_arr, bins = 20, alpha=0.5, histtype='bar', ec='black',color= colors[counter])
    
    counter += 1
    
    # plt.savefig('images/' +  col +'.png',bbox_inches='tight')

# since there are only 2 classes to which can banknote belong, differences between classes can be seen on a first sight 
# mainly at variance and skewness

plt.show()

In [None]:
# detection of outliers using z-score of standardt deviation

# https://www.youtube.com/watch?v=rzR_cKnkD18&ab_channel=KrishNaik
# https://github.com/krishnaik06/Finding-an-Outlier
# https://en.wikipedia.org/wiki/Standard_score

# detects outliers in dataset and returns indexes of the rows of outliers
def detect_outliers(dataset):
    
    outliers = []
    indexes  = []
    index_counter  = 0

    threshold = 3
    mean = np.mean(dataset)  # get mean
    std  = np.std(dataset)   # get standard deviaton
    
    
    for value in dataset:
        z_score= (value - mean) / std 
        
        if np.abs(z_score) > threshold:
            outliers.append(value)
            indexes.append(index_counter)
        index_counter += 1
            
    # return outliers, indexes
    return indexes
    

In [None]:
# finds and removes outliers from dataset
# run this only once

outlier_indexes = []

# take columns without class
for col in data.columns[:-1]:
    
    arr = data[col].to_numpy()
    
    outlier_indexes += detect_outliers(arr)

# get rid of duplicates if there are any
outlier_indexes = list(set(outlier_indexes))
print("dropping " + str(len(outlier_indexes)) + " rows, which is " + str(round(len(outlier_indexes) /len(data.index) *100,2))  + "% of original table")

data_wo_outliers = data.drop(outlier_indexes)

In [None]:
# histograms before and after removing outliers

# different colors for different graphs
colors = ["b","g", "r","c","m"]
colors_old = ["whitesmoke","whitesmoke","whitesmoke", "whitesmoke","whitesmoke"]
counter = 0

#todo troska kultury

# show data as graph and their histogram
for col in data.columns:
    new = data_wo_outliers[col].to_numpy()

    old = data[col].to_numpy()
    
    plt.title("Histogram of " + col)
    plt.xlabel("Values")
    plt.ylabel("Frequency")
    
    plt.hist([new,old], bins = 20, alpha=0.75, histtype='bar', ec='black',color= [colors[counter],colors_old[counter]])
    plt.legend(["new","old"])
 
    
    #plt.savefig('images/histogram_of_' +  col +'.png',bbox_inches='tight')
    plt.figure()
    
    counter += 1

plt.show()

In [None]:
# better at python script than jupyter notebook 
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

ax.scatter(data["Variance"].to_numpy(), data["Skewness"].to_numpy(), data["Curtosis"].to_numpy(), c = data["Class"].to_numpy())

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

plt.show()