In [1]:
import numpy as np
import pandas as pd
import copy
import numpy as np
from collections import Counter
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import datetime 

In [2]:
def load_data(set_type, test_size=30):
    if   set_type == 'BreastTissue':
        dataset = np.loadtxt('Datasets\BreastTissue.txt', delimiter = '\t', dtype='str') 
        n = 9
    elif set_type == 'Diabetes':
        dataset = np.loadtxt('Datasets\Diabetes.txt', delimiter = '\t', dtype='str')
        n = 8
    elif set_type == 'Glass':
        dataset = np.loadtxt('Datasets\Glass.txt', delimiter = '\t', dtype='str')
        n = 9
    elif set_type == 'Ionosphere':
        dataset = np.loadtxt('Datasets\Ionosphere.txt', delimiter = ',', dtype='str')
        n = 34
    elif set_type == 'Sonar':
        dataset = np.loadtxt('Datasets\Sonar.txt', delimiter = ',', dtype='str')
        n = 60
    elif set_type == 'Wine':
        dataset = np.loadtxt('Datasets\Wine.txt', delimiter = ', ', dtype='str')
        n = 13
        
    x = dataset[:,:n].astype(np.float)
    y = dataset[:,n]
    
    y_unique = np.unique(y)
    k = len(np.unique(y))
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size)
    
    m = X_train.shape[0]
    m_test = X_test.shape[0]
    return X_train, y_train, X_test, y_test, y_unique, m, m_test, k



In [3]:
# ONE Nearest Neighbour

def min_euclidean_distance(X, x):
    return np.argmin(np.linalg.norm(X-x, axis=1))
    
def oneNN(X_train, X_test, y_train):
    min_indices = np.array(list(map(lambda x: min_euclidean_distance(X_train,x) , X_test)))
    return y_train[min_indices]


datasets = ['BreastTissue','Diabetes','Glass','Ionosphere','Sonar','Wine']

for d in datasets:
    date1 = datetime.datetime.now()
    pred_list = []
    for i in range(10):
        X_train, y_train, X_test, y_test, _, _,_,_ = load_data(d, test_size=30)
        predicted = oneNN(X_train, X_test, y_train)
        accuracy = accuracy_score(y_test, predicted)
        pred_list.append(accuracy)
    date2 = datetime.datetime.now()
    print('Total run time of 10 individual runs for ',d,' dataset = ', (date2-date1).total_seconds(), ' seconds')
    print('Mean accuracy of 10 individual runs for ',d,' dataset = ',np.mean(pred_list))
    print('___________________________________________________________________________________________________')

Total run time of 10 individual runs for  BreastTissue  dataset =  0.032002  seconds
Mean accuracy of 10 individual runs for  BreastTissue  dataset =  0.5533333333333333
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Diabetes  dataset =  0.247014  seconds
Mean accuracy of 10 individual runs for  Diabetes  dataset =  0.6933333333333332
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Glass  dataset =  0.056003  seconds
Mean accuracy of 10 individual runs for  Glass  dataset =  0.7333333333333332
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Ionosphere  dataset =  0.16301  seconds
Mean accuracy of 10 individual runs for  Ionosphere  dataset =  0.8400000000000001
_______________________________________________

In [4]:
def pairwise_euclidean_distance(X, Y, squared=False):
    XX = np.dot(X,X.T).diagonal()[:, np.newaxis]
    YY = np.dot(Y,Y.T).diagonal()[np.newaxis, :]
    distances = (-2 * np.dot(X,Y.T)) + XX + YY
    np.maximum(distances, 0, out=distances)
    if X is Y:
        np.fill_diagonal(distances, 0)
    return distances if squared else np.sqrt(distances, out=distances)

def pairwise_kernelized_euclidean_distance(X, Y, kernel, squared=False):
    XX = kernel(X,X).diagonal()[:, np.newaxis]
    YY = kernel(Y,Y).diagonal()[np.newaxis, :]
    distances = (-2 * kernel(X,Y)) + XX + YY
    np.maximum(distances, 0, out=distances)
    if X is Y:
        np.fill_diagonal(distances, 0)
    return distances if squared else np.sqrt(distances, out=distances)


SIGMA = 0.1

def linear_kernel(X, Y):
    return X@Y.T

def rbf_kernel(X, Y):
    K = pairwise_euclidean_distance(X, Y, squared=True)
    gamma = -1/(SIGMA**2)
#     gamma = 1.0 / X.shape[1]
    K *= gamma
    np.exp(K, K)
    return K

def first_polynomial_kernel(X, Y):
    gamma = 1.0 / X.shape[1]
    coef0 = 1
    degree = 1
    K = X@Y.T
    K *= gamma
    K += coef0
    K **= degree
    return K
def second_polynomial_kernel(X, Y):
    gamma = 1.0 / X.shape[1]
    coef0 = 1
    degree = 2
    K = X@Y.T
    K *= gamma
    K += coef0
    K **= degree
    return K
def third_polynomial_kernel(X, Y):
    gamma = 1.0 / X.shape[1]
    coef0 = 1
    degree = 2
    K = X@Y.T
    K *= gamma
    K += coef0
    K **= degree
    return K



In [5]:
import sys


print('___________________________________________________________________________________________________')
print('###################################################################################################')
print('###################################################################################################')
print('####################################    ',end='')
sys.stdout.write( "\033[;1m" + "\033[0;32m")
print('Linear Kernel',end='')
sys.stdout.write("\033[0;0m")
print('    ##########################################')
print('###################################################################################################')
print('###################################################################################################')
print('\n\n\n\n')


datasets = ['BreastTissue','Diabetes','Glass','Ionosphere','Sonar','Wine']
for d in datasets:
    date1 = datetime.datetime.now()
    pred_list = []
    for i in range(10):
        X_train, y_train, X_test, y_test, _, _,_,_ = load_data(d, test_size=30)
        distsances = pairwise_kernelized_euclidean_distance(X_train, X_test, linear_kernel, squared=False)
        nearest_neibours = np.argmin(distsances, axis=0)
        predicted = y_train[nearest_neibours]
        accuracy = accuracy_score(y_test, predicted)
        pred_list.append(accuracy)
    date2 = datetime.datetime.now()
    print('Total run time of 10 individual runs for ',d,' dataset = ', (date2-date1).total_seconds(), ' seconds')
    print('Mean accuracy of 10 individual runs for ',d,' dataset = ',np.mean(pred_list))
    print('___________________________________________________________________________________________________')

___________________________________________________________________________________________________
###################################################################################################
###################################################################################################
####################################    [;1m[0;32mLinear Kernel[0;0m    ##########################################
###################################################################################################
###################################################################################################





Total run time of 10 individual runs for  BreastTissue  dataset =  0.047003  seconds
Mean accuracy of 10 individual runs for  BreastTissue  dataset =  0.5466666666666666
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Diabetes  dataset =  0.17401  seconds
Mean accuracy of 10 individ

In [6]:
import sys


print('___________________________________________________________________________________________________')
print('###################################################################################################')
print('###################################################################################################')
print('####################################    ',end='')
sys.stdout.write( "\033[;1m" + "\033[1;34m")
print('RBF Kernel',end='')
sys.stdout.write("\033[0;0m")
print('    #############################################')
print('###################################################################################################')
print('###################################################################################################')
print('\n\n\n\n')


datasets = [('BreastTissue',1),('Diabetes',0.1),('Glass',1),('Ionosphere',1),('Sonar',1),('Wine',1)]
for d,s in datasets:
    date1 = datetime.datetime.now()
    SIGMA = s
    pred_list = []
    for i in range(10):
        X_train, y_train, X_test, y_test, _, _,_,_ = load_data(d, test_size=30)
        distsances = pairwise_kernelized_euclidean_distance(X_train, X_test, rbf_kernel, squared=False)
        nearest_neibours = np.argmin(distsances, axis=0)
        predicted = y_train[nearest_neibours]
        accuracy = accuracy_score(y_test, predicted)
        pred_list.append(accuracy)
    date2 = datetime.datetime.now()
    print('Total run time of 10 individual runs for ',d,' dataset = ', (date2-date1).total_seconds(), ' seconds')
    print('Mean accuracy of 10 individual runs for ',d,' dataset = ',np.mean(pred_list))
    print('___________________________________________________________________________________________________')

___________________________________________________________________________________________________
###################################################################################################
###################################################################################################
####################################    [;1m[1;34mRBF Kernel[0;0m    #############################################
###################################################################################################
###################################################################################################





Total run time of 10 individual runs for  BreastTissue  dataset =  0.038002  seconds
Mean accuracy of 10 individual runs for  BreastTissue  dataset =  0.16333333333333336
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Diabetes  dataset =  0.518029  seconds
Mean accuracy of 10 indiv

In [7]:
import sys


print('___________________________________________________________________________________________________')
print('###################################################################################################')
print('###################################################################################################')
print('####################################    ',end='')
sys.stdout.write( "\033[;1m" + "\033[0;32m")
print('Polynomial (𝑑 = 1) Kernel',end='')
sys.stdout.write("\033[0;0m")
print('    ##############################')
print('###################################################################################################')
print('###################################################################################################')
print('\n\n\n\n')


datasets = ['BreastTissue','Diabetes','Glass','Ionosphere','Sonar','Wine']
for d in datasets:
    date1 = datetime.datetime.now()
    pred_list = []
    for i in range(10):
        X_train, y_train, X_test, y_test, _, _,_,_ = load_data(d, test_size=30)
        distsances = pairwise_kernelized_euclidean_distance(X_train, X_test, first_polynomial_kernel, squared=False)
        nearest_neibours = np.argmin(distsances, axis=0)
        predicted = y_train[nearest_neibours]
        accuracy = accuracy_score(y_test, predicted)
        pred_list.append(accuracy)
    date2 = datetime.datetime.now()
    print('Total run time of 10 individual runs for ',d,' dataset = ', (date2-date1).total_seconds(), ' seconds')
    print('Mean accuracy of 10 individual runs for ',d,' dataset = ',np.mean(pred_list))
    print('___________________________________________________________________________________________________')

___________________________________________________________________________________________________
###################################################################################################
###################################################################################################
####################################    [;1m[0;32mPolynomial (𝑑 = 1) Kernel[0;0m    ##############################
###################################################################################################
###################################################################################################





Total run time of 10 individual runs for  BreastTissue  dataset =  0.042002  seconds
Mean accuracy of 10 individual runs for  BreastTissue  dataset =  0.5333333333333333
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Diabetes  dataset =  0.17801  seconds
Mean accuracy of 10 individ

In [8]:
import sys


print('___________________________________________________________________________________________________')
print('###################################################################################################')
print('###################################################################################################')
print('####################################    ',end='')
sys.stdout.write( "\033[;1m" + "\033[1;34m")
print('Polynomial (𝑑 = 2) Kernel',end='')
sys.stdout.write("\033[0;0m")
print('    ##############################')
print('###################################################################################################')
print('###################################################################################################')
print('\n\n\n\n')


datasets = ['BreastTissue','Diabetes','Glass','Ionosphere','Sonar','Wine']
for d in datasets:
    date1 = datetime.datetime.now()
    pred_list = []
    for i in range(10):
        X_train, y_train, X_test, y_test, _, _,_,_ = load_data(d, test_size=30)
        distsances = pairwise_kernelized_euclidean_distance(X_train, X_test, second_polynomial_kernel, squared=False)
        nearest_neibours = np.argmin(distsances, axis=0)
        predicted = y_train[nearest_neibours]
        accuracy = accuracy_score(y_test, predicted)
        pred_list.append(accuracy)
    date2 = datetime.datetime.now()
    print('Total run time of 10 individual runs for ',d,' dataset = ', (date2-date1).total_seconds(), ' seconds')
    print('Mean accuracy of 10 individual runs for ',d,' dataset = ',np.mean(pred_list))
    print('___________________________________________________________________________________________________')

___________________________________________________________________________________________________
###################################################################################################
###################################################################################################
####################################    [;1m[1;34mPolynomial (𝑑 = 2) Kernel[0;0m    ##############################
###################################################################################################
###################################################################################################





Total run time of 10 individual runs for  BreastTissue  dataset =  0.035002  seconds
Mean accuracy of 10 individual runs for  BreastTissue  dataset =  0.5166666666666666
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Diabetes  dataset =  0.154009  seconds
Mean accuracy of 10 indivi

In [9]:
import sys


print('___________________________________________________________________________________________________')
print('###################################################################################################')
print('###################################################################################################')
print('####################################    ',end='')
sys.stdout.write( "\033[;1m" + "\033[0;32m")
print('Polynomial (𝑑 = 3) Kernel',end='')
sys.stdout.write("\033[0;0m")
print('    ##############################')
print('###################################################################################################')
print('###################################################################################################')
print('\n\n\n\n')


datasets = ['BreastTissue','Diabetes','Glass','Ionosphere','Sonar','Wine']
for d in datasets:
    date1 = datetime.datetime.now()
    pred_list = []
    for i in range(10):
        X_train, y_train, X_test, y_test, _, _,_,_ = load_data(d, test_size=30)
        distsances = pairwise_kernelized_euclidean_distance(X_train, X_test, third_polynomial_kernel, squared=False)
        nearest_neibours = np.argmin(distsances, axis=0)
        predicted = y_train[nearest_neibours]
        accuracy = accuracy_score(y_test, predicted)
        pred_list.append(accuracy)
    date2 = datetime.datetime.now()
    print('Total run time of 10 individual runs for ',d,' dataset = ', (date2-date1).total_seconds(), ' seconds')
    print('Mean accuracy of 10 individual runs for ',d,' dataset = ',np.mean(pred_list))
    print('___________________________________________________________________________________________________')

___________________________________________________________________________________________________
###################################################################################################
###################################################################################################
####################################    [;1m[0;32mPolynomial (𝑑 = 3) Kernel[0;0m    ##############################
###################################################################################################
###################################################################################################





Total run time of 10 individual runs for  BreastTissue  dataset =  0.039002  seconds
Mean accuracy of 10 individual runs for  BreastTissue  dataset =  0.5366666666666667
___________________________________________________________________________________________________
Total run time of 10 individual runs for  Diabetes  dataset =  0.17901  seconds
Mean accuracy of 10 individ