In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree


datasetList = ['abalone.data', 'balance-scale.data', 'transfusion.data', 'australian.dat', 'car.data','breast-cancer-wisconsin.data','pop_failures.dat','german.data']
const_ks    = [1, 2, 3, 4, 5, 10, 20, 30, 50, 100, 255, 400]

# Dataset

### Abalone

In [None]:
dtName      = 'data/' + datasetList[0]
dtLabel     = datasetList[0]

df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Transfusion

In [None]:
dtName      = 'data/' + datasetList[2]
dtLabel     = datasetList[2]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Australian

In [None]:
dtName      = 'data/' + datasetList[3]
dtLabel     = datasetList[3]

df          = pd.read_csv(dtName, header=None,delim_whitespace=True)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Car

In [None]:
dtName      = 'data/' + datasetList[4]
dtLabel     = datasetList[4]
df          = pd.read_csv(dtName, header=None)
number      = LabelEncoder()

for i in range(len(df.columns)):
    df[i]   = number.fit_transform(df[i].astype('str'))
    
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### BreastOri

In [None]:
dtName      = 'data/' + datasetList[5]
dtLabel     = datasetList[5]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
y           = y[ np.all(X != '?', axis = 1)]
X           = X[ np.all(X != '?', axis = 1)].astype(np.float)
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Climate

In [None]:
dtName      = 'data/' + datasetList[6]
dtLabel     = datasetList[6]

df          = pd.read_csv(dtName,delim_whitespace=True)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()


### German

In [None]:
dtName      = 'data/' + datasetList[7]
dtLabel     = datasetList[7]

df          = pd.read_csv(dtName, header=None,delim_whitespace=True)
number      = LabelEncoder()

for i in range(len(df.columns)):
    df[i]   = number.fit_transform(df[i].astype('str'))
    
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

# KNN

In [None]:
print(dtLabel.split(".")[0])
print("K, Acurácia, Obs.")
for i in range(0, len(const_ks)):
    n_splits   = 5
    k          = const_ks[i]

    acc        = []
    acurancias = []
    train_time = []
    test_time  = []
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

    for train_index, test_index in (kf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        neigh = KNeighborsClassifier(n_neighbors=k)
        #neigh = KNN(K=k, ktype=ktype)

        #Train
        start_time = time.time()
        neigh.fit(X_train, y_train)
        train_time.append( time.time() - start_time )

        #Test
        start_time = time.time()
        pred = neigh.predict(X_test)
        test_time.append( time.time() - start_time )

        acc.append( (pred == y_test).sum() / pred.shape[0] )

    acc = np.array(acc)
    print(f"{k:5}, {acc.mean():0.4f} +/- {acc.std():0.4f}")
    
    
clf = tree.DecisionTreeClassifier()
n_splits   = 5
k          = const_ks[i]

acc        = []
acurancias = []
train_time = []
test_time  = []
kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
th_value = 0.999
for train_index, test_index in (kf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    corr_x_train    = np.corrcoef(X_train)
    corr_y_train    = ( corr_x_train > (corr_x_train.max() * th_value)).sum(axis=0)
    clf.fit(X_train, corr_y_train)
    corr_y_test     = clf.predict(X_test)
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(X_train, y_train)

    sum_acc         = 0
    for idx, k in enumerate( np.unique(corr_y_test)):
        X_test_k = X_test[ corr_y_test == k]
        
        neigh.set_params(n_neighbors=k)
        
        #Test
        start_time = time.time()
        pred = neigh.predict(X_test_k)
        test_time.append( time.time() - start_time )
        
        sum_acc += (pred == y_test[corr_y_test == k]).sum()
        
    acc.append( sum_acc / len(corr_y_test))
acc = np.array(acc)
print(f"KTree, {acc.mean():0.4f} +/- {acc.std():0.4f}, TH: {th_value}")

## K*Tree Helping Functions

In [None]:
def kstartleafGen(A,classifier,dataset): ###function to create the leafs of a k*tree
    kstarleaf = {}
    NeighB = classifier.kneighbors([A])
    NeighB = NeighB[1][0]
    #print(NeighB)
    for i in range(0,len(NeighB)):#loop on the neighbors of A
        aux = []
        NeighBB = classifier.kneighbors([dataset[NeighB[i]]])#Neighbors of A[i]
        NeighBB = NeighBB[1][0]
        #print(NeighBB)
        for j in range(0,len(NeighBB)):
            aux.append(dataset[NeighBB[j]])#Saving A[i] Neighboors
        aux1 = dataset[NeighB[i]]
        aux1 = tuple(aux1)#A np.array cannot be a key of a dict, so transf. it in a tuple works fine
        kstarleaf[aux1] = aux
    #returns the dict which keys are the neighbors of A and content
    #are the neighbors of the neighbors of A
    return kstarleaf 


### K*tree

In [None]:
teste = kstartleafGen(X[0],neigh,X)
#print(teste)
print(teste[tuple(X[9])])

### Graphs

In [None]:
def plot_bar(dataset,acc_knn,acc_ktree,acc_kstar):#simple bar plot, two bars to compare accuracy
    x = np.arange(3)
    labels = [acc_knn, acc_ktree, acc_kstar ]
    fig, ax = plt.subplots()
    plt.bar(x, labels,color=['blue', 'orange', 'red'])
    plt.xticks(x, ('Acc. kNN', 'Acc. Ktree', 'Acc. K*tree'))
    plt.ylabel('Accuracy')
    for i, v in enumerate(labels):
        plt.text(x[i]-0.05, v+0.001, str(v))
    plt.title('Accuracy comparison on ' +dataset+ ' dataset')
    plt.show()
    

In [None]:
plot_bar('Abalone',.1,.2,.3)

In [None]:
def lines_plot(runtime_kNN, runtime_kTree, runtime_Kstar, iterations):
    x = np.arange(int(iterations))
    fig, ax = plt.subplots()
    line1, = ax.plot(x, runtime_kNN, '--' , label='kNN')

    line2, = ax.plot(x, runtime_kTree,'-.',label='Ktree')

    line3, = ax.plot(x,runtime_Kstar,'red', label='K*tree')

    ax.legend(loc='lower right')
    plt.ylabel('Runtime')
    plt.xlabel('Iterations')
    plt.show()

In [None]:
lines_plot([1.,2.,8.],[2.2,3.,1.],[3.,4.,3.],3)