In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn import datasets

datasetList = ['abalone.data', 'balance-scale.data', 'transfusion.data', 'australian.dat', 
               'car.data','breast-cancer-wisconsin.data','pop_failures.dat','german.data']
const_ks    = [1, 5, 10, 20]

# Dataset

### Abalone

In [None]:
dtName      = 'data/' + datasetList[0]
dtLabel     = datasetList[0]

df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Transfusion

In [None]:
dtName      = 'data/' + datasetList[2]
dtLabel     = datasetList[2]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Australian

In [None]:
dtName      = 'data/' + datasetList[3]
dtLabel     = datasetList[3]

df          = pd.read_csv(dtName, header=None,delim_whitespace=True)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Car

In [None]:
dtName      = 'data/' + datasetList[4]
dtLabel     = datasetList[4]
df          = pd.read_csv(dtName, header=None)
number      = LabelEncoder()

for i in range(len(df.columns)):
    df[i]   = number.fit_transform(df[i].astype('str'))
    
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### BreastOri

In [None]:
dtName      = 'data/' + datasetList[5]
dtLabel     = datasetList[5]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
y           = y[ np.all(X != '?', axis = 1)]
X           = X[ np.all(X != '?', axis = 1)].astype(np.float)
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()

### Climate

In [None]:
dtName      = 'data/' + datasetList[6]
dtLabel     = datasetList[6]

df          = pd.read_csv(dtName,delim_whitespace=True)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(X.shape)
print(corr_x)
df.head()


### German

In [None]:
dtName      = 'data/' + datasetList[7]
dtLabel     = datasetList[7]

df          = pd.read_csv(dtName, header=None,delim_whitespace=True)
number      = LabelEncoder()

for i in range(len(df.columns)):
    df[i]   = number.fit_transform(df[i].astype('str'))
    
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

### Wine

In [None]:
dtLabel  = "wine"
dataset  = datasets.load_wine()
X,y      = dataset.data, dataset.target
X        = normalize(X)
corr_x   = np.corrcoef(X)
print(corr_x)

### breast_cancer

In [2]:
dtLabel  = "breast_cancer"
dataset  = datasets.load_breast_cancer()
X,y      = dataset.data, dataset.target
X        = normalize(X)
corr_x   = np.corrcoef(X)
print(corr_x)

[[1.         0.9892786  0.98703483 ... 0.97919942 0.98785229 0.97683484]
 [0.9892786  1.         0.999713   ... 0.99774113 0.99972889 0.98469916]
 [0.98703483 0.999713   1.         ... 0.99878867 0.99988251 0.98591229]
 ...
 [0.97919942 0.99774113 0.99878867 ... 1.         0.99875905 0.98800384]
 [0.98785229 0.99972889 0.99988251 ... 0.99875905 1.         0.98765632]
 [0.97683484 0.98469916 0.98591229 ... 0.98800384 0.98765632 1.        ]]


## Iris

In [None]:
dtLabel  = "iris"
iris   = load_iris()
X,y    = iris.data, iris.target
X      = normalize(X)
corr_x = np.corrcoef(X)
print(corr_x)

# K*Tree

In [3]:
def getKS(X_train, y_train):
    feat = X_train.shape[1]
    neigh = KNeighborsClassifier(1)
    neigh.fit(X_train, y_train)
    leafs = clf.apply(X_train)
    k_values = clf.predict(X_train)

    ktree_st = []
    ktree_s  = []
    for leaf in np.unique(leafs):
        tree_node_k   = k_values[leafs==leaf][0]
        tree_node_e   = X_train[leafs == leaf]
        tree_node_e_y = y_train[leafs == leaf][:, None]
        NN_idx  = neigh.kneighbors(tree_node_e, tree_node_k, return_distance=False)
        NN      = X_train[NN_idx]
        NN_y    = y_train[NN_idx]

        NNN_idx = neigh.kneighbors(NN.reshape(-1, feat), 1, return_distance=False)
        NNN     = X_train[NNN_idx][:,0]
        NNN_y   = y_train[NNN_idx]
        ktree_st.append((tree_node_k, tree_node_e, NN, NNN))

        NN      = NN.reshape(-1, feat)
        NN_y    = NN_y.reshape(-1, 1)

        batch_x = np.concatenate((tree_node_e, NN, NNN))
        batch_y = np.concatenate((tree_node_e_y, NN_y, NNN_y))
        batch = np.concatenate((batch_x, batch_y),axis=1)
        batch  = np.unique(batch, axis=0)
        #if len(batch.shape) == 1:
        #    batch = batch[:, None]
            
        ktree_s.append(batch)
    return ktree_s, np.unique(leafs)
    ## ktree_st[0][0] Valor de K do quadrante
    ## ktree_st[0][1] Elementos do quadrante
    ## ktree_st[0][2] NN dos elementos do quadrante
    

In [7]:
print(dtLabel)
clf = tree.DecisionTreeClassifier()
n_splits   = 10


acc        = []
acurancias = []
train_time = []
test_time  = []
kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
th_value =  0.9999
for train_index, test_index in (kf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    start_time = time.time()
    corr_x_train    = np.corrcoef(X_train)
    corr_y_train    = ( corr_x_train > (corr_x_train.max() * th_value)).sum(axis=0)
    clf.fit(X_train, corr_y_train)

    
    ktree_batch, leafs = getKS(X_train, y_train)
    
    corr_y_test     = clf.predict(X_test)
    leaf_test       = clf.apply(X_test)
    neigh = KNeighborsClassifier(n_neighbors=1)
    sum_acc    = 0
    
    for idx, leaf in enumerate(leaf_test):
        leaf_idx = (leaf == leafs).argmax()
        X_train = ktree_batch[leaf_idx][:, :-1]
        y_train = ktree_batch[leaf_idx][:, -1]
        neigh.fit(X_train, y_train)
        n_neighbors = min(corr_y_test[idx], len(X_train))
        neigh.set_params(n_neighbors=n_neighbors)
        
        pred = neigh.predict(X_test[idx,None])
        sum_acc += (pred == y_test[idx])
    acc.append( sum_acc / len(corr_y_test))
acc        = np.array(acc)

print(f"K*Tree, {acc.mean():0.4f} +/- {acc.std():0.4f}, TH: {th_value}")


breast_cancer
K*Tree, 0.0000 +/- 0.0000, TH: 0.9999
