In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree

datasetList = ['abalone.data', 'balance-scale.data', 'transfusion.data']

# Dataset
### Balance

In [None]:
dtName      = 'data/' + datasetList[1]
df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,1:].values, df.iloc[:, 0].values
X           = normalize(X)
x_pd        = pd.DataFrame(X)
corr_x      = np.corrcoef(X)

print(df.head(5))
print(corr_x)

### Abalone

In [None]:
dtName      = 'data/' + datasetList[0]

df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

### Transfusion

In [101]:
dtName      = 'data/' + datasetList[2]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[1.         0.99999966 0.99999966 ... 0.99795436 0.98739345 0.94906298]
 [0.99999966 1.         0.99999998 ... 0.99799809 0.98743441 0.94914984]
 [0.99999966 0.99999998 1.         ... 0.99800393 0.9874638  0.94920777]
 ...
 [0.99795436 0.99799809 0.99800393 ... 1.         0.99329296 0.96313755]
 [0.98739345 0.98743441 0.9874638  ... 0.99329296 1.         0.98696177]
 [0.94906298 0.94914984 0.94920777 ... 0.96313755 0.98696177 1.        ]]


Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [122]:
(corr_x[:, 234] > 0.9999).sum()

401

## Dataset division

In [None]:
n_splits   = 5
k          = 10
acc        = []
train_time = []
test_time  = []

kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

for train_index, test_index in (kf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    neigh = KNeighborsClassifier(n_neighbors=k)
    #neigh = KNN(K=k, ktype=ktype)

    #Train
    start_time = time.time()
    neigh.fit(X_train, y_train)
    train_time.append( time.time() - start_time )

    #Test
    start_time = time.time()
    pred = neigh.predict(X_test)
    test_time.append( time.time() - start_time )
    
    acc.append( (pred == y_test).sum() / pred.shape[0] )

acc = np.array(acc)
print(f"Acc: {acc.mean()} +/- {acc.std()}")

## Tree

In [None]:
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf.fit(iris.data, iris.target)
clf.apply(iris.data[[1,100, 100, 60]])
#tree.plot_tree(clf.fit(iris.data, iris.target)) 