In [7]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree

datasetList = ['abalone.data', 'balance-scale.data', 'transfusion.data', 'australian.dat', 'car.data','breast-cancer-wisconsin.data','pop_failures.dat','german.data']
const_ks    = [1, 5, 10, 20]

# Dataset
### Balance

In [127]:
dtName      = 'data/' + datasetList[1]
df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,1:].values, df.iloc[:, 0].values
X           = normalize(X)
x_pd        = pd.DataFrame(X)
corr_x      = np.corrcoef(X)

print(df.head(5))
print(corr_x)

   0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5
[[nan nan nan ... nan nan nan]
 [nan  1.  1. ... -1. -1. nan]
 [nan  1.  1. ... -1. -1. nan]
 ...
 [nan -1. -1. ...  1.  1. nan]
 [nan -1. -1. ...  1.  1. nan]
 [nan nan nan ... nan nan nan]]


  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[None, :]


### Abalone

In [128]:
dtName      = 'data/' + datasetList[0]

df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[ 1.          0.9648594  -0.09247761 ...  0.96146651 -0.11040126
   0.91951738]
 [ 0.9648594   1.         -0.11612443 ...  0.86190065 -0.17811506
   0.78973203]
 [-0.09247761 -0.11612443  1.         ... -0.1137173   0.9810195
  -0.10408575]
 ...
 [ 0.96146651  0.86190065 -0.1137173  ...  1.         -0.08250344
   0.99014047]
 [-0.11040126 -0.17811506  0.9810195  ... -0.08250344  1.
  -0.04863324]
 [ 0.91951738  0.78973203 -0.10408575 ...  0.99014047 -0.04863324
   1.        ]]


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### Transfusion

In [129]:
dtName      = 'data/' + datasetList[2]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[1.         0.99999966 0.99999966 ... 0.99795436 0.98739345 0.94906298]
 [0.99999966 1.         0.99999998 ... 0.99799809 0.98743441 0.94914984]
 [0.99999966 0.99999998 1.         ... 0.99800393 0.9874638  0.94920777]
 ...
 [0.99795436 0.99799809 0.99800393 ... 1.         0.99329296 0.96313755]
 [0.98739345 0.98743441 0.9874638  ... 0.99329296 1.         0.98696177]
 [0.94906298 0.94914984 0.94920777 ... 0.96313755 0.98696177 1.        ]]


Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


### Australian

In [130]:
dtName      = 'data/' + datasetList[3]

df          = pd.read_csv(dtName, header=None,delim_whitespace=True)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[ 1.         -0.13768124 -0.12807721 ... -0.14836166 -0.11967189
   0.46265743]
 [-0.13768124  1.          0.99911646 ...  0.99889511  0.99513334
   0.40281011]
 [-0.12807721  0.99911646  1.         ...  0.99634918  0.99125768
   0.41050089]
 ...
 [-0.14836166  0.99889511  0.99634918 ...  1.          0.99711021
   0.39164472]
 [-0.11967189  0.99513334  0.99125768 ...  0.99711021  1.
   0.37115108]
 [ 0.46265743  0.40281011  0.41050089 ...  0.39164472  0.37115108
   1.        ]]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,22.08,11.46,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.0,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.75,1,4,4,1.25,0,0,0,1,2,280,1,0
3,0,21.67,11.5,1,5,3,0.0,1,1,11,1,2,0,1,1
4,1,20.17,8.17,2,6,4,1.96,1,1,14,0,2,60,159,1


### Car

In [131]:
dtName      = 'data/' + datasetList[4]
df          = pd.read_csv(dtName, header=None)
number      = LabelEncoder()

for i in range(len(df.columns)):
    df[i]   = number.fit_transform(df[i].astype('str'))
    
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[ 1.          0.97794009  0.97519736 ... -0.45280778 -0.44902063
  -0.43220777]
 [ 0.97794009  1.          0.90821661 ... -0.45657083 -0.41075502
  -0.48077724]
 [ 0.97519736  0.90821661  1.         ... -0.42724425 -0.46952502
  -0.3585276 ]
 ...
 [-0.45280778 -0.45657083 -0.42724425 ...  1.          0.97666378
   0.97464817]
 [-0.44902063 -0.41075502 -0.46952502 ...  0.97666378  1.
   0.90456968]
 [-0.43220777 -0.48077724 -0.3585276  ...  0.97464817  0.90456968
   1.        ]]


Unnamed: 0,0,1,2,3,4,5,6
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


### BreastOri

In [132]:
dtName      = 'data/' + datasetList[5]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = X[ np.all(X != '?', axis = 1)].astype(np.float)
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(corr_x)
print(X)

[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]
[[1.00000000e+00 4.98531824e-06 3.98825459e-06 ... 2.99119094e-06
  1.99412729e-06 9.97063647e-07]
 [1.00000000e+00 2.95442795e-06 9.84809316e-07 ... 2.95442795e-06
  9.84809316e-07 9.84809316e-07]
 [1.00000000e+00 5.90390218e-06 7.87186958e-06 ... 2.95195109e-06
  6.88788588e-06 9.83983697e-07]
 ...
 [1.00000000e+00 5.62543597e-06 1.12508719e-05 ... 9.00069755e-06
  1.12508719e-05 2.25017439e-06]
 [1.00000000e+00 4.45696853e-06 8.91393705e-06 ... 1.11424213e-05
  6.68545279e-06 1.11424213e-06]
 [1.00000000e+00 4.45696853e-06 8.91393705e-06 ... 1.11424213e-05
  4.45696853e-06 1.11424213e-06]]


### Climate

In [133]:
dtName      = 'data/' + datasetList[6]

df          = pd.read_csv(dtName,delim_whitespace=True)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[1.         0.31257614 0.25278135 ... 0.29870223 0.29716219 0.29833474]
 [0.31257614 1.         0.82038705 ... 0.77736799 0.78051254 0.7784493 ]
 [0.25278135 0.82038705 1.         ... 0.91147616 0.9123951  0.9112097 ]
 ...
 [0.29870223 0.77736799 0.91147616 ... 1.         0.99994788 0.99995182]
 [0.29716219 0.78051254 0.9123951  ... 0.99994788 1.         0.99994268]
 [0.29833474 0.7784493  0.9112097  ... 0.99995182 0.99994268 1.        ]]


Unnamed: 0,Study,Run,vconst_corr,vconst_2,vconst_3,vconst_4,vconst_5,vconst_7,ah_corr,ah_bolus,...,efficiency_factor,tidal_mix_max,vertical_decay_scale,convect_corr,bckgrnd_vdc1,bckgrnd_vdc_ban,bckgrnd_vdc_eq,bckgrnd_vdc_psim,Prandtl,outcome
0,1,1,0.859036,0.927825,0.252866,0.298838,0.170521,0.735936,0.428325,0.567947,...,0.245675,0.104226,0.869091,0.997518,0.44862,0.307522,0.85831,0.796997,0.869893,0
1,1,2,0.606041,0.457728,0.359448,0.306957,0.843331,0.934851,0.444572,0.828015,...,0.61687,0.975786,0.914344,0.845247,0.864152,0.346713,0.356573,0.438447,0.512256,1
2,1,3,0.9976,0.373238,0.517399,0.504993,0.618903,0.605571,0.746225,0.195928,...,0.679355,0.803413,0.643995,0.718441,0.924775,0.315371,0.250642,0.285636,0.365858,1
3,1,4,0.783408,0.104055,0.197533,0.421837,0.742056,0.490828,0.005525,0.392123,...,0.471463,0.597879,0.761659,0.362751,0.912819,0.977971,0.845921,0.699431,0.475987,1
4,1,5,0.40625,0.513199,0.061812,0.635837,0.844798,0.441502,0.191926,0.487546,...,0.551543,0.743877,0.312349,0.650223,0.522261,0.043545,0.37666,0.280098,0.132283,1


### German

In [134]:
dtName      = 'data/' + datasetList[7]
df          = pd.read_csv(dtName, header=None,delim_whitespace=True)
number      = LabelEncoder()

for i in range(len(df.columns)):
    df[i]   = number.fit_transform(df[i].astype('str'))
    
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[1.         0.23901034 0.25908761 ... 0.23990432 0.83783045 0.24103225]
 [0.23901034 1.         0.44385627 ... 0.44903919 0.44485234 0.99996892]
 [0.25908761 0.44385627 1.         ... 0.9986736  0.43901699 0.44369842]
 ...
 [0.23990432 0.44903919 0.9986736  ... 1.         0.44395236 0.44864486]
 [0.83783045 0.44485234 0.43901699 ... 0.44395236 1.         0.44439982]
 [0.24103225 0.99996892 0.44369842 ... 0.44864486 0.44439982 1.        ]]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,27,4,4,50,4,4,3,2,0,...,0,48,2,1,1,2,0,1,0,0
1,1,24,2,4,725,0,2,1,1,0,...,0,3,2,1,0,2,0,0,0,1
2,3,2,4,7,323,0,3,1,2,0,...,0,30,2,1,0,1,1,0,0,0
3,0,21,2,3,846,0,3,1,2,2,...,1,26,2,2,0,2,1,0,0,0
4,0,11,3,0,684,0,2,2,2,0,...,3,34,2,2,1,2,1,0,0,1


### MicePE


In [135]:
dtName      = 'data/' + datasetList[7]
df          = pd.read_csv(dtName, header=None,delim_whitespace=True)
number      = LabelEncoder()
print(df)
for i in range(len(df.columns)):
    if(df[i]):
        df[i]   = number.fit_transform(df[i].astype('str'))
    
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

      0   1    2    3      4    5    6   7    8     9  ...    11  12    13  \
0    A11   6  A34  A43   1169  A65  A75   4  A93  A101 ...  A121  67  A143   
1    A12  48  A32  A43   5951  A61  A73   2  A92  A101 ...  A121  22  A143   
2    A14  12  A34  A46   2096  A61  A74   2  A93  A101 ...  A121  49  A143   
3    A11  42  A32  A42   7882  A61  A74   2  A93  A103 ...  A122  45  A143   
4    A11  24  A33  A40   4870  A61  A73   3  A93  A101 ...  A124  53  A143   
5    A14  36  A32  A46   9055  A65  A73   2  A93  A101 ...  A124  35  A143   
6    A14  24  A32  A42   2835  A63  A75   3  A93  A101 ...  A122  53  A143   
7    A12  36  A32  A41   6948  A61  A73   2  A93  A101 ...  A123  35  A143   
8    A14  12  A32  A43   3059  A64  A74   2  A91  A101 ...  A121  61  A143   
9    A12  30  A34  A40   5234  A61  A71   4  A94  A101 ...  A123  28  A143   
10   A12  12  A32  A40   1295  A61  A72   3  A92  A101 ...  A123  25  A143   
11   A11  48  A32  A49   4308  A61  A72   3  A92  A101 ...  A122

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## Dataset division

In [137]:
for i in range(0, len(const_ks)):
    n_splits   = 5
    k          = const_ks[i]
    print(k)
    acc        = []
    acurancias = []
    train_time = []
    test_time  = []
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

    for train_index, test_index in (kf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        neigh = KNeighborsClassifier(n_neighbors=k)
        #neigh = KNN(K=k, ktype=ktype)

        #Train
        start_time = time.time()
        neigh.fit(X_train, y_train)
        train_time.append( time.time() - start_time )

        #Test
        start_time = time.time()
        pred = neigh.predict(X_test)
        test_time.append( time.time() - start_time )

        acc.append( (pred == y_test).sum() / pred.shape[0] )

    acc = np.array(acc)
    print(f"Acc: {acc.mean()} +/- {acc.std()}")

1
Acc: 0.6759999999999999 +/- 0.030886890422960982
5
Acc: 0.704 +/- 0.02034698994937578
10
Acc: 0.719 +/- 0.01319090595827293
20
Acc: 0.698 +/- 0.018055470085267752


## Tree

In [126]:
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf.fit(iris.data, iris.target)
clf.apply(iris.data[[1,100, 100, 60]])
#tree.plot_tree(clf.fit(iris.data, iris.target)) 

array([ 1, 16, 16,  5])