In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder
datasetList = ['abalone.data', 'balance-scale.data', 'transfusion.data']

# Dataset
### Balance

In [2]:
dtName      = 'data/' + datasetList[1]
df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,1:].values, df.iloc[:, 0].values
X           = normalize(X)
x_pd        = pd.DataFrame(X)
corr_x      = np.corrcoef(X)

print(df.head(5))
print(corr_x)

   0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5
[[nan nan nan ... nan nan nan]
 [nan  1.  1. ... -1. -1. nan]
 [nan  1.  1. ... -1. -1. nan]
 ...
 [nan -1. -1. ...  1.  1. nan]
 [nan -1. -1. ...  1.  1. nan]
 [nan nan nan ... nan nan nan]]


  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[None, :]


### Abalone

In [45]:
dtName      = 'data/' + datasetList[0]

df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[ 1.          0.9648594  -0.09247761 ...  0.96146651 -0.11040126
   0.91951738]
 [ 0.9648594   1.         -0.11612443 ...  0.86190065 -0.17811506
   0.78973203]
 [-0.09247761 -0.11612443  1.         ... -0.1137173   0.9810195
  -0.10408575]
 ...
 [ 0.96146651  0.86190065 -0.1137173  ...  1.         -0.08250344
   0.99014047]
 [-0.11040126 -0.17811506  0.9810195  ... -0.08250344  1.
  -0.04863324]
 [ 0.91951738  0.78973203 -0.10408575 ...  0.99014047 -0.04863324
   1.        ]]


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### Transfusion

In [27]:
dtName      = 'data/' + datasetList[2]

df          = pd.read_csv(dtName)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
X           = normalize(X)
corr_x      = np.corrcoef(X)
print(corr_x)
df.head()

[[1.         0.99999966 0.99999966 ... 0.99795436 0.98739345 0.94906298]
 [0.99999966 1.         0.99999998 ... 0.99799809 0.98743441 0.94914984]
 [0.99999966 0.99999998 1.         ... 0.99800393 0.9874638  0.94920777]
 ...
 [0.99795436 0.99799809 0.99800393 ... 1.         0.99329296 0.96313755]
 [0.98739345 0.98743441 0.9874638  ... 0.99329296 1.         0.98696177]
 [0.94906298 0.94914984 0.94920777 ... 0.96313755 0.98696177 1.        ]]


Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


## Dataset division

In [55]:
n_splits   = 5
k          = 10
acc        = []
train_time = []
test_time  = []

kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

for train_index, test_index in (kf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    neigh = KNeighborsClassifier(n_neighbors=k)
    #neigh = KNN(K=k, ktype=ktype)

    #Train
    start_time = time.time()
    neigh.fit(X_train, y_train)
    train_time.append( time.time() - start_time )

    #Test
    start_time = time.time()
    pred = neigh.predict(X_test)
    test_time.append( time.time() - start_time )
    
    acc.append( (pred == y_test).sum() / pred.shape[0] )

acc = np.array(acc)
print(f"Acc: {acc.mean()} +/- {acc.std()}")

Acc: 0.23513034771705987 +/- 0.011031068825019844


