In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder

# Dataset

In [10]:
datasetList = ['abalone.data']
dtName      = 'data/' + datasetList[0]

df          = pd.read_csv(dtName, header=None)
X, y        = df.iloc[:,:-1].values, df.iloc[:, -1].values
enc         = OneHotEncoder(handle_unknown='ignore')
one_hot     = enc.fit_transform(X[:, 0, None])
one_hot_arr = one_hot.toarray()
X           = normalize(X[:, 1:])
X           = np.concatenate((one_hot_arr, X), axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Dataset division

In [11]:
k          = 5
acc        = []
train_time = []
test_time  = []
train_index = 3133

X_train, X_test = X[:train_index], X[train_index:]
y_train, y_test = y[:train_index], y[train_index:]
neigh = KNeighborsClassifier(n_neighbors=k)
#neigh = KNN(K=k, ktype=ktype)

#Train
start_time = time.time()
neigh.fit(X_train, y_train)
train_time.append( time.time() - start_time )

#Test
start_time = time.time()
pred = neigh.predict(X_test)
test_time.append( time.time() - start_time )
#print(pred.shape,  y_test.shape)

acc = ( (pred == y_test).sum() / pred.shape[0] )

print(f"Acc: {acc.mean()}")

Acc: 0.21743295019157088


In [86]:
n_splits   = 5
k          = 1
acc        = []
train_time = []
test_time  = []

kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

for train_index, test_index in (kf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    neigh = KNeighborsClassifier(n_neighbors=k)
    #neigh = KNN(K=k, ktype=ktype)

    #Train
    start_time = time.time()
    neigh.fit(X_train, y_train)
    train_time.append( time.time() - start_time )

    #Test
    start_time = time.time()
    pred = neigh.predict(X_test)
    test_time.append( time.time() - start_time )
    #print(pred.shape,  y_test.shape)

    acc.append( (pred == y_test).sum() / pred.shape[0] )

acc = np.array(acc)
print(f"Acc: {acc.mean()} +/- {acc.std()}")

Acc: 0.19491166668079113 +/- 0.009837267926792797


