In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter
from numpy.random import choice
from sklearn import datasets

## Load iris dataset

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
print(X[:5])
print(y[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]


## Split the data

### Method #1

In [4]:
def shuffle_split(X, y, ratio=0.75):
    p = np.random.permutation(len(X))
    X_train = X[p[:int(ratio*len(X))]]
    X_test = X[p[int(ratio*len(X)):]]
    y_train = y[p[:int(ratio*len(y))]]
    y_test = y[p[int(ratio*len(y)):]]
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = shuffle_split(X, y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(112, 4)
(38, 4)
(112,)
(38,)


### Method #2

In [3]:
def train_test_split(X, y, ratio=0.75):
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for i in range(len(X)):
        split = np.random.choice(2, 1, p=[ratio, 1-ratio])
        if split == 0:
            X_train.append(X[i].tolist())
            y_train.append(y[i])
        else:
            X_test.append(X[i].tolist())
            y_test.append(y[i])

    return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(116, 4)
(34, 4)
(116,)
(34,)


In [7]:
print(X_train[:5])
print(X_test[:5])
print(y_train[:5])
print(y_test[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[[4.6 3.4 1.4 0.3]
 [4.8 3.4 1.6 0.2]
 [4.6 3.6 1.  0.2]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]]
[0 0 0 0 0]
[0 0 0 0 0]


## The Model

In [8]:
def dist(vect1, vect2):
    vect_diff = vect1-vect2
    tot = 0
    for i in vect_diff:
        tot += i*i
    return np.sqrt(tot)

In [9]:
def most_similar(sample, data_features, data_labels, n=5):
    count = 0
    dist_list = []
    for j in range(len(data_features)):
        res = dist(sample, data_features[j])
        dist_list.append(res)
    res = sorted(zip(dist_list, data_labels))
    neighbor_list = list([(i[1]) for i in res])[1:n+1]
    most_common,num_most_common = Counter(neighbor_list).most_common(1)[0]
    return most_common

In [16]:
def accuray(X_train, X_test, y_train, y_test, n=5):
    sum = 0
    for i in enumerate(X_test):
        pred = most_similar(i[1], X_train, y_train, n)
        if pred == y_test[i[0]]:
            sum+=1    
        
    return sum/len(y_test)

In [18]:
print(accuray(X_train, X_test, y_train, y_test, 5))

0.9411764705882353


In [19]:
test = []
for i in range(len(X_test)):
    test.append(most_similar(X_test[i], X_train, y_train))
np.array(test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2])

## Compare to sklearn

In [12]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train) 
print(neigh.score(X_test, y_test))
res = neigh.predict(X_test)
res

0.9411764705882353


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2])

## Sklearn vs Mine

In [20]:
diff=test-res
diff

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [21]:
diff.sum()

0