## Exploration on the Iris dataset

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter

## Performing Exploratory Data Analysis on the Iris dataset
Attributes
- Sepal length (in cm)
- Sepal width (in cm)
- Petal length (in cm)
- Petal width (in cm)

Classes
- Iris setosa
- Iris Versicolor
- Iris Virginica

In [2]:
# Load iris dataset

iris_dataset = datasets.load_iris()
iris_dataset

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
# Turning dataset into pandas DataFrame for visualization
iris_df = pd.DataFrame(data=np.c_[iris_dataset['data'], iris_dataset['target']],
                      columns=iris_dataset['feature_names'] + ['target'])
iris_df = iris_df.astype({'target': np.int64})
y = iris_df.target
x = iris_df.drop('target', axis=1)
x

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

xtrain = np.asarray(xtrain)
ytrain = np.asarray(ytrain)

xtest = np.asarray(xtest)
ytest = np.asarray(ytest)

print(f'Size of train set: {xtrain.shape[0]}\nSize of test set: {xtest.shape[0]}')

Size of train set: 120
Size of test set: 30


In [5]:
# Feature scaling on data
scaler = Normalizer().fit(xtrain)
norm_xtrain = scaler.transform(xtrain)
norm_xtest = scaler.transform(xtest)

# y set contains labels, which DO NOT need to be normalized

In [6]:
print("Before normalization...")
print(xtrain[:3])
print("\nAfter normalization...")
print(norm_xtrain[:3])

Before normalization...
[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]]

After normalization...
[[0.77577075 0.60712493 0.16864581 0.03372916]
 [0.77381111 0.59732787 0.2036345  0.05430253]
 [0.76945444 0.35601624 0.50531337 0.16078153]]


## Implementing KNN from scratch

### Defining the distance function

In [7]:
import math

def dist_euclid(xtrain, xpoint, ytrain):
    """
    Inputs
    ----------------------
    xtrain: corresponding to training data
    xpoint: corresponding to the test point
    
    Outputs
    ----------------------
    distances_df: DataFrame of distances between test point and each point in the training dataset, with labels
    """
    distances = []
    for trainpoint in xtrain:
        dist = trainpoint - xpoint
        dist = np.square(dist)
        dist = math.sqrt(np.sum(dist))
        distances.append(dist)
    
    distances_df = pd.DataFrame(data=np.c_[distances, ytrain], columns=['distance', 'label'])
    return distances_df

In [8]:
distances_df = dist_euclid(norm_xtrain, norm_xtest[0], ytrain)
distances_df

Unnamed: 0,distance,label
0,0.495361,0.0
1,0.457402,0.0
2,0.074329,1.0
3,0.395442,0.0
4,0.425854,0.0
...,...,...
115,0.075112,1.0
116,0.120023,2.0
117,0.477492,0.0
118,0.050868,1.0


### Finding K-Nearest Neighbours

In [9]:
def nearest_neighbours(distances, K):
    distances = distances.sort_values(by=['distance'], axis=0)
    k_dist = distances[:K].label.to_numpy()
    counter_vote = Counter(k_dist)
    ypred = counter_vote.most_common()[0][0]
    return ypred

In [10]:
nearest_neighbours(distances_df, 3)

1.0

In [11]:
def KNN(xtrain, ytrain, xtest, K):
    """
    Implements the K-Nearest Neighbours algorithm
    
    Inputs
    --------------
    xtrain: corresponding to training data
    ytrain: corresponding to training labels
    xtest: corresponding to testing data
    K: corresponding to number of nearest neighbours
    
    Outputs
    --------------
    preds: corresponding to prediction labels
    """
    preds = []
    
    for testpoint in xtest:
        dist_df = dist_euclid(xtrain, testpoint, ytrain)
        preds.append(nearest_neighbours(dist_df, K))
    
    return preds

In [12]:
# Testing accuracy of KNN algorithm
ypreds = KNN(norm_xtrain, ytrain, norm_xtest, 3)
ypreds = np.array(ypreds, dtype=np.int64)

### Implementing with `sklearn`

In [13]:
knn = KNeighborsClassifier(3)
knn.fit(norm_xtrain, ytrain)
ypred_sklearn = knn.predict(norm_xtest)
ypred_sklearn

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 1, 2, 2, 2, 0, 0], dtype=int64)

In [14]:
ypreds

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 1, 2, 2, 2, 0, 0], dtype=int64)

In [15]:
np.array_equal(ypreds, ypred_sklearn)

True

In [16]:
print(f'Accuracy: {accuracy_score(ytest, ypreds)*100:.2f}%')

Accuracy: 96.67%


## K-Fold cross validation to tune hyper-parameters
- Uses the whole training set on training and validation
- Performs K rounds of evaluations to find the best set of hyperparameters

In [17]:
n_splits = 4
kf = KFold(n_splits=n_splits)

accuracy = []
k_values = list(range(1, 30, 2)) #searches for best value of K, K must be odd to get majority

for k in k_values:
    accuracy_fold = 0
    for norm_xtrain_fold_idx, norm_xvalid_fold_idx in kf.split(norm_xtrain):
        norm_xtrain_fold = norm_xtrain[norm_xtrain_fold_idx]
        norm_xvalid_fold = norm_xtrain[norm_xvalid_fold_idx]
        
        ytrain_fold = ytrain[norm_xtrain_fold_idx]
        yvalid_fold = ytrain[norm_xvalid_fold_idx]
        ypred_fold = KNN(norm_xtrain_fold, ytrain_fold, norm_xvalid_fold, k)
        
        accuracy_fold += accuracy_score(yvalid_fold, ypred_fold)
    accuracy.append(accuracy_fold/n_splits)

In [18]:
for idx, k in enumerate(k_values):
    print(f'For k={k}: {accuracy[idx]*100:.2f}%')

For k=1: 96.67%
For k=3: 98.33%
For k=5: 97.50%
For k=7: 95.83%
For k=9: 96.67%
For k=11: 97.50%
For k=13: 97.50%
For k=15: 97.50%
For k=17: 97.50%
For k=19: 97.50%
For k=21: 97.50%
For k=23: 96.67%
For k=25: 96.67%
For k=27: 96.67%
For k=29: 96.67%


In [19]:
print(f'Best accuracy score {np.max(accuracy)*100:.2f}% when k={k_values[np.argmax(accuracy)]}')

Best accuracy score 98.33% when k=3


## Distance
- Euclidian distance: Use if quantities measured are of similar properties (height, length, depth)
- Manhattan distance: Use if quantities measured are of different properties (weight, height, speed)