In [10]:
import numpy as np
import seaborn as sns
from typing import Tuple
from scipy.stats import mode
from sklearn.metrics import confusion_matrix


csv_path = "iris.csv"

Load data and clean, try loading without shuffle

In [11]:
def load_csv(csv_path:str) ->Tuple[np.ndarray,np.ndarray]:
    np.random.seed(42)
    dataset = np.genfromtxt(csv_path,delimiter=',')
    np.random.shuffle(dataset,)
    x,y = dataset[:,:4],dataset[:,-1]
    return x,y

In [12]:
x,y = load_csv(csv_path)

In [13]:
np.mean(x,axis=0),np.var(x,axis=0)

(array([nan, nan, nan, nan]), array([nan, nan, nan, nan]))

In [14]:
np.nanmean(x,axis=0),np.nanvar(x,axis=0)

(array([ 355.46503497, -280.09189189,    2.95      ,   21.74726027]),
 array([1.73561968e+07, 1.18405444e+07, 1.51049922e+04, 6.11729208e+04]))

In [15]:
x[np.isnan(x)] = 3.5
x.shape

(150, 4)

In [16]:
np.mean(x,axis=0),np.var(x,axis=0)

(array([ 339.04      , -276.31066667,    2.95733333,   21.26066667]),
 array([1.65517522e+07, 1.16837285e+07, 1.49035963e+04, 5.95502852e+04]))

In [17]:
(x > 13.0).sum(), (x < 0.0).sum()

(4, 2)

In [18]:
x[np.where(np.logical_or(x > 13.0,x < 0.0))]

array([ -1111.,    100.,   1000.,  50000.,   3000., -42000.])

In [19]:
less_than = np.where(x < 0.0)
higher_than = np.where(x > 13.0)
less_than,higher_than

((array([  4, 140], dtype=int64), array([2, 1], dtype=int64)),
 (array([14, 27, 28, 62], dtype=int64), array([1, 2, 0, 3], dtype=int64)))

In [20]:
y = np.delete(y,np.where(x < 0.0)[0],axis=0)
y = np.delete(y,np.where(x > 13.0)[0],axis=0)
x = np.delete(x,np.where(x < 0.0)[0],axis=0)
x = np.delete(x,np.where(x > 13.0)[0],axis=0)
x.shape,y.shape

((144, 4), (144,))

Train test split
try optim

In [21]:
def train_test_split(features:np.ndarray,
                     labels:np.ndarray,
                     test_split_ratio:float) -> None:
        
    test_size = int(len(features) * test_split_ratio)
    train_size = len(features) - test_size
    assert len(features) == test_size + train_size, "Size mismatch!"

    x_train,y_train = features[:train_size,:],labels[:train_size]
    x_test,y_test = features[train_size:train_size+test_size,:], labels[train_size:train_size + test_size]
    return (x_train,y_train,x_test,y_test)

Distance

In [22]:
def euclidean(points:np.ndarray,element_of_x:np.ndarray) -> np.ndarray:
    return np.sqrt(np.sum((points - element_of_x)**2,axis=1))

Predict func

In [23]:
def predict(x_train:np.ndarray,y_train:np.ndarray,x_test:np.ndarray,k:int) -> np.ndarray:
    labels_pred = []
    for x_test_element in x_test:
        distances = euclidean(x_train,x_test_element)
        distances = np.array(sorted(zip(distances,y_train)))
        label_pred = mode(distances[:k,1],keepdims=False).mode
        labels_pred.append(label_pred)
    return np.array(labels_pred,dtype=np.int32)

Metrics

In [24]:
def accuracy(y_test:np.ndarray,y_preds:np.ndarray) -> float:
    true_positive = (y_test == y_preds).sum()
    return true_positive / len(y_test) * 100

In [25]:
def plot_confusion_matrix(y_test:np.ndarray,y_preds:np.ndarray):
        conf_matrix = confusion_matrix(y_test,y_preds)
        sns.heatmap(conf_matrix,annot=True) 