In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
from typing import Tuple
from scipy.stats import mode
from sklearn.metrics import confusion_matrix

csv_path = "diabetes.csv"

Load data and clean, try loading without shuffle

In [26]:
def load_csv(csv_path:str) -> Tuple[pd.DataFrame , pd.DataFrame]:
    dataset = pd.read_csv(csv_path, delimiter=',')
    dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)
    x,y = dataset.iloc[:,:4], dataset.iloc[:,-1]
    return x,y

In [27]:
x,y = load_csv(csv_path)

print(x, y)

     Pregnancies  Glucose  BloodPressure  SkinThickness
0              6       98             58             33
1              2      112             75             32
2              2      108             64              0
3              8      107             80              0
4              7      136             90              0
..           ...      ...            ...            ...
763            5      139             64             35
764            1       96            122              0
765           10      101             86             37
766            0      141              0              0
767            0      125             96              0

[768 rows x 4 columns] 0      0
1      0
2      0
3      0
4      0
      ..
763    0
764    0
765    1
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [28]:
np.nanmean(x,axis=0),np.nanvar(x,axis=0)

(array([  3.84505208, 120.89453125,  69.10546875,  20.53645833]),
 array([  11.33927239, 1020.91726176,  374.15944926,  254.14189996]))

In [29]:
x[np.isnan(x)] = 3.5
x.shape

(768, 4)

In [30]:
np.mean(x,axis=0),np.var(x,axis=0)

(Pregnancies        3.845052
 Glucose          120.894531
 BloodPressure     69.105469
 SkinThickness     20.536458
 dtype: float64,
 Pregnancies        11.339272
 Glucose          1020.917262
 BloodPressure     374.159449
 SkinThickness     254.141900
 dtype: float64)

In [34]:
(x > 13.0).sum(), (x < 0.0).sum()

(Pregnancies        4
 Glucose          763
 BloodPressure    733
 SkinThickness    508
 dtype: int64,
 Pregnancies      0
 Glucose          0
 BloodPressure    0
 SkinThickness    0
 dtype: int64)

In [35]:
less_than = np.where(x < 0.0)
higher_than = np.where(x > 13.0)
less_than,higher_than

((array([], dtype=int64), array([], dtype=int64)),
 (array([  0,   0,   0, ..., 766, 767, 767], dtype=int64),
  array([1, 2, 3, ..., 1, 1, 2], dtype=int64)))

Train test split
try optim

In [38]:
def train_test_split(features: pd.DataFrame,
                     labels: pd.DataFrame,
                     test_split_ratio:float) -> None:
        
    test_size = int(len(features) * test_split_ratio)
    train_size = len(features) - test_size
    assert len(features) == test_size + train_size, "Size mismatch!"

    x_train,y_train = features[:train_size,:],labels[:train_size]
    x_test,y_test = features[train_size:train_size+test_size,:], labels[train_size:train_size + test_size]
    return (x_train,y_train,x_test,y_test)

Distance

In [45]:
def euclidean(points: pd.DataFrame,element_of_x: pd.DataFrame) -> pd.DataFrame:
    return pd.sqrt(pd.sum((points - element_of_x)**2,axis=1))

Predict func

In [46]:
def predict(x_train: pd.DataFrame,y_train: pd.DataFrame,x_test: pd.DataFrame,k:int) -> pd.DataFrame:
    labels_pred = []
    for x_test_element in x_test:
        distances = euclidean(x_train,x_test_element)
        distances = pd.DataFrame(sorted(zip(distances,y_train)))
        label_pred = mode(distances[:k,1],keepdims=False).mode
        labels_pred.append(label_pred)
    return pd.DataFrame(labels_pred,dtype=pd.int32)

Metrics

In [24]:
def accuracy(y_test:pd.DataFrame, y_preds:pd.DataFrame) -> float:
    true_positive = (y_test == y_preds).sum()
    return true_positive / len(y_test) * 100

In [25]:
def plot_confusion_matrix(y_test: pd.DataFrame,y_preds: pd.DataFrame) -> np.Array:
        conf_matrix = confusion_matrix(y_test,y_preds)
        return conf_matrix 