In [1]:
import pandas as pd
import numpy as np

## Save Data To Files For CV

In [6]:
df = pd.read_csv('processed.cleveland.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [7]:
df.shape

(303, 14)

In [8]:
# num is the label column

features = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'] # from heart-disease.names number 7

In [9]:
df.columns = features

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [10]:
# cast label to binary

df['num'] = df['num'].apply(lambda x: 1 if x != 0 else 0)

In [11]:
df['num'].value_counts()

0    164
1    139
Name: num, dtype: int64

In [12]:
data_X = df.iloc[:, :-1].to_numpy()
data_X.shape

(303, 13)

In [13]:
data_Y = df.iloc[:, -1].to_numpy()
data_Y.shape

(303,)

In [14]:
# enumerate 0 ... 9 repeatedly until length of df

from itertools import cycle

lst_cycle = cycle(list(range(10)))
enum = [next(lst_cycle)for i in range(len(df))]

In [15]:
# permute samples

df = df.sample(frac=1, random_state=42)

In [16]:
# save to 10 different npy files

df['enum'] = enum
for i, group in df.groupby('enum'):
    np.savez(f'heart_fold{i}', x=group.iloc[:, :-2].to_numpy(), y=group['num'].to_numpy(), example_names=group.index.to_numpy())

## Naive KNN Algorithm

In [17]:
test_data = np.load('heart_fold0.npz', allow_pickle=True)
X = test_data['x']
y = test_data['y']
example_names = test_data['example_names']

In [18]:
# confirm shapes look reasonable

X.shape, y.shape, example_names.shape

((31, 13), (31,), (31,))

In [19]:
from collections import Counter
import numpy as np

class NaiveKNNClassifier:
    def __init__(self, k):
        self.k = k
        self.data = None

    def fit(self, X_train: np.array, y_train: np.array):
        self.data = X_train
        labels = y_train
        self.labels = np.where(labels == 0, labels, 1) # cast to boolean

    def predict_one(self, test_example: np.array):
        assert self.data is not None
        assert test_example.shape[0] == self.data.shape[1] # assert that arrays are broadcastable
        norms = np.linalg.norm(test_example - self.data, axis=1)
        sorted_norm_idx = np.argsort(norms)
        sorted_norm_idx = sorted_norm_idx[:self.k]
        common_counter = Counter(self.labels[sorted_norm_idx])
        return common_counter.most_common(1)[0][0]

    def predict(self, test_examples: np.array):
        assert self.data is not None
        assert test_examples.shape[1] == self.data.shape[1] # assert that arrays are broadcastable
        return np.apply_along_axis(self.predict_one, 1, test_examples)
        

# unit test
clf = NaiveKNNClassifier(3)
clf.fit(np.array([[3, 4], [1, 2], [5, 6], [7, 8]]), np.array([1, 1, 0, 0]))
clf.predict(np.array([[1, 1], [2, 2], [7, 7]]))

array([1, 1, 0])

## CV with KNN

1. Implement the kNN algo (choose whatever distance function you want)
2. Treat each of your 10 data folds from HW0 as a test set and
compute the accuracy of kNN on it (and report the chosen k)
3. Use the other 9 folds to choose a good k
4. Consider k in {1, 3, 7, 15, 25, 51, 101}; each test set might get a different k
5. Nine times, use 8 of these 9 folds as the train set and 
the 9th as the tune set.   Find which one k works best across all 9
6. Then use all 9 folds as the train set plus the best k 
to compute the accuracy on the fold that is the test set

In [20]:
# utility functions for loading files

from typing import List

def load_one_file(filename: str):
    data = np.load(filename, allow_pickle=True)
    X = data['x']
    y = data['y']
    example_names = data['example_names']
    return X, y, example_names

def load_folds(folds: List[int]):
    all_X = []
    all_y = []
    all_example_names = []
    for fold in folds:
        filename = f'heart_fold{fold}.npz'
        X, y, example_names = load_one_file(filename)
        all_X.append(X)
        all_y.append(y)
        all_example_names.append(example_names)

    X_folds = np.concatenate(all_X, axis=0)
    y_folds = np.concatenate(all_y)
    example_names_folds = np.concatenate(all_example_names)
        
    return X_folds, y_folds, example_names_folds

X, y, example_names = load_folds([0, 1])
X.shape, y.shape, example_names.shape

((62, 13), (62,), (62,))

In [21]:
# utility functions for calculating and displaying metrics

def accuracy_score(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return (y_true == y_pred).sum() / len(y_true)

In [None]:
def get_tune_results(train_tune_folds: List[int], k_values=[1, 3, 7, 15, 25, 101]):
    results_dict = {k: [] for k in k_values}
    for tune_fold in range(len(train_tune_folds)):
        X_tune, y_tune, example_names_tune = load_folds([tune_fold])
        train_folds = [fold for fold in train_tune_folds if fold != tune_fold]
        X_train, y_train, example_names_train = load_folds(train_folds)

        for k in k_values:
            clf = NaiveKNNClassifier(k)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_tune)
            score = accuracy_score(y_tune, y_pred)
            results_dict[k].append(score)
        
        