## HW 3a: Implement Perceptron for Heart Disease

1. Use step function and $\eta = 0.1$ and batch size = 4 (ok to use 1 if need be)
2. Use TUNE set to choose ‘early stopping’ epoch for each of 10 test folds (maxEpochs = 10,000 but can stop if / when all trainset examples correct)
3. Ok to just use ONE tune set (but feel free to use 9 as done in HW1 and HW2)
4. Once epochsToUse estimated, train on all 9 folds; use (8/9) epochsToUse
5. Report best epoch and min, mean, and max accuracy per train-tune fold 
6. Compare to Random Forests and kNN test set min, mean, and max

In [26]:
import numpy as np

def step_function(x: int):
    return 1 if x >= 0 else 0

class Perceptron:
    def __init__(self, num_weights, activation_function, learning_rate=0.1):
        self.weights = np.zeros(num_weights + 1)
        self.activation_function = activation_function
        self.learning_rate = learning_rate

    def fit(self, X: np.array, y: np.array): # X is 3-d array, y is 2-d array with batch size in first dimension
        for X_batch, y_batch in zip(X, y):
            # print(f'weights before: {self.weights}')
            updates = []
            assert X_batch.shape[0] == y_batch.shape[0]
            for example, y in zip(X_batch, y_batch):
                example = np.append(example, -1)
                y_bar = self.predict_one(example)
                # print(y_bar)
                update = self.learning_rate * (y - y_bar) * example
                # print(f'example: {example}, update: {update}')
                updates.append(update)
            self.weights = self.weights + np.sum(updates, axis=0) / X_batch.shape[0]
            # print(f'weights after: {self.weights}')

    def predict_one(self, example: np.array):
        return self.activation_function(self.weights.T @ example)

    def predict(self, examples: np.array):
        output = []
        for example in examples:
            example = np.append(example, -1)
            output.append(self.predict_one(example))
        return np.array(output)

In [27]:
# unit tests

clf = Perceptron(4, step_function)
x = np.array([[[1, 2, 3, 1], [2, 3, 4, 1], [2, 3, 4, 1]], [[5, 6, 7, 1], [8, 9, 10, 1], [2, 3, 4, 1]]])
y = np.array([[1, 0, 1], [0, 0, 1]])

clf.fit(x, y)

In [28]:
from typing import List
from tqdm import trange

def accuracy_score(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return (y_true == y_pred).sum() / len(y_true)

def load_one_file(filename: str):
    data = np.load(filename, allow_pickle=True)
    X = data['x']
    y = data['y']
    example_names = data['example_names']
    return X, y, example_names

def load_folds(folds: List[int]):
    all_X = []
    all_y = []
    all_example_names = []
    for fold in folds:
        filename = f'/Users/brwang/Desktop/ml_class/hw0/data/heart_fold{fold}.npz'
        X, y, example_names = load_one_file(filename)
        all_X.append(X)
        all_y.append(y)
        all_example_names.append(example_names)

    X_folds = np.concatenate(all_X, axis=0)
    y_folds = np.concatenate(all_y)
    example_names_folds = np.concatenate(all_example_names)
        
    return X_folds, y_folds, example_names_folds

X, y, example_names = load_folds([0, 1])
X.shape

(60, 13)

In [29]:
def create_batched_data(X: np.array, y: np.array, batch_size):
    assert len(X.shape) == 2 and len(y.shape) == 1
    # losing some data here
    num_batches = X.shape[0] // batch_size
    X = X[:num_batches * batch_size]
    y = y[:num_batches * batch_size]
    X = X.reshape(-1, batch_size, X.shape[1])
    y = y.reshape(-1, batch_size)

    return X, y

X, y = create_batched_data(np.ones((7, 3)), np.ones(7), 2)
X.shape, y.shape

((3, 2, 3), (3, 2))

In [30]:
import pandas as pd
import time

def get_tune_results(train_tune_folds: List[int], batch_size=4, MAX_EPOCHS=5000): # TODO: set batch size 1 for debugging 
    for tune_fold in range(len(train_tune_folds)):
        X_tune, y_tune, example_names_tune = load_folds([tune_fold])
        train_folds = [fold for fold in train_tune_folds if fold != tune_fold]
        X_train, y_train, example_names_train = load_folds(train_folds)

        clf = Perceptron(X_train.shape[1], step_function)
        best_epoch = 0
        best_accuracy = 0
        for i in trange(MAX_EPOCHS):
            perm = np.random.permutation(X_train.shape[0])
            X_train = X_train[perm]
            y_train = y_train[perm]
            X_batched, y_batched = create_batched_data(X_train, y_train, batch_size=batch_size)
            clf.fit(X_batched, y_batched)
            y_pred_train = clf.predict(X_train)
            train_accuracy = accuracy_score(y_train, y_pred_train)
            if train_accuracy == 1:
                print(f"achieved perfect train accuracy at epoch {i}")
                break
            y_pred = clf.predict(X_tune)
            tune_accuracy = accuracy_score(y_tune, y_pred)
            if tune_accuracy > best_accuracy:
                best_epoch = i
                best_accuracy = tune_accuracy
                # print(best_accuracy, best_epoch)
            # if i % 1000 == 0:
            #     print(f'epoch {i}: ')
            #     print(np.abs(clf.weights).sum(), np.abs(clf.weights).max(), np.abs(clf.weights).min()) # print min max sum of absolute value of weights per epoch
            # print absolute value of weight updates
            # okay try eta / 10, eta / 100
        break
        
    print(f'best epoch: {best_epoch}')
    print(f'last train accuracy: {train_accuracy}')
    print(f'best tune accuracy: {best_accuracy}')
    return best_epoch

def cv(num_folds=10):
    cv_results = []
    ks = []
    for test_fold in range(num_folds):
        print(f'starting fold {test_fold}')
        start = time.time()
        X_test, y_test, example_names_test = load_folds([test_fold])
        train_folds = [fold for fold in range(num_folds) if fold != test_fold]
        best_epoch = get_tune_results(train_folds)
        X_train, y_train, example_names_train = load_folds(train_folds)
        clf = Perceptron(X_train.shape[1], step_function)
        for i in range(int(best_epoch * (8/9))):
            perm = np.random.permutation(X_train.shape[0])
            X_train = X_train[perm]
            y_train = y_train[perm]
            X_batched, y_batched = create_batched_data(X_train, y_train, batch_size=4)
            clf.fit(X_batched, y_batched)
        y_pred = clf.predict(X_test)
        print(y_pred)
        acc = accuracy_score(y_test, y_pred)
        cv_results.append(acc)
        end = time.time()
        print(f'time for fold: {end - start}')
        # break

    print(ks)
    return cv_results

cv_scores = cv()
print(f'average cv score: {np.mean(cv_scores)}')

  0%|          | 12/5000 [00:00<00:43, 115.45it/s]

starting fold 0


100%|██████████| 5000/5000 [00:42<00:00, 118.71it/s]


best epoch: 336
last train accuracy: 0.8127340823970037
best tune accuracy: 0.8333333333333334


  0%|          | 15/5000 [00:00<00:35, 142.16it/s]

[0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 0]
time for fold: 43.482462882995605
starting fold 1


100%|██████████| 5000/5000 [00:35<00:00, 139.52it/s]


best epoch: 493
last train accuracy: 0.8523206751054853
best tune accuracy: 0.8333333333333334


  0%|          | 15/5000 [00:00<00:34, 143.68it/s]

[1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 0]
time for fold: 37.922394037246704
starting fold 2


100%|██████████| 5000/5000 [00:41<00:00, 119.13it/s]


best epoch: 574
last train accuracy: 0.759493670886076
best tune accuracy: 0.8333333333333334


  0%|          | 15/5000 [00:00<00:34, 143.77it/s]

[1 1 0 0 1 0 1 1 1 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 0 0 0 1 1 1]
time for fold: 44.2390398979187
starting fold 3


100%|██████████| 5000/5000 [00:39<00:00, 126.52it/s]


best epoch: 476
last train accuracy: 0.8396624472573839
best tune accuracy: 0.8333333333333334


  0%|          | 13/5000 [00:00<00:39, 127.18it/s]

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
time for fold: 41.660305976867676
starting fold 4


100%|██████████| 5000/5000 [00:35<00:00, 140.77it/s]


best epoch: 358
last train accuracy: 0.8607594936708861
best tune accuracy: 0.8333333333333334


  0%|          | 14/5000 [00:00<00:36, 137.50it/s]

[1 0 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 1 1 0 1]
time for fold: 36.98588418960571
starting fold 5


100%|██████████| 5000/5000 [00:35<00:00, 141.90it/s]


best epoch: 457
last train accuracy: 0.8649789029535865
best tune accuracy: 0.8333333333333334


  0%|          | 15/5000 [00:00<00:35, 142.29it/s]

[0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
time for fold: 37.04566192626953
starting fold 6


100%|██████████| 5000/5000 [53:14<00:00,  1.56it/s]  


best epoch: 697
last train accuracy: 0.8734177215189873
best tune accuracy: 0.8333333333333334


  0%|          | 10/5000 [00:00<00:54, 91.27it/s]

[0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 1 0 1 0 0 0 0]
time for fold: 3200.5784060955048
starting fold 7


 18%|█▊        | 888/5000 [00:08<00:28, 146.38it/s]

In [None]:
print(f'min cv score: {np.min(cv_scores)}')
print(f'max cv score: {np.max(cv_scores)}')

# before bugfix
# min cv score: 0.6551724137931034
# max cv score: 0.9333333333333333

min cv score: 0.6551724137931034
max cv score: 0.9333333333333333


# HW 3b

1. Repeat above but use one layer of 128 ReLU hidden units & backprop
2. Use Sigmoid as the activation function for the output
3. If maxEpochs = 10,000 runs too slowly, use 5,000 or 1,000
4. Optional: try #HUs in {16, 32, 64, 128, 256, 512, 1024, …, YouChooseMax} and use the best #HUs + stopping epoch on tune set per train-test fold
5. Compare to results discussed in HW3a

In [25]:
import numpy as np
import math

class NeuralNetwork:
    def __init__(self):
        kaiming_std = np.sqrt(2/13) # kaiming initialization
        self.hidden_weights = np.random.normal(0, kaiming_std, size=(128, 13))
        self.output_weights = np.random.normal(0, kaiming_std, size=128)
        self.relu = lambda x: np.maximum(x, 0) # vectorized function
        self.sigmoid = lambda x: 1 / (1 + math.exp(-x)) # scalar function

    def predict(self, examples: np.array):
        output = []
        for example in examples:
            example = np.append(example, 1)
            output.append(self.predict_one(example))
        return np.array(output)
    
    def predict_one(self, example: np.array):
        v = self.hidden_weights @ example
        v = self.relu(v)
        v = self.output_weights @ v
        return self.sigmoid(v)

clf = NeuralNetwork()
clf.predict_one(np.ones(13))

0.8993195841921252