# Online Predictive Coding
In this notebook, we implement the first version of our prototype for learning from varying feature spaces.

## Model Design
- add the plot we have on the desktop computer here.

In [71]:
import numpy as np

class error_module:
    def __init__(self, size, lr):
        self.w = np.zeros(size)
        self.lr = lr
        
    def predict(self, x):
        return np.dot(self.w, x)
    
    def update(self, x, y):
        yhat = self.predict(x)  # regression
        loss = 0.5 * (y - yhat)**2
        self.w += self.lr * (y - yhat)
        return loss
        
        
class classifier_module:
    def __init__(self, size, lr):
        self.w = np.zeros(size)
        self.lr = lr
        
    def predict(self, x):
        return np.dot(self.w, x)

    def update(self, x, y):
        loss = np.maximum(0, 1.0 - y * np.dot(self.w, x))
        if loss > 0: self.w += x * y * self.lr
        return loss


In [72]:
dataset_names = ["german", "ionosphere", "spambase", "magic", "a8a"]
root_path, extension = "./datasets/", "_numeric"


def get_path(name):
    '''returns a path pair to the preprocessed datasets
    X and y csv files.'''
    path = root_path + name + extension
    return path + "_X.csv", path + "_y.csv"


def read_dataset(X_path, y_path):
    '''reads and returns numpy arrays in a given pair of paths for 
    X and y.'''
    X = pd.read_csv(X_path).values
    y = pd.read_csv(y_path)['0'].values
    return X, y


def simulate_varying(X):  # multivariate normal distribution
    '''Get the data and generate a varying feature space pattern.
    Possible concerns: thresholding messing up the distribution?'''
    
    # create a covariance matrix
    cov = np.random.rand(num_features, num_features)
    cov = np.dot(cov, cov.transpose())  # to have a positive semi-definite matrix
    
    # create a mean vector
    mean = np.random.rand(len(X[0]))
    
    # sample from multivariate gaussian w/ given mean and cov
    spaces = np.random.multivariate_normal(mean, cov, len(X))
    
    # threshold samples for 1-hot encoding
    spaces[spaces < 0] = 0
    spaces[spaces != 0] = 1

    return spaces


def simulate_random_varying(X): # discrete uniform distribution
    matrix = np.random.randint(2, size=(len(X), len(X[0])))  
    return matrix

In [79]:
X_path, y_path = get_path("german")
X, y = read_dataset(X_path, y_path)
num_features = len(X[0])
folds = 20
learning_rate = 0.001

In [80]:
# multivariate gaussian mask with threshold 0

fold_error_rates = []
predictions = []
losses = []

for f in range(folds):
    error_count = 0
    
    # shuffle for each fold
    l = list(range(len(X)))
    np.random.shuffle(l)
    X, y = X[l], y[l]
    mask = simulate_varying(X)  # multivariate
    
    # initialize model
    #model = classifier_module(num_features, learning_rate)
    model = error_module(num_features, learning_rate)

    for i in range(len(X)):
        # predict and suffer
        yhat = model.predict(X[i] * mask[i])
        loss = model.update(X[i] * mask[i], y[i])
        
        # bookkeeping
        predictions.append(yhat)
        losses.append(loss)
        
        if np.sign(yhat) != y[i]:
            error_count += 1
        
    fold_error_rates.append(error_count/len(X))

print(learning_rate, np.mean(fold_error_rates))

0.001 0.3903
