In [1]:
import numpy as np

In [31]:
# activation functions and their derivatives
class ReLU:
  def activation(self,x):
    return np.maximum(0,x)

  def derivative(self,x):
    return np.where(x > 0, 1, 0)


class Tanh:
  def activation(self,x):
    return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

  def derivative(self, x):
    return 1 - np.square(self.activation(x))


class LeakyReLU:
  def __init__(self, y):
    self.gamma = y

  def activation(self, x):
    return np.maximum(0,x) + self.gamma*np.minimum(0,x)

  def derivative(self, x):
    return np.where(x > 0, 1, self.gamma)


class Softmax:
  def activation(self,x):
    sum = np.sum(np.exp(x), axis=1, keepdims=True) # should work for a matrix now
    return np.exp(x) / sum

  def derivative(self,x): #assumes that x is already a softmax vector
    n = x.shape[-1]
    jacobian = np.zeros((n,n))
    for i in range(0,n):
      for j in range(0,n):
        if i == j:
          jacobian[i,j] = x[i] * (1-x[i])
        else:
          jacobian[i,j] = -x[i] * x[j]
    return jacobian

class MultiClassLoss:
  def loss(self, pred, true): # assumes pred and true are vectors of the same size
    return -np.sum(true * np.log(pred))

  def derivative(self, pred, true): #returns the partial derivatives w.r.t. all the class probabilies
    return -true / pred

In [75]:
class MLPBackpropagation:

    """
    Parameters:
    hidden_sizes = an int list that describes the number of units in each hidden layer
    inner_activation = an object that will be the activation function for all hidden layers
    final_activation = an object that will be the activation function for the output
    loss = an object that represents the loss function
    """
    def __init__(self, hidden_sizes, inner_activation, final_activation, loss, bias = True):
        self.hidden_sizes = hidden_sizes
        self.w_matrices = [None]
        self.biases = [None]

        for k in range(len(hidden_sizes)-1): #initialize V later when we recieved x
          w = np.random.randn(hidden_sizes[k+1], hidden_sizes[k]) * .01   #intialize with random gaussian noise
          self.w_matrices.append(w)

          if (bias):
            b = np.ones((1,hidden_sizes[k+1])) # keep separate for easier gradient calulation
            self.biases.append(b)

        # intialize the inner activation and the last activation (both are objects)
        self.inner_fn = inner_activation
        self.outer_fn = final_activation

        #need to intialize what is the loss function
        self.loss_fn = loss

    def forward(self, x):
      self.hidden_units = []
      self.activated_units = []

      units = x.reshape(-1, 1)

      #go through the hidden layers
      for i in range(len(self.w_matrices) - 1):
        # linear transformation on input
        units = np.dot(self.w_matrices[i], units).T + self.biases[i]
        units = units.reshape(-1, 1)
        self.hidden_units.append(units.copy())

        # activate
        units = self.inner_fn.activation(units)
        self.activated_units.append(units.copy())

      #produce the prediction
      y = np.dot(self.w_matrices[-1], units) + self.biases[-1].reshape(-1,1)
      self.hidden_units.append(y.copy())

      y = self.outer_fn.activation(y)
      self.activated_units.append(y.copy())

      return y


    def backward(self, x, pred_y, true_y): # assumes we are given just 1 instance
      # calculate the loss w.r.t. the output
      dy_hat = self.loss_fn.derivative(pred_y,true_y)

      # calculate the loss w.r.t. the outer activation
      if (isinstance(self.outer_fn, Softmax)):
        # simplify the formula so we dont need to calculate the jacobian
        da_list = [(pred_y.reshape(-1, 1) - true_y.reshape(-1, 1)).T]
      else:
        da_list = [np.dot(dy_hat,self.outer_fn.derivative(self.activated_units[-1])).T]

      # check if the model has no hidden layers
      if (len(self.w_matrices) == 1):
        dw_list = [np.dot(x.reshape(-1,1), da_list[0]).T]
        db_list = [da_list[0]]
        return dw_list, db_list

      # calculate for the outer layer -> y = fn(a) = f(Wc + b) = f(W * g(d) + b) = ....
      dw_list = [np.dot(self.activated_units[-2], da_list[0]).T]
      db_list = [da_list[0]]

      # follow a similar formula for the hidden layers
      tot = len(self.activated_units)
      for i in range(1, tot - 1):
        da = np.dot(da_list[-1], self.w_matrices[tot-i])
        da = self.inner_fn.derivative(self.activated_units[tot-i-1]).T * da
        da_list.append(da)
        dw_list.append(np.dot(self.activated_units[tot-i-2], da).T)
        db_list.append(da)

      #repeat one more time for the input layer
      da = np.dot(da_list[-1], self.w_matrices[1])
      da = self.inner_fn.derivative(self.activated_units[0]).T * da
      dw_list.append(np.dot(x.reshape(-1,1),da).T)
      db_list.append(da)

      return dw_list, db_list

    def fit(self, X, Y, epoch, learning_rate = 0.05, testX = None, testY = None):
      features = X.shape[-1]
      classes = Y.shape[-1]

      # initalize the input weight matrix and bias
      if (self.hidden_sizes != []):
        self.w_matrices[0] = np.random.randn(self.hidden_sizes[0], features) * .01
        self.biases[0] = np.ones((1,self.hidden_sizes[0]))

        # intialize the output weight matrix
        self.w_matrices.append(np.random.randn(classes, self.hidden_sizes[-1]) * .01)
        self.biases.append(np.ones((1,classes)))

      else: # no hidden layers
        self.w_matrices[0] = np.random.randn(classes, features) * .01
        self.biases[0] = np.ones((1,classes))


      # tranning setup
      self.train_acc = []
      self.test_acc = []
      matrices = len(self.w_matrices)
      evaluate = testX != None and testY != None

      if (X.ndim == 1):
        instances = 1
      else:
        instances = X.shape[0]

      # SDG
      for i in range(epoch):
        for j in range(instances):

          # get gradient for each instance
          if (X.ndim == 1):
            pred = self.forward(X).flatten()
            grad_w, grad_b = self.backward(X, pred.reshape(-1,1), Y)
          else:
            pred = self.forward(X[j,:]).flatten()
            grad_w, grad_b = self.backward(X[j,:], pred.reshape(-1,1), Y[j,:])

          #update the weight backwards due to how grad_w and grad_b are stored
          for k in range(matrices):
            self.w_matrices[k] -= learning_rate * grad_w[matrices-1-k]
            self.biases[k] -= learning_rate * grad_b[matrices-1-k]

        # calculate performance per epoch
        if (evaluate):
          train_result = self.predict(X)
          test_result = self.predict(testX)
          self.train_acc.append(self.evaluate_acc(train_result, Y))
          self.test_acc.append(self.evaluate_acc(test_result, testY))

    def evaluate_acc(self, pred, true):
      # assumes model is doing mulit-classification where classes = 1,2,...,k
      correct = 0
      total = len(pred)
      for i in range(total):
        # determine which class is chosen
        c = np.argmax(pred[i])
        t = np.argmax(true[i])

        # check true label
        if c == t:
          correct += 1
      return correct / total


    def predict(self, X):
      if (X.ndim == 1):
        return self.forward(X)
      else:
        # need to do more than 1 forward pass
        predictions = np.zeros(X.shape)

        for i in range(X.shape[0]):
          p = self.forward(X[i,:]).flatten()
          print(p.shape)
          print(predictions[i,:].shape)
          predictions[i,:] = self.forward(X[i,:]).flatten()

        return predictions


In [77]:
# example usage:
fn = ReLU()
g = Softmax()
loss = MultiClassLoss()
model = MLPBackpropagation([3,4], fn, g, loss)

x = np.array([[1,2], [3,2]])
y = np.array([[0,1], [0,1]])
model.fit(x,y,4)

print(model.w_matrices)
print(model.biases)





[array([[0.02737603, 0.01838725],
       [0.0119414 , 0.03444603],
       [0.01934491, 0.01441589]]), array([[0.07962928, 0.07754506, 0.08036094],
       [0.0601658 , 0.05521002, 0.08625459],
       [0.08424674, 0.07830913, 0.07799915],
       [0.07557669, 0.08331489, 0.08377116]]), array([[-0.43307044, -0.418664  , -0.43611015, -0.44512337],
       [-0.01564098, -0.00394903,  0.00196755,  0.00162914]])]
[array([[1.00879205, 1.00848064, 1.01124931]]), array([[1.0718843 , 1.06913304, 1.07288489, 1.07653005]]), array([[0.6, 1. ]])]
