<a href="https://colab.research.google.com/github/returaj/cs6910/blob/assginment1_akash/momentum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

momentum based minibatch stochastic gradient

In [3]:
class mgd(object):
  def __init__(self, model, alpha):
    self.model = model
    self.alpha = alpha
  def optimize(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    gamma=0.9
    model = self.model
    layer_output = model.forward(X)
    dw, db = model.backward(X, y, layer_output)
    num_layers = len(model.weight)
    layers=num_layers
    prev_w=[]
    prev_b=[]
    v_w=[]
    v_b=[]
    for i in range(num_layers):
      m, n = model.weight[i].shape
      prev_w.append(np.zeros((m,n)))
      prev_b.append(np.zeros(n))
    for l in range(num_layers):
      v_w.append( gamma*prev_w[l]+self.alpha*dw[l])
      v_b.append(gamma*prev_b[l]+self.alpha*db[l])
      model.weight[l]-=v_w[l]
      model.bias[l]-=v_b[l]
      prev_w[l]=v_w[l]
      prev_b[l]=v_b[l]
  def error(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size = X.shape[0]
    prob = self.model.forward(X)[-1]
    err = - np.sum(np.log(prob[np.arange(batch_size), y])) / batch_size
    return err

  

In [4]:
import numpy as np
from keras.datasets import fashion_mnist
##from optimizer import SGD


class FNN(object):
  def __init__(self, input_size, output_size, hidden_layers_size):
    self.input_size = input_size
    self.output_size = output_size
    self.weight, self.bias = None, None
    self.initialize(input_size, hidden_layers_size, output_size)

  def initialize(self, input_size, hidden_layers_size, output_size):
    self.weight, self.bias = [], []
    prev_layer_size = input_size
    hidden_layers_size.append(output_size)
    for curr_layer_size in hidden_layers_size:
      self.weight.append(np.random.normal(0, 1, size=(prev_layer_size, curr_layer_size)))
      self.bias.append(np.zeros(curr_layer_size))
      prev_layer_size = curr_layer_size

  def reset(self):
    num_layers = len(self.weight)
    for l in range(num_layers):
      m, n = self.weight[l].shape
      self.weight[l] = np.random.normal(0, 1, size=(m, n))
      self.bias[l] = np.zeros(n)

  @staticmethod
  def sigmoid(x):
    return 1./(1+np.exp(-x))

  @staticmethod
  def softmax(x):
    """
    x: (batch_size(B), data_size(N))
    """
    exp_prob = np.exp(x)
    prob = exp_prob / np.sum(exp_prob, axis=1, keepdims=True)
    return prob

  def forward(self, X):
    """
    X: (batch_size(B), data_size(N))
    """
    layer_output = []
    prev_layer = X
    num_hidden_layers = last_layer = len(self.weight) - 1
    for t in range(num_hidden_layers):
      w, b = self.weight[t], self.bias[t]
      next_layer = self.sigmoid(np.dot(prev_layer, w) + b)
      layer_output.append(next_layer)
      prev_layer = next_layer
    w, b = self.weight[last_layer], self.bias[last_layer]
    prob = self.softmax(np.dot(prev_layer, w) + b)
    layer_output.append(prob)
    return layer_output

  def backward(self, X, y, layer_output):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size, _ = X.shape
    num_hidden_layers = last_layer = len(layer_output)-1
    dw, db = [None]*(num_hidden_layers+1), [None]*(num_hidden_layers+1)
    for t in range(num_hidden_layers, -1, -1):
      if t == last_layer:
        dh = layer_output[t] / batch_size
        dh[np.arange(batch_size), y] -= 1/batch_size
      else:
        dh = np.dot(dh_fwd, self.weight[t+1].T) * layer_output[t] * (1-layer_output[t])
      prev_layer_output = X if t==0 else layer_output[t-1]
      dw[t] = np.dot(prev_layer_output.T, dh)
      db[t] = np.sum(dh, axis=0)
      dh_fwd = dh
    return dw, db


if __name__ == '__main__':
  (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
  consider = 1000
  X = np.array([x_train[i].flatten() for i in range(consider)])
  Y = y_train[:consider]
  batch_size, epochs = 16, 20
  model = FNN(784, 10, [50, 20])
  ##sgd = SGD(model, 0.01)
  mgd=mgd(model,0.01)
  for ep in range(1, epochs+1):
    ids = np.arange(consider)
    np.random.shuffle(ids)
    start, end = 0, batch_size
    while end > start:
      x, y = X[ids[start:end]], Y[ids[start:end]]
      mgd.optimize(x, y)
      start, end = end, min(consider, end+batch_size)
    err = mgd.error(X, Y)
    print(f'epoch: {ep}, error: {err}')




epoch: 1, error: 3.334025413120928
epoch: 2, error: 2.928422100640473
epoch: 3, error: 2.7243637772563134
epoch: 4, error: 2.5419737636236004
epoch: 5, error: 2.406307523444947
epoch: 6, error: 2.3458551254621436
epoch: 7, error: 2.310097041666402
epoch: 8, error: 2.2609962143649907
epoch: 9, error: 2.2237388192393004
epoch: 10, error: 2.1899706720781515
epoch: 11, error: 2.1597585888732227
epoch: 12, error: 2.1409272145535034
epoch: 13, error: 2.0865662449001623
epoch: 14, error: 2.0717354390394065
epoch: 15, error: 2.0493594164758826
epoch: 16, error: 2.0166256740931354
epoch: 17, error: 1.9918191011995359
epoch: 18, error: 1.9652783860031127
epoch: 19, error: 1.9425971295840103
epoch: 20, error: 1.9158059960913894
