<a href="https://colab.research.google.com/github/returaj/cs6910/blob/assginment1_akash/nesterov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
class nest_gd(object):
  def __init__(self, model, alpha):
    self.model = model
    self.alpha = alpha
    self.initialize()
  def initialize(self):
    self.prev_w=[]
    self.prev_b=[]
    num_layers = len(model.weight)
    for i in range(num_layers):
      m, n = self.model.weight[i].shape
      self.prev_w.append(np.zeros((m,n)))
      self.prev_b.append(np.zeros(n))
  def optimize(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    gamma=0.9
    model = self.model
    v_w=[]
    v_b=[]
    num_layers = len(model.weight)
    layers=num_layers
    for j in range(num_layers):
      v_w.append(gamma*self.prev_w[j])
      v_b.append(gamma*self.prev_b[j])
    w=model.weight
    b=model.bias
    model.bias=[]
    model.weight=[]
    for k in range(num_layers):
      model.weight.append(w[k]-v_w[k])
      model.bias.append(b[k]-v_b[k])
    layer_output = model.forward(X)
    dw, db = model.backward(X, y, layer_output)
    for l in range(num_layers):
      v_w[l]=gamma*self.prev_w[l]+self.alpha*dw[l]
      v_b[l]=gamma*self.prev_b[l]+self.alpha*db[l]
      model.weight[l]-=v_w[l]
      model.bias[l]-=v_b[l]
      self.prev_w[l]=v_w[l]
      self.prev_b[l]=v_b[l]
  def error(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size = X.shape[0]
    prob = self.model.forward(X)[-1]
    err = - np.sum(np.log(prob[np.arange(batch_size), y])) / batch_size
    return err

In [13]:
import numpy as np
from keras.datasets import fashion_mnist
##from optimizer import SGD


class FNN(object):
  def __init__(self, input_size, output_size, hidden_layers_size):
    self.input_size = input_size
    self.output_size = output_size
    self.weight, self.bias = None, None
    self.initialize(input_size, hidden_layers_size, output_size)

  def initialize(self, input_size, hidden_layers_size, output_size):
    self.weight, self.bias = [], []
    prev_layer_size = input_size
    hidden_layers_size.append(output_size)
    for curr_layer_size in hidden_layers_size:
      self.weight.append(np.random.normal(0, 1, size=(prev_layer_size, curr_layer_size)))
      self.bias.append(np.zeros(curr_layer_size))
      prev_layer_size = curr_layer_size

  def reset(self):
    num_layers = len(self.weight)
    for l in range(num_layers):
      m, n = self.weight[l].shape
      self.weight[l] = np.random.normal(0, 1, size=(m, n))
      self.bias[l] = np.zeros(n)

  @staticmethod
  def sigmoid(x):
    return 1./(1+np.exp(-x))

  @staticmethod
  def softmax(x):
    """
    x: (batch_size(B), data_size(N))
    """
    x_max = np.max(x, axis=1, keepdims=True)
    exp_prob = np.exp(x - x_max)
    prob = exp_prob / np.sum(exp_prob, axis=1, keepdims=True)
    return prob


  def forward(self, X):
    """
    X: (batch_size(B), data_size(N))
    """
    layer_output = []
    prev_layer = X
    num_hidden_layers = last_layer = len(self.weight) - 1
    for t in range(num_hidden_layers):
      w, b = self.weight[t], self.bias[t]
      next_layer = self.sigmoid(np.dot(prev_layer, w) + b)
      layer_output.append(next_layer)
      prev_layer = next_layer
    w, b = self.weight[last_layer], self.bias[last_layer]
    prob = self.softmax(np.dot(prev_layer, w) + b)
    layer_output.append(prob)
    return layer_output

  def backward(self, X, y, layer_output):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size, _ = X.shape
    num_hidden_layers = last_layer = len(layer_output)-1
    dw, db = [None]*(num_hidden_layers+1), [None]*(num_hidden_layers+1)
    for t in range(num_hidden_layers, -1, -1):
      if t == last_layer:
        dh = layer_output[t] / batch_size
        dh[np.arange(batch_size), y] -= 1/batch_size
      else:
        dh = np.dot(dh_fwd, self.weight[t+1].T) * layer_output[t] * (1-layer_output[t])
      prev_layer_output = X if t==0 else layer_output[t-1]
      dw[t] = np.dot(prev_layer_output.T, dh)
      db[t] = np.sum(dh, axis=0)
      dh_fwd = dh
    return dw, db


if __name__ == '__main__':
  (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
  consider = 10000
  X = np.array([(x_train[i].flatten())/255 for i in range(consider)])
  Y = y_train[:consider]
  batch_size, epochs = 16, 20
  model = FNN(784, 10, [50, 20])
  ##sgd = SGD(model, 0.01)
  nest_gd=nest_gd(model,0.01)


  for ep in range(1, epochs+1):
    ids = np.arange(consider)
    np.random.shuffle(ids)
    start, end = 0, batch_size
    while end > start:
      x, y = X[ids[start:end]], Y[ids[start:end]]
      nest_gd.optimize(x, y)
      start, end = end, min(consider, end+batch_size)
    err = nest_gd.error(X, Y)
    print(f'epoch: {ep}, error: {err}')


epoch: 1, error: 0.9022863988199566
epoch: 2, error: 0.7375622299036497
epoch: 3, error: 0.6694436072901908
epoch: 4, error: 0.6225547693740252
epoch: 5, error: 0.5965219942049218
epoch: 6, error: 0.5574668815721634
epoch: 7, error: 0.5344249708474574
epoch: 8, error: 0.513837668279311
epoch: 9, error: 0.5077528854045938
epoch: 10, error: 0.48675290678395133
epoch: 11, error: 0.46501250599515215
epoch: 12, error: 0.4542538429194883
epoch: 13, error: 0.46618177659811233
epoch: 14, error: 0.43407716827659504
epoch: 15, error: 0.42445590752539036
epoch: 16, error: 0.41467349303222883
epoch: 17, error: 0.39938699829987157
epoch: 18, error: 0.39900238314553005
epoch: 19, error: 0.39569144214027663
epoch: 20, error: 0.37659968099900837
