<a href="https://colab.research.google.com/github/returaj/cs6910/blob/assginment1_akash/adagrad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
class adagrad_gd(object):
  def __init__(self, model, alpha):
    self.model = model
    self.alpha = alpha
    self.initialize()
  def initialize(self):
    self.gamma=0.9
    self.v_w=[]
    self.v_b=[]
    num_layers = len(model.weight)
    for i in range(num_layers):
      m, n = self.model.weight[i].shape
      self.v_w.append(np.zeros((m,n)))
      self.v_b.append(np.zeros(n))
  def optimize(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    gamma=0.9
    epsilon=0.000001
    model = self.model
    num_layers = len(model.weight)
    layers=num_layers
    layer_output = model.forward(X)
    dw,db = model.backward(X, y, layer_output)
    for l in range(num_layers):
      self.v_w[l]= self.v_w[l]+np.power(dw[l],2)
      self.v_b[l]=self.v_b[l]+np.power(db[l],2)
      model.weight[l]-=(self.alpha/np.sqrt(self.v_w[l]+epsilon))*dw[l]
      model.bias[l]-=(self.alpha/np.sqrt(self.v_b[l]+epsilon))*db[l]
      
  def error(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size = X.shape[0]
    prob = self.model.forward(X)[-1]
    err = - np.sum(np.log(prob[np.arange(batch_size), y])) / batch_size
    return err

In [5]:
import numpy as np
from keras.datasets import fashion_mnist
##from optimizer import SGD


class FNN(object):
  def __init__(self, input_size, output_size, hidden_layers_size):
    self.input_size = input_size
    self.output_size = output_size
    self.weight, self.bias = None, None
    self.initialize(input_size, hidden_layers_size, output_size)

  def initialize(self, input_size, hidden_layers_size, output_size):
    self.weight, self.bias = [], []
    prev_layer_size = input_size
    hidden_layers_size.append(output_size)
    for curr_layer_size in hidden_layers_size:
      self.weight.append(np.random.normal(0, 1, size=(prev_layer_size, curr_layer_size)))
      self.bias.append(np.zeros(curr_layer_size))
      prev_layer_size = curr_layer_size

  def reset(self):
    num_layers = len(self.weight)
    for l in range(num_layers):
      m, n = self.weight[l].shape
      self.weight[l] = np.random.normal(0, 1, size=(m, n))
      self.bias[l] = np.zeros(n)

  @staticmethod
  def sigmoid(x):
    return 1./(1+np.exp(-x))

  @staticmethod
  def softmax(x):
    """
    x: (batch_size(B), data_size(N))
    """
    x_max = np.max(x, axis=1, keepdims=True)
    exp_prob = np.exp(x - x_max)
    prob = exp_prob / np.sum(exp_prob, axis=1, keepdims=True)
    return prob

  def forward(self, X):
    """
    X: (batch_size(B), data_size(N))
    """
    layer_output = []
    prev_layer = X
    num_hidden_layers = last_layer = len(self.weight) - 1
    for t in range(num_hidden_layers):
      w, b = self.weight[t], self.bias[t]
      next_layer = self.sigmoid(np.dot(prev_layer, w) + b)
      layer_output.append(next_layer)
      prev_layer = next_layer
    w, b = self.weight[last_layer], self.bias[last_layer]
    prob = self.softmax(np.dot(prev_layer, w) + b)
    layer_output.append(prob)
    return layer_output

  def backward(self, X, y, layer_output):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size, _ = X.shape
    num_hidden_layers = last_layer = len(layer_output)-1
    dw, db = [None]*(num_hidden_layers+1), [None]*(num_hidden_layers+1)
    for t in range(num_hidden_layers, -1, -1):
      if t == last_layer:
        dh = layer_output[t] / batch_size
        dh[np.arange(batch_size), y] -= 1/batch_size
      else:
        dh = np.dot(dh_fwd, self.weight[t+1].T) * layer_output[t] * (1-layer_output[t])
      prev_layer_output = X if t==0 else layer_output[t-1]
      dw[t] = np.dot(prev_layer_output.T, dh)
      db[t] = np.sum(dh, axis=0)
      dh_fwd = dh
    return dw, db


if __name__ == '__main__':
  (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
  consider = 1000
  X = np.array([(x_train[i].flatten())/255 for i in range(consider)])
  Y = y_train[:consider]
  batch_size, epochs = 16, 20
  model = FNN(784, 10, [50, 20])
  ##sgd = SGD(model, 0.01)
  adagrad_gd=adagrad_gd(model,0.01)


  for ep in range(1, epochs+1):
    ids = np.arange(consider)
    np.random.shuffle(ids)
    start, end = 0, batch_size
    while end > start:
      x, y = X[ids[start:end]], Y[ids[start:end]]
      adagrad_gd.optimize(x, y)
      start, end = end, min(consider, end+batch_size)
    err = adagrad_gd.error(X, Y)
    print(f'epoch: {ep}, error: {err}')


epoch: 1, error: 1.891393324809104
epoch: 2, error: 1.6731522557733212
epoch: 3, error: 1.5702276674305553
epoch: 4, error: 1.48864564292644
epoch: 5, error: 1.4167115269645232
epoch: 6, error: 1.358962937337205
epoch: 7, error: 1.3090137037016238
epoch: 8, error: 1.2633800540024367
epoch: 9, error: 1.2209330711161241
epoch: 10, error: 1.182332899285462
epoch: 11, error: 1.146646311810267
epoch: 12, error: 1.1138633569704761
epoch: 13, error: 1.085409581348194
epoch: 14, error: 1.059705682871445
epoch: 15, error: 1.035507148428007
epoch: 16, error: 1.0139363677363091
epoch: 17, error: 0.9942293573439184
epoch: 18, error: 0.9762299364129362
epoch: 19, error: 0.9591688632228595
epoch: 20, error: 0.9431457993973645
