<a href="https://colab.research.google.com/github/returaj/cs6910/blob/assginment1_akash/NADAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
class NADAM(object):
  def __init__(self, model, alpha,beta1=0.9,beta2=0.99,epsilon=0.0000001):
    self.model = model
    self.alpha = alpha
    self.beta1=beta1
    self.beta2=beta2
    self.epsilon=epsilon
    self.found=0
    self.initialize()
  
  def initialize(self):
    self.v_w=[]
    self.v_b=[]
    self.m_w=[]
    self.m_b=[]
    self.m_w_hat=[]
    self.m_b_hat=[]
    self.v_w_hat=[]
    self.v_b_hat=[]
    self.mw_cap=[]
    self.mb_cap=[]
    num_layers = len(model.weight)
    for i in range(num_layers):
      m, n = self.model.weight[i].shape
      self.v_w.append(np.zeros((m,n)))
      self.v_b.append(np.zeros(n))
      self.m_w.append(np.zeros((m,n)))
      self.m_b.append(np.zeros(n))
      self.m_w_hat.append(np.zeros((m,n)))
      self.m_b_hat.append(np.zeros(n))
      self.v_w_hat.append(np.zeros((m,n)))
      self.v_b_hat.append(np.zeros(n))
      self.mw_cap.append(np.zeros((m,n)))
      self.mb_cap.append(np.zeros(n))
    
  def optimize(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    model = self.model
    num_layers = len(model.weight)
    layer_output = model.forward(X)
    dw,db = model.backward(X, y, layer_output)
    for l in range(num_layers):
      self.v_w[l]=self.beta2*self.v_w[l]+(1-self.beta2)*np.power(dw[l],2)
      self.v_b[l]=self.beta2*self.v_b[l]+(1-self.beta2)*np.power(db[l],2)
      self.m_w[l]=self.beta1*self.m_w[l]+(1-self.beta1)*dw[l]
      self.m_b[l]=self.beta1*self.m_b[l]+(1-self.beta1)*db[l]
      self.m_w_hat[l]=(1/(1-(self.beta1**(self.found+1))))*self.m_w[l]
      self.m_b_hat[l]=(1/(1-(self.beta1**(self.found+1))))*self.m_b[l]
      self.v_w_hat[l]=(1/(1-(self.beta2**(self.found+1))))*self.v_w[l]
      self.v_b_hat[l]=(1/(1-(self.beta2**(self.found+1))))*self.v_b[l]
      self.mw_cap[l]=self.beta1*self.m_w_hat[l]+(1-self.beta1)*dw[l]
      self.mb_cap[l]=self.beta1*self.m_b_hat[l]+(1-self.beta1)*db[l]
      model.weight[l]-=(self.alpha/np.sqrt(self.v_w_hat[l]+self.epsilon))*self.mw_cap[l]
      model.bias[l]-=(self.alpha/np.sqrt(self.v_b_hat[l]+self.epsilon))*self.mb_cap[l]
    self.found=self.found+1
  def error(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size = X.shape[0]
    prob = self.model.forward(X)[-1]
    err = - np.sum(np.log(prob[np.arange(batch_size), y])) / batch_size
    return err

In [2]:
import numpy as np
from keras.datasets import fashion_mnist
##from optimizer import SGD


class FNN(object):
  def __init__(self, input_size, output_size, hidden_layers_size, reg=0.001):
    self.input_size = input_size
    self.output_size = output_size
    self.weight, self.bias = None, None
    self.initialize(input_size, hidden_layers_size, output_size)
    self.reg = reg

  def initialize(self, input_size, hidden_layers_size, output_size):
    self.weight, self.bias = [], []
    prev_layer_size = input_size
    hidden_layers_size.append(output_size)
    for curr_layer_size in hidden_layers_size:
      self.weight.append(np.random.normal(0, 1, size=(prev_layer_size, curr_layer_size)))
      self.bias.append(np.zeros(curr_layer_size))
      prev_layer_size = curr_layer_size

  def reset(self):
    num_layers = len(self.weight)
    for l in range(num_layers):
      m, n = self.weight[l].shape
      self.weight[l] = np.random.normal(0, 1, size=(m, n))
      self.bias[l] = np.zeros(n)

  @staticmethod
  def sigmoid(x):
    return 1./(1+np.exp(-x))

  @staticmethod
  def softmax(x):
    """
    x: (batch_size(B), data_size(N))
    """
    x_max = np.max(x, axis=1, keepdims=True)
    exp_prob = np.exp(x - x_max)
    prob = exp_prob / np.sum(exp_prob, axis=1, keepdims=True)
    return prob

  def forward(self, X):
    """
    X: (batch_size(B), data_size(N))
    """
    layer_output = []
    prev_layer = X
    num_hidden_layers = last_layer = len(self.weight) - 1
    for t in range(num_hidden_layers):
      w, b = self.weight[t], self.bias[t]
      next_layer = self.sigmoid(np.dot(prev_layer, w) + b)
      layer_output.append(next_layer)
      prev_layer = next_layer
    w, b = self.weight[last_layer], self.bias[last_layer]
    prob = self.softmax(np.dot(prev_layer, w) + b)
    layer_output.append(prob)
    return layer_output

  def backward(self, X, y, layer_output):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size, _ = X.shape
    num_hidden_layers = last_layer = len(layer_output)-1
    dw, db = [None]*(num_hidden_layers+1), [None]*(num_hidden_layers+1)
    for t in range(num_hidden_layers, -1, -1):
      if t == last_layer:
        dh = layer_output[t] / batch_size
        dh[np.arange(batch_size), y] -= 1/batch_size
      else:
        dh = np.dot(dh_fwd, self.weight[t+1].T) * layer_output[t] * (1-layer_output[t])
      prev_layer_output = X if t==0 else layer_output[t-1]
      dw[t] = np.dot(prev_layer_output.T, dh) 
      # dw[t] = np.dot(prev_layer_output.T, dh) + self.reg*self.weight[t]
      db[t] = np.sum(dh, axis=0)
      dh_fwd = dh
    return dw, db

  def error(self, X, y):
    """
    X: (batch_size(B), data_size(N))
    y: (batch_size(B))
    """
    batch_size = X.shape[0]
    prob = self.forward(X)[-1]
    err = - np.sum(np.log(prob[np.arange(batch_size), y])) / batch_size
    # for w in self.weight:
    #   err += 0.5 * self.reg * np.sum(np.power(w,2))
    return err


# if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
consider = 10000
X = np.array([(x_train[i].flatten())/255for i in range(consider)])
Y = y_train[:consider]
batch_size, epochs = 16, 20
model = FNN(784, 10, [50, 20])
##sgd = SGD(model, 0.01)
NADAM=NADAM(model,0.0001)


for ep in range(1, epochs+1):
  ids = np.arange(consider)
  np.random.shuffle(ids)
  start, end = 0, batch_size
  while end > start:
    x, y = X[ids[start:end]], Y[ids[start:end]]
    NADAM.optimize(x, y)
    start, end = end, min(consider, end+batch_size)
  # print(model.weight[0])
  err = NADAM.error(X, Y)
  print(f'epoch: {ep}, error: {err}')


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
epoch: 1, error: 2.815581300642716
epoch: 2, error: 2.2039960443389863
epoch: 3, error: 1.8399932342971297
epoch: 4, error: 1.6273174956360354
epoch: 5, error: 1.4747348506262705
epoch: 6, error: 1.3588426840754724
epoch: 7, error: 1.2665572882317733
epoch: 8, error: 1.1917473062823982
epoch: 9, error: 1.1279871964211157
epoch: 10, error: 1.0717075881650653
epoch: 11, error: 1.022238320426339
epoch: 12, error: 0.9779592715017479
epoch: 13, error: 0.9398399162238704
epoch: 14, error: 0.9062665476048284
epoch: 15, error: 0.876704999813121
epoch: 16, error: 0.850