<a href="https://colab.research.google.com/github/BNarayanaReddy/CS7015/blob/main/Chapter5_Moment_NAG_Adagrad_RMS_ADAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [112]:
height = np.random.normal(loc = 168, scale = 10, size=500) # average 168, deviation 10, size 50
weight = np.random.normal(loc = 68, scale = 5, size=500)

In [113]:
bmi = weight / (height/100)**2

In [114]:
labels = (bmi >= 25).astype(int) # obese = 1 else 0

### Network - Multi Label Classification
Input (shape: 50, 2) \
      || \
Layer1 (Neurons = 5) \
|| \
Layer 2 (Neurons = 5) \
|| \
Output Layer (Neurons = 1)

In [115]:
height = height.reshape(-1, 1)
weight = weight.reshape(-1, 1)

In [116]:
X = np.concatenate([height/np.max(height), weight/np.max(weight)], axis = 1)

In [117]:
Y = np.copy(labels).reshape(1,-1)

In [118]:
X[0]

array([0.92143501, 0.78773796])

In [119]:
Y.shape

(1, 500)

In [120]:
# weights = [  [<Weights of layer 1, unit 1>, <Weights of layer 1, unit 2>...], [<Weights of layer 2, unit 1>, <Weights of layer 2, unit 2>...]        ]

In [121]:
def initialize_weights(units, layers = 2, input_dim = 2):
  weights = {}
  biases = {}
  # Hidden layers
  for layer in range(1, layers+1):
    if layer == 1:
      weights[layer] = np.random.randn(units, input_dim) * 0.5
      biases[layer] = np.random.randn(1, units) * 0.5
    else:
      weights[layer] = np.random.randn(units, units) * 0.5
      biases[layer] = np.random.randn(1, units) * 0.5

  # Output layer
  weights[layers+1] = np.random.randn(1, units) * 0.5
  biases[layers+1] = np.random.randn(1, 1) * 0.5

  return weights, biases

In [122]:
weights, biases = initialize_weights(5, 2)

In [123]:
weights[1].shape, weights[2].shape, weights[3].shape, biases[1].shape, biases[2].shape, biases[3].shape

((5, 2), (5, 5), (1, 5), (1, 5), (1, 5), (1, 1))

In [124]:
def activation(z, act_fn):
  if act_fn == 'relu':
    return np.maximum(0,z)
  if act_fn == 'sigmoid':
    return 1/(1+np.exp(-z))

In [125]:
def forward_prop(X, weights, biases, hidden_activation='relu', output_activation = 'sigmoid'):
  # m, n = X.shape
  a_op = {}
  h_op = {}
  h_op[0] = X
  # hidden
  for i in range(1, len(weights)):
    w = weights[i]
    b = biases[i]
    a_op[i] = np.matmul(h_op[i-1], w.T) + b
    h_op[i] = activation(a_op[i], hidden_activation)
  # output
  a_op[len(weights)] = np.dot(weights[len(weights)], h_op[len(weights)-1].T) + biases[len(weights)]
  h_op[len(weights)] = activation(a_op[len(weights)], output_activation)

  return a_op, h_op

In [126]:
a_op, h_op = forward_prop(X, weights, biases)

In [127]:
def compute_activation_gradient(a, activ_fn):
  grad = np.zeros(a.shape)
  if activ_fn == 'relu':
    grad[a > 0] = 1
  if activ_fn == 'sigmoid':
    return a*(1-a)
  return grad

In [128]:
def backpropagation(X, Y, weights, biases, y_pred, output_activation='sigmoid', hidden_activation='relu'):
  a_op, h_op = y_pred
  op_layer = 3
  output_gradient = h_op[op_layer] - Y

  grad_w = {}
  grad_b = {}

  for layer in range(op_layer, 0, -1):
    # print("Layer: ", layer)
    grad_w[layer] = np.dot(output_gradient, h_op[layer-1])
    grad_b[layer] = np.sum(output_gradient, axis=1) # 1, 10

    hidden_grad = np.dot(weights[layer].T, output_gradient)

    prev_op = a_op[layer]
    aggregate_grad = hidden_grad * compute_activation_gradient(h_op[layer-1].T, hidden_activation)


    output_gradient = aggregate_grad

  return grad_w, grad_b

In [129]:
def compute_cost(Y_pred, Y):
  return np.mean(-Y*np.log10(Y_pred + 1e-8)-(1-Y)*np.log10(1-Y_pred + 1e-8))

In [130]:
def vanilla_gd(X, Y, weights, biases, epochs = 100, lr = 1e-3, output_activation='sigmoid', hidden_activation='relu'):
  for epoch in range(epochs):
    y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)

    grad_w, grad_b = backpropagation(X, Y, weights, biases, y_pred, output_activation, hidden_activation)

    for layer in range(1, len(weights)+1):
      weights[layer] -= lr*grad_w[layer]
      # print(grad_b[layer].shape)
      biases[layer] -= lr*grad_b[layer]

  y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)
  gd_cost = compute_cost(y_pred[1][3], Y)
  return weights, biases, gd_cost

In [131]:
weights, biases = initialize_weights(5, 2)
vanilla_gd(X, Y, weights, biases)

({1: array([[ 0.43698617, -0.07581409],
         [ 0.65381624, -0.06136731],
         [-1.01563952,  0.07041402],
         [-0.58471875, -0.08562356],
         [ 0.04171964,  0.36163176]]),
  2: array([[-0.20219152, -0.72614578, -0.30531518, -0.00924112,  0.18379936],
         [ 0.30440054,  0.46828597, -0.06115064, -0.32691393, -0.36912133],
         [-0.94820832, -0.53875948, -0.17020645, -0.00293365,  0.14781074],
         [-0.32083909,  0.77151213, -1.08870002, -0.28345569, -0.83709831],
         [-0.91551348,  0.26636017,  0.41453549,  0.10179388, -0.23694052]]),
  3: array([[ 0.2727478 ,  0.07895604, -0.05008925,  0.01951893,  0.18592804]])},
 {1: array([[ 0.76741771,  0.52582109,  0.0968249 , -0.05451993,  1.07737662]]),
  2: array([[ 0.15816034, -0.36983766, -0.83614025,  0.40669893, -0.39424499]]),
  3: array([[-0.49804249]])},
 np.float64(0.28797047169854556))

In [132]:
def moment_gd(X, Y, weights, biases, epochs = 100, lr = 1e-3, output_activation='sigmoid', hidden_activation='relu', gamma = 0.98):
  for epoch in range(epochs):
    y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)

    grad_w, grad_b = backpropagation(X, Y, weights, biases, y_pred, output_activation, hidden_activation)


    update_w = {}
    update_b = {}

    for layer in range(1, len(weights)+1):
      update_w[layer] = np.zeros((weights[layer].shape))
      update_b[layer] = np.zeros((biases[layer].shape))

    for layer in range(1, len(weights)+1):

      update_w[layer] = (gamma*update_w[layer]) + lr*grad_w[layer]
      update_b[layer] = (gamma*update_b[layer]) + lr*grad_b[layer]

      weights[layer] -= update_w[layer]
      biases[layer] -= update_b[layer]

  y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)
  mgd_cost = compute_cost(y_pred[1][3], Y)
  return weights, biases, mgd_cost

In [133]:
weights, biases = initialize_weights(5, 2)
moment_gd(X, Y, weights, biases, gamma = 0.99)

({1: array([[-0.4301708 , -0.55158947],
         [ 0.37868323,  0.13363311],
         [ 1.14082322,  0.30786474],
         [ 0.05590133,  0.0542555 ],
         [-0.12511918,  0.1090524 ]]),
  2: array([[ 0.47443898, -0.23419657,  0.07998146,  0.77726405, -0.63848996],
         [-0.04783648, -0.64972155,  0.23788968, -0.7751783 ,  0.6997581 ],
         [ 1.32025805, -0.01048379,  0.69633826,  0.0693517 ,  0.15249587],
         [-0.21312267, -0.32422692,  0.21684579, -0.53144651, -0.05994411],
         [-1.33016147, -0.12086501, -0.68811105, -0.44488353,  0.04638416]]),
  3: array([[-0.98233767,  0.16582054, -0.22151506,  0.3348588 ,  0.3152293 ]])},
 {1: array([[-0.47869285,  0.10894568, -0.2701648 ,  0.14631273, -0.20568637]]),
  2: array([[-0.46452029, -0.95221885,  0.17964414, -0.33193775, -0.07395891]]),
  3: array([[-0.32926668]])},
 np.float64(0.2869972484639911))

In [134]:
def nesterov_gd(X, Y, weights, biases, epochs = 100, lr = 1e-3, output_activation='sigmoid', hidden_activation='relu', gamma = 0.98):
  for epoch in range(epochs):
    y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)

    update_w = {}
    update_b = {}
    w_lookahead = {}
    b_lookahead = {}
    for layer in range(1, len(weights)+1):
      update_w[layer] = np.zeros((weights[layer].shape))
      update_b[layer] = np.zeros((biases[layer].shape))


    for layer in range(1, len(weights)+1):
      w_lookahead[layer] = weights[layer] - gamma*update_w[layer]
      b_lookahead[layer] = biases[layer] - gamma*update_b[layer]


    grad_w, grad_b = backpropagation(X, Y, w_lookahead, b_lookahead, y_pred, output_activation, hidden_activation)

    for layer in range(1, len(weights)+1):
      update_w[layer] = (gamma*update_w[layer]) + lr*grad_w[layer]
      update_b[layer] = (gamma*update_b[layer]) + lr*grad_b[layer]

      weights[layer] -= update_w[layer] + lr*grad_w[layer]
      biases[layer] -= update_b[layer] + lr*grad_b[layer]

  y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)
  mgd_cost = compute_cost(y_pred[1][3], Y)
  return weights, biases, mgd_cost

In [135]:
weights, biases = initialize_weights(5, 2)
nesterov_gd(X, Y, weights, biases, gamma = 0.99)

({1: array([[ 0.32716012,  0.12732854],
         [ 1.71993214, -0.99959609],
         [-0.14650644, -1.08317801],
         [ 0.19497428, -0.67370671],
         [-0.28660563,  0.3036672 ]]),
  2: array([[-0.12420708, -0.6672138 ,  0.44149706,  0.37627987,  0.15834825],
         [ 0.03028567, -1.19195039,  0.79172105,  0.07388626, -0.00695355],
         [ 0.29206552,  1.04236548, -0.3714885 , -0.49928835, -0.59093016],
         [ 0.17557186, -0.50279375, -0.48661442, -0.09009045, -0.09720017],
         [-0.25552899,  0.11931919, -0.50446871,  0.85182566,  0.04728329]]),
  3: array([[ 0.61890209,  0.9145727 , -0.86258011, -0.99250901, -0.13594265]])},
 {1: array([[ 0.03885869,  0.35335083, -0.09083193, -0.19514739,  0.13772654]]),
  2: array([[ 0.4345529 ,  0.13090139,  0.06790481, -0.31575656,  0.40818658]]),
  3: array([[-0.52806561]])},
 np.float64(0.32692971326014914))

In [136]:
def adagrad(X, Y, weights, biases, epochs = 100, lr = 1e-3, output_activation='sigmoid', hidden_activation='relu'):
  for epoch in range(epochs):
    y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)

    update_w = {}
    update_b = {}

    for layer in range(1, len(weights)+1):
      update_w[layer] = np.zeros((weights[layer].shape))
      update_b[layer] = np.zeros((biases[layer].shape))


    grad_w, grad_b = backpropagation(X, Y, weights, biases, y_pred, output_activation, hidden_activation)

    for layer in range(1, len(weights)+1):
      update_w[layer] += (grad_w[layer])**2
      update_b[layer] += (grad_b[layer])**2

      weights[layer] -= (lr/((update_w[layer] + 1e-8)**0.5)) * grad_w[layer]
      biases[layer] -= (lr/((update_b[layer] + 1e-8)**0.5)) * grad_b[layer]

  y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)
  adagrad_cost = compute_cost(y_pred[1][3], Y)
  return weights, biases, adagrad_cost

In [137]:
weights, biases = initialize_weights(5, 2)
adagrad(X, Y, weights, biases)

({1: array([[-0.24714276,  0.04486096],
         [ 0.47431511,  0.3406276 ],
         [ 0.18579491, -0.66803038],
         [ 0.35642939,  0.19006158],
         [ 0.10550974, -0.33215126]]),
  2: array([[-0.30564955, -0.2428932 ,  0.88481615,  0.88593697,  0.02598832],
         [-0.66590332,  0.59485525,  0.1485907 , -0.42371483,  0.46601335],
         [-0.8822107 , -0.47016569, -0.30239002, -0.7136859 ,  0.08422289],
         [-0.20657334, -0.34753255,  0.34640142,  0.44832596,  0.29316841],
         [ 0.52863577,  0.19090321, -0.05828584, -0.12014932, -0.56198489]]),
  3: array([[-0.17917926, -1.11243024, -0.68306875,  0.48032388, -0.77428947]])},
 {1: array([[0.18235215, 0.64262175, 0.45484002, 0.19914385, 0.08790305]]),
  2: array([[ 0.38490212,  0.17196445, -0.55862831, -0.01584433, -0.6116621 ]]),
  3: array([[1.04542321]])},
 np.float64(0.3071612581418124))

In [138]:
def rmsprop(X, Y, weights, biases, epochs = 100, lr = 1e-3, output_activation='sigmoid', hidden_activation='relu', beta = 0.9):
  for epoch in range(epochs):
    y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)

    update_w = {}
    update_b = {}

    for layer in range(1, len(weights)+1):
      update_w[layer] = np.zeros((weights[layer].shape))
      update_b[layer] = np.zeros((biases[layer].shape))


    grad_w, grad_b = backpropagation(X, Y, weights, biases, y_pred, output_activation, hidden_activation)

    for layer in range(1, len(weights)+1):
      update_w[layer] = beta*update_w[layer] + (1-beta)*(grad_w[layer])**2
      update_b[layer] = beta*update_b[layer] + (1-beta)*(grad_b[layer])**2

      weights[layer] -= (lr/((update_w[layer] + 1e-8)**0.5)) * grad_w[layer]
      biases[layer] -= (lr/((update_b[layer] + 1e-8)**0.5)) * grad_b[layer]

  y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)
  rmsprop_cost = compute_cost(y_pred[1][3], Y)
  return weights, biases, rmsprop_cost

In [139]:
weights, biases = initialize_weights(5, 2)
rmsprop(X, Y, weights, biases, beta = 0.99)

({1: array([[ 8.11074857e-02, -8.40230836e-01],
         [-1.01824732e+00,  7.17165934e-01],
         [ 1.35179868e+00, -1.16162592e+00],
         [-8.07099957e-01, -2.75056631e-01],
         [-4.27292892e-04, -2.90621713e-01]]),
  2: array([[-0.50529966,  0.55448097,  1.14623451,  0.94424462, -0.29454386],
         [-0.00153593,  1.12934986, -1.07924576,  0.00726724,  0.28518244],
         [-0.107936  , -0.77886482,  0.29215288,  0.4478823 ,  0.10839842],
         [-0.67181038,  1.20500197, -1.21965356,  0.07379449, -0.64140427],
         [-0.45626023,  0.00663744, -0.31320651,  0.80786187,  0.42224474]]),
  3: array([[ 0.50758493,  1.10217295, -0.32353367,  1.68657993,  0.03872414]])},
 {1: array([[-0.52884034,  0.48751937, -0.00062787, -0.44264814, -0.18266504]]),
  2: array([[-0.65272271,  0.25237377, -0.32131012,  0.25332547, -0.43918043]]),
  3: array([[-1.08568802]])},
 np.float64(0.2216368953353209))

In [140]:
def ADAM(X, Y, weights, biases, epochs = 100, lr = 1e-3, output_activation='sigmoid', hidden_activation='relu', beta1 = 0.9, beta2 = 0.99):
  for epoch in range(epochs):
    y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)

    lr_w = {}
    lr_b = {}

    moment_w = {}
    moment_b = {}

    for layer in range(1, len(weights)+1):
      moment_w[layer] = np.zeros((weights[layer].shape))
      moment_b[layer] = np.zeros((biases[layer].shape))

    for layer in range(1, len(weights)+1):
      lr_w[layer] = np.zeros((weights[layer].shape))
      lr_b[layer] = np.zeros((biases[layer].shape))


    grad_w, grad_b = backpropagation(X, Y, weights, biases, y_pred, output_activation, hidden_activation)

    for layer in range(1, len(weights)+1):

      momentum_bias_correction = 1 - beta1**(epoch+1)
      lr_correction = 1 - beta2**(epoch+1)

      # moment
      moment_w[layer] = (beta1*moment_w[layer] + (1-beta1)*grad_w[layer])/momentum_bias_correction
      moment_b[layer] = (beta1*moment_b[layer] + (1-beta1)*grad_b[layer])/momentum_bias_correction

      # LR - Adaption
      lr_w[layer] = (beta2*lr_w[layer] + (1-beta2)*(grad_w[layer])**2)/lr_correction
      lr_b[layer] = (beta2*lr_b[layer] + (1-beta2)*(grad_b[layer])**2)/lr_correction

      # Update
      weights[layer] -= (lr/((lr_w[layer] + 1e-8)**0.5)) * moment_w[layer]
      biases[layer] -= (lr/((lr_b[layer] + 1e-8)**0.5)) * moment_b[layer]

  y_pred = forward_prop(X, weights, biases, hidden_activation, output_activation)
  adam_cost = compute_cost(y_pred[1][3], Y)
  return weights, biases, adam_cost

In [141]:
weights, biases = initialize_weights(5, 2)
ADAM(X, Y, weights, biases, beta1 = 0.91, beta2 = 0.99)

({1: array([[ 0.62261371,  0.15493885],
         [ 0.18368862,  0.5555347 ],
         [-1.08075172, -0.59681157],
         [-0.2026865 , -0.15525941],
         [-0.3467897 ,  0.55655151]]),
  2: array([[ 0.72727929,  0.21529042, -0.24563052, -0.48860513, -0.44193537],
         [ 0.18325049, -0.09992685, -0.19680009,  0.30027889, -0.5575378 ],
         [-0.45370309,  1.22847173,  0.27170622, -0.18823191,  0.31375255],
         [-0.06430194, -0.77282963,  1.21199431,  0.26673197,  0.22424152],
         [ 0.48440491, -0.02766997, -0.360063  ,  0.38303944,  1.15805186]]),
  3: array([[-1.4008536 , -1.40197882, -0.86294694,  0.38610047,  0.49899965]])},
 {1: array([[ 0.50363692, -0.15453439, -0.8356804 , -0.04716253, -0.46264989]]),
  2: array([[-0.26073063, -0.14691104, -0.15567666,  0.3212611 ,  1.21416471]]),
  3: array([[-0.39607107]])},
 np.float64(0.2845596105541566))

In [142]:
# Result Format: Weights, Biases, Cost