<a href="https://colab.research.google.com/github/AndrewstheBuilder/grokking_deeplearning/blob/main/LSTM_CH13_CH14_Grokking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Deep Learning Framework Begin

In [15]:
import numpy as np
class Tensor (object):
  def __init__(self, data,
               autograd=False,
               creators=None,
               creation_op=None,
               id=None):
    self.data = np.array(data)
    self.creation_op = creation_op
    self.creators = creators
    self.grad = None

  def backward(self,grad):
    self.grad = grad

    if(self.creation_op == "add"):
      self.creators[0].backward(grad)
      self.creators[1].backward(grad)

  def __add__(self, other):
    return Tensor(self.data + other.data,
                  creators=[self, other],
                  creation_op="add")

  def __repr__(self):
    # This method calls the self.data's repr method
    return str(self.data.__repr__())

  def __str__(self):
    return str(self.data.__str__())

x = Tensor([1,2,3,4,5])
y = Tensor([2,2,2,2,2])

z = x+y
z.backward(Tensor(np.array([1,1,1,1,1])))
print(x.grad)
print(y.grad)
print(z.creators)
print(z.creation_op)

[1 1 1 1 1]
[1 1 1 1 1]
[array([1, 2, 3, 4, 5]), array([2, 2, 2, 2, 2])]
add


A toy example of RNN Backpropagation with exploding and vanishing gradients

In [None]:
import numpy as np

In [None]:
(sigmoid,relu) = (lambda x: 1/1+np.exp(-x)), lambda x: (x>0).astype(float)*x
weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1, 0.01]))

print("Sigmoid Activations")
activations = list()
for iter in range(10):
  activation = sigmoid(activation.dot(weights))
  activations.append(activation)
  print(activation)
print("\nSigmoid Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  # The derivative of sigmoid causes very small gradients when activation is very near 0 or 1
  sigmoid_deriv = (activation) * (1-activation)
   # Chain Rule
  gradient = sigmoid_deriv * gradient
  gradient = gradient.dot(weights.transpose()) # So this is also part of the chain rule???
  print(gradient)

print("\nRelu Activations")
activations = list()
for iter in range(10):
  # The matrix multiplication causes exploding gradients that don't get squashed by a nonlinearity as in sigmoid
  activation = relu(activation.dot(weights))

  activations.append(activation)
  print(activation)
print("\n Relu Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  gradient = ((activation > 0) * gradient).dot(weights.transpose())
  print(gradient)

# Adding gates to RNN will replace all of the nonlinearies and matrix multiplications

Sigmoid Activations
[1.00008889 1.00057475]
[1.00672188 1.00673168]
[1.006515   1.00651519]
[1.00652199 1.00652199]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]

Sigmoid Gradients
[-0.03282154 -0.03282154]
[0.00107725 0.00107725]
[-3.53571093e-05 -3.53571093e-05]
[1.16047468e-06 1.16047468e-06]
[-3.80885641e-08 -3.80885641e-08]
[1.25012386e-09 1.25012385e-09]
[-4.10323732e-11 -4.10323591e-11]
[1.34536847e-12 1.34534485e-12]
[-4.55737812e-14 -4.55341556e-14]
[1.08795551e-16 4.23921767e-17]

Relu Activations
[5.00238791 5.00093033]
[25.00610921 25.01048197]
[125.04803709 125.03491883]
[625.1877124  625.22706719]
[3126.09598115 3125.97791678]
[15630.00764826 15630.36184138]
[78151.45501378 78150.39243441]
[390753.02475143 390756.21248955]
[1953777.87470964 1953768.31149529]
[9768851.12069078 9768879.81033384]

 Relu Gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15