<a href="https://colab.research.google.com/github/AndrewstheBuilder/grokking_deeplearning/blob/main/LSTM_CH13_CH14_Grokking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Grokking Deep Learning Framework Begin

In [49]:
import numpy as np
class Tensor (object):
  def __init__(self,data,label='None',
               autograd=False,
               creators=None,
               creation_op=None,
               id=None):
    # print('label',label)
    self.label = label
    self.data = np.array(data)
    self.creation_op = creation_op
    self.creators = creators
    self.grad = None
    self.autograd = autograd
    self.children = {}
    if(id is None):
      id = np.random.randint(0,100000) # What is the likelyhood of producing the same id for a tensor in the same session?
    self.id = id

    if(creators is not None):
      # print('creators',creators)
      for c in creators:
        # keeps track of how many children a tensor has
        if(self.id not in c.children):
          # Initialize c.children[self.id]
          # We are giving the creator the children property
          c.children[self.id] = 1
        else:
          # Update counter for children
          # What is this scenario ???
          # Each child should have 2 separate parents and that's it
          # Another way of saying this ^ is that all parents have one child
          print('when does this get called')
          print('c',c)
          print('self',self)
          c.children[self.id] += 1

  def all_children_grads_accounted_for(self):
    '''
    Checks whether a tensor has received the correct
    number of gradients from each child
    '''
    for id,cnt in self.children.items():
      if(cnt != 0):
        return False
    return True

  def backward(self,grad=None,grad_origin=None):
    print('self',self)
    print('grad',grad)
    print('grad_origin',grad_origin)
    print()
    if(self.autograd):
      if(grad_origin is not None):
        if(self.children[grad_origin.id] == 0):
          raise Exception("cannot backprop more than once")
        else:
          self.children[grad_origin.id] -= 1

      if(self.grad is None):
        self.grad = grad
      else:
        # accumulates gradients from several children
        self.grad += grad

      if(self.creators is not None and
         (self.all_children_grads_accounted_for() or
          grad_origin is None)):
          # begins actual back propagation
          if(self.creation_op == "add"):
            self.creators[0].backward(grad,self)
            self.creators[1].backward(grad,self)

          if(self.creation_op == "neg"):
            self.creators[0].backward(self.grad.__neg__())
    # old code before adding support for multiuse tensors
    # self.grad = grad

    # if(self.creation_op == "add"):
    #   self.creators[0].backward(grad)
    #   self.creators[1].backward(grad)

  def __add__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data + other.data,
                    label=self.label+' + '+other.label,
                    autograd = True,
                    creators=[self, other],
                    creation_op="add")
    # print("When would this case ever be true?")
    # print('self',self)
    # print('other',other)
    # This case gets called when we are accumulating the gradients and add the self.grad += grad
    # if((self.autograd == False and other.autograd == True) or (self.autograd == True and other.autograd == False)):
    #   print('how did this happen??')
    #   print('self',self)
    #   print('other',other)
    return Tensor(self.data + other.data)

  def __neg__(self):
    if(self.autograd):
      # I think this tensor replaces the tensor I have as I initially declared it
      # print('neg self',self)
      # print('self.data',self.data)
      return Tensor(self.data*-1,
                    label='-'+self.label,
                    autograd=True,
                    creators=[self],
                    creation_op="neg",)
    # print('when does neg this else statement occur', self)
    # It happens for the gradient calculation. When we are backpropagating the grad from the child.
    return Tensor(self.data*-1)

  def __sub__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data - other.data,
                    label = self.label + ' - ' + other.label,
                    autograd=True,
                    creators=[self, other],
                    creation_op="sub")
    # if((self.autograd == False and other.autograd == True) or (self.autograd == True and other.autograd == False)):
    #   print('how did this happen??')
    #   print('self',self)
    #   print('other',other)

    return Tensor(self.data - other.data)

  def __mul__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data * other.data,
                    label = self.label+'*'+other.label,
                    autograd=True,
                    creators=[self, other],
                    creation_op="mul")
    # if((self.autograd == False and other.autograd == True) or (self.autograd == True and other.autograd == False)):
    #   print('how did this happen??')
    #   print('self',self)
    #   print('other',other)

    return Tensor(self.data - other.data)

  def sum(self, dim):
    if(self.autograd):
      return Tensor(self.data.sum(dim),
                    label = self.label+'.sum_'+str(dim)+')',
                    autograd=True,
                    creators=[self],
                    creation_op="sum_"+str(dim))
    # if((self.autograd == False)):
    #   print('how did this happen?? in sum')
    #   print('self',self)
    #   print('dim',dim)

    return Tensor(self.data.sum(dim))

  def expand(self, dim, copies):

    trans_cmd = list(range(0, len(self.data.shape)))
    trans_cmd.insert(dim, len(self.data.shape))
    new_shape = list(self.data.shape) + [copies]
    new_data = self.data.repeat(copies).reshape(new_shape)
    new_data = new_data.transpose(trans_cmd)

    if(self.autograd):
      return Tensor(new_data,
                    label=self.label+".expand_"+str(dim),
                    autograd=True,
                    creators=[self],
                    creation_op="expand_"+str(dim))
    # print('How the heck did you get here in expand')
    # print('self',self)
    # print('dim',dim)
    # print('copies',copies)
    return new_data

  def transpose(self):
    if(self.autograd):
      return Tensor(self.data.transpose(),
                    label=self.label+".transpose",
                    autograd=True,
                    creators=self,
                    creation_op="transpose")
    # print("How did you get here in transpose()")
    # print('self',self)
    return Tensor(self.data.transpose())

  def mm(self,x):
    if(self.autograd):
      return Tensor(self.data.dot(x.data),
                    label=self.label+".dot_"+x.data,
                    autograd=True,
                    creators=self,
                    creation_op="mm")
    # print("How did you get here in mm()")
    # print('self',self)
    return Tensor(self.data.dot(x.data))

  def __repr__(self):
    # This method calls the self.data's repr method
    return str(self.label.__repr__() + ":" + self.data.__repr__())

  def __str__(self):
    return str(self.label.__repr__() + ":" + self.data.__str__() + ' Creators:'+self.creators.__str__())

x = Tensor(np.array([[1,2,3],
                     [4,5,6]]), label='x')
x.sum(0)

# a = Tensor(([1,2,3,4,5]), label='a', autograd=True)
# b = Tensor(([2,2,2,2,2]), label='b', autograd=True)
# c = Tensor(([5,4,3,2,1]), label='c', autograd=True)

# d = a + (-b)
# e = (-b) + c
# f = d + e
# f.backward(Tensor(np.array([1,1,1,1,1]), label='initial grad'))

# # f.backward(Tensor(np.array([1,1,1,1,1])))

# print('b.grad.data',b.grad.data)
# d = a + b
# e = b + c
# f = d + e

# f.backward(Tensor(np.array([1,1,1,1,1])))
# # f.backward(Tensor(np.array([1,1,1,1,1])))
# print(b.grad.data == np.array([2,2,2,2,2]))
# print('f.grad',f.grad)
# print('e.grad',e.grad)
# print('d.grad',d.grad)
# print('b.grad',b.grad)
# Old code before adding support for multiuse tensors
# x = Tensor([1,2,3,4,5])
# y = Tensor([2,2,2,2,2])

# z = x+y
# z.backward(Tensor(np.array([1,1,1,1,1])))
# print(x.grad)
# print(y.grad)
# print(z.creators)
# print(z.creation_op)

'None':array([5, 7, 9])

A toy example of RNN Backpropagation with exploding and vanishing gradients

In [None]:
import numpy as np

In [None]:
(sigmoid,relu) = (lambda x: 1/1+np.exp(-x)), lambda x: (x>0).astype(float)*x
weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1, 0.01]))

print("Sigmoid Activations")
activations = list()
for iter in range(10):
  activation = sigmoid(activation.dot(weights))
  activations.append(activation)
  print(activation)
print("\nSigmoid Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  # The derivative of sigmoid causes very small gradients when activation is very near 0 or 1
  sigmoid_deriv = (activation) * (1-activation)
   # Chain Rule
  gradient = sigmoid_deriv * gradient
  gradient = gradient.dot(weights.transpose()) # So this is also part of the chain rule???
  print(gradient)

print("\nRelu Activations")
activations = list()
for iter in range(10):
  # The matrix multiplication causes exploding gradients that don't get squashed by a nonlinearity as in sigmoid
  activation = relu(activation.dot(weights))

  activations.append(activation)
  print(activation)
print("\n Relu Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  gradient = ((activation > 0) * gradient).dot(weights.transpose())
  print(gradient)

# Adding gates to RNN will replace all of the nonlinearies and matrix multiplications

Sigmoid Activations
[1.00008889 1.00057475]
[1.00672188 1.00673168]
[1.006515   1.00651519]
[1.00652199 1.00652199]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]

Sigmoid Gradients
[-0.03282154 -0.03282154]
[0.00107725 0.00107725]
[-3.53571093e-05 -3.53571093e-05]
[1.16047468e-06 1.16047468e-06]
[-3.80885641e-08 -3.80885641e-08]
[1.25012386e-09 1.25012385e-09]
[-4.10323732e-11 -4.10323591e-11]
[1.34536847e-12 1.34534485e-12]
[-4.55737812e-14 -4.55341556e-14]
[1.08795551e-16 4.23921767e-17]

Relu Activations
[5.00238791 5.00093033]
[25.00610921 25.01048197]
[125.04803709 125.03491883]
[625.1877124  625.22706719]
[3126.09598115 3125.97791678]
[15630.00764826 15630.36184138]
[78151.45501378 78150.39243441]
[390753.02475143 390756.21248955]
[1953777.87470964 1953768.31149529]
[9768851.12069078 9768879.81033384]

 Relu Gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15