<a href="https://colab.research.google.com/github/AndrewstheBuilder/grokking_deeplearning/blob/main/LSTM_CH14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Deep Learning Framework

In [1]:
# Tensor Class. The Foundation of the Deep Learning Framework
import numpy as np
class Tensor (object):
  def __init__(self,data,label,
               autograd=False,
               creators=None,
               creation_op=None,
               id=None):
    self.label = label
    self.data = np.array(data)
    self.creation_op = creation_op
    self.creators = creators
    self.grad = None
    self.autograd = autograd
    self.children = {}
    if(id is None):
      id = np.random.randint(0,100000) # What is the likelyhood of producing the same id for a tensor in the same session?
    self.id = id

    if(creators is not None):
      for c in creators:
        # keeps track of how many children a tensor has
        if(self.id not in c.children):
          # Initialize c.children[self.id]
          # We are giving the creator the children property
          c.children[self.id] = 1
        else:
          # Update counter for children
          c.children[self.id] += 1

  def all_children_grads_accounted_for(self):
    '''
    Checks whether a tensor has received the correct
    number of gradients from each child
    '''
    for id,cnt in self.children.items():
      if(cnt != 0):
        return False
    return True

  def backward(self,grad=None,grad_origin=None):
    if(self.autograd):

      if(grad is None):
        grad = Tensor(np.ones_like(self.data), 'grad'+str(self.data.shape))

      if(grad_origin is not None):
        if(self.children[grad_origin.id] == 0):
          raise Exception("cannot backprop more than once")
        else:
          self.children[grad_origin.id] -= 1

      if(self.grad is None):
        self.grad = grad
      else:
        # accumulates gradients from several children
        self.grad += grad

      if(self.creators is not None and
         (self.all_children_grads_accounted_for() or
          grad_origin is None)):
          # begins actual back propagation
          if(self.creation_op == "add"):
            self.creators[0].backward(grad,self)
            self.creators[1].backward(grad,self)

          if(self.creation_op == "neg"):
            self.creators[0].backward(self.grad.__neg__())

          if(self.creation_op == "sub"):
            new = Tensor(self.grad.data, label='sub_grad')
            self.creators[0].backward(new, self)
            new = Tensor(self.grad.__neg__().data, label='sub_grad2')
            self.creators[1].backward(new,self)

          if(self.creation_op == "mul"):
            new = self.grad * self.creators[1].data
            self.creators[0].backward(new, self)
            new = self.grad * self.creators[0].data
            self.creators[1].backward(new, self)

          if(self.creation_op == "mm"):
            # Usually an activation
            act = self.creators[0]
            weights = self.creators[1]
            new = self.grad.mm(weights.transpose())
            act.backward(new)
            new = self.grad.transpose().mm(act).transpose()
            weights.backward(new)

          if(self.creation_op == "transpose"):
            self.creators[0].backward(self.grad.transpose())

          if("sum" in self.creation_op):
            dim = int(self.creation_op.split("_")[1])
            ds = self.creators[0].data.shape[dim]
            self.creators[0].backward(self.grad.expand(dim, ds))

          if("expand" in self.creation_op):
            dim = int(self.creation_op.split("_")[1])
            self.creators[0].backward(self.grad.sum(dim))

          if(self.creation_op == "sigmoid"):
            ones = Tensor(np.ones_like(self.grad.data), "ones used in sigmoid backprop")
            self.creators[0].backward(Tensor(self.grad.data * (self.data * (ones.data - self.data)), "sigmoid_grad2"))

          if(self.creation_op == "tanh"):
            ones = Tensor(np.ones_like(self.grad.data))
            self.creators[0].backward(Tensor(self.grad.data * (ones.data - self.data), "tanh_grad2"))

          if(self.creation_op == "index_select"):
            new_grad = np.zeros_like(self.creators[0].data)
            indices_ = self.index_select_indices.data.flatten()
            grad_ = grad.data.reshape(len(indices_), -1)
            for i in range(len(indices_)):
              new_grad[indices_[i]] += grad_[i]
            self.creators[0].backward(Tensor(new_grad, "index_select grad2"))

          if(self.creation_op == "cross_entropy"):
            # This is the complicated derivation we did in part 4 of the makemore series
            dx = self.softmax_output - self.target_dist
            self.creators[0].backward(Tensor(dx, "cross_entropy complicated deriv backprop dx"))

  def __add__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data + other.data,
                    label=self.label+' + '+other.label,
                    autograd = True,
                    creators=[self, other],
                    creation_op="add")
    return Tensor(self.data + other.data, 'add no grad')

  def __neg__(self):
    if(self.autograd):
      return Tensor(self.data*-1,
                    label='-'+self.label,
                    autograd=True,
                    creators=[self],
                    creation_op="neg",)
    return Tensor(self.data*-1, 'neg no grad')

  def __sub__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data - other.data,
                    label = self.label + ' - ' + other.label,
                    autograd=True,
                    creators=[self, other],
                    creation_op="sub")
    return Tensor(self.data - other.data, 'sub no grad')

  def __mul__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data * other.data,
                    label = self.label+'*'+other.label,
                    autograd=True,
                    creators=[self, other],
                    creation_op="mul")
    return Tensor(self.data - other.data, 'mul no grad')

  def sum(self, dim):
    if(self.autograd):
      return Tensor(self.data.sum(dim),
                    label = self.label+'.sum_'+str(dim)+')',
                    autograd=True,
                    creators=[self],
                    creation_op="sum_"+str(dim))
    return Tensor(self.data.sum(dim), 'sum no grad')

  def expand(self, dim, copies):

    trans_cmd = list(range(0, len(self.data.shape)))
    trans_cmd.insert(dim, len(self.data.shape))
    new_shape = list(self.data.shape) + [copies]
    new_data = self.data.repeat(copies).reshape(new_shape)
    new_data = new_data.transpose(trans_cmd)

    if(self.autograd):
      return Tensor(new_data,
                    label=self.label+".expand_"+str(dim),
                    autograd=True,
                    creators=[self],
                    creation_op="expand_"+str(dim))
    return new_data

  def transpose(self):
    if(self.autograd):
      return Tensor(self.data.transpose(),
                    label=self.label+".transpose",
                    autograd=True,
                    creators=[self],
                    creation_op="transpose")
    return Tensor(self.data.transpose(), "transpose no grad")

  def mm(self,x):
    if(self.autograd):
      return Tensor(self.data.dot(x.data),
                    label=self.label+".dot_"+x.label,
                    autograd=True,
                    creators=[self,x],
                    creation_op="mm")
    return Tensor(self.data.dot(x.data), "mm no grad")

  # Nonlinearities
  def sigmoid(self):
    if(self.autograd):
      return Tensor(1/(1+np.exp(-self.data)),
                    label="sigmoid_"+self.label,
                    autograd=True,
                    creators=[self],
                    creation_op="sigmoid")
    return Tensor(1/(1+np.exp(-self.data)), label="(no auto grad)sigmoid_"+self.label)

  def tanh(self):
    if(self.autograd):
      return Tensor(np.tanh(self.data),
                    label="tanh_"+self.label,
                    autograd=True,
                    creators=[self],
                    creation_op="tanh")
    return Tensor(np.tanh(self.data), label="(no auto grad)tanh"+self.label)

  def index_select(self, indices):
    if(self.autograd):
      new = Tensor(self.data[indices.data],
                   label="index_select w/ "+self.label,
                   autograd=True,
                   creators=[self],
                   creation_op="index_select")
      new.index_select_indices = indices
      return new
    return Tensor(self.data[indices.data], "index_select no grad")

  def cross_entropy(self, target_indices):
    temp = np.exp(self.data)
    softmax_output = temp / np.sum(temp,
                                   axis=len(self.data.shape)-1,
                                   keepdims=True)
    t = target_indices.data.flatten()
    p = softmax_output.reshape(len(t),-1)
    target_dist = np.eye(p.shape[1])[t]
    loss = -(np.log(p) * (target_dist)).sum(1).mean()
    if(self.autograd):
      out = Tensor(loss,
                   label="cross_entropy",
                   autograd=True,
                   creators=[self],
                   creation_op="cross_entropy")
      out.softmax_output = softmax_output
      out.target_dist = target_dist
      return out
    return Tensor(loss, "cross_entropy no grad")

  def __repr__(self):
    # This method calls the self.data's repr method
    return str(self.data.__repr__())
    # return str(self.label.__repr__() + ":" + self.data.__repr__())

  def __str__(self):
    return str(self.data.__str__())
    # return str(self.label.__repr__() + ":" + self.data.__str__() + ' Creators:'+self.creators.__str__())

# Other Classes for the Deep Learning (DL) Framework
class SGD(object):
  def __init__(self, parameters, alpha=0.1):
    self.parameters = parameters
    self.alpha = alpha

  def zero(self):
    # zero parameters' gradients
    for p in self.parameters:
      p.grad.data *= 0

  def step(self, zero=True):
    # update parameters' data based on their gradients
    # zero out the gradient after if zero=True
    for p in self.parameters:
      p.data -= p.grad.data * self.alpha
      if(zero):
        p.grad.data *= 0

# The Layer Class another foundation for the DL Framework
class Layer(object):

  def __init__(self):
    self.parameters = list()

  def get_parameters(self):
    return self.parameters

class Linear(Layer):

  def __init__(self, n_inputs, n_outputs):
    super().__init__()
    W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/(n_inputs))
    self.weight = Tensor(W, autograd=True, label='Linear W')
    self.bias = Tensor(np.zeros(n_outputs), autograd=True, label='Linear b')

    self.parameters.append(self.weight)
    self.parameters.append(self.bias)

  def forward(self, input):
    return input.mm(self.weight)+self.bias.expand(0, len(input.data))

class Sequential(Layer):

  def __init__(self, layers=list()):
    super().__init__()

    self.layers = layers

  def add(self, layer):
    self.layers.append(layer)

  def forward(self, input):
    for layer in self.layers:
      input = layer.forward(input)
    return input

  def get_parameters(self):
    params = list()
    for l in self.layers:
      params += l.get_parameters()
    return params

class Tanh(Layer):
  def __init__(self):
    super().__init__()

  def forward(self, input):
    return input.tanh()

class Sigmoid(Layer):
  def __init__(self):
    super().__init__()

  def forward(self, input):
    return input.sigmoid()

class Embedding(Layer):

  def __init__(self, vocab_size, dim):
    super().__init__()

    self.vocab_size = vocab_size
    self.dim = dim

    # this initialization style is a convention from word2vec
    weight = (np.random.rand(vocab_size, dim) - 0.5) / dim
    self.weight = Tensor(weight, label="Embedding weight", autograd=True)

    self.parameters.append(self.weight)

  def forward(self, input):
    return self.weight.index_select(input)

# You can also create layers that are functions on the input
class MSELoss(Layer):

  def __init__(self):
    super().__init__()

  def forward(self, pred, target):
    return ((pred-target)*(pred-target)).sum(0)

class CrossEntropyLoss(object):
  def __init__(self):
    super().__init__()

  def forward(self, input, target):
    return input.cross_entropy(target)

class RNNCell(Layer):
  def __init__(self,n_inputs,n_hidden,n_output,activation='sigmoid'):
    super().__init__()

    self.n_inputs = n_inputs
    self.n_hidden = n_hidden
    self.n_output = n_output

    if(activation == 'sigmoid'):
      self.activation = Sigmoid()
    elif(activation == 'tanh'):
      self.activation = Tanh()
    else:
      raise Exception("Non-linearity not found")

    self.w_ih = Linear(n_inputs, n_hidden)
    self.w_hh = Linear(n_hidden, n_hidden)
    self.w_ho = Linear(n_hidden, n_output)

    self.parameters += self.w_ih.get_parameters()
    self.parameters += self.w_hh.get_parameters()
    self.parameters += self.w_ho.get_parameters()

  def forward(self, input, hidden):
      from_prev_hidden = self.w_hh.forward(hidden)
      combined = self.w_ih.forward(input) + from_prev_hidden
      new_hidden = self.activation.forward(combined)
      output = self.w_ho.forward(new_hidden)
      return output, new_hidden

  def init_hidden(self, batch_size=1):
    # What is this used for?
    return Tensor(np.zeros((batch_size, self.n_hidden)), label="RNN Hidden State", autograd=True)

In [2]:
import numpy as np

In [3]:
(sigmoid,relu) = (lambda x: 1/1+np.exp(-x)), lambda x: (x>0).astype(float)*x
weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1, 0.01]))

print("Sigmoid Activations")
activations = list()
for iter in range(10):
  activation = sigmoid(activation.dot(weights))
  activations.append(activation)
  print(activation)
print("\nSigmoid Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  # The derivative of sigmoid causes very small gradients when activation is very near 0 or 1
  sigmoid_deriv = (activation) * (1-activation)
   # Chain Rule
  gradient = sigmoid_deriv * gradient
  gradient = gradient.dot(weights.transpose()) # So this is also part of the chain rule???
  print(gradient)

print("\nRelu Activations")
activations = list()
for iter in range(10):
  # The matrix multiplication causes exploding gradients that don't get squashed by a nonlinearity as in sigmoid
  activation = relu(activation.dot(weights))

  activations.append(activation)
  print(activation)
print("\n Relu Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  gradient = ((activation > 0) * gradient).dot(weights.transpose())
  print(gradient)

# Adding gates to RNN will replace all of the nonlinearies and matrix multiplications

Sigmoid Activations
[1.00008889 1.00057475]
[1.00672188 1.00673168]
[1.006515   1.00651519]
[1.00652199 1.00652199]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]

Sigmoid Gradients
[-0.03282154 -0.03282154]
[0.00107725 0.00107725]
[-3.53571093e-05 -3.53571093e-05]
[1.16047468e-06 1.16047468e-06]
[-3.80885641e-08 -3.80885641e-08]
[1.25012386e-09 1.25012385e-09]
[-4.10323732e-11 -4.10323591e-11]
[1.34536847e-12 1.34534485e-12]
[-4.55737812e-14 -4.55341556e-14]
[1.08795551e-16 4.23921767e-17]

Relu Activations
[5.00238791 5.00093033]
[25.00610921 25.01048197]
[125.04803709 125.03491883]
[625.1877124  625.22706719]
[3126.09598115 3125.97791678]
[15630.00764826 15630.36184138]
[78151.45501378 78150.39243441]
[390753.02475143 390756.21248955]
[1953777.87470964 1953768.31149529]
[9768851.12069078 9768879.81033384]

 Relu Gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15

LSTM Implementation CH 14