<a href="https://colab.research.google.com/github/AndrewstheBuilder/grokking_deeplearning/blob/main/LSTM_CH13_Grokking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Grokking Deep Learning Framework Begin

Tensor Object. This lays the groundwork for the autograd engine

In [49]:
import numpy as np
class Tensor (object):
  def __init__(self,data,label,
               autograd=False,
               creators=None,
               creation_op=None,
               id=None):
    # print('label',label)
    self.label = label
    self.data = np.array(data)
    self.creation_op = creation_op
    self.creators = creators
    self.grad = None
    self.autograd = autograd
    self.children = {}
    if(id is None):
      id = np.random.randint(0,100000) # What is the likelyhood of producing the same id for a tensor in the same session?
    self.id = id

    if(creators is not None):
      # print('creators',creators)
      for c in creators:
        # keeps track of how many children a tensor has
        if(self.id not in c.children):
          # Initialize c.children[self.id]
          # We are giving the creator the children property
          c.children[self.id] = 1
        else:
          # Update counter for children
          # What is this scenario ???
          # Each child should have 2 separate parents and that's it
          # Another way of saying this ^ is that all parents have one child
          # TODO: Investigate this. The only time this scenario has happened so far is in RNNCell
          print('when does this get called')
          labels_str = '\n'.join([(creator.label) for creator in creators])
          print('ALL CREATORS')
          print(labels_str)
          # print('c',c.label)
          print('child --->',self.label)
          print('child creation_op --->',self.creation_op)
          c.children[self.id] += 1
  def all_children_grads_accounted_for(self):
    '''
    Checks whether a tensor has received the correct
    number of gradients from each child
    '''
    for id,cnt in self.children.items():
      if(cnt != 0):
        return False
    return True

  def backward(self,grad=None,grad_origin=None):
    # Back Prop logging
    # Might want to enable this for future?
    # print('self',self)
    # print('grad',grad)
    # print('grad_origin',grad_origin)
    # print()
    if(self.autograd):

      if(grad is None):
        grad = Tensor(np.ones_like(self.data), 'grad'+str(self.data.shape))

      if(grad_origin is not None):
        if(self.children[grad_origin.id] == 0):
          raise Exception("cannot backprop more than once")
        else:
          self.children[grad_origin.id] -= 1

      if(self.grad is None):
        self.grad = grad
      else:
        # accumulates gradients from several children
        self.grad += grad

      if(self.creators is not None and
         (self.all_children_grads_accounted_for() or
          grad_origin is None)):
          # begins actual back propagation
          if(self.creation_op == "add"):
            self.creators[0].backward(grad,self)
            self.creators[1].backward(grad,self)

          if(self.creation_op == "neg"):
            self.creators[0].backward(self.grad.__neg__())

          if(self.creation_op == "sub"):
            new = Tensor(self.grad.data, label='sub_grad')
            self.creators[0].backward(new, self)
            new = Tensor(self.grad.__neg__().data, label='sub_grad2')
            self.creators[1].backward(new,self)

          if(self.creation_op == "mul"):
            new = self.grad * self.creators[1].data
            self.creators[0].backward(new, self)
            new = self.grad * self.creators[0].data
            self.creators[1].backward(new, self)

          if(self.creation_op == "mm"):
            # Usually an activation
            act = self.creators[0]
            # print('act in mm() backward', act)
            # print('self.creators',self.creators)
            # Usually a weight matrix
            weights = self.creators[1]
            new = self.grad.mm(weights.transpose())
            act.backward(new)
            new = self.grad.transpose().mm(act).transpose()
            weights.backward(new)

          if(self.creation_op == "transpose"):
            self.creators[0].backward(self.grad.transpose())

          if("sum" in self.creation_op):
            dim = int(self.creation_op.split("_")[1])
            # print('dim in sum backward',dim)
            # print('ds',ds)
            ds = self.creators[0].data.shape[dim]
            self.creators[0].backward(self.grad.expand(dim, ds))

          if("expand" in self.creation_op):
            dim = int(self.creation_op.split("_")[1])
            self.creators[0].backward(self.grad.sum(dim))

          if(self.creation_op == "sigmoid"):
            ones = Tensor(np.ones_like(self.grad.data), "ones used in sigmoid backprop")
            self.creators[0].backward(Tensor(self.grad.data * (self.data * (ones.data - self.data)), "sigmoid_grad2"))

          if(self.creation_op == "tanh"):
            ones = Tensor(np.ones_like(self.grad.data))
            self.creators[0].backward(Tensor(self.grad.data * (ones.data - self.data), "tanh_grad2"))

          if(self.creation_op == "index_select"):
            new_grad = np.zeros_like(self.creators[0].data)
            indices_ = self.index_select_indices.data.flatten()
            grad_ = grad.data.reshape(len(indices_), -1)
            for i in range(len(indices_)):
              new_grad[indices_[i]] += grad_[i]
            self.creators[0].backward(Tensor(new_grad, "index_select grad2"))

          if(self.creation_op == "cross_entropy"):
            # This is the complicated derivation we did in part 4 of the makemore series
            dx = self.softmax_output - self.target_dist
            self.creators[0].backward(Tensor(dx, "cross_entropy complicated deriv backprop dx"))

    # old code before adding support for multiuse tensors
    # self.grad = grad

    # if(self.creation_op == "add"):
    #   self.creators[0].backward(grad)
    #   self.creators[1].backward(grad)

  def __add__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data + other.data,
                    label=self.label+' + '+other.label,
                    autograd = True,
                    creators=[self, other],
                    creation_op="add")
    # print("When would this case ever be true?")
    # print('self',self)
    # print('other',other)
    # This case gets called when we are accumulating the gradients and add the self.grad += grad
    # if((self.autograd == False and other.autograd == True) or (self.autograd == True and other.autograd == False)):
    #   print('how did this happen??')
    #   print('self',self)
    #   print('other',other)
    return Tensor(self.data + other.data, 'add no grad')

  def __neg__(self):
    if(self.autograd):
      # I think this tensor replaces the tensor I have as I initially declared it
      # print('neg self',self)
      # print('self.data',self.data)
      return Tensor(self.data*-1,
                    label='-'+self.label,
                    autograd=True,
                    creators=[self],
                    creation_op="neg",)
    # print('when does neg this else statement occur', self)
    # It happens for the gradient calculation. When we are backpropagating the grad from the child.
    return Tensor(self.data*-1, 'neg no grad')

  def __sub__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data - other.data,
                    label = self.label + ' - ' + other.label,
                    autograd=True,
                    creators=[self, other],
                    creation_op="sub")
    # if((self.autograd == False and other.autograd == True) or (self.autograd == True and other.autograd == False)):
    #   print('how did this happen??')
    #   print('self',self)
    #   print('other',other)

    return Tensor(self.data - other.data, 'sub no grad')

  def __mul__(self, other):
    if(self.autograd and other.autograd):
      return Tensor(self.data * other.data,
                    label = self.label+'*'+other.label,
                    autograd=True,
                    creators=[self, other],
                    creation_op="mul")
    # if((self.autograd == False and other.autograd == True) or (self.autograd == True and other.autograd == False)):
    #   print('how did this happen??')
    #   print('self',self)
    #   print('other',other)

    return Tensor(self.data - other.data, 'mul no grad')

  def sum(self, dim):
    if(self.autograd):
      return Tensor(self.data.sum(dim),
                    label = self.label+'.sum_'+str(dim)+')',
                    autograd=True,
                    creators=[self],
                    creation_op="sum_"+str(dim))
    # if((self.autograd == False)):
    #   print('how did this happen?? in sum')
    #   print('self',self)
    #   print('dim',dim)

    return Tensor(self.data.sum(dim), 'sum no grad')

  def expand(self, dim, copies):

    trans_cmd = list(range(0, len(self.data.shape)))
    trans_cmd.insert(dim, len(self.data.shape))
    new_shape = list(self.data.shape) + [copies]
    new_data = self.data.repeat(copies).reshape(new_shape)
    new_data = new_data.transpose(trans_cmd)

    if(self.autograd):
      return Tensor(new_data,
                    label=self.label+".expand_"+str(dim),
                    autograd=True,
                    creators=[self],
                    creation_op="expand_"+str(dim))
    # print('How the heck did you get here in expand')
    # print('self',self)
    # print('dim',dim)
    # print('copies',copies)
    return new_data

  def transpose(self):
    if(self.autograd):
      return Tensor(self.data.transpose(),
                    label=self.label+".transpose",
                    autograd=True,
                    creators=[self],
                    creation_op="transpose")
    # print("How did you get here in transpose()")
    # print('self',self)
    return Tensor(self.data.transpose(), "transpose no grad")

  def mm(self,x):
    if(self.autograd):
      return Tensor(self.data.dot(x.data),
                    label=self.label+".dot_"+x.label,
                    autograd=True,
                    creators=[self,x],
                    creation_op="mm")
    # print("How did you get here in mm()")
    # print('self',self)
    return Tensor(self.data.dot(x.data), "mm no grad")

  # Nonlinearities
  def sigmoid(self):
    if(self.autograd):
      return Tensor(1/(1+np.exp(-self.data)),
                    label="sigmoid_"+self.label,
                    autograd=True,
                    creators=[self],
                    creation_op="sigmoid")
    return Tensor(1/(1+np.exp(-self.data)), label="(no auto grad)sigmoid_"+self.label)

  def tanh(self):
    if(self.autograd):
      return Tensor(np.tanh(self.data),
                    label="tanh_"+self.label,
                    autograd=True,
                    creators=[self],
                    creation_op="tanh")
    return Tensor(np.tanh(self.data), label="(no auto grad)tanh"+self.label)

  def index_select(self, indices):
    if(self.autograd):
      new = Tensor(self.data[indices.data],
                   label="index_select w/ "+self.label,
                   autograd=True,
                   creators=[self],
                   creation_op="index_select")
      new.index_select_indices = indices
      return new
    return Tensor(self.data[indices.data], "index_select no grad")

  def cross_entropy(self, target_indices):
    temp = np.exp(self.data)
    softmax_output = temp / np.sum(temp,
                                   axis=len(self.data.shape)-1,
                                   keepdims=True)
    t = target_indices.data.flatten()
    p = softmax_output.reshape(len(t),-1)
    # print('p shape',p.shape)
    # print('t',t)
    target_dist = np.eye(p.shape[1])[t]
    # print('target_dist',target_dist)

    # print('-(np.log(p) * (target_dist))',-(np.log(p) * (target_dist)))
    # print('-(np.log(p) * (target_dist)).sum(1)',-(np.log(p) * (target_dist)).sum(1))

    loss = -(np.log(p) * (target_dist)).sum(1).mean()
      # A lot I do not understand here. 1. Why do we take the log of p.
      # Why do we multiply with the target_dist -> get the probability of the right answer.
      # Why do we sum and then calculate the mean? And why is it negative(this should be easy to answer)
        #
        # We take the mean because there is a batch of inputs and we want to return a single number for loss which will be the average across the batch

    if(self.autograd):
      out = Tensor(loss,
                   label="cross_entropy",
                   autograd=True,
                   creators=[self],
                   creation_op="cross_entropy")
      out.softmax_output = softmax_output
      out.target_dist = target_dist
      return out
    return Tensor(loss, "cross_entropy no grad")
    # My WRONG cross_entropy implementation
    # predicted = self.data
    # if(self.autograd):
    #   return Tensor(-(target_indices * np.log(predicted)).sum(),
    #                label="cross_entropy w/ "+self.label,
    #                autograd=True,
    #                creators=[self],
    #                creation_op="cross_entropy")
    # return Tensor(-(target_indices * np.log(predicted)).sum())


  def __repr__(self):
    # This method calls the self.data's repr method
    return str(self.data.__repr__())
    # return str(self.label.__repr__() + ":" + self.data.__repr__())

  def __str__(self):
    return str(self.data.__str__())
    # return str(self.label.__repr__() + ":" + self.data.__str__() + ' Creators:'+self.creators.__str__())

# x = Tensor(np.eye(5), 'x', autograd=True)
# x.index_select(Tensor([[1,2,3],
#                        [2,3,4]], 'x_answers')).backward()
# print(x.grad)
# x = Tensor(np.array([[1,2,3],
#                      [4,5,6]]), label='x')
# y = x.sum(0)
# y.backward()
# x.expand(dim=0, copies=1)


# a = Tensor(([1,2,3,4,5]), label='a', autograd=True)
# b = Tensor(([2,2,2,2,2]), label='b', autograd=True)
# c = Tensor(([5,4,3,2,1]), label='c', autograd=True)

# d = a + (-b)
# e = (-b) + c
# f = d + e
# f.backward(Tensor(np.array([1,1,1,1,1]), label='initial grad'))

# # f.backward(Tensor(np.array([1,1,1,1,1])))

# print('b.grad.data',b.grad.data)
# d = a + b
# e = b + c
# f = d + e

# f.backward(Tensor(np.array([1,1,1,1,1])))
# # f.backward(Tensor(np.array([1,1,1,1,1])))
# print(b.grad.data == np.array([2,2,2,2,2]))
# print('f.grad',f.grad)
# print('e.grad',e.grad)
# print('d.grad',d.grad)
# print('b.grad',b.grad)
# Old code before adding support for multiuse tensors
# x = Tensor([1,2,3,4,5])
# y = Tensor([2,2,2,2,2])

# z = x+y
# z.backward(Tensor(np.array([1,1,1,1,1])))
# print(x.grad)
# print(y.grad)
# print(z.creators)
# print(z.creation_op)

Stochastic Gradient Descent Class

In [3]:
class SGD(object):
  def __init__(self, parameters, alpha=0.1):
    self.parameters = parameters
    self.alpha = alpha

  def zero(self):
    # zero parameters' gradients
    for p in self.parameters:
      p.grad.data *= 0

  def step(self, zero=True):
    # update parameters' data based on their gradients
    # zero out the gradient after if zero=True
    for p in self.parameters:
      p.data -= p.grad.data * self.alpha
      if(zero):
        p.grad.data *= 0

Toy Example that does not use the autograd system

Layer Abstract Class(Makes sure parameters are stored in the subclass because that is what a layer represents) + Linear Layer Subclass Implementing Layer

In [24]:
class Layer(object):

  def __init__(self):
    self.parameters = list()

  def get_parameters(self):
    return self.parameters

class Linear(Layer):

  def __init__(self, n_inputs, n_outputs):
    super().__init__()
    W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/(n_inputs))
    self.weight = Tensor(W, autograd=True, label='Linear W')
    self.bias = Tensor(np.zeros(n_outputs), autograd=True, label='Linear b')

    self.parameters.append(self.weight)
    self.parameters.append(self.bias)

  def forward(self, input):
    return input.mm(self.weight)+self.bias.expand(0, len(input.data))

Sequential Object that is a subclass of Layer. It will probably be used to stack a bunch of other layer subclass objects on top of each other.

In [5]:
np.random.seed(0)
class Sequential(Layer):

  def __init__(self, layers=list()):
    super().__init__()

    self.layers = layers

  def add(self, layer):
    self.layers.append(layer)

  def forward(self, input):
    for layer in self.layers:
      input = layer.forward(input)
    return input

  def get_parameters(self):
    params = list()
    for l in self.layers:
      params += l.get_parameters()
    return params

data = Tensor(np.array([[0,0], [0,1], [1,0], [1,1]]), autograd=True)
target = Tensor(np.array([[0], [1], [0], [1]]), autograd=True)

model = Sequential([Linear(2,3), Linear(3,1)])

optim = SGD(parameters=model.get_parameters(), alpha=0.05)

for i in range(10):
  # predict
  pred = model.forward(data)

  # Compare Mean Squared Error
  loss = ((pred-target) * (pred-target)).sum(0)

  # Learn
  loss.backward()
  optim.step()
  print(loss)

[2.33428272]
[0.62282083]
[0.19680451]
[0.08915535]
[0.06028456]
[0.049625]
[0.04329267]
[0.03828787]
[0.0339512]
[0.03010911]


In [36]:
class Tanh(Layer):
  def __init__(self):
    super().__init__()

  def forward(self, input):
    return input.tanh()

class Sigmoid(Layer):
  def __init__(self):
    super().__init__()

  def forward(self, input):
    return input.sigmoid()

# Trying out the new non linearities 04/03/24. This should be the latest testing of the autograd framework.
# import numpy as np
# np.random.seed(0)

# data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
# target = Tensor(np.array([[0], [1], [0], [1]]), autograd=True)

# model = Sequential([Linear(2,3), Tanh(), Linear(3,1), Sigmoid()])
# criterion = MSELoss()


# optim = SGD(parameters=model.get_parameters(), alpha=1)

# for i in range(10):
#   # predict
#   pred = model.forward(data)

#   # Compare Mean Squared Error
#   loss = criterion.forward(pred, target)

#   # Learn
#   loss.backward()
#   optim.step()
#   print(loss)

class Embedding(Layer):

  def __init__(self, vocab_size, dim):
    super().__init__()

    self.vocab_size = vocab_size
    self.dim = dim

    # this initialization style is a convention from word2vec
    weight = (np.random.rand(vocab_size, dim) - 0.5) / dim
    self.weight = Tensor(weight, label="Embedding weight", autograd=True)

    self.parameters.append(self.weight)

  def forward(self, input):
    return self.weight.index_select(input)

Layers without weights. Here we got the Mean Squared Error Function. Why do we extend the Layer class in MSELoss?

Other Loss Functions(CrossEntropyLoss etc...)

In [50]:
# You can also create layers that are functions on the input
class MSELoss(Layer):

  def __init__(self):
    super().__init__()

  def forward(self, pred, target):
    return ((pred-target)*(pred-target)).sum(0)

class CrossEntropyLoss(object):
  def __init__(self):
    super().__init__()

  def forward(self, input, target):
    return input.cross_entropy(target)

class RNNCell(Layer):
  def __init__(self,n_inputs,n_hidden,n_output,activation='sigmoid'):
    super().__init__()

    self.n_inputs = n_inputs
    self.n_hidden = n_hidden
    self.n_output = n_output

    if(activation == 'sigmoid'):
      self.activation = Sigmoid()
    elif(activation == 'tanh'):
      self.activation = Tanh()
    else:
      raise Exception("Non-linearity not found")

    self.w_ih = Linear(n_inputs, n_hidden)
    self.w_hh = Linear(n_hidden, n_hidden)
    self.w_ho = Linear(n_hidden, n_output)

    self.parameters += self.w_ih.get_parameters()
    self.parameters += self.w_hh.get_parameters()
    self.parameters += self.w_ho.get_parameters()

  def forward(self, input, hidden):
      from_prev_hidden = self.w_hh.forward(hidden)
      combined = self.w_ih.forward(input) + from_prev_hidden
      new_hidden = self.activation.forward(combined)
      output = self.w_ho.forward(new_hidden)
      return output, new_hidden

  def init_hidden(self, batch_size=1):
    # What is this used for?
    return Tensor(np.zeros((batch_size, self.n_hidden)), label="RNN Hidden State", autograd=True)

# Test RNN Implementation
import sys,random,math
from collections import Counter
import numpy as np

f = open('drive/MyDrive/grokking/qa1_single-supporting-fact_train.txt', 'r')
raw = f.readlines()
# print(raw[:10])
f.close()

tokens = list()
for line in raw[0:1000]:
  # We are starting at 1 and moving on because the first token is the number
  # indicating what example number we are on
  tokens.append(line.lower().replace("\n","").split(" ")[1:])

new_tokens = list()
for line in tokens:
  new_tokens.append(['-'] * (6 - len(line)) + line)
tokens = new_tokens
# print(new_tokens[:10])

vocab = set()
for sent in tokens:
  for word in sent:
    vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
  word2index[word]=i

def words2indices(sentence):
  idx = list()
  for word in sentence:
    idx.append(word2index[word])
  return idx

indices = list()
for line in tokens:
  idx = list()
  for w in line:
    idx.append(word2index[w])
  indices.append(idx)

data = np.array(indices)

# Fit the training data
embed = Embedding(vocab_size=len(vocab), dim=16)
model = RNNCell(n_inputs=16, n_hidden=16, n_output=len(vocab))

criterion = CrossEntropyLoss()
params = model.get_parameters() + embed.get_parameters()
optim = SGD(parameters=params, alpha=0.05)

# Train
for iter in range(1000):
  batch_size = 100
  total_loss = 0

  hidden = model.init_hidden(batch_size=batch_size)

  for t in range(5):
    input = Tensor(data[0:batch_size,t], "RNN_Input", autograd=True)
    rnn_input = embed.forward(input=input)
    output, hidden = model.forward(input=rnn_input, hidden=hidden)

  target = Tensor(data[0:batch_size, t+1], "RNN_Target", autograd=True)
  loss = criterion.forward(output, target)
  loss.backward()
  optim.step()
  total_loss += loss.data
  if(iter % 200 == 0):
    p_correct = (target.data == np.argmax(output.data, axis=1)).mean()
    print_loss = total_loss / (len(data)/batch_size) # Why do we do the division by the batch_size
    print("Loss:",print_loss,"% Correct:",p_correct)

# Test
batch_size = 1
hidden = model.init_hidden(batch_size=batch_size)
for t in range(5):
  input = Tensor(data[0:batch_size, t], autograd=True)
  rnn_input = embed.forward(input=input)
  output, hidden = model.forward(input=rnn_input, hidden=hidden)

target = Tensor(data[0:batch_size, t+1], autograd=True)
loss = criterion.forward(output, target)

ctx = ""
for idx in data[0:batch_size][0][0:-1]:
  ctx += vocab[idx] + " "
print("Context:",ctx)
print("Pred:", vocab[output.data.argmax()])

# # Test Cross Entropy Loss
# import numpy as np
# np.random.seed(0)

# # data indices
# data = Tensor(np.array([1,2,1,2]), autograd=True)

# # target indices
# target = Tensor(np.array([0,1,0,1]), autograd=True)

# model = Sequential([Embedding(3,3), Tanh(), Linear(3,4)])
# criterion = CrossEntropyLoss()

# optim = SGD(parameters=model.get_parameters(), alpha=0.1)

# for i in range(10):
#   pred = model.forward(data)
#   loss = criterion.forward(pred, target)
#   loss.backward(Tensor(np.ones_like(loss.data)))
#   optim.step()
#   print(loss)
# Test Mean Squared Error(MSE) Loss
# import numpy
# np.random.seed(0)

# data = Tensor(np.array([[0,0], [0,1], [1,0], [1,1]]), autograd=True)
# target = Tensor(np.array([[0], [1], [0], [1]]), autograd=True)

# model = Sequential([Linear(2,3), Linear(3,1)])
# criterion = MSELoss()

# optim = SGD(parameters=model.get_parameters(), alpha=0.05)

# for i in range(10):
#   # predict
#   pred = model.forward(data)

#   # Compare
#   # Using Mean Squared Error
#   loss = criterion.forward(pred, target)

#   # Learn
#   loss.backward()
#   optim.step()
#   print(loss)

Loss: 0.5197367346295365 % Correct: 0.0
when does this get called
ALL CREATORS
Linear b
child ---> Linear b.expand_0
child creation_op ---> expand_0
when does this get called
ALL CREATORS
sigmoid_index_select w/ Embedding weight.dot_Linear W + Linear b.expand_0 + sigmoid_index_select w/ Embedding weight.dot_Linear W + Linear b.expand_0 + RNN Hidden State.dot_Linear W + Linear b.expand_0.dot_Linear W + Linear b.expand_0
Linear W
child ---> sigmoid_index_select w/ Embedding weight.dot_Linear W + Linear b.expand_0 + sigmoid_index_select w/ Embedding weight.dot_Linear W + Linear b.expand_0 + RNN Hidden State.dot_Linear W + Linear b.expand_0.dot_Linear W + Linear b.expand_0.dot_Linear W
child creation_op ---> mm
when does this get called
ALL CREATORS
Linear b
child ---> Linear b.expand_0
child creation_op ---> expand_0
when does this get called
ALL CREATORS
Linear W
child ---> Linear W.transpose
child creation_op ---> transpose
when does this get called
ALL CREATORS
sigmoid_index_select w/ 

KeyboardInterrupt: 

Nonlinearities

In [41]:
np.random.seed(0)
# Embedding Layer test
data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([[0], [1], [0], [1]]), autograd=True)

embed = Embedding(5,3)
model = Sequential([embed, Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.5)

for i in range(10):
  # Predict
  pred = model.forward(data)

  # Compare
  loss = criterion.forward(pred, target)

  # Learn
  loss.backward(Tensor(np.ones_like(loss.data)))
  optim.step()
  print(loss)

[0.98874126]
[0.65968299]
[0.44360414]
[0.29737218]
[0.20766265]
[0.15315922]
[0.11857244]
[0.09539963]
[0.07908649]
[0.06711553]


In [33]:
import numpy
np.random.seed(0)

data = np.array([[0,0], [0,1], [1,0], [1,1]])
target = np.array([[0], [1], [0], [1]])

weights_0_1 = np.random.rand(2,3)
weights_1_2 = np.random.rand(3,1)

for i in range(10):

  # These 'layer' variables are intermediate variables that won't be needed
  # when we use the autograd engine
  # Predict
  layer_1 = data.dot(weights_0_1)
  layer_2 = layer_1.dot(weights_1_2)

  # Compare
  diff = (layer_2 - target)
  sqdiff = (diff * diff)
  # Mean Squared Error Loss
  loss = sqdiff.sum(0)

  # Learn this this is the backpropagation piece
  layer_1_grad = diff.dot(weights_1_2.transpose())
  weight_1_2_update = layer_1.transpose().dot(diff)
  weight_0_1_update = data.transpose().dot(layer_1_grad)

  # Update the weights
  weights_1_2 -= weight_1_2_update * 0.1
  weights_0_1 -= weight_0_1_update * 0.1
  print(loss[0])

5.066439994622396
0.4959907791902341
0.4180671892167177
0.35298133007809646
0.2972549636567376
0.24923260381633278
0.20785392075862477
0.17231260916265181
0.14193744536652994
0.11613979792168387


Autograd Backprop Toy Example

In [48]:
import numpy
np.random.seed(0)

# Why does the Tensor.data need to np.array() here?
# I think its unneccessary... I already got the same loss result without having np.array here
# My concern is that we are leveraging numpy in places in the autograd Tensor code.
  # Numpy functions get used in sum, expand, mm...
data = Tensor(np.array([[0,0], [0,1], [1,0], [1,1]]), autograd=True)
target = Tensor(np.array([[0], [1], [0], [1]]), autograd=True)

w = list()
w.append(Tensor(np.random.rand(2,3), label='w1', autograd=True))
w.append(Tensor(np.random.rand(3,1), label='w2', autograd=True))

optim = SGD(parameters=w, alpha=0.1)
for i in range(10):

  # Predict
  pred = data.mm(w[0]).mm(w[1])

  # Compare
  loss = ((pred-target)* (pred-target)).sum(0)

  # Learn
  loss.backward(Tensor(np.ones_like(loss.data)))
  optim.step()
  # print('w',w)
  # Replace below code with call to SGD class 'optim.step()'
  # for w_ in w:
  #   # print(w_.data)
  #   # print(w_.grad)
  #   w_.data -= w_.grad.data * 0.1
  #   w_.grad.data *= 0

  print('loss',loss)

loss [0.58128304]
loss [0.48988149]
loss [0.41375111]
loss [0.34489412]
loss [0.28210124]
loss [0.2254484]
loss [0.17538853]
loss [0.1324231]
loss [0.09682769]
loss [0.06849361]


End Grokking Autograd Implementation

A toy example of RNN Backpropagation with exploding and vanishing gradients

In [None]:
import numpy as np

In [None]:
(sigmoid,relu) = (lambda x: 1/1+np.exp(-x)), lambda x: (x>0).astype(float)*x
weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1, 0.01]))

print("Sigmoid Activations")
activations = list()
for iter in range(10):
  activation = sigmoid(activation.dot(weights))
  activations.append(activation)
  print(activation)
print("\nSigmoid Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  # The derivative of sigmoid causes very small gradients when activation is very near 0 or 1
  sigmoid_deriv = (activation) * (1-activation)
   # Chain Rule
  gradient = sigmoid_deriv * gradient
  gradient = gradient.dot(weights.transpose()) # So this is also part of the chain rule???
  print(gradient)

print("\nRelu Activations")
activations = list()
for iter in range(10):
  # The matrix multiplication causes exploding gradients that don't get squashed by a nonlinearity as in sigmoid
  activation = relu(activation.dot(weights))

  activations.append(activation)
  print(activation)
print("\n Relu Gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
  gradient = ((activation > 0) * gradient).dot(weights.transpose())
  print(gradient)

# Adding gates to RNN will replace all of the nonlinearies and matrix multiplications

Sigmoid Activations
[1.00008889 1.00057475]
[1.00672188 1.00673168]
[1.006515   1.00651519]
[1.00652199 1.00652199]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]
[1.00652177 1.00652177]

Sigmoid Gradients
[-0.03282154 -0.03282154]
[0.00107725 0.00107725]
[-3.53571093e-05 -3.53571093e-05]
[1.16047468e-06 1.16047468e-06]
[-3.80885641e-08 -3.80885641e-08]
[1.25012386e-09 1.25012385e-09]
[-4.10323732e-11 -4.10323591e-11]
[1.34536847e-12 1.34534485e-12]
[-4.55737812e-14 -4.55341556e-14]
[1.08795551e-16 4.23921767e-17]

Relu Activations
[5.00238791 5.00093033]
[25.00610921 25.01048197]
[125.04803709 125.03491883]
[625.1877124  625.22706719]
[3126.09598115 3125.97791678]
[15630.00764826 15630.36184138]
[78151.45501378 78150.39243441]
[390753.02475143 390756.21248955]
[1953777.87470964 1953768.31149529]
[9768851.12069078 9768879.81033384]

 Relu Gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15