In [1]:
import numpy as np

import math
import random
import matplotlib.pyplot as plt

In [26]:
class Parameter:
    def __init__(self, value: float, name: str, _children = ()) -> None:
        self._value = value
        self._name = name

        self._grad = 0.0
        self._backward = lambda: None

        self._prev = set(_children)


    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"


    def __pow__(self, other):
      assert isinstance(other, (int, float)), "only supporting int/float powers for now"
      out = Parameter(
          self._value**other,
          f'**{other}',
          (self,)
      )

      def _backward():
          self._grad += other * (self._value ** (other - 1)) * out._grad
      out._backward = _backward

      return out


    def __mul__(self, other):
      other = other if isinstance(other, Parameter) else Parameter(other, '')
      out = Parameter(
          self._value * other._value,
          f'{self._name} * {other._name}',
          (self, other)
      )

      def _backward():
        self._grad += out._grad * other._value  # dL / dself
        other._grad += out._grad * self._value  # dL / dother
      out._backward = _backward

      return out


    def __add__(self, other):
      other = other if isinstance(other, Parameter) else Parameter(other, '')
      out = Parameter (
          self._value + other._value,
          f'[{self._name} + {other._name}]',
          (self, other)
      )

      def _backward():
        self._grad += 1.0 * out._grad   # dL / dself
        other._grad += 1.0 * out._grad  # dL / dother
      out._backward = _backward

      return out


    def __neg__(self): # -self
      return self * (-1)

    def __sub__(self, other):  # self - other
      other = other if isinstance(other, Parameter) else Parameter(other, '')
      return self + (-other)



    def sigmoid(self):
        # f(x) = 1 / (1 + exp(-self._value))
        # f'(x) = f(x) * (1 - f(x))

        val = 1.0 / (1.0 + math.exp(-self._value))

        result = Parameter(
            val,
             f"σ({self._name})",
            (self, )
        )

        def _backward():
            self._grad += result._grad * val * (1 - val)

        result._backward = _backward

        return result


    def silu(self):
        # silu(x) = self._value * sigmoid(self._value)
        # silu'(x) = sigmoid(self._value) * (1.0 + self._value - silu(self._value))

        sigmoid = self.sigmoid()._value
        val = self._value * sigmoid

        result = Parameter (
            val,
            f"SiLU({self._name})",
            (self, )
        )

        def _backward():
            self._grad += result._grad * (sigmoid * (1.0 + self._value - val))

        result._backward = _backward

        return result



    def softplus(self):
      # softplus(x) = ln(1 + e^x)
      # softplus'(x) = sigmoid(x)

      softplus_val = math.log(1 + math.exp(self._value))
      sigmoid_val = self.sigmoid()._value

      result = Parameter(
          softplus_val,
          f"Softplus({self._name})",
          (self, )
      )

      def _backward():
        self._grad += result._grad * sigmoid_val

      result._backward = _backward

      return result


    def backward(self):
        topo_sort = []
        visited = set()
        def build_topo_sort(v):
          visited.add(v)
          for child in v._prev:
            build_topo_sort(child)
          topo_sort.append(v)

        build_topo_sort(self)

        self._grad = 1.0
        for node in reversed(topo_sort):
          node._backward()



from typing import List

def gd(learning_rate: float, parameters: List[Parameter]) -> None:
  for p in parameters:
    p._value -= learning_rate * p._grad
    p.grad = 0.0



# Backpropagation (backward) test

In [27]:
a = Parameter(3.0, name='a')
b = Parameter(2.0, name='b')
c = Parameter(5.0, name='c')
d = Parameter(5.0, name='d')

In [28]:
print(a)
print(b)
print(c)
print(d)

Parameter a = 3.0; dL/d[a] = 0.0
Parameter b = 2.0; dL/d[b] = 0.0
Parameter c = 5.0; dL/d[c] = 0.0
Parameter d = 5.0; dL/d[d] = 0.0


In [29]:
u = a * b
v = u + c
L = v * d

print(u)
print(v)
print(L)

Parameter a * b = 6.0; dL/d[a * b] = 0.0
Parameter [a * b + c] = 11.0; dL/d[[a * b + c]] = 0.0
Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 0.0


In [30]:
L.backward()

print(L)
print(v)
print(d)
print(u)
print(c)
print(a)
print(b)

Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 1.0
Parameter [a * b + c] = 11.0; dL/d[[a * b + c]] = 5.0
Parameter d = 5.0; dL/d[d] = 11.0
Parameter a * b = 6.0; dL/d[a * b] = 5.0
Parameter c = 5.0; dL/d[c] = 5.0
Parameter a = 3.0; dL/d[a] = 10.0
Parameter b = 2.0; dL/d[b] = 15.0


In [31]:
x1 = Parameter(3.0, name = 'x1')
x2 = Parameter(4.0, name = 'x2')

w1 = Parameter(1.0, name = 'w1')
w2 = Parameter(2.0, name = 'w2')

x1w1 = x1 * w1;
x2w2 = x2 * w2;
xw = x1w1 + x2w2;
out = xw.silu();

print(out)
print(xw)
print(x1w1)
print(x2w2)
print(x1)
print(w1)
print(x2)
print(w2)

Parameter SiLU([x1 * w1 + x2 * w2]) = 10.999816284359673; dL/d[SiLU([x1 * w1 + x2 * w2])] = 0.0
Parameter [x1 * w1 + x2 * w2] = 11.0; dL/d[[x1 * w1 + x2 * w2]] = 0.0
Parameter x1 * w1 = 3.0; dL/d[x1 * w1] = 0.0
Parameter x2 * w2 = 8.0; dL/d[x2 * w2] = 0.0
Parameter x1 = 3.0; dL/d[x1] = 0.0
Parameter w1 = 1.0; dL/d[w1] = 0.0
Parameter x2 = 4.0; dL/d[x2] = 0.0
Parameter w2 = 2.0; dL/d[w2] = 0.0


In [32]:
out.backward()

print(out)
print(xw)
print(x1w1)
print(x2w2)
print(x1)
print(w1)
print(x2)
print(w2)

Parameter SiLU([x1 * w1 + x2 * w2]) = 10.999816284359673; dL/d[SiLU([x1 * w1 + x2 * w2])] = 1.0
Parameter [x1 * w1 + x2 * w2] = 11.0; dL/d[[x1 * w1 + x2 * w2]] = 1.0001670111501668
Parameter x1 * w1 = 3.0; dL/d[x1 * w1] = 1.0001670111501668
Parameter x2 * w2 = 8.0; dL/d[x2 * w2] = 1.0001670111501668
Parameter x1 = 3.0; dL/d[x1] = 1.0001670111501668
Parameter w1 = 1.0; dL/d[w1] = 3.0005010334505005
Parameter x2 = 4.0; dL/d[x2] = 2.0003340223003336
Parameter w2 = 2.0; dL/d[w2] = 4.000668044600667


# Checking gradient descent

In [33]:
w = Parameter(0.5, name='w')
b = Parameter(0.1, name='b')
x = Parameter(5.0, name='x')

learning_rate = 0.0001
y_target = Parameter(3.0, name='target')

for n in range(15):
  # forward pass
  y_pred = (x * w + b).softplus()
  loss = (y_pred - y_target) ** 2

  # backward pass
  loss.backward()

  # update
  gd(learning_rate, [w, b])

  print(f"loss {n} steps: \t {loss._value}")


loss 0 steps: 	 0.10781720831300634
loss 1 steps: 	 0.106847734907399
loss 2 steps: 	 0.10492582341195823
loss 3 steps: 	 0.10208527320076471
loss 4 steps: 	 0.09837610800843254
loss 5 steps: 	 0.09386377348935533
loss 6 steps: 	 0.08862808072992617
loss 7 steps: 	 0.08276190756761778
loss 8 steps: 	 0.07636967290825829
loss 9 steps: 	 0.06956560274459106
loss 10 steps: 	 0.062471810231867486
loss 11 steps: 	 0.05521621590712735
loss 12 steps: 	 0.04793033785728071
loss 13 steps: 	 0.04074698523165677
loss 14 steps: 	 0.03379789182232777


In [34]:
print(y_pred)
print(w)
print(b)

Parameter Softplus([x * w + b]) = 2.816157970468318; dL/d[Softplus([x * w + b])] = -0.36768405906336366
Parameter w = 0.5335636450314861; dL/d[w] = -38.60247457492748
Parameter b = 0.10671272900629727; dL/d[b] = -7.720494914985498
