In [164]:
import math
import random

import numpy as np
import matplotlib.pyplot as plt
from graphviz import Digraph

In [165]:
# Value - autograd wrapper
class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._children = list(_children)
        self._op = _op
        self.label = label
        # if label:  # is labelled?
        #     self.label = label
        # elif len(_children) == 1:  # is unary op?
        #     self.label = f'{_op}({_children[0].label})'
        # elif len(_children) == 2:  # is binary op?
        #     self.label = f'{_children[0].label} {_op} {_children[1].label}'
        # else:  # is leaf?
        #     self.label = f"{data:.4f}"

    def __repr__(self):
        if self.label:
            return f"Value(label={self.label}, data={self.data:.4f})"
        else:
            return f"Value(data={self.data:.4f})"

    def tanh(self):
        t = (math.exp(2 * self.data) - 1) / (math.exp(2 * self.data) + 1)
        # t = math.tanh(x)
        out = Value(t, (self,), 'tanh')

        def _backward():
            self.grad += out.grad * (1 - out.data**2)
        out._backward = _backward

        return out

    def exp(self):
        out = Value(math.exp(self.data), (self,), 'exp')

        def _backward():
            self.grad += out.grad * out.data
        out._backward = _backward

        return out

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad * 1.0
            other.grad += out.grad * 1.0
        out._backward = _backward

        return out

    def __radd__(self, other):
        return Value(other) + self

    def __sub__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data - other.data, (self, other), '-')

        def _backward():
            self.grad += out.grad * 1.0
            other.grad += out.grad * (-1.0)
        out._backward = _backward

        return out

    def __rsub__(self, other):
        return Value(other) - self

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
        out._backward = _backward

        return out

    def __rmul__(self, other):
        return Value(other) * self

    def __truediv__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data / other.data, (self, other), '/')

        def _backward():
            self.grad += out.grad * (1 / other.data)
            other.grad += out.grad * (-self.data / other.data**2)
        out._backward = _backward

        return out

    def __rtruediv__(self, other):
        return Value(other) / self

    def __pow__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data**other.data, (self, other), '**')

        def _backward():
            self.grad += out.grad * (other.data / self.data * out.data)
            # other.grad += out.grad * (math.log(self.data) * out.data)
        out._backward = _backward

        return out

    def __rpow__(self, other):
        return Value(other) ** self

    def __neg__(self):
        out = Value(-self.data, (self,), '-')

        def _backward():
            self.grad += out.grad * (-1.0)
        out._backward = _backward

        return out

    def backward(self):
        topo = []
        visited = set()
        def build_topo(root):
            if root not in visited:
                visited.add(root)
                for child in root._children:
                    build_topo(child)
                topo.append(root)
        build_topo(self)

        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

    def zero_grad(self):
        visited = set()
        def _zero_grad(root):
            if root not in visited:
                visited.add(root)
                root.grad = 0.0
                for child in root._children:
                    _zero_grad(child)
        _zero_grad(self)

In [166]:
# draw_dot - render computation graph
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'})

    def trace(root):
        # builds a set of all nodes and edges in a graph
        nodes, edges = list(), set()
        def build(root):
            if root not in nodes:
                nodes.append(root)
                for child in root._children:
                    edges.add((child, root))
                    build(child)
        build(root)
        return nodes, edges

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(uid, f"{n.label} | data: {n.data:.4f} | grad: {n.grad:.4f}", shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(uid + n._op, n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot

In [167]:
# Neuron - neuron w/ autograd
class Neuron:
    def __init__(self, indim):
        self.w = [Value(random.uniform(-1, 1)) for _ in range(indim)]
        self.b = Value(random.uniform(-1, 1))

    def __call__(self, x):
        act = sum([wi * xi for wi, xi in zip(self.w, x)], self.b)
        out = act.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]

x = [2.0, 3.0]
n = Neuron(2)
n(x)

Value(data=-0.8466)

In [168]:
# Layer - layer of neurons w/ autograd
class Layer:
    def __init__(self, indim, outdim):
        self.neurons = [Neuron(indim) for _ in range(outdim)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs

    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]

x = [2.0, 3.0]
n = Layer(2, 3)
n(x)

[Value(data=0.6620), Value(data=-0.2306), Value(data=0.9997)]

In [169]:
# Layer - multilayer perceptron w/ autograd
class MLP:
    def __init__(self, indim, outdims):
        sz = [indim] + outdims
        self.layers = [Layer(sz[i], sz[i + 1]) for i in range(len(outdims))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x[0] if len(x) == 1 else x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

x = [2.0, 3.0]
n = MLP(2, [4, 4, 1])
print(n(x))
# draw_dot(n(x))

Value(data=-0.8618)


In [170]:
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]
ys = [1.0, -1.0, -1.0, 1.0]

n = MLP(2, [4, 4, 1])
y_pred = [n(x) for x in xs]
print(y_pred)

[Value(data=-0.3609),
 Value(data=-0.4790),
 Value(data=-0.5385),
 Value(data=-0.4557)]

In [176]:
loss = sum((y - y_true)**2 for y_true, y in zip(ys, y_pred))
print("Loss:", loss)

Loss: Value(data=4.4555)


In [794]:
# full optimization pass
loss.zero_grad()
loss.backward()
for p in n.parameters():
    p.data -= 1 * p.grad

y_pred = [n(x) for x in xs]
loss = sum((y - y_true)**2 for y_true, y in zip(ys, y_pred))
print("Loss:", loss)

Loss: Value(data=0.0003)


In [796]:
y_pred = [n(x) for x in xs]
print(y_pred)

[Value(data=0.9987), Value(data=-0.9914), Value(data=-0.9911), Value(data=0.9884)]


In [803]:
# Full model training procedure
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]
ys = [1.0, -1.0, -1.0, 1.0]

n = MLP(2, [4, 1])
for k in range(1000):
    # forward pass
    y_pred = [n(x) for x in xs]
    # calculate loss
    loss = sum((y - y_true)**2 for y_true, y in zip(ys, y_pred))
    # backward pass
    loss.zero_grad()
    loss.backward()
    # update parameters
    for p in n.parameters():
        p.data -= 0.05 * p.grad

    print(k, loss.data)

0 5.279132349537837
1 3.351747832684072
2 3.181184724585142
3 3.0626653703103917
4 2.953900362960349
5 2.8610413173869738
6 2.7864836297912845
7 2.7277333660186094
8 2.680535087135788
9 2.64117465161461
10 2.607037486625714
11 2.5763965172158434
12 2.5481073933500484
13 2.5213896744755466
14 2.4956922529341927
15 2.4706135338241033
16 2.445853874879779
17 2.4211867057220693
18 2.396440593136735
19 2.3714878603970555
20 2.3462372019659816
21 2.3206287295129315
22 2.294630426172941
23 2.268235289610041
24 2.2414586346699403
25 2.2143351755271485
26 2.1869156546585033
27 2.159262945604735
28 2.1314477203452302
29 2.1035439159551386
30 2.0756243288947096
31 2.047756686526988
32 2.0200004913692333
33 1.9924048234629324
34 1.965007153738398
35 1.9378331003855902
36 1.910896973645483
37 1.8842029093005113
38 1.8577463826498652
39 1.8315159124644929
40 1.8054947974165712
41 1.7796627668367608
42 1.753997466905622
43 1.7284757382800173
44 1.7030746693091074
45 1.6777724295767507
46 1.6525489018