In [1]:
from micrograd.engine import Value

In [3]:
a = Value(-4.0)
b = Value(2.0)
c = a + b
d = a * b + b**3
c += c + 1
c += 1 + c + (-a)
d += d * 2 + (b + a).relu()
d += 3 * d + (b - a).relu()
e = c - d
f = e**2
g = f / 2.0
g += 10.0 / f
print(f'{g.data:.4f}') # prints 24.7041, the outcome of this forward pass
g.backward()
print(f'{a.grad:.4f}') # prints 138.8338, i.e. the numerical value of dg/da
print(f'{b.grad:.4f}') # prints 645.5773, i.e. the numerical value of dg/db


24.7041
138.8338
645.5773


In [4]:
# Biggest note that I'm not sure if Karpathy mentioned:
# You MUST avoid in-place operations for autograd to process correctly.
# So no doing x += Layer(x), you have to do x = x + Layer(x). 

In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [3]:
# Recalling derivatives:
def f(x):
    return 3*x**2 - 4*x + 5

h = 0.000001
a = 3
fprime = (f(a+h)-f(a))/h
print(fprime)

fsymmprime = (f(a+h)-f(a-h))/(2*h)

14.000003002223593


In [3]:
# Value is a class where standard mathematical operations can be made
# and the new values "keep track" of how they were obtained from prior values.
class Value:

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._op = _op
        self._backward = lambda: None

    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        def _backward():
            self.grad += out.grad * other.data
            other.grad += out.grad * self.data
        out._backward = _backward
        return out

    def exp(self):
        x = self.data
        t = math.exp(x)
        out = Value(t, (self, ), 'exp')

        def _backward():
            self.grad += out.grad * t
        out._backward = _backward

        return out

    def log(self):
        x = self.data
        t = math.log(abs(x))
        out = Value(t, (self, ), 'log')

        def _backward():
            self.grad += out.grad * 1/abs(x)
        out._backward = _backward

        return out

    def __pow__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        t = self.data**other.data
        out = Value(t, (self,), f'**{other.data}')

        def _backward():
            self.grad += out.grad * (other.data * self.data**(other.data - 1))
            other.grad += out.grad * (t * math.log(abs(self.data)))
        out._backward = _backward

        return out
    

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')
        def _backward():
            self.grad += out.grad * (1 - t**2)
        out._backward = _backward
        return out

    def algsigmoid(self):
        x = self.data
        abs = max(x,-x)
        tabs = 2 + 2*abs
        t = x / tabs + 0.5
        out = Value(t, (self, ), 'algsigmoid')
        def _backward():
            if self.data == 0:
                self.grad = 0
            else:
                self.grad += out.grad * ((x**2 + tabs*abs) / (abs*tabs**2))
        out._backward = _backward
        return out

    def algelu(self):
        x = self.data
        algs = self.algsigmoid(2*x).data
        t = x*algs
        out = Value(t, (self, ), 'algelu')
        def _backward():
            if self.data == 0:
                self.grad += 0
            else:
                self.grad += out.grad * ((x**2 + tabs*abs) / (abs*tabs**2))
        out._backward = _backward
        return out

    

    def backward(self):

        # Topological sort the graph of quantities to backpropogate over
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # Initialize the final value to be 1, since its partial wrt itself is 1
        self.grad = 1.0
        # Then backpropogate:
        for node in reversed(topo):
            node._backward()

    def __radd__(self, other):
        if isinstance(other, Value):
            return other + self
        else:
            return Value(other) + self
        
    def __rmul__(self, other):
        if isinstance(other, Value):
            return other * self
        else:
            return Value(other) * self

    def __truediv__(self, other):
        return self * other**-1

    def __sub__(self, other):
        return self + (-1*other)

    def __rsub__(self, other):
        return other + (-self)

    def __neg__(self):
        return self * -1

    def __rtruediv__(self, other):
        return other * self**-1

In [4]:
# More activation functions:

    #  def logistic(self):
    #     x = self.data
    #     t = 1/(1 + math.exp(-1.702*x))
    #     out = Value(t, (self, ), 'logistic')
    #     def _backward():
    #         self.grad += out.grad * (t * (1-t))
    #     out._backward = _backward
    #     return out

    # def gelu(self):
    #     x = self.data
        
    #     l = self.logistic()
    #     t = x*l
    #     out = Value(t, (self, ), 'gelu')
    #     def _backward():
    #         self.grad += out.grad * (l + x*1.702*(l.grad))
    #     out._backward = _backward
    #     return out

    # def algelu(self):
    #     x = self.data
    #     algs = self.algsigmoid(2*x).data
    #     t = x*algs
    #     out = Value(t, (self, ), 'algelu')
    #     def _backward():
    #         if self.data == 0:
    #             self.grad += 0
    #         else:
    #             self.grad += out.grad * ((x**2 + tabs*abs) / (abs*tabs**2))
    #     out._backward = _backward
    #     return out

    # def elu(self):
    #     x = self.data
    #     t = 0
    #     if x >= 0:
    #         t = x
    #     else:
    #         t = e**x - 1
    #     out = Value(t, (self, ), 'elu')
    #     def _backward():
    #         if x >= 0:
    #             self.grad += out.grad
    #         else:
    #             self.grad += out.grad * (t + 1)
    #     out._backward = _backward
    #     return out

In [5]:
# Now doing everything above in PyTorch:

import torch

In [20]:
x1 = torch.Tensor([2.0]).double() ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double() ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double() ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double() ; w2.requires_grad = True
b = torch.Tensor([6.88137]).double() ; b.requires_grad = True
n = x1*w1 + x2*w2 + b # same as what we did in micrograd


In [6]:
class Neuron:

    def __init__(self, nin):
        self.w = [Value(random.uniform(0,1)) for _ in range(nin)]
        self.b = Value(random.uniform(0,1))

    def __call__(self, x):
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        out = act.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]

class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs

    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]


class MultiPercep:

    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [None]:
x = [2.0, 3.0]
n = Layer(2,3)
n(x)

In [8]:
n = MultiPercep(3, [4,4,1])
xs = [
    [0.2, 0.3, 0.7],
    [0.5, 0.5, 0.0],
    [0.0, 0.1, 0.0],
    [0.3, 0.1, 0.3]
]

ys = [0.2, 0.3, 0.1, 0.4]

In [None]:
ypred = [output for x in xs for output in n(x)]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
print(ypred)
print(loss.data)

In [None]:
for i in range(0,20000):

    # This step is the forward pass:
    ypred = [output for x in xs for output in n(x)]
    # One Loss is MSE:
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
    
    # To minimize loss, use gradient descent:
    # This step is the backward pass:
    loss.backward()
    for p in n.parameters():
        p.data += -0.05 * p.grad
        p.grad = 0.0
    if i % 1000 == 0:
        print(loss.data)

In [2]:
def softmax(logits):
  counts = [logit.exp() for logit in logits]
  denominator = sum(counts)
  out = [c / denominator for c in counts]
  return out

In [None]:
softmax(ys)