# Tutorial for auto differentiation package

This package created by @Faris-ML

### How to differentiate automaticlly?

The derivative is taken by going throuth the following steps:

1- Define the equation by the package operator

2- Create a graph

3- compute the forward pass and backward pass

### Methodology:

The way to be able to take derivative is to convert the equation to graph.

example:

let say that we want to evaluate this equation

y = (x^2)+6

after converting the equation to graph will be like this:

![image](images/graph.png)

and by the chain rule we compute the gradients by taking the backward pass like the graph below :

![image](images/d_graph.png)

## lets get started on implementation

In [1]:
from ElhamMath import Tensor, mul, Constant, Variable, divide, exp, add, Graph
from typing import Union

Define variables and constant

In [None]:
def sigmoid(x: Union[Variable, Constant]):
    # σ(x) = 1 / (1 + exp(-x))
    one = Constant(Tensor(1.0), "ones")
    negx = mul(Constant(Tensor(-1.0), "neg_ones"), x, "neg_mul")
    return divide(one, add(one, exp(negx), "add"), "devide")


B, C, H, W = 2, 3, 4, 5
data = [
    [
        [[(i - 0.5 * j + 0.1 * k - 0.2 * l) for l in range(W)] for k in range(H)]
        for j in range(C)
    ]
    for i in range(B)
]

x = Variable(Tensor(data), "x")
y = sigmoid(x)

Create a graph

In [3]:
graph = Graph(y)

In [4]:
import time

Compute forward pass and backward pass

In [5]:
s = time.time()
forward = graph.forward()
print("the forward pass : ", forward.data)
graph.backward()
print("the derivative with respect to x is : ", x.grad.data)
e = time.time()
print((e - s) * 1000)

the forward pass :  [0.5, 0.45016600268752216, 0.401312339887548, 0.35434369377420455, 0.31002551887238755, 0.52497918747894, 0.47502081252106, 0.425557483188341, 0.3775406687981454, 0.3318122278318339, 0.549833997312478, 0.5, 0.45016600268752216, 0.401312339887548, 0.35434369377420455, 0.574442516811659, 0.52497918747894, 0.47502081252106, 0.425557483188341, 0.3775406687981454, 0.3775406687981454, 0.3318122278318339, 0.289050497374996, 0.24973989440488234, 0.2141650169574414, 0.401312339887548, 0.35434369377420455, 0.31002551887238755, 0.2689414213699951, 0.23147521650098232, 0.425557483188341, 0.3775406687981454, 0.3318122278318339, 0.289050497374996, 0.24973989440488234, 0.45016600268752216, 0.401312339887548, 0.35434369377420455, 0.31002551887238755, 0.2689414213699951, 0.2689414213699951, 0.23147521650098238, 0.19781611144141825, 0.16798161486607552, 0.14185106490048777, 0.289050497374996, 0.24973989440488234, 0.2141650169574414, 0.18242552380635635, 0.15446526508353467, 0.3100255

In [6]:
from typing import Sequence, List, Union
from ElhamMath import Tensor, Variable, Constant, exp, mul, divide, sub, sqrt, matmul, Node, power, Graph, add


# ----- activations -----
def sigmoid(x: Node):
    # σ(x) = 1 / (1 + exp(-x))
    one = Constant(Tensor(1.0), "ones")
    negx = mul(Constant(Tensor(-1.0), "neg_ones"), x, "neg_mul")
    return divide(one, add(one, exp(negx), "add"), "devide")


def tanh(x: Node):
    # if you have a native tanh(), use it; otherwise use exp:
    epos = exp(x)
    eneg = exp(mul(Constant(Tensor(-1),"tanh"),x))
    return divide(sub(epos,eneg),add(epos,eneg))


def relu(x: Node):
    half = Constant(Tensor(0.5),"relu1")
    eps = Constant(Tensor(1e-12),"relu2")
    return mul(half,add(x,sqrt(add(mul(x,x),eps))))  # approx |x| -> (x+|x|)/2


# ----- layers -----
class Linear:
    def __init__(
        self,
        in_features: int,
        out_features: int,
        std: float = 0.01,
    ):
        import random

        W_py = [
            [random.gauss(0.0, std) for _ in range(out_features)]
            for _ in range(in_features)
        ]
        b_py = [0.0 for _ in range(out_features)]
        self.W = Variable(Tensor(W_py),"W")
        self.b = Variable(Tensor(b_py),"b")

    def __call__(self, x: Node):
        y = add(matmul(x, self.W),self.b)  # broadcasting adds bias
        return y

    @property
    def params(self):
        return [self.W, self.b]


class MLP:
    def __init__(
        self,
        in_features: int,
        hidden_features: int,
        out_features: int,
    ):
        self.l1 = Linear(in_features, hidden_features)
        self.l2 = Linear(hidden_features, out_features)

    def __call__(self, x: Node):
        h = sigmoid(self.l1(x))  # swap for relu/tanh if you prefer
        y = sigmoid(self.l2(h))  # for binary outputs
        return y

    @property
    def params(self):
        return self.l1.params + self.l2.params


# ----- losses -----
def mse_loss(y_pred: Node, y_true: Node):
    err = power(sub(y_pred,y_true),Constant(Tensor(2),"loss_power"))
    return err


# def bce_loss(y_pred: Variable, y_true: Constant, eps: float = 1e-7):
#     # Binary cross-entropy: -[y*log(p) + (1-y)*log(1-p)]
#     p = y_pred
#     one = 1.0
#     loss_tensor = -(y_true * (p + eps).ln() + (one - y_true) * (one - p + eps).ln())
#     return reduce_to_shape(loss_tensor, []) if reduce_to_shape else loss_tensor.sum()


# ===== Example wiring =====
# Toy data (N=4, D=2 -> binary label)
X_py = [
    [0.0, 0.0],
    [0.0, 1.0],
    [1.0, 0.0],
    [1.0, 1.0],
]
Y_py = [
    [0.0],
    [1.0],
    [1.0],
    [0.0],
]

X = Constant(Tensor(X_py),"x")
Y = Constant(Tensor(Y_py),"y")

model = MLP(in_features=2, hidden_features=4, out_features=1)

# Build the graph
pred = model(X)  # activation functions + weights as Variables, X as Constant
loss = mse_loss(pred, Y)  # or bce_loss(pred, Y)
g = Graph(loss)

In [11]:
g.forward()

Tensor(shape=[4, 1], device=Device.CPU, data=[[0.24487506552908433], [0.2551196574284689], [0.25516366884579317], [0.2449462829870329]])

In [18]:
g.backward()

In [19]:
model.l1.W.grad

Tensor(shape=[2, 4], device=Device.CPU, data=[[3.9494740950173936e-05, 6.455336752157997e-06, 0.0001378081227428724, 8.167849802356636e-05], [3.939078921131868e-05, 6.438598401613453e-06, 0.00013641906167491087, 8.051873598420459e-05]])