# 13.3.3c MLP XOR: backprop

In [1]:
import numpy as np; np.set_printoptions(precision=4)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
W = np.array([[1, 1], [1, 1]]); b1 = np.array([-1,  .5])

## Forward

<div align="center">

|$x_1$|$x_2$|$\boldsymbol{z}=\mathbf{W}\boldsymbol{x}+\boldsymbol{b}_1$|$\boldsymbol{h}=\operatorname{ReLU}(\boldsymbol{z})$|$\boldsymbol{a}=\mathbf{V}\boldsymbol{h}+\boldsymbol{b}_2$|$\hat{\boldsymbol{y}}=\mathcal{S}(\boldsymbol{a})$|$-\boldsymbol{y}^t\log(\hat{\boldsymbol{y}})$|
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|$0$|$0$|$(-1, 0.5)^t$|$(0, 0.5)^t$|$(0.5, -0.5)^t$|$(0.7311, 0.2689)^t$|$(0.3133, 0)^t$|
|$0$|$1$|$(0, 1.5)^t$ |$(0, 1.5)^t$|$(-0.5, 0.5)^t$|$(0.2689, 0.7311)^t$|$(0, 0.3133)^t$|
|$1$|$0$|$(0, 1.5)^t$ |$(0, 1.5)^t$|$(-0.5, 0.5)^t$|$(0.2689, 0.7311)^t$|$(0, 0.3133)^t$|
|$1$|$1$|$(1, 2.5)^t$ |$(1, 2.5)^t$|$(-0.5, 0.5)^t$|$(0.2689, 0.7311)^t$|$(1.3133, 0)^t$|

</div>

In [2]:
z = X @ W + b1; print('z =', str(z).replace('\n',','))
h = np.maximum(0, z); print('h =', str(h).replace('\n',','))
V = np.array([[1, -1], [-1, 1]]); b2 = np.array([ 1, -1])
a = h @ V + b2; print('a =', str(a).replace('\n',','))
p = np.exp(a); p = np.transpose(p.T / p.sum(axis=1)); print('p =', str(p).replace('\n',','))
Ln = -y * np.log(p); print('Ln =', str(Ln).replace('\n',','))
print('L =', np.sum(Ln)/4.)

z = [[-1.   0.5], [ 0.   1.5], [ 0.   1.5], [ 1.   2.5]]
h = [[0.  0.5], [0.  1.5], [0.  1.5], [1.  2.5]]
a = [[ 0.5 -0.5], [-0.5  0.5], [-0.5  0.5], [-0.5  0.5]]
p = [[0.7311 0.2689], [0.2689 0.7311], [0.2689 0.7311], [0.2689 0.7311]]
Ln = [[ 0.3133 -0.    ], [-0.      0.3133], [-0.      0.3133], [ 1.3133 -0.    ]]
L = 0.5632616875182226


## Backward

**Jacobiana con respecto a $\boldsymbol{a}$:** $\quad\displaystyle\mathbf{J}_{\mathcal{L}}(\boldsymbol{a})=(\boldsymbol{p}-\boldsymbol{y})^t\in\mathbb{R}^{m\times m_3}$

**VJP con respecto a $\boldsymbol{a}$:** $\qquad\underbrace{\boldsymbol{u}^t}_{1\times m}\underbrace{\mathbf{J}_{\mathcal{L}}(\boldsymbol{a})}_{m\times m_3}=\underbrace{\mathbf{J}_{\mathcal{L}}(\boldsymbol{a})}_{1\times m_3}\quad$ ya que $m=1$ e, inicialmente, $u=1;\; m_3=2$

In [3]:
Ja = p - y; print('Ja =', str(Ja).replace('\n',','))

Ja = [[-0.2689  0.2689], [ 0.2689 -0.2689], [ 0.2689 -0.2689], [-0.7311  0.7311]]


**Jacobiana con respecto a $\boldsymbol{z}$:** $\quad\displaystyle\mathbf{J}_{\boldsymbol{h}}(\boldsymbol{z})=\operatorname{diag}(\varphi'(\boldsymbol{z}))$

**VJP de $\boldsymbol{h}\,(\operatorname{ReLU})$ con respecto a $\boldsymbol{z}$:** $\qquad\underbrace{\mathbf{J}_{\boldsymbol{h}}(\boldsymbol{z})}_{m_2\times m_1};\;m_2=m_1=2$

In [None]:
import numpy as np; np.set_printoptions(precision=4)
z = np.array([[-1., 0.5], [0., 1.5], [0., 1.5], [1., 2.5]])
Jz = np.heaviside(z, 0.0); print('Jz =', str(Jz).replace('\n',','))

Jz = [[0. 1.], [0. 1.], [0. 1.], [1. 1.]]


### Respecto a la linealidad de la capa de salida $\mathbf{V}$
$$\boldsymbol{\nabla}_{\mathbf{V}}\mathcal{L}%
=\biggl[\frac{\partial\mathcal{L}}{\partial\mathbf{V}}\biggr]_{1,:}%
=\biggl[\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}\frac{\partial\boldsymbol{a}}{\partial\mathbf{V}}\biggr]_{1,:}%
=\biggl[\boldsymbol{u}_2^t\frac{\partial\boldsymbol{a}}{\partial\mathbf{V}}\biggr]_{1,:}%
=\boldsymbol{u}_2\boldsymbol{h}^t\in\mathbb{R}^{C\times K}$$

In [None]:
print(u2[0], h[0, :], tf.tensordot(u2[0], h[0, :], axes=0))
#gV = u2[0] @ tf.transpose(h[0])
#print(u2[0], tf.transpose(h[0]), gV)

tf.Tensor([-0.2689  0.2689], shape=(2,), dtype=float32) tf.Tensor([0.  0.5], shape=(2,), dtype=float32) tf.Tensor(
[[-0.     -0.1345]
 [ 0.      0.1345]], shape=(2, 2), dtype=float32)


In [None]:

z = L1_preact(X); print('z =', str(z).replace('\n',''))
h = L1(X); print('h =', str(h).replace('\n',''))
L2logits = L2_preact(h); print('logits =', str(L2logits).replace('\n','')) # softmax
y_pred = L2(h); print('y_pred =', str(y_pred).replace('\n',''))
eL2 = np.exp(L2logits)
y_pred_alt = np.transpose(eL2.T / eL2.sum(axis=1))
print('y_pred_alt =', str(y_pred_alt).replace('\n','')) # softmax a mano
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
print('loss =', loss(y, y_pred)); 
loss_alt_all = -y * np.log(y_pred) # loss a mano
print('loss_alt_data =', str(loss_alt_all).replace('\n',''))
print('loss_alt =', tf.reduce_sum(loss_alt_all) / 4.) # SUM_OVER_BATCH_SIZE