# 13.3.3c MLP XOR: backprop

$$\begin{align*}
\boldsymbol{x}&=(x_1,x_2)\in\{0, 1\}^n%
&&\text{donde}\quad n=2\\%
\boldsymbol{z}&=\mathbf{W}\boldsymbol{x}+\boldsymbol{b}_1\in\mathbb{R}^{m_1}%
&&\text{donde}\quad m_1=2,\,\mathbf{W}\in\mathbb{R}^{m_1\times n}\;\text{y}\;\boldsymbol{b}_1\in\mathbb{R}^{m_1}\\%
\boldsymbol{h}&=\operatorname{ReLU}(\boldsymbol{z})\in\mathbb{R}^{m_2}%
&&\text{donde}\quad m_2=2\\%
\boldsymbol{a}&=\mathbf{V}\boldsymbol{h}+\boldsymbol{b}_2\in\mathbb{R}^{m_3}%
&&\text{donde}\quad m_3=2,\,\mathbf{V}\in\mathbb{R}^{m_3\times m_2}\;\text{y}\;\boldsymbol{b}_2\in\mathbb{R}^{m_3}\\%
\hat{\boldsymbol{y}}&=\mathcal{S}(\boldsymbol{a})\in[0,1]^{m_3}%
&&\text{probabilidades de las clases $0$ y $1$}\\%
\mathcal{L}&=\operatorname{CrossEntropy}(\boldsymbol{y},\hat{\boldsymbol{y}})\in\mathbb{R}%
&&\text{salida escalar}\quad m=1%
\end{align*}$$

$$\mathbf{W}=\begin{pmatrix}1&1\\1&1\end{pmatrix}\quad \boldsymbol{b}_1=\begin{pmatrix}-1\\0.5\end{pmatrix}\quad \mathbf{V}=\begin{pmatrix}1&-1\\-1&1\end{pmatrix}\quad\boldsymbol{b}_2=\begin{pmatrix}1\\-1\end{pmatrix}$$

In [1]:
import numpy as np; np.set_printoptions(precision=4)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
W = np.array([[1, 1], [1, 1]]); b1 = np.array([-1,  .5])
V = np.array([[1, -1], [-1, 1]]); b2 = np.array([ 1, -1])

## Forward: $\small\quad\boldsymbol{x}\to\boldsymbol{z}(\mathbf{W},\boldsymbol{b}_1)\to\boldsymbol{h}\to\boldsymbol{a}(\mathbf{V},\boldsymbol{b}_2)\to\mathcal{L}$

<div align="center">

|$x_1$|$x_2$|$\boldsymbol{z}=\mathbf{W}\boldsymbol{x}+\boldsymbol{b}_1$|$\boldsymbol{h}=\operatorname{ReLU}(\boldsymbol{z})$|$\boldsymbol{a}=\mathbf{V}\boldsymbol{h}+\boldsymbol{b}_2$|$\hat{\boldsymbol{y}}=\mathcal{S}(\boldsymbol{a})$|$-\boldsymbol{y}^t\log(\hat{\boldsymbol{y}})$|
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|$0$|$0$|$(-1, 0.5)^t$|$(0, 0.5)^t$|$(0.5, -0.5)^t$|$(0.7311, 0.2689)^t$|$(0.3133, 0)^t$|
|$0$|$1$|$(0, 1.5)^t$ |$(0, 1.5)^t$|$(-0.5, 0.5)^t$|$(0.2689, 0.7311)^t$|$(0, 0.3133)^t$|
|$1$|$0$|$(0, 1.5)^t$ |$(0, 1.5)^t$|$(-0.5, 0.5)^t$|$(0.2689, 0.7311)^t$|$(0, 0.3133)^t$|
|$1$|$1$|$(1, 2.5)^t$ |$(1, 2.5)^t$|$(-0.5, 0.5)^t$|$(0.2689, 0.7311)^t$|$(1.3133, 0)^t$|

</div>

In [2]:
z = X @ W + b1; print('z =', str(z).replace('\n',','))
h = np.maximum(0, z); print('h =', str(h).replace('\n',','))
a = h @ V + b2; print('a =', str(a).replace('\n',','))
y_pred = np.exp(a); y_pred = np.transpose(y_pred.T / y_pred.sum(axis=1))
print('y_pred =', str(y_pred).replace('\n',','))
Ln = -y * np.log(y_pred)
print('Ln =', str(Ln).replace('\n',','), '\nL =', np.sum(Ln)/4.)

z = [[-1.   0.5], [ 0.   1.5], [ 0.   1.5], [ 1.   2.5]]
h = [[0.  0.5], [0.  1.5], [0.  1.5], [1.  2.5]]
a = [[ 0.5 -0.5], [-0.5  0.5], [-0.5  0.5], [-0.5  0.5]]
y_pred = [[0.7311 0.2689], [0.2689 0.7311], [0.2689 0.7311], [0.2689 0.7311]]
Ln = [[ 0.3133 -0.    ], [-0.      0.3133], [-0.      0.3133], [ 1.3133 -0.    ]] 
L = 0.5632616875182226


## Backward $\small\quad\mathcal{L}\to\boldsymbol{a}(\mathbf{V},\boldsymbol{b}_2)\to\boldsymbol{h}\to\boldsymbol{z}(\mathbf{W},\boldsymbol{b}_1)\to\boldsymbol{x}$

**Paso backward para $\boldsymbol{x}=(0,0)^t$:** $\qquad\boldsymbol{u}_{K+1}=\boldsymbol{1};\qquad$**para** $\;k=K:1:\quad\boldsymbol{g}_k=\boldsymbol{u}_{k+1}^t\dfrac{\partial\boldsymbol{f}(\boldsymbol{x}_k,\boldsymbol{\theta}_k)}{\partial\boldsymbol{\theta}_k}\quad\boldsymbol{u}_k^t=\boldsymbol{u}_{k+1}^t\dfrac{\partial\boldsymbol{f}(\boldsymbol{x}_k,\boldsymbol{\theta}_k)}{\partial\boldsymbol{x}_k}$
$$\begin{align*}
\boldsymbol{u}^t%
&=1^t\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}%
=(\hat{\boldsymbol{y}}-\boldsymbol{y})^t=(0.7311, 0.2689)-(1, 0)=(-0.2689, 0.2689)\\%
%
\frac{\partial\boldsymbol{a}}{\partial\mathbf{V}}%
&=\left[\frac{\partial a_1}{\partial\mathbf{V}}, \frac{\partial a_2}{\partial\mathbf{V}}\right]^t%
=\left[\begin{pmatrix}\frac{\partial a_1}{\partial V_{11}}&\frac{\partial a_1}{\partial V_{12}}\\%
\frac{\partial a_1}{\partial V_{21}}&\frac{\partial a_1}{\partial V_{22}}\end{pmatrix},
\begin{pmatrix}\frac{\partial a_2}{\partial V_{11}}&\frac{\partial a_2}{\partial V_{12}}\\%
\frac{\partial a_2}{\partial V_{21}}&\frac{\partial a_2}{\partial V_{22}}\end{pmatrix}\right]^t%
=\left[\begin{pmatrix}h_1&h_2\\0&0\end{pmatrix},\begin{pmatrix}0&0\\h_1&h_2\end{pmatrix}\right]^t\\
%
\frac{\partial\boldsymbol{a}}{\partial\boldsymbol{b}_2}%
&=\left[\frac{\partial a_1}{\partial\boldsymbol{b}_2}, \frac{\partial a_2}{\partial\boldsymbol{b}_2}\right]^t%
=\left[\begin{pmatrix}\frac{\partial a_1}{\partial b_{21}}\\\frac{\partial a_1}{\partial b_{22}}\end{pmatrix},
\begin{pmatrix}\frac{\partial a_2}{\partial b_{21}}\\\frac{\partial a_2}{\partial b_{22}}\end{pmatrix}\right]^t%
=\left[\begin{pmatrix}1\\0\end{pmatrix},\begin{pmatrix}0\\1\end{pmatrix}\right]^t\\
%
\boldsymbol{g}_{\mathbf{V}}%
&=\boldsymbol{u}^t\frac{\partial\boldsymbol{a}}{\partial\mathbf{V}}%
=(-0.2689, 0.2689)\left[\begin{pmatrix}0&0.5\\0&0\end{pmatrix},\begin{pmatrix}0&0\\0&0.5\end{pmatrix}\right]^t%
=\begin{pmatrix}0&-0.1345\\0&0.1345\end{pmatrix}\\%
%
\boldsymbol{g}_{\boldsymbol{b}_2}%
&=\boldsymbol{u}^t\frac{\partial\boldsymbol{a}}{\partial\boldsymbol{b}_2}%
=(-0.2689, 0.2689)\left[\begin{pmatrix}1\\0\end{pmatrix},\begin{pmatrix}0\\1\end{pmatrix}\right]^t%
=\begin{pmatrix}-0.2689\\0.2689\end{pmatrix}\\%
%
\boldsymbol{u}^t%
&=\boldsymbol{u}^t\frac{\partial\boldsymbol{a}}{\partial\boldsymbol{h}}%
=(-0.2689, 0.2689)\mathbf{V}=(-0.5378, 0.5378)\\%
%
\frac{\partial\boldsymbol{h}}{\partial\boldsymbol{z}}%
&=\operatorname{diag}(H(\boldsymbol{z}))%
=\operatorname{diag}(H(-1), H(0.5))%
=\begin{pmatrix}0&0\\0&1\end{pmatrix}\\%
%
\boldsymbol{u}^t%
&=\boldsymbol{u}^t\frac{\partial\boldsymbol{h}}{\partial\boldsymbol{z}}%
=(-0.5378, 0.5378)\begin{pmatrix}0&0\\0&1\end{pmatrix}%
=(0, 0.5378)\\%
%
\frac{\partial\boldsymbol{z}}{\partial\mathbf{W}}%
&=\left[\frac{\partial z_1}{\partial\mathbf{W}}, \frac{\partial z_2}{\partial\mathbf{W}}\right]^t%
=\left[\begin{pmatrix}\frac{\partial z_1}{\partial W_{11}}&\frac{\partial z_1}{\partial W_{12}}\\%
\frac{\partial z_1}{\partial W_{21}}&\frac{\partial z_1}{\partial W_{22}}\end{pmatrix},
\begin{pmatrix}\frac{\partial z_2}{\partial W_{11}}&\frac{\partial z_2}{\partial W_{12}}\\%
\frac{\partial z_2}{\partial W_{21}}&\frac{\partial z_2}{\partial W_{22}}\end{pmatrix}\right]^t%
=\left[\begin{pmatrix}x_1&x_2\\0&0\end{pmatrix},\begin{pmatrix}0&0\\x_1&x_2\end{pmatrix}\right]^t\\%
%
\frac{\partial\boldsymbol{z}}{\partial\boldsymbol{b}_1}%
&=\left[\frac{\partial z_1}{\partial\boldsymbol{b}_1}, \frac{\partial z_2}{\partial\boldsymbol{b}_1}\right]^t%
=\left[\begin{pmatrix}\frac{\partial z_1}{\partial b_{11}}\\\frac{\partial z_1}{\partial b_{12}}\end{pmatrix},
\begin{pmatrix}\frac{\partial z_2}{\partial b_{11}}\\\frac{\partial z_2}{\partial b_{12}}\end{pmatrix}\right]^t%
=\left[\begin{pmatrix}1\\0\end{pmatrix},\begin{pmatrix}0\\1\end{pmatrix}\right]^t\\%
%
\boldsymbol{g}_{\mathbf{W}}%
&=\boldsymbol{u}^t\frac{\partial\boldsymbol{z}}{\partial\mathbf{W}}%
=(0, 0.5378)\left[\begin{pmatrix}0&0\\0&0\end{pmatrix},\begin{pmatrix}0&0\\0&0\end{pmatrix}\right]^t%
=\begin{pmatrix}0&0\\0&0\end{pmatrix}\\%
%
\boldsymbol{g}_{\boldsymbol{b}_1}%
&=\boldsymbol{u}^t\frac{\partial\boldsymbol{z}}{\partial\boldsymbol{b}_1}%
=(0, 0.5378)\left[\begin{pmatrix}1\\0\end{pmatrix},\begin{pmatrix}0\\1\end{pmatrix}\right]^t%
=\begin{pmatrix}0\\0.5378\end{pmatrix}\\%
%
\boldsymbol{u}^t%
&=\boldsymbol{u}^t\frac{\partial\boldsymbol{z}}{\partial\boldsymbol{x}}%
=(0, 0.5378)\mathbf{W}=(0.5378, 0.5378)%
\end{align*}$$

In [3]:
n = 0; ut = (y_pred[n] - y[n]).reshape(1, -1); print('uJLa =', str(ut).replace('\n',','))
gV = np.kron(h[n].reshape(1, -1), ut.T); print('gV =', str(gV).replace('\n',','))
gb2 = ut.T; print('gb2 =', str(gb2).replace('\n',','))
ut = ut @ V; print('uJLaJah =', str(ut).replace('\n',','))
Jhz = np.diag(np.heaviside(z[n], 0.0)); print('Jhz =', str(Jhz).replace('\n',','))
ut = ut @ Jhz; print('uJLaJahJhz =', str(ut).replace('\n',','))
gW = np.kron(X[n, :], ut.T); print('gW =', str(gW).replace('\n',','))
gb1 = ut.T; print('gb1 =', str(gb1).replace('\n',','))
ut = ut @ W; print('uJLaJahJhzJzx =', str(ut).replace('\n',','), '\n')

uJLa = [[-0.2689  0.2689]]
gV = [[-0.     -0.1345], [ 0.      0.1345]]
gb2 = [[-0.2689], [ 0.2689]]
uJLaJah = [[-0.5379  0.5379]]
Jhz = [[0. 0.], [0. 1.]]
uJLaJahJhz = [[0.     0.5379]]
gW = [[0. 0.], [0. 0.]]
gb1 = [[0.    ], [0.5379]]
uJLaJahJhzJzx = [[0.5379 0.5379]] 



## Forward-Backward con keras

In [5]:
import tensorflow as tf; from tensorflow import keras
W = tf.constant_initializer([[1, 1], [1, 1]]); b1 = tf.constant_initializer([-1,  .5])
V = tf.constant_initializer([[1, -1], [-1, 1]]); b2 = tf.constant_initializer([ 1, -1])
L1 = keras.layers.Dense(2, activation=tf.nn.relu, input_dim=2, kernel_initializer=W, bias_initializer=b1)
L2 = keras.layers.Dense(2, activation=tf.nn.softmax, kernel_initializer=V, bias_initializer=b2)
M = keras.Sequential([L1, L2])
L1_preact = keras.layers.Dense(2, activation=None, input_dim=2, kernel_initializer=W, bias_initializer=b1)
z = L1_preact(X); print('z =', str(z).replace('\n',''))
h = L1(X); print('h =', str(h).replace('\n',''))
L2_preact = keras.layers.Dense(2, activation=None, kernel_initializer=V, bias_initializer=b2)
a = L2_preact(h); print('a =', str(a).replace('\n','')) # softmax
p = L2(h); print('p =', str(p).replace('\n',''))
L = tf.keras.losses.CategoricalCrossentropy(from_logits=False); print('L =', L(y, p))
optimizer = tf.optimizers.SGD(learning_rate=0.1)
M.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
M.fit(X, y, epochs=1, verbose=1); print(L1.get_weights(), "\n", L2.get_weights())

z = tf.Tensor([[-1.   0.5] [ 0.   1.5] [ 0.   1.5] [ 1.   2.5]], shape=(4, 2), dtype=float32)
h = tf.Tensor([[0.  0.5] [0.  1.5] [0.  1.5] [1.  2.5]], shape=(4, 2), dtype=float32)
a = tf.Tensor([[ 0.5 -0.5] [-0.5  0.5] [-0.5  0.5] [-0.5  0.5]], shape=(4, 2), dtype=float32)
p = tf.Tensor([[0.7311 0.2689] [0.2689 0.7311] [0.2689 0.7311] [0.2689 0.7311]], shape=(4, 2), dtype=float32)
L = tf.Tensor(0.5632617, shape=(), dtype=float32)
[array([[1.0366, 0.9769],
       [1.0366, 0.9769]], dtype=float32), array([-0.9634,  0.4769], dtype=float32)] 
 [array([[ 1.0183, -1.0183],
       [-0.9711,  0.9711]], dtype=float32), array([ 1.0116, -1.0116], dtype=float32)]
