# Mirror Classificiation Deep Network

This notebook reproduces the experimental result in the seminal 1986
paper by Rumelhart, Hinton, and Williams.

The goal of this deep is to detect 6-pixel bit patterns that are symmetric.

Here we create input data $X$ and output labels $y$ that contain all 64 of the cases:

In [None]:
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import FancyArrowPatch

##############################################################################
# 1. Generate the 6-bit mirror-symmetry dataset
##############################################################################
def is_symmetric(bits):
    return (bits[0] == bits[5]) and (bits[1] == bits[4]) and (bits[2] == bits[3])

X_list = [list(np.binary_repr(i, width=6)) for i in range(64)]
y_list = [[1] if is_symmetric(x) else [0] for x in X_list]

X = np.array(X_list, dtype=float)
y = np.array(y_list, dtype=float)

# Display the dataset
fig, axs = plt.subplots(8, 8, figsize=(8,4))
for i, ax in enumerate(cell for row in axs for cell in row):
    ax.imshow(X[i,None], cmap="Paired", vmin=0, vmax=1)
    ax.axis("off")
    ax.set_title(f"y={int(y[i, 0])}", fontsize=10)
plt.tight_layout()
plt.show()

# Define a two-layer neural network.

Now we are going to define a two-layer perceptron network (MLP) with the neural connections needed to try to solve this problem.

The $x_i$ nodes are inputs.  The $h_i$ nodes are interior "hidden" neurons, and the Out node is the visible "output" neuron.

The weights leading into the $h_i$ nodes and the Out node are not determined ahead of time.  We will initialize them randomly and apply backpropagation to learn them.

In [None]:
class Untrained_MLP:
    def __init__(self, n_input=6, n_hidden=2, n_output=1,
                 init_range=0.3, seed=10):
        """
        6->2->1 MLP:
         - Sigmoid activation in hidden
         - Sigmoid activation in output
        """
        rng = np.random.RandomState(seed)

        # Weight init in [-init_range, +init_range]
        self.W1 = rng.uniform(-init_range, init_range, (n_input, n_hidden))
        self.b1 = rng.uniform(-init_range, init_range, (n_hidden,))

        self.W2 = rng.uniform(-init_range, init_range, (n_hidden, n_output))
        self.b2 = rng.uniform(-init_range, init_range, (n_output,))

    def sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def forward(self, X):
        """
        Forward pass. Returns (hidden_out, final_out).
        """
        z1 = X @ self.W1 + self.b1
        hidden_out = self.sigmoid(z1)
        z2 = hidden_out @ self.W2 + self.b2
        final_out = self.sigmoid(z2)
        return hidden_out, final_out

    def predict(self, X):
        """Return 0/1 predictions by thresholding final_out>0.5."""
        _, final_out = self.forward(X)
        return (final_out>0.5).astype(int)


# Define visualization functions

The following defines `draw_mirror_network`, which draws the network and shows its current weights.  It also defines `plot_training_error` that we will use to plot the progress of training.

In [None]:
def draw_mirror_network(W1, b1, W2, b2):
    """
    Draw a diagram with circles, arrow lines, and weight labels on arrows.
      - 6 input nodes horizontally at y=2, x=0..5
      - 2 hidden nodes: top (h0) at (3,4), bottom (h1) at (3,0)
      - 1 output node at (8,2)
      - Circles bigger + arrow lines with weight labels on them.
    """

    fig, ax = plt.subplots(figsize=(12,5))
    ax.set_xlim(-1, 14)
    ax.set_ylim(-1, 7)
    ax.set_aspect("equal")
    ax.axis("off")

    # Coordinates & circle sizes
    input_positions = [(i * 2, 3) for i in range(6)]  # x=0..5, y=2
    h0_pos = (9, 6)
    h1_pos = (9, 0)
    out_pos = (13, 3)

    circle_radius = 0.6  # bigger circle

    # 1) Draw input nodes
    for i, (xx, yy) in enumerate(input_positions):
        c = plt.Circle((xx, yy), radius=circle_radius, fill=False, lw=1.5)
        ax.add_patch(c)
        ax.text(xx, yy, f"$x_{i}$", ha="center", va="center", fontsize=10)

    # 2) Hidden nodes
    for idx, (hx, hy) in enumerate([h0_pos, h1_pos]):
        c = plt.Circle((hx, hy), radius=circle_radius, fill=False, lw=1.5)
        ax.add_patch(c)
        ax.text(hx, hy, f"$h_{idx}$\nb={b1[idx]:.2f}", ha="center", va="center", fontsize=10)
        # ax.text(hx, hy, f"$h_{idx}$", ha="center", va="center", fontsize=10)

    # 3) Output node
    ox, oy = out_pos
    c = plt.Circle((ox, oy), radius=circle_radius, fill=False, lw=1.5)
    ax.add_patch(c)
    ax.text(ox, oy, f"Out\nb={b2[0]:.2f}", ha="center", va="center", fontsize=10)
    # ax.text(ox, oy, f"Out", ha="center", va="center", fontsize=10)

    # Helper arrow function
    def draw_arrow_and_label(x0, y0, x1, y1, label, color="blue"):

        # Start and end arrow at circle edge, not center:
        angle = np.arctan2(y1 - y0, x1 - x0)
        xstart = x0 + circle_radius * np.cos(angle)
        ystart = y0 + circle_radius * np.sin(angle)
        xend = x1 - circle_radius * np.cos(angle)
        yend = y1 - circle_radius * np.sin(angle)
        ax.add_patch(FancyArrowPatch((xstart, ystart), (xend, yend),
                                arrowstyle='-|>',
                                mutation_scale=12,
                                lw=1,
                                color=color))
        # Label at arrow midpoint
        ax.text((x0 + x1) / 2, (y0 + y1) / 2, label, fontsize=9, color=color,
                ha="center", va="center",
                bbox=dict(boxstyle="square,pad=0.05", fc="white", ec="none"))

    # 4) Draw arrows for input->hidden
    # W1 shape: (6,2)
    for i, (xx, yy) in enumerate(input_positions):
        w_h0 = W1[i, 0]
        w_h1 = W1[i, 1]
        draw_arrow_and_label(xx, yy, h0_pos[0], h0_pos[1], f"{w_h0:.2f}", "blue")
        draw_arrow_and_label(xx, yy, h1_pos[0], h1_pos[1], f"{w_h1:.2f}", "blue")

    # 5) Draw arrows for hidden->output
    # W2 shape: (2,1)
    draw_arrow_and_label(h0_pos[0], h0_pos[1], ox, oy, f"{W2[0,0]:.2f}", "red")
    draw_arrow_and_label(h1_pos[0], h1_pos[1], ox, oy, f"{W2[1,0]:.2f}", "red")

    plt.show()

def plot_training_error(history, label='MSE'):
    plt.figure(figsize=(6,4))
    plt.plot(history, label=label)
    plt.xlabel("Epoch")
    plt.ylabel("Mean Squared Error")
    plt.title("Training (Sigmoid->Sigmoid + Momentum)")
    plt.grid(True)
    plt.ylim(-0.01, max(history))
    plt.legend()
    plt.show()

# Test the random untrained MLP

Now let's create the MLP and test it on the data.

When I run it, I get about 12% accuracy, which is what would happen if it says "1" all the time. If it says "0" all the time it would get about 87%.

We'd hope to find a network that can do better than this.

In [None]:
mlp = Untrained_MLP()
preds = mlp.predict(X)
acc = np.mean(preds==y)
print(f"Initial training accuracy: {acc*100:.2f}%")

draw_mirror_network(mlp.W1, mlp.b1, mlp.W2, mlp.b2)

# Define backpropagation functions

To apply Rumelhart, Hinton, and Williams' idea, we need to calculate the error $e = \sum_k (o_k - y_k)^2$, and then we also need to calculate the derivatives that lead to the error.

$$g_i = \frac{de}{dw_i} = \sum_k  \frac{\partial e}{\partial o_k} \frac{\partial o_k}{\partial w_i} = \sum_k (o_k - y_k) \frac{\partial o_k} {\partial w_i} =  \sum_k (o_k - y_k) \sum_j  \frac{\partial o_k}{\partial h_j} \frac{\partial h_j}{\partial w_i}$$

This derivative continues to expand out using the chain rule, we and we can compute it all by computing and multiplying the local partial derivatives $\frac{\partial o_k}{\partial h_j}$ and $\frac{\partial h_j}{\partial w_i}$ for every intermediate step $h_j$ within the network.

Once we have the entire derivative $g_i$, then the paper recommends applying "momentum" to keep a running (decaying) sum of recent derivatives

$$m_i \leftarrow \mu m_i + g_i$$

And then finally we need to upate the parameters in the direction of this accumulated derivative

$$w_i \leftarrow w_i - \lambda m_i$$

The class `Trainable_MLP` adds a `backward` method that calculates the errors, all these derivatives and the momentum.  It then applies this information to make a change in the weights.

A network is trained by applying this learning rule repeatedly, hundreds or thousands of times.


In [None]:
class Trainable_MLP(Untrained_MLP):
    def __init__(self, n_input=6, n_hidden=2, n_output=1,
                 lr=0.1, momentum=0.9, init_range=0.3, seed=10):
        super().__init__(n_input=n_input, n_hidden=n_hidden, n_output=n_output,
                         init_range=init_range, seed=seed)
        self.lr = lr
        self.momentum = momentum
        self.W1_m = np.zeros_like(self.W1)
        self.b1_m = np.zeros_like(self.b1)
        self.W2_m = np.zeros_like(self.W2)
        self.b2_m = np.zeros_like(self.b2)

    def sigmoid_deriv(self, s):
        # s is the sigmoid output
        return s * (1.0 - s)

    def backward(self, X, hidden_out, final_out, y):
        """
        Backprop with momentum.
        X: shape (batch, 6)
        hidden_out: shape (batch, 2)
        final_out: shape (batch, 1)
        y: shape (batch, 1)
        """
        batch_size = X.shape[0]

        # Output delta
        error = final_out - y  # shape (batch,1)
        d_final = 2 * error * self.sigmoid_deriv(final_out)

        # Hidden delta
        d_hidden = (d_final @ self.W2.T) * self.sigmoid_deriv(hidden_out)

        # Grad for W2,b2
        grad_W2 = hidden_out.T @ d_final / batch_size   # (2,1)
        grad_b2 = np.mean(d_final, axis=0)             # (1,)

        # Grad for W1,b1
        grad_W1 = X.T @ d_hidden / batch_size          # (6,2)
        grad_b1 = np.mean(d_hidden, axis=0)            # (2,)

        # Momentum update for W2,b2
        self.W2_m = self.momentum*self.W2_m - self.lr*grad_W2
        self.b2_m = self.momentum*self.b2_m - self.lr*grad_b2
        self.W2 += self.W2_m
        self.b2 += self.b2_m

        # Momentum update for W1,b1
        self.W1_m = self.momentum*self.W1_m - self.lr*grad_W1
        self.b1_m = self.momentum*self.b1_m - self.lr*grad_b1
        self.W1 += self.W1_m
        self.b1 += self.b1_m

    def train_on_batch(self, X, y):
        """One epoch of forward+backward over the full dataset."""
        hidden_out, final_out = self.forward(X)
        self.backward(X, hidden_out, final_out, y)
        mse = np.mean((final_out - y)**2)
        return mse

# Train the weights of the MLP

To train the weights, we repeatedly evaluate the error, calculate the derivatives, and update the parameters.

In [None]:
mlp = Trainable_MLP(n_input=6, n_hidden=2, n_output=1,
                    lr=0.1,
                    momentum=0.9,
                    init_range=0.5,
                    seed=10)

epochs = 200
mse_history = []
for epoch in range(epochs):
    mse = mlp.train_on_batch(X, y)
    mse_history.append(mse)

preds = mlp.predict(X)
acc = np.mean(preds==y)
print(f"Final training accuracy: {acc*100:.2f}%")

plot_training_error(mse_history)

draw_mirror_network(mlp.W1, mlp.b1, mlp.W2, mlp.b2)


# Your turn: improve training

Questions:

1. What is the final accuracy of the mirror network?  Is it near 100\%?
2. Do the final neural network weights resemble the weights in the Rumelhart paper?
3. Can you find a change in hyperparameters (learning rate, momentum or training epochs) that improves accuracy?
4. Once the accuracy of the network is neat 100%, do the weights resemble Rumelhart?
5. What is "overfitting"?  Is this model overfitting?