In [1]:
import random
import numpy as np

In [2]:
def sigmoid(z):
    """
    The sigmoid function.

    Parameters:
    z (numpy.ndarray): The input to the sigmoid function.

    Returns:
    numpy.ndarray: The sigmoid of the input.
    """
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_prime(z):
    """
    Derivative of the sigmoid function.

    Parameters:
    z (numpy.ndarray): The input to the sigmoid function.

    Returns:
    numpy.ndarray: The derivative of the sigmoid function at z.
    """
    return sigmoid(z) * (1 - sigmoid(z))

## 3-layer NN
![This is an example image](http://neuralnetworksanddeeplearning.com/images/tikz10.png)

## Weights initilization

In [3]:
# Size contains the number of neurons in the respective layers
sizes = [3, 4, 1]
# Understand dimensions of weights
for x, y in zip(sizes[:-1], sizes[1:]):
    print(x, y)
# Initialize weights using Gaussian distribution with mean 0 and standard deviation 1
weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
print(weights)

3 4
4 1
[array([[ 0.45755238, -0.63902368, -1.42559792],
       [ 1.02520757, -0.22282446, -0.37816523],
       [ 0.17191112,  2.60076811, -0.61025091],
       [ 0.40555358, -1.18517198, -2.41139182]]), array([[ 1.03100247,  1.98062682, -1.7508068 , -0.23435663]])]


For the above network, the number of weights for each neuron in a layer is determined by the number of neurons in the previous layer. For instance, the second layer (i.e., hidden layer) in the above 3-layer NN has four neurons. For each neuron in this layer there are three inputs from the three neurons in the previous layer, which requires three weights. Therefore, the dimension of the array to store all the weights for this layer has shape of (4, 3), with the first dimension corresponds to the number of neurons in the current layer and second dimension corresponds to the total number of neurons in the previous layer.

## Bias initialization

In [4]:
# Understand dimensions of bias
for y in sizes[1:]:
    print(y)
# Initialize bias using Gaussian distribution with mean 0 and standard deviation 1    
bias = [np.random.randn(y, 1) for y in sizes[1:]]
print(bias)

4
1
[array([[0.98872931],
       [0.57179665],
       [0.27104466],
       [1.12685389]]), array([[0.11113971]])]


For bias, firstly no bias term is needed for the input layer since the neurons in this layer do not have any outputs coming in. The number of bias terms for all the subsequent layes, including output layer, is equal to the number of neurons in each layer, which is captured in the sizes of the network.

In [5]:
class Network:
    def __init__(self, sizes):
        """
        Initialize the neural network with the given sizes.
        
        Parameters:
        sizes (list): A list containing the number of neurons in each layer.
                      For example, [784, 30, 10] would create a network with
                      784 input neurons, one hidden layer with 30 neurons,
                      and an output layer with 10 neurons.
        """
        self.num_layers = len(sizes)
        self.sizes = sizes
        # Biases and weights are initialized with Gaussian distribution
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        """
        Return the output of the network if 'a' is input.
        
        Parameters:
        a (numpy array): The input to the network.
        
        Returns:
        numpy array: The output of the network.
        """
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a) + b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        """
        Train the neural network using mini-batch stochastic gradient descent.
        
        Parameters:
        training_data (list): A list of tuples '(x, y)' representing the training inputs
                              and the desired outputs.
        epochs (int): The number of epochs to train for.
        mini_batch_size (int): The size of the mini-batches to use when sampling.
        eta (float): The learning rate.
        test_data (list, optional): If provided, the network will be evaluated
                                    against the test data after each epoch, and
                                    partial progress will be printed out.
        """
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print(f"Epoch {j}: {self.evaluate(test_data)} / {n_test}")
            else:
                print(f"Epoch {j} complete")

    def update_mini_batch(self, mini_batch, eta):
        """
        Update the network's weights and biases by applying gradient descent
        using backpropagation to a single mini-batch.
        
        Parameters:
        mini_batch (list): A list of tuples '(x, y)'.
        eta (float): The learning rate.
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w - (eta / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b - (eta / len(mini_batch)) * nb for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        """
        Return a tuple representing the gradient for the cost function.
        
        Parameters:
        x (numpy array): The input to the network.
        y (numpy array): The desired output.
        
        Returns:
        tuple: (nabla_b, nabla_w), representing the gradients for the biases and weights.
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # Feedforward
        activation = x
        activations = [x]  # List to store all the activations, layer by layer
        zs = []  # List to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # Backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        # Update the gradients for the previous layers
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)

    def evaluate(self, test_data):
        """
        Return the number of test inputs for which the neural network outputs
        the correct result.
        
        Parameters:
        test_data (list): A list of tuples '(x, y)' where 'x' is the input and 'y' is the desired output.
        
        Returns:
        int: The number of test inputs for which the network is correct.
        """
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        """
        Return the vector of partial derivatives ∂C/∂a for the output activations.
        
        Parameters:
        output_activations (numpy array): The output of the network.
        y (numpy array): The desired output.
        
        Returns:
        numpy array: The derivative of the cost function.
        """
        return (output_activations - y)

## Create a Network object

In [6]:
# A neural network with three layers: one input layer with three neurons, one hidden layer with four neurons and 
# one output layer with one neuron
net = Network([3, 4, 1])
# Weights connecting the second layer and third layers of neurons
net.weights[1].shape

(1, 4)

## Understand `feedforward` function of `Network` object
Let's walk through an example of the `feedforward` function using a neural network with sizes `[3, 4, 1]`. This means the network has:
- 3 neurons in the input layer
- 4 neurons in the hidden layer
- 1 neuron in the output layer

### Example

For the sake of this example, let's assume the following initial random weights and biases for the network:

#### Weights
- Between input layer and hidden layer (3 input neurons to 4 hidden neurons):
  ```python
  w1 = [[0.2, -0.5, 1.0],
        [1.5, -1.0, 0.5],
        [-1.5, 2.0, -1.0],
        [0.5, -0.5, 1.5]]
  ```
- Between hidden layer and output layer (4 hidden neurons to 1 output neuron):
  ```python
  w2 = [[0.3, -0.8, 0.5, 1.0]]
  ```

#### Biases
- For hidden layer (4 neurons):
  ```python
  b1 = [[0.1],
        [0.2],
        [0.3],
        [0.4]]
  ```
- For output layer (1 neuron):
  ```python
  b2 = [[0.5]]
  ```

### Input
Input layer of this NN has three neurons. Let's take a specific input training example for those three input neurons:
```python
input_vector = [[0.5],
                [0.1],
                [0.4]]
```

In [7]:
%%html
<svg width="800" height="400" viewBox="0 0 800 400" xmlns="http://www.w3.org/2000/svg">
  <svg viewBox="0 0 800 400" xmlns="http://www.w3.org/2000/svg">
  <!-- Input Layer -->
  <g id="input-layer">
    <circle cx="100" cy="80" r="20" fill="lightblue" stroke="black"/>
    <text x="100" y="85" text-anchor="middle" font-size="12">0.5</text>
    <circle cx="100" cy="160" r="20" fill="lightblue" stroke="black"/>
    <text x="100" y="165" text-anchor="middle" font-size="12">0.1</text>
    <circle cx="100" cy="240" r="20" fill="lightblue" stroke="black"/>
    <text x="100" y="245" text-anchor="middle" font-size="12">0.4</text>
  </g>
  
  <!-- Hidden Layer -->
  <g id="hidden-layer">
    <circle cx="300" cy="60" r="20" fill="lightgreen" stroke="black"/>
    <text x="300" y="65" text-anchor="middle" font-size="12">0.63</text>
    <circle cx="300" cy="140" r="20" fill="lightgreen" stroke="black"/>
    <text x="300" y="145" text-anchor="middle" font-size="12">0.74</text>
    <circle cx="300" cy="220" r="20" fill="lightgreen" stroke="black"/>
    <text x="300" y="225" text-anchor="middle" font-size="12">0.34</text>
    <circle cx="300" cy="300" r="20" fill="lightgreen" stroke="black"/>
    <text x="300" y="305" text-anchor="middle" font-size="12">0.77</text>
  </g>
  
  <!-- Output Layer -->
  <g id="output-layer">
    <circle cx="500" cy="160" r="20" fill="lightyellow" stroke="black"/>
    <text x="500" y="165" text-anchor="middle" font-size="12">0.74</text>
  </g>
  
  <!-- Connections -->
  <g id="connections" stroke="gray" stroke-width="1">
    <!-- Input to Hidden -->
    <line x1="120" y1="80" x2="280" y2="60"/>
    <line x1="120" y1="80" x2="280" y2="140"/>
    <line x1="120" y1="80" x2="280" y2="220"/>
    <line x1="120" y1="80" x2="280" y2="300"/>
    
    <line x1="120" y1="160" x2="280" y2="60"/>
    <line x1="120" y1="160" x2="280" y2="140"/>
    <line x1="120" y1="160" x2="280" y2="220"/>
    <line x1="120" y1="160" x2="280" y2="300"/>
    
    <line x1="120" y1="240" x2="280" y2="60"/>
    <line x1="120" y1="240" x2="280" y2="140"/>
    <line x1="120" y1="240" x2="280" y2="220"/>
    <line x1="120" y1="240" x2="280" y2="300"/>
    
    <!-- Hidden to Output -->
    <line x1="320" y1="60" x2="480" y2="160"/>
    <line x1="320" y1="140" x2="480" y2="160"/>
    <line x1="320" y1="220" x2="480" y2="160"/>
    <line x1="320" y1="300" x2="480" y2="160"/>
  </g>
  
  <!-- Weights -->
  <g id="weights" font-size="10" fill="red">
    <!-- Input to Hidden -->
    <text x="150" y="70">0.2</text>
    <text x="150" y="90">1.5</text>
    <text x="150" y="110">-1.5</text>
    <text x="150" y="130">0.5</text>
    
    <text x="180" y="100">-0.5</text>
    <text x="180" y="150">-1.0</text>
    <text x="180" y="180">2.0</text>
    <text x="180" y="210">-0.5</text>
    
    <text x="230" y="100">1.0</text>
    <text x="230" y="160">0.5</text>
    <text x="230" y="220">-1.0</text>
    <text x="230" y="280">1.5</text>
    
    <!-- Hidden to Output -->
    <text x="400" y="100">0.3</text>
    <text x="400" y="140">-0.8</text>
    <text x="400" y="180">0.5</text>
    <text x="400" y="220">1.0</text>
  </g>
  
  <!-- Labels -->
  <text x="100" y="20" text-anchor="middle" font-size="14" font-weight="bold">Input Layer</text>
  <text x="300" y="20" text-anchor="middle" font-size="14" font-weight="bold">Hidden Layer</text>
  <text x="500" y="20" text-anchor="middle" font-size="14" font-weight="bold">Output Layer</text>
  <text x="600" y="160" font-size="14">Output: 0.74</text>
</svg>
</svg>

### Feedforward Calculation

Let's break down the `feedforward` function step by step:

1. **Initialization**: Set the input vector `a` as the initial activation.

    ```python
    a = input_vector
    ```

2. **First Layer Calculation (Input to Hidden)**:
   - Compute the weighted input $z_1$ for the hidden layer:
     \begin{equation}
     z_1 = w_1 \cdot a + b_1
     \end{equation}
     Substituting the values:
     $$
     z_1 = \begin{bmatrix}
     0.2 & -0.5 & 1.0 \\
     1.5 & -1.0 & 0.5 \\
     -1.5 & 2.0 & -1.0 \\
     0.5 & -0.5 & 1.5 
     \end{bmatrix} \cdot \begin{bmatrix}
     0.5 \\
     0.1 \\
     0.4 
     \end{bmatrix} + \begin{bmatrix}
     0.1 \\
     0.2 \\
     0.3 \\
     0.4 
     \end{bmatrix}
     $$
     
     Calculate the dot product:
     
     $$
     w_1 \cdot a = \begin{bmatrix}
     0.2 \cdot 0.5 + -0.5 \cdot 0.1 + 1.0 \cdot 0.4 \\
     1.5 \cdot 0.5 + -1.0 \cdot 0.1 + 0.5 \cdot 0.4 \\
     -1.5 \cdot 0.5 + 2.0 \cdot 0.1 + -1.0 \cdot 0.4 \\
     0.5 \cdot 0.5 + -0.5 \cdot 0.1 + 1.5 \cdot 0.4
     \end{bmatrix} = \begin{bmatrix}
     0.1 + -0.05 + 0.4 \\
     0.75 + -0.1 + 0.2 \\
     -0.75 + 0.2 + -0.4 \\
     0.25 + -0.05 + 0.6
     \end{bmatrix} = \begin{bmatrix}
     0.45 \\
     0.85 \\
     -0.95 \\
     0.8 
     \end{bmatrix}
     $$
     Adding the bias:
     $$
     z_1 = \begin{bmatrix}
     0.45 \\
     0.85 \\
     -0.95 \\
     0.8 
     \end{bmatrix} + \begin{bmatrix}
     0.1 \\
     0.2 \\
     0.3 \\
     0.4 
     \end{bmatrix} = \begin{bmatrix}
     0.55 \\
     1.05 \\
     -0.65 \\
     1.2 
     \end{bmatrix}
     $$

   - Apply the sigmoid activation function to get the activation $a_1$ of the hidden layer:
     $$
     a_1 = \sigma(z_1) = \sigma\left(\begin{bmatrix}
     0.55 \\
     1.05 \\
     -0.65 \\
     1.2 
     \end{bmatrix}\right) = \begin{bmatrix}
     \sigma(0.55) \\
     \sigma(1.05) \\
     \sigma(-0.65) \\
     \sigma(1.2)
     \end{bmatrix} = \begin{bmatrix}
     0.63413559 \\
     0.7407749 \\
     0.34298954 \\
     0.76852478
     \end{bmatrix}
     $$

3. **Second Layer Calculation (Hidden to Output)**:
   - Compute the weighted input $z_2$ for the output layer:
     $$
     z_2 = w_2 \cdot a_1 + b_2
     $$
     
     Substituting the values:
     
     $$
     z_2 = \begin{bmatrix}
     0.3 & -0.8 & 0.5 & 1.0
     \end{bmatrix} \cdot \begin{bmatrix}
     0.63413559 \\
     0.7407749 \\
     0.34298954 \\
     0.76852478
     \end{bmatrix} + \begin{bmatrix}
     0.5
     \end{bmatrix}
     $$
     Calculate the dot product:
     $$
     w_2 \cdot a_1 = 0.3 \cdot 0.63413559 + -0.8 \cdot 0.7407749 + 0.5 \cdot 0.34298954 + 1.0 \cdot 0.76852478 = 0.19024068 + -0.59261992 + 0.17149477 + 0.76852478 = 0.53764031
     $$
     Adding the bias:
     $$
     z_2 = 0.53764031 + 0.5 = 1.03764031
     $$

   - Apply the sigmoid activation function to get the activation `a2` of the output layer:
     $$
     a_2 = \sigma(z_2) = \sigma(1.03764031) = 0.73841914
     $$

### Final Output
So, given the input vector `[[0.5], [0.1], [0.4]]`, the network produces an output of `0.73841914`.

### Summary
- **Input Layer**: Receives the input `[[0.5], [0.1], [0.4]]`.
- **Hidden Layer**: 
  - Computes $z_1$ using weights and biases.
  - Applies the sigmoid function to $z_1$ to get $a_1$.
- **Output Layer**:
  - Computes $z_2$ using weights and biases.
  - Applies the sigmoid function to $z_2$ to get $a_2$.

This step-by-step example demonstrates how the `feedforward` function processes an input through the layers of the network to produce the final output.

In [8]:
# Create a network with the same structure as our previous example
net = Network([3, 4, 1])

# Set the weights and biases to match our previous example
net.weights = [
    np.array([[0.2, -0.5, 1.0],
              [1.5, -1.0, 0.5],
              [-1.5, 2.0, -1.0],
              [0.5, -0.5, 1.5]]),
    np.array([[0.3, -0.8, 0.5, 1.0]])
]

net.biases = [
    np.array([[0.1], [0.2], [0.3], [0.4]]),
    np.array([[0.5]])
]

# Input vector
input_vector = np.array([[0.5], [0.1], [0.4]])

# Use the feedforward method
output = net.feedforward(input_vector)

print("Network Feedforward Result:")
print(f"Input: {input_vector.T}")
print(f"Output: {output[0][0]:.8f}")

# Compare with our previous manual calculation
print("\nComparison:")
print(f"Manual calculation result: 0.73841914")
print(f"Network feedforward result: {output[0][0]:.8f}")
print(f"Difference: {abs(0.73841914 - output[0][0]):.8f}")

Network Feedforward Result:
Input: [[0.5 0.1 0.4]]
Output: 0.73839445

Comparison:
Manual calculation result: 0.73841914
Network feedforward result: 0.73839445
Difference: 0.00002469


In [9]:
# Create a network with the same structure as our previous example
# The weights and biases are initialized by the Network object.
net = Network([3, 4, 1])

# Input vector
input_vector = np.array([[0.5], [0.1], [0.4]])

# Use the feedforward method
output = net.feedforward(input_vector)

print("Network Feedforward Result:")
print(f"Input: {input_vector.T}")
print(f"Output: {output[0][0]:.8f}")

Network Feedforward Result:
Input: [[0.5 0.1 0.4]]
Output: 0.78564259


## Understand `SGD` function of `Network` object
We'll demonstrate the workflow of the `SGD` function using a hypothetical dataset for the neural network structure described in the notebook (3 input neurons, 4 hidden neurons, 1 output neuron).

Let's start by creating a hypothetical dataset and then go through the SGD function step by step.
### Step 1: Create a hypothetical dataset

In [19]:
# Create 10 random input samples
inputs = np.random.rand(12, 3)
# Create corresponding random output samples (between 0 and 1)
outputs = np.random.rand(12, 1)

# Combine inputs and outputs into training data
training_data = list(zip(inputs, outputs))

print("Training Data:")
for i, (x, y) in enumerate(training_data):
    print(f"Sample {i+1}: Input {x}, Output {y[0]:.4f}")

Training Data:
Sample 1: Input [0.23984301 0.36823297 0.73021352], Output 0.9632
Sample 2: Input [0.84115672 0.84562775 0.57006856], Output 0.4770
Sample 3: Input [0.08615032 0.12027176 0.32592854], Output 0.2486
Sample 4: Input [0.25883698 0.08358081 0.40747107], Output 0.6444
Sample 5: Input [0.91296901 0.2102162  0.80432432], Output 0.8701
Sample 6: Input [0.08612347 0.385473   0.44872146], Output 0.2585
Sample 7: Input [0.72153488 0.63716517 0.61898078], Output 0.8623
Sample 8: Input [0.36620078 0.2239276  0.59579017], Output 0.9833
Sample 9: Input [0.95651739 0.31186137 0.41393432], Output 0.5588
Sample 10: Input [0.50652544 0.35704097 0.54453999], Output 0.0947
Sample 11: Input [0.46581827 0.33754085 0.21991484], Output 0.2355
Sample 12: Input [0.95287591 0.02280938 0.92499575], Output 0.8820


### Step 2: Initialize the network

In [20]:
net = Network([3, 4, 1])

### Step 3: Set up SGD parameters

In [21]:
epochs = 3
mini_batch_size = 4
learning_rate = 0.1

Let's walk through the `SGD` function step by step:

```python
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        """
        Train the neural network using mini-batch stochastic gradient descent.
        
        Parameters:
        training_data (list): A list of tuples '(x, y)' representing the training inputs
                              and the desired outputs.
        epochs (int): The number of epochs to train for.
        mini_batch_size (int): The size of the mini-batches to use when sampling.
        eta (float): The learning rate.
        test_data (list, optional): If provided, the network will be evaluated
                                    against the test data after each epoch, and
                                    partial progress will be printed out.
        """
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print(f"Epoch {j}: {self.evaluate(test_data)} / {n_test}")
            else:
                print(f"Epoch {j} complete")
```

Let's break this down:

> 1. The function takes training data, number of epochs, mini batch size, learning rate (eta), and optimal test data as input/arguments.
> 2. It iterates through the number of epochs specified. For instance, if the number of epochs is 20, then it iterates through the function 10 times, each with different mini batches.
> 3. At the beginning of each epoch, the function shuffles the training data for mini-batch selection.
> 4. After shuffling, the entire training data is split into mini batches with each batch size is specified by mini-batch size. For instance, if we have 100 training examples and mini batch size is 20. Then the entire training data is splitted into 5 mini batches, each containing 20 training examples.
> 5. For each mini batch, the function calls another function `update_mini_batch` to update weights and biases of the model.
> 6. If test data is provided, it evaluates the performance of the model with the updated weights and biases.
> 7. After processing all mini batches within an epoch, it prints the epoch completion status.

In [22]:
def simulate_sgd(training_data, epochs, mini_batch_size, eta):
    n = len(training_data)
    print(f"The total number of training examples in the training data is {n}.")
    print("Training Data:")
    for i, (x, y) in enumerate(training_data):
        print(f"Sample {i+1}: Input {x}, Output {y[0]:.4f}")
        
    for j in range(epochs):
        print(f"\nEpoch {j+1}:")
        random.shuffle(training_data)
        print("Shuffled data:")
        for i, (x, y) in enumerate(training_data):
            print(f"Sample {i+1}: Input {x}, Output {y[0]:.4f}")
        
        mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
        print(f"\nNumber of mini-batches: {len(mini_batches)}")
        
        for i, mini_batch in enumerate(mini_batches):
            print(f"\nMini-batch {i+1}:")
            for x, y in mini_batch:
                print(f"Input {x}, Output {y[0]:.4f}")
            print("Updating network weights and biases...")
        
        print(f"Epoch {j+1} complete")

# Run the simulation
simulate_sgd(training_data, epochs, mini_batch_size, learning_rate)

The total number of training examples in the training data is 12.
Training Data:
Sample 1: Input [0.23984301 0.36823297 0.73021352], Output 0.9632
Sample 2: Input [0.84115672 0.84562775 0.57006856], Output 0.4770
Sample 3: Input [0.08615032 0.12027176 0.32592854], Output 0.2486
Sample 4: Input [0.25883698 0.08358081 0.40747107], Output 0.6444
Sample 5: Input [0.91296901 0.2102162  0.80432432], Output 0.8701
Sample 6: Input [0.08612347 0.385473   0.44872146], Output 0.2585
Sample 7: Input [0.72153488 0.63716517 0.61898078], Output 0.8623
Sample 8: Input [0.36620078 0.2239276  0.59579017], Output 0.9833
Sample 9: Input [0.95651739 0.31186137 0.41393432], Output 0.5588
Sample 10: Input [0.50652544 0.35704097 0.54453999], Output 0.0947
Sample 11: Input [0.46581827 0.33754085 0.21991484], Output 0.2355
Sample 12: Input [0.95287591 0.02280938 0.92499575], Output 0.8820

Epoch 1:
Shuffled data:
Sample 1: Input [0.23984301 0.36823297 0.73021352], Output 0.9632
Sample 2: Input [0.84115672 0.845

In [30]:
def simulate_update_mini_batch(net, mini_batch, eta):
    print("\nSimulating update_mini_batch:")
    print(f"Mini-batch size: {len(mini_batch)}")
    print(f"Learning rate (eta): {eta}")
    for b in net.biases:
        print(f"The shape of the biases: {b.shape}")
    for w in net.weights:
        print(f"The shape of the weights: {w.shape}")
    
    nabla_b = [np.zeros(b.shape) for b in net.biases]
    nabla_w = [np.zeros(w.shape) for w in net.weights]
    
    for i, (x, y) in enumerate(mini_batch):
        print(f"\nProcessing example {i+1}:")
        print(f"Input: {x.flatten()}")
        print(f"Target output: {y.flatten()}")
        
        # Forward propagation
        activation = x
        for j, (b, w) in enumerate(zip(net.biases, net.weights)):
            activation = sigmoid(np.dot(w, activation) + b)
            print(f"Layer {j+1} output: {activation.flatten()}")
        
        print(f"Network output: {activation.flatten()}")
        
        # Backpropagation
        delta_nabla_b, delta_nabla_w = net.backprop(x, y)
        
        print("\nGradients:")
        for j, (dnb, dnw) in enumerate(zip(delta_nabla_b, delta_nabla_w)):
            print(f"Layer {j+1}:")
            print(f"  Bias gradients: {dnb.flatten()}")
            print(f"  Weight gradients shape: {dnw.shape}")
        
        nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
        nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
    
    print("\nUpdating weights and biases:")
    for i, (w, b, nw, nb) in enumerate(zip(net.weights, net.biases, nabla_w, nabla_b)):
        print(f"\nLayer {i+1}:")
        w_update = (eta / len(mini_batch)) * nw
        b_update = (eta / len(mini_batch)) * nb
        print(f"Weight update magnitude: {np.linalg.norm(w_update)}")
        print(f"Bias update magnitude: {np.linalg.norm(b_update)}")
        net.weights[i] = w - w_update
        net.biases[i] = b - b_update

def simulate_sgd(training_data, epochs, mini_batch_size, eta):
    net = Network([3, 4, 1])
    n = len(training_data)
    
    for j in range(epochs):
        print(f"\nEpoch {j+1}:")
        random.shuffle(training_data)
        mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
        
        for i, mini_batch in enumerate(mini_batches):
            print(f"\nProcessing mini-batch {i+1}:")
            simulate_update_mini_batch(net, mini_batch, eta)
        
        print(f"\nEpoch {j+1} complete")

Let's break this down:

>1. `simulate_sgd` works in the similar way as perviously explained in this notebook.
>2. `simulate_update_mini_batch` function starts with initializing gradients with zeros. This initialization corresponds to layers. For instance, if we have three-layer network with one input layer, one hidden layer and one output layer, then the initialization corresponds to the connection between input layer and hidden layer, and between hidden layer and output layer.
>3. Next step involves iterating through each training example in the training data. For each training example in the current mini-batch:
    - Implement forward propagation to compute neuron output using sigmpoid activation function and current network weights and biases, which is stored in the `activation` variable.
    - Implement backpropagation to compute `delta_nabla_b` and `delta_nabla_w`. `delta_nabla_b` and `delta_nabla_w` are partial derivatives of the cost function with respect to biases and weights, respectively, for a single training example.
    - The gradients are accumulated in `nabla_b` and `nabla_w`
>4. After processing all training examples in a mini-batch:
    - `w_update` and `b_update` represents the average of accumulated gradient, adjusted by the learning rate.
    - `weights` and `biases` are then updated based on the `w_update` and `b_update`.
    
Two important points here:
1. Single training example: The backprop method computes these gradients for one training example at a time. This is why we accumulate these gradients over the mini-batch.
2. Iteration specificity: These gradients are specific to the current state of the network (current weights and biases) and the particular training example being processed in that iteration.

In [31]:
# Create sample training data
np.random.seed(42)  # for reproducibility
training_data = [(np.random.rand(3, 1), np.random.rand(1, 1)) for _ in range(12)]
for i in range(len(training_data)):
    print(f"The {i+1}th training example is: {training_data[i]}")

# Run the simulation
simulate_sgd(training_data, epochs=2, mini_batch_size=4, eta=0.1)

The 1th training example is: (array([[0.37454012],
       [0.95071431],
       [0.73199394]]), array([[0.59865848]]))
The 2th training example is: (array([[0.15601864],
       [0.15599452],
       [0.05808361]]), array([[0.86617615]]))
The 3th training example is: (array([[0.60111501],
       [0.70807258],
       [0.02058449]]), array([[0.96990985]]))
The 4th training example is: (array([[0.83244264],
       [0.21233911],
       [0.18182497]]), array([[0.18340451]]))
The 5th training example is: (array([[0.30424224],
       [0.52475643],
       [0.43194502]]), array([[0.29122914]]))
The 6th training example is: (array([[0.61185289],
       [0.13949386],
       [0.29214465]]), array([[0.36636184]]))
The 7th training example is: (array([[0.45606998],
       [0.78517596],
       [0.19967378]]), array([[0.51423444]]))
The 8th training example is: (array([[0.59241457],
       [0.04645041],
       [0.60754485]]), array([[0.17052412]]))
The 9th training example is: (array([[0.06505159],
     