In [None]:
import numpy as np

# Activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

class MultiLayerPerceptron:
    def __init__(self, input_size=2, hidden1_size=8, hidden2_size=6, output_size=2, learning_rate=0.1):
        self.input_size = input_size
        self.hidden1_size = hidden1_size
        self.hidden2_size = hidden2_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        # Initialize weights and biases with random values
        # Input to first hidden layer
        self.w1 = np.random.randn(input_size, hidden1_size) * 0.01
        self.b1 = np.zeros((1, hidden1_size))    
        
        # First hidden layer to second hidden layer
        self.w2 = np.random.randn(hidden1_size, hidden2_size) * 0.01
        self.b2 = np.zeros((1, hidden2_size))
        
        # Second hidden layer to output
        self.w3 = np.random.randn(hidden2_size, output_size) * 0.01
        self.b3 = np.zeros((1, output_size))
        
    def forward(self, X):
        # Forward pass through the network
        self.X = X
        
        # Input to first hidden layer
        self.z1 = np.dot(X, self.w1) + self.b1
        self.a1 = sigmoid(self.z1)
        
        # First hidden layer to second hidden layer
        self.z2 = np.dot(self.a1, self.w2) + self.b2
        self.a2 = sigmoid(self.z2)
        
        # Second hidden layer to output
        self.z3 = np.dot(self.a2, self.w3) + self.b3
        self.output = sigmoid(self.z3)
        
        return self.output
    
    def backward(self, y):
        # Backward pass - backpropagation
        m = self.X.shape[0]  # Number of training examples
        
        # Calculate the squared error
        error = self.output - y
        squared_error = np.sum(error**2) / (2 * m)
        
        # Compute gradients for output layer
        delta3 = error * sigmoid_derivative(self.output)
        dw3 = np.dot(self.a2.T, delta3) / m
        db3 = np.sum(delta3, axis=0, keepdims=True) / m
        
        # Compute gradients for second hidden layer
        delta2 = np.dot(delta3, self.w3.T) * sigmoid_derivative(self.a2)
        dw2 = np.dot(self.a1.T, delta2) / m
        db2 = np.sum(delta2, axis=0, keepdims=True) / m
        
        # Compute gradients for first hidden layer
        delta1 = np.dot(delta2, self.w2.T) * sigmoid_derivative(self.a1)
        dw1 = np.dot(self.X.T, delta1) / m
        db1 = np.sum(delta1, axis=0, keepdims=True) / m
        
        # Store gradients
        self.dw1, self.db1 = dw1, db1
        self.dw2, self.db2 = dw2, db2
        self.dw3, self.db3 = dw3, db3
        
        return squared_error
    
    def update_weights(self):
        # Update weights using gradient descent
        self.w1 -= self.learning_rate * self.dw1
        self.b1 -= self.learning_rate * self.db1
        
        self.w2 -= self.learning_rate * self.dw2
        self.b2 -= self.learning_rate * self.db2
        
        self.w3 -= self.learning_rate * self.dw3
        self.b3 -= self.learning_rate * self.db3
    
    def train_iteration(self, X, y):
        # One iteration of training
        self.forward(X)
        error = self.backward(y)
        self.update_weights()
        return error


# Generate sample data for demonstration
np.random.seed(42)  # For reproducibility
X = np.random.randn(4, 2)  # 4 examples, 2 features each
y = np.random.rand(4, 2)   # 4 examples, 2 outputs each

# Create and train the MLP
mlp = MultiLayerPerceptron(input_size=2, hidden1_size=8, hidden2_size=6, output_size=2)

# Print initial weights
print("Initial weights:")
print("W1 (input -> hidden1):")
print(mlp.w1)
print("\nW2 (hidden1 -> hidden2):")
print(mlp.w2)
print("\nW3 (hidden2 -> output):")
print(mlp.w3)

# First iteration
error1 = mlp.train_iteration(X, y)
print("\n\nAfter iteration 1:")
print(f"Error: {error1:.6f}")
print("\nUpdated W1:")
print(mlp.w1)
print("\nUpdated W2:")
print(mlp.w2)
print("\nUpdated W3:")
print(mlp.w3)

# Second iteration
error2 = mlp.train_iteration(X, y)
print("\n\nAfter iteration 2:")
print(f"Error: {error2:.6f}")
print("\nUpdated W1:")
print(mlp.w1)
print("\nUpdated W2:")
print(mlp.w2)
print("\nUpdated W3:")
print(mlp.w3)

# Demonstrate forward pass
print("\n\nPredict for the first sample:")
sample = X[0:1]  # First sample
prediction = mlp.forward(sample)
print(f"Input: {sample}")
print(f"Prediction: {prediction}")

Initial weights:
W1 (input -> hidden1):
[[-0.01724918 -0.00562288 -0.01012831  0.00314247 -0.00908024 -0.01412304
   0.01465649 -0.00225776]
 [ 0.00067528 -0.01424748 -0.00544383  0.00110923 -0.01150994  0.00375698
  -0.00600639 -0.00291694]]

W2 (hidden1 -> hidden2):
[[-0.00601707  0.01852278 -0.00013497 -0.01057711  0.00822545 -0.01220844]
 [ 0.00208864 -0.0195967  -0.01328186  0.00196861  0.00738467  0.00171368]
 [-0.00115648 -0.00301104 -0.01478522 -0.00719844 -0.00460639  0.01057122]
 [ 0.00343618 -0.0176304   0.00324084 -0.00385082 -0.00676922  0.00611676]
 [ 0.01031     0.0093128  -0.00839218 -0.00309212  0.00331263  0.00975545]
 [-0.00479174 -0.00185659 -0.01106335 -0.01196207  0.00812526  0.0135624 ]
 [-0.0007201   0.01003533  0.00361636 -0.0064512   0.00361396  0.01538037]
 [-0.00035826  0.01564644 -0.02619745  0.00821903  0.00087047 -0.00299007]]

W3 (hidden2 -> output):
[[ 0.00091761 -0.01987569]
 [-0.00219672  0.00357113]
 [ 0.01477894 -0.0051827 ]
 [-0.00808494 -0.0050175