In [110]:
import random
import seaborn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm


In [120]:

def init_params(nx, nh, ny):
    """
    Initialize weights for an MLP based on the number of neurons in each layer.
    
    Arguments:
    nx -- integer, number of neurons in input layer
    nh -- integer, number of neurons in hidden layer
    ny -- integer, number of neurons in output layer
    
    Returns:
    params -- dictionary containing the initialized weights for each layer
    """
    
    np.random.seed(1)
    
    # Initialize weights with normal distribution
    w1 = np.random.normal(loc=0, scale=0.3, size=(nh, nx))
    w2 = np.random.normal(loc=0, scale=0.3, size=(ny, nh))
    
    # Initialize biases with zeros
    b1 = np.zeros((nh, 1))
    b2 = np.zeros((ny, 1))
    
    # Store initialized weights and biases in dictionary
    params = {"W1": w1, "W2": w2, "b1": b1, "b2": b2}
    
    return params

In [121]:
init_params(2,3,2)

{'W1': array([[ 0.48730361, -0.18352692],
        [-0.15845153, -0.32189059],
        [ 0.25962229, -0.69046161]]),
 'W2': array([[ 0.52344353, -0.22836207,  0.09571173],
        [-0.07481111,  0.43863238, -0.61804221]]),
 'b1': array([[0.],
        [0.],
        [0.]]),
 'b2': array([[0.],
        [0.]])}

In [122]:
"""
Perform forward propagation for a multi-layer perceptron.

Parameters:
    params (dict): A dictionary containing the weights and biases for each layer of the MLP.
    X (ndarray): An input data matrix of shape (n_batch, n_x).

Returns:
    tuple: A tuple containing the predicted values y_pred and intermediate values.
"""

def forward(params, X):

    # Unpack parameters
    W1, b1, W2, b2 = params['W1'], params['b1'], params['W2'], params['b2']
    
    # Forward propagation
    z1 = np.dot(X, W1) + b1
    h1 = np.tanh(z1)
    z2 = np.dot(h1, W2) + b2
    ##print(z2)
    y_pred = softmax(z2)
    
    # Save intermediate values for backpropagation
    intermediate = {'z1': z1, 'h1': h1, 'z2': z2}
    
    return y_pred, intermediate
"""
Compute softmax activation function for input x.

Parameters:
    x (ndarray): Input array.

Returns:
    ndarray: Output array after applying softmax function.
"""
def softmax(x):

    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [123]:
# Input data
X = np.array([[0.5, 0.2]])
#print(X.shape)
# Parameters
params = {
    'W1': np.array([[0.1, 0.2], [0.3, 0.4]]),
    'b1': np.array([0.1, 0.2]),
    'W2': np.array([[0.5, 0.6], [0.7, 0.8]]),
    'b2': np.array([0.3, 0.4])
}

# Test the forward function
y_pred, intermediate = forward(params, X)
print("Predicted output:", y_pred)
print(intermediate)

Predicted output: [[0.46083853 0.53916147]]
{'z1': array([[0.21, 0.38]]), 'h1': array([[0.2069665 , 0.36270747]]), 'z2': array([[0.65737848, 0.81434587]])}


In [124]:
def loss_accuracy(Yhat, Y):
    """
    Compute cross-entropy loss and accuracy between predicted and true values.
    
    Parameters:
        Yhat (ndarray): Predicted values.
        Y (ndarray): True values.
        
    Returns:
        tuple: A tuple containing the cross-entropy loss and accuracy.
    """
    # Calculate cross-entropy loss
    loss = -np.sum(Y * np.log(Yhat)) / Y.shape[0]
    
    # Calculate accuracy
    Y_pred = np.argmax(Yhat, axis=1)
    Y_true = np.argmax(Y, axis=1)
    accuracy = np.sum(Y_pred == Y_true) / Y.shape[0]
    
    return loss, accuracy

In [125]:

# Generate random data for testing
n_batch = 5
n_y = 3
Yhat = np.random.rand(n_batch, n_y)
print(Yhat)
Y = np.random.randint(0, n_y, size=n_batch)
"""
Calculate the cross-entropy loss and accuracy of predicted values.

Parameters:
    Yhat (ndarray): Predicted values of shape (n_batch, n_y).
    Y (ndarray): True values of shape (n_batch,).

Returns:
    tuple: A tuple containing the loss and accuracy.
"""
def loss_accuracy(Yhat, Y):

    # Calculate cross-entropy loss
    N = len(Y) 
    print(Yhat)
    loss = -np.sum(np.log(Yhat[np.arange(N), Y])) / N
    
    # Calculate accuracy
   
    y_pred = np.argmax(Yhat, axis=1)
    accuracy = np.sum(y_pred == Y) / N
    
    return loss, accuracy

# Test the function
loss, accuracy = loss_accuracy(Yhat, Y)
print("Loss:", loss)
print("Accuracy:", accuracy)

[[0.14038694 0.19810149 0.80074457]
 [0.96826158 0.31342418 0.69232262]
 [0.87638915 0.89460666 0.08504421]
 [0.03905478 0.16983042 0.8781425 ]
 [0.09834683 0.42110763 0.95788953]]
[[0.14038694 0.19810149 0.80074457]
 [0.96826158 0.31342418 0.69232262]
 [0.87638915 0.89460666 0.08504421]
 [0.03905478 0.16983042 0.8781425 ]
 [0.09834683 0.42110763 0.95788953]]
Loss: 1.1750767324054703
Accuracy: 0.2


In [126]:
def backward(params, intermediate, Y):
    """
    Perform backward propagation for a multi-layer perceptron.
    
    Parameters:
        params (dict): A dictionary containing the weights and biases for each layer of the MLP.
        intermediate (dict): A dictionary containing intermediate values from the forward propagation.
        Y (ndarray): A target values matrix of shape (n_batch, n_y).
        
    Returns:
        dict: A dictionary containing the gradients of the loss with respect to the parameters.
    """
    # Unpack parameters
    W1, b1, W2, b2 = params['W1'], params['b1'], params['W2'], params['b2']
    
    # Unpack intermediate values
    z1, h1, z2 = intermediate['z1'], intermediate['h1'], intermediate['z2']
    
    # Compute gradients
    dL_dz2 = (1 / Y.shape[0]) * (softmax(z2) - Y)
    dL_dW2 = np.dot(h1.T, dL_dz2)
    dL_db2 = np.sum(dL_dz2, axis=0)
    dL_dh1 = np.dot(dL_dz2, W2.T) * (1 - h1**2)
    dL_dz1 = dL_dh1
    dL_dW1 = np.dot(X.T, dL_dz1)
    dL_db1 = np.sum(dL_dz1, axis=0)
    
    # Pack gradients into a dictionary
    gradients = {'W1': dL_dW1, 'b1': dL_db1, 'W2': dL_dW2, 'b2': dL_db2}
    
    return gradients


In [127]:
def loss_accuracy(params, X, Y):
    """
    Calculate the loss and accuracy of a multi-layer perceptron on a given dataset.
    
    Parameters:
        params (dict): A dictionary containing the weights and biases for each layer of the MLP.
        X (ndarray): A feature matrix of shape (n_batch, n_features).
        Y (ndarray): A target values matrix of shape (n_batch, n_y).
        
    Returns:
        tuple: A tuple containing the loss and accuracy of the MLP.
    """
    # Perform forward propagation
    intermediate = forward(params, X)
    Yhat = intermediate['z2']
    
    # Compute loss
    loss = -np.mean(np.sum(Y * np.log(softmax(Yhat)), axis=1))
    
    # Compute accuracy
    predictions = np.argmax(Yhat, axis=1)
    true_labels = np.argmax(Y, axis=1)
    accuracy = np.mean(predictions == true_labels)
    
    return loss, accuracy


In [128]:
# Initialize parameters
params = init_params(5, 3, 3)
# Input data
X = np.array([[0.5, 0.2]])
#print(X.shape)
# Parameters
params = {
    'W1': np.array([[0.1, 0.2], [0.3, 0.4]]),
    'b1': np.array([0.1, 0.2]),
    'W2': np.array([[0.5, 0.6], [0.7, 0.8]]),
    'b2': np.array([0.3, 0.4])
}
# Set hyperparameters
learning_rate = 0.01
n_epochs = 500

# Training loop
for epoch in range(n_epochs):
    # Forward pass
    Yhat, intermediate = forward(params, X)
    
    # Compute loss and accuracy
    loss, accuracy = loss_accuracy(Yhat,X, Y)
    
    # Backward pass
    grads = backward(params, intermediate, Y)
    
    # Update parameters
    params['W1'] -= learning_rate * grads['W1']
    params['b1'] -= learning_rate * grads['b1']
    params['W2'] -= learning_rate * grads['W2']
    params['b2'] -= learning_rate * grads['b2']
    
    # Print progress
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch + 1} -- Loss: {loss:.4f} -- Accuracy: {accuracy:.4f}")


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


$$\frac{\partial \mathcal{L}}{\partial z^{[2]}} = \frac{1}{m}(\hat{Y} - Y)$$

$$\frac{\partial \mathcal{L}}{\partial W^{[2]}} = h^{[1]T}\frac{\partial \mathcal{L}}{\partial z^{[2]}}$$

$$\frac{\partial \mathcal{L}}{\partial b^{[2]}} = \sum_{i=1}^{m} \frac{\partial \mathcal{L}}{\partial z_i^{[2]}}$$

$$\frac{\partial \mathcal{L}}{\partial h^{[1]}} = \frac{\partial \mathcal{L}}{\partial z^{[2]}}W^{[2]T}\odot (1 - (h^{[1]})^2)$$

$$\frac{\partial \mathcal{L}}{\partial z^{[1]}} = \frac{\partial \mathcal{L}}{\partial h^{[1]}}$$

$$\frac{\partial \mathcal{L}}{\partial W^{[1]}} = X^T\frac{\partial \mathcal{L}}{\partial z^{[1]}}$$

$$\frac{\partial \mathcal{L}}{\partial b^{[1]}} = \sum_{i=1}^{m} \frac{\partial \mathcal{L}}{\partial z_i^{[1]}}$$

Where:
<br>
$\mathcal{L}$ is the loss function
<br>$m$ is the batch size
<br>$Y$ is the true labels (target values)
<br>$\hat{Y}$ is the predicted labels (output of the forward propagation)
<br>$z^{[2]}$ is the weighted sum of the hidden layer, i.e., $z^{[2]} = W^{[2]}h^{[1]} + b^{[2]}$
<br>$W^{[2]}$ and $b^{[2]}$ are the weights and bias for the output layer
<br>$h^{[1]}$ is the output of the hidden layer, i.e., $h^{[1]} = \tanh(z^{[1]})$
<br>$z^{[1]}$ is the weighted sum of the input layer, i.e., $z^{[1]} = W^{[1]}X + b^{[1]}$
<br>$W^{[1]}$ and $b^{[1]}$ are the weights and bias for the hidden layer
<br>$X$ is the input data.
<br>