In [None]:
import numpy as np
def softmax(z):
 z_max = np.max(z, axis=1, keepdims=True)
 exp_z = np.exp(z - z_max)
 return exp_z / np.sum(exp_z, axis=1, keepdims=True)
z_test = np.array([[2.0, 1.0, 0.1], [1.0, 1.0, 1.0]])
softmax_output = softmax(z_test)
# Verify if the sum of probabilities for each row is 1 using assert
row_sums = np.sum(softmax_output, axis=1)
# Assert that the sum of each row is 1
assert np.allclose(row_sums, 1), f"Test failed: Row sums are {row_sums}"
print("Softmax function passed the test case!")

Softmax function passed the test case!


"""
Compute the softmax probabilities for a given input matrix.
Parameters:
z (numpy.ndarray): Logits (raw scores) of shape (m, n), where

- m is the number of samples.
- n is the number of classes.

Returns:
numpy.ndarray: Softmax probability matrix of shape (m, n), where
each row sums to 1 and represents the probability
distribution over classes.

Notes:
- The input to softmax is typically computed as: z = XW + b.
- Uses numerical stabilization by subtracting the max value per row.
"""

In [None]:
def predict_softmax(X, W, b):
  z = np.dot(X, W) + b
  probabilities = softmax(z)
  predicted_classes = np.argmax(probabilities, axis=1)
  return predicted_classes

"""
Predict the class labels for a set of samples using the trained softmax model.
Parameters:
X (numpy.ndarray): Feature matrix of shape (n, d), where n is the number of samples and d is the
number of features.
W (numpy.ndarray): Weight matrix of shape (d, c), where c is the number of classes.
b (numpy.ndarray): Bias vector of shape (c,).
Returns:
numpy.ndarray: Predicted class labels of shape (n,), where each value is the index of the
predicted class.
"""

In [None]:
# The test function ensures that the predicted class labels have the same number of elements as the input samples, verifying that the model produces a valid output shape.
# Define test case
X_test = np.array([[0.2, 0.8], [0.5, 0.5], [0.9, 0.1]]) # Feature matrix (3 samples, 2 features)
W_test = np.array([[0.4, 0.2, 0.1], [0.3, 0.7, 0.5]]) # Weights (2 features, 3 classes)
b_test = np.array([0.1, 0.2, 0.3]) # Bias (3 classes)
# Expected Output:
# The function should return an array with class labels (0, 1, or 2)
y_pred_test = predict_softmax(X_test, W_test, b_test)
# Validate output shape
assert y_pred_test.shape == (3,), f"Test failed: Expected shape (3,), got {y_pred_test.shape}"
# Print the predicted labels
print("Predicted class labels:", y_pred_test)

Predicted class labels: [1 1 0]


In [None]:
def loss_softmax(y_pred, y):
  """
  Compute the cross-entropy loss for a single sample.
  Parameters:
  y_pred (numpy.ndarray): Predicted probabilities of shape (c,) for a single sample,

  where c is the number of classes.

  y (numpy.ndarray): True labels (one-hot encoded) of shape (c,), where c is the number of classes.
  Returns:
  float: Cross-entropy loss for the given sample.
  """
  epsilon=1e-12
  y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
  loss = -np.sum(y * np.log(y_pred))
  return loss

In [None]:
import numpy as np
  # This test case Compares loss for correct vs. incorrect predictions.
  # Expects low loss for correct predictions.
  # Expects high loss for incorrect predictions.
  # Define correct predictions (low loss scenario)
y_true_correct = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # True one-hot labels
y_pred_correct = np.array([[0.9, 0.05, 0.05],[0.1, 0.85, 0.05],[0.05, 0.1, 0.85]]) # High confidence in the correct class

  # Define incorrect predictions (high loss scenario)
y_pred_incorrect = np.array([[0.05, 0.05, 0.9], [0.1, 0.05, 0.85],[0.85, 0.1, 0.05]])# Highly confident in the wrong class

  # Compute loss for both cases
loss_correct = loss_softmax(y_pred_correct, y_true_correct)
loss_incorrect = loss_softmax(y_pred_incorrect, y_true_correct)
  # Validate that incorrect predictions lead to a higher loss
assert loss_correct < loss_incorrect, f"Test failed: Expected loss_correct < loss_incorrect, but got{loss_correct:.4f} >= {loss_incorrect:.4f}"
  # Print results
print(f"Cross-Entropy Loss (Correct Predictions): {loss_correct:.4f}")
print(f"Cross-Entropy Loss (Incorrect Predictions): {loss_incorrect:.4f}")

Cross-Entropy Loss (Correct Predictions): 0.4304
Cross-Entropy Loss (Incorrect Predictions): 8.9872


implement cost function


In [None]:
def cost_softmax(X, y, W, b):
  """
  Compute the average softmax regression cost (cross-entropy loss) over all samples.
  Parameters:
  X (numpy.ndarray): Feature matrix of shape (n, d), where n is the number of samples and d is the
  number of features.
  y (numpy.ndarray): True labels (one-hot encoded) of shape (n, c), where n is the number of
  samples and c is the number of classes.
  W (numpy.ndarray): Weight matrix of shape (d, c).
  b (numpy.ndarray): Bias vector of shape (c,).
  Returns:
  float: Average softmax cost (cross-entropy loss) over all samples.
  """
  n = X.shape[0]  # Get the number of samples
  z = np.dot(X, W) + b
  y_pred = softmax(z)
  total_loss = -np.sum(y * np.log(y_pred))
# Return average loss
  return total_loss / n

In [None]:
# The test case assures that the cost for the incorrect prediction should be higher than for thecorrect prediction, confirming that the cost function behaves as expected.
import numpy as np
# Example 1: Correct Prediction (Closer predictions)
X_correct = np.array([[1.0, 0.0], [0.0, 1.0]]) # Feature matrix for correct predictions
y_correct = np.array([[1, 0], [0, 1]]) # True labels (one-hot encoded, matching predictions)
W_correct = np.array([[5.0, -2.0], [-3.0, 5.0]]) # Weights for correct prediction
b_correct = np.array([0.1, 0.1]) # Bias for correct prediction
# Example 2: Incorrect Prediction (Far off predictions)
X_incorrect = np.array([[0.1, 0.9], [0.8, 0.2]]) # Feature matrix for incorrect predictions
y_incorrect = np.array([[1, 0], [0, 1]]) # True labels (one-hot encoded, incorrect predictions)
W_incorrect = np.array([[0.1, 2.0], [1.5, 0.3]]) # Weights for incorrect prediction
b_incorrect = np.array([0.5, 0.6]) # Bias for incorrect prediction
# Compute cost for correct predictions
cost_correct = cost_softmax(X_correct, y_correct, W_correct, b_correct)
# Compute cost for incorrect predictions
cost_incorrect = cost_softmax(X_incorrect, y_incorrect, W_incorrect, b_incorrect)
# Check if the cost for incorrect predictions is greater than for correct predictionsassert cost_incorrect > cost_correct, f"Test failed: Incorrect cost {cost_incorrect} is not greaterthan correct cost {cost_correct}"
# Print the costs for verification
print("Cost for correct prediction:", cost_correct)
print("Cost for incorrect prediction:", cost_incorrect)
print("Test passed!")

Cost for correct prediction: 0.0006234364133349324
Cost for incorrect prediction: 0.29930861359446115
Test passed!


gradient


In [None]:
def compute_gradient_softmax(X, y, W, b):
  """
  Compute the gradients of the cost function with respect to weights and biases.
  Parameters:
  X (numpy.ndarray): Feature matrix of shape (n, d).
  y (numpy.ndarray): True labels (one-hot encoded) of shape (n, c).
  W (numpy.ndarray): Weight matrix of shape (d, c).
  b (numpy.ndarray): Bias vector of shape (c,).
  Returns:
  tuple: Gradients with respect to weights (d, c) and biases (c,).
  """
  n,d = X.shape
  z = np.dot(X,W)+b
  y_pred = softmax(z)
  grad_w = np.dot(X.T,y_pred-y)/n
  grad_b = np.sum(y_pred-y,axis=0)/n
  return grad_w,grad_b

In [None]:
import numpy as np
# Define a simple feature matrix and true labels
X_test = np.array([[0.2, 0.8], [0.5, 0.5], [0.9, 0.1]]) # Feature matrix (3 samples, 2 features)
y_test = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # True labels (one-hot encoded, 3 classes)
# Define weight matrix and bias vector
W_test = np.array([[0.4, 0.2, 0.1], [0.3, 0.7, 0.5]]) # Weights (2 features, 3 classes)
b_test = np.array([0.1, 0.2, 0.3]) # Bias (3 classes)
# Compute the gradients using the function
grad_W, grad_b = compute_gradient_softmax(X_test, y_test, W_test, b_test)
# Manually compute the predicted probabilities (using softmax function)
z_test = np.dot(X_test, W_test) + b_test
y_pred_test = softmax(z_test)
# Compute the manually computed gradients
grad_W_manual = np.dot(X_test.T, (y_pred_test - y_test)) / X_test.shape[0]
grad_b_manual = np.sum(y_pred_test - y_test, axis=0) / X_test.shape[0]
# Assert that the gradients computed by the function match the manually computed gradients
assert np.allclose(grad_W, grad_W_manual), f"Test failed: Gradients w.r.t. W are not equal.\nExpected: {grad_W_manual}\nGot: {grad_W}"
assert np.allclose(grad_b, grad_b_manual), f"Test failed: Gradients w.r.t. b are not equal.\nExpected: {grad_b_manual}\nGot: {grad_b}"
# Print the gradients for verification
print("Gradient w.r.t. W:", grad_W)
print("Gradient w.r.t. b:", grad_b)
print("Test passed!")

Gradient w.r.t. W: [[ 0.1031051   0.01805685 -0.12116196]
 [-0.13600547  0.00679023  0.12921524]]
Gradient w.r.t. b: [-0.03290036  0.02484708  0.00805328]
Test passed!
