In [16]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

# Function to create dummy data
def create_data(points, classes): 
    x = np.zeros((points*classes, 64))  # Adjusted to 64 features
    y = np.zeros(points*classes, dtype='uint8')
    for class_number in range(classes):
        ix = range(points*class_number, points*(class_number+1))
        r = np.linspace(0.0, 1, points)  # radius 
        t = np.linspace(class_number*4, (class_number+1)*4, points) + np.random.randn(points)*0.2
        x[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]  # Dummy features adjusted to 64
        y[ix] = class_number
    return x, y

# Layers and activations
class Layer:
    def __init__(self, input_size, neuron_size):
        self.weights = np.random.randn(input_size, neuron_size) * np.sqrt(2.0 / input_size)
        self.biases = np.zeros((1, neuron_size))
        self.dw = np.zeros_like(self.weights)
        self.db = np.zeros_like(self.biases)
        self.v_dw = np.zeros_like(self.weights)
        self.v_db = np.zeros_like(self.biases)
        self.s_dw = np.zeros_like(self.weights)
        self.s_db = np.zeros_like(self.biases)

    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, inputs, grad_output, learning_rate):
        grad_inputs = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(inputs.T, grad_output)
        grad_biases = np.sum(grad_output, axis=0, keepdims=True)
        
        # Adam optimizer
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-7

        self.v_dw = beta1 * self.v_dw + (1 - beta1) * grad_weights
        self.v_db = beta1 * self.v_db + (1 - beta1) * grad_biases
        self.s_dw = beta2 * self.s_dw + (1 - beta2) * (grad_weights ** 2)
        self.s_db = beta2 * self.s_db + (1 - beta2) * (grad_biases ** 2)
        
        v_dw_corrected = self.v_dw / (1 - beta1)
        v_db_corrected = self.v_db / (1 - beta1)
        s_dw_corrected = self.s_dw / (1 - beta2)
        s_db_corrected = self.s_db / (1 - beta2)
        
        self.weights -= learning_rate * v_dw_corrected / (np.sqrt(s_dw_corrected) + epsilon)
        self.biases -= learning_rate * v_db_corrected / (np.sqrt(s_db_corrected) + epsilon)

class ReluActivation:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)

class SoftmaxActivation:
    def forward(self, inputs):
        normalized_inputs = np.exp(inputs - np.amax(inputs, axis=1, keepdims=True))
        self.output = normalized_inputs / np.sum(normalized_inputs, axis=1, keepdims=True)

class CrossEntropyLoss:
    @staticmethod
    def calculate_loss(y_pred, y_true):
        clipped_values = np.clip(y_pred, 1e-7, 1 - 1e-7)
        if len(y_true.shape) == 1:
            correct_confidences = clipped_values[range(len(clipped_values)), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(clipped_values * y_true, axis=1)
        return np.mean(-np.log(correct_confidences))

# Create dummy data with adjusted features
X, y = create_data(100, 64)

# Initialize layers and activations
l1 = Layer(64, 32)  # Adjusted input_size to match features in X
ac1 = ReluActivation()
l2 = Layer(32, 64)
ac2 = SoftmaxActivation()

# Forward pass
l1.forward(X)
ac1.forward(l1.output)
l2.forward(ac1.output)
ac2.forward(l2.output)

# Calculate initial loss
initial_loss = CrossEntropyLoss.calculate_loss(ac2.output, y)
print("Initial Loss:", initial_loss)

# Backward pass (Gradient calculation)
grad_ac2 = ac2.output.copy()
grad_ac2[range(len(y)), y] -= 1
grad_ac2 /= len(y)

# Backpropagate gradients
ac2_backward = grad_ac2
l2.backward(ac1.output, ac2_backward, learning_rate=0.001)

ac1_backward = np.dot(ac2_backward, l2.weights.T)
l1.backward(X, ac1_backward, learning_rate=0.001)

# Re-calculate loss after optimization
l1.forward(X)
ac1.forward(l1.output)
l2.forward(ac1.output)
ac2.forward(l2.output)

final_loss = CrossEntropyLoss.calculate_loss(ac2.output, y)
print("Final Loss:", final_loss)


ValueError: shape mismatch: value array of shape (100,2) could not be broadcast to indexing result of shape (100,64)