In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

# Function to create dummy data
def create_data(points, classes): 
    x = np.zeros((points * classes, 64))  # Adjusted to 64 features
    y = np.zeros(points * classes, dtype='uint8')
    for class_number in range(classes):
        ix = range(points * class_number, points * (class_number + 1))
        r = np.linspace(0.0, 1, points)  # radius 
        t = np.linspace(class_number * 4, (class_number + 1) * 4, points) + np.random.randn(points) * 0.2
        features = np.c_[r * np.sin(t * 2.5), r * np.cos(t * 2.5)]  # Generate 2 features
        x[ix, :2] = features  # Place the 2 features in the first 2 columns
        y[ix] = class_number
    return x, y

# Layers and activations
class Layer:
    def __init__(self, input_size, neuron_size):
        self.weights = np.random.randn(input_size, neuron_size) * np.sqrt(2.0 / input_size)
        self.biases = np.zeros((1, neuron_size))
        self.dw = np.zeros_like(self.weights)
        self.db = np.zeros_like(self.biases)
        self.v_dw = np.zeros_like(self.weights)
        self.v_db = np.zeros_like(self.biases)
        self.s_dw = np.zeros_like(self.weights)
        self.s_db = np.zeros_like(self.biases)

    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, inputs, grad_output, learning_rate):
        grad_inputs = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(inputs.T, grad_output)
        grad_biases = np.sum(grad_output, axis=0, keepdims=True)
        
        # Adam optimizer
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-7

        self.v_dw = beta1 * self.v_dw + (1 - beta1) * grad_weights
        self.v_db = beta1 * self.v_db + (1 - beta1) * grad_biases
        self.s_dw = beta2 * self.s_dw + (1 - beta2) * (grad_weights ** 2)
        self.s_db = beta2 * self.s_db + (1 - beta2) * (grad_biases ** 2)
        
        v_dw_corrected = self.v_dw / (1 - beta1)
        v_db_corrected = self.v_db / (1 - beta1)
        s_dw_corrected = self.s_dw / (1 - beta2)
        s_db_corrected = self.s_db / (1 - beta2)
        
        self.weights -= learning_rate * v_dw_corrected / (np.sqrt(s_dw_corrected) + epsilon)
        self.biases -= learning_rate * v_db_corrected / (np.sqrt(s_db_corrected) + epsilon)

class ReluActivation:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)

class SoftmaxActivation:
    def forward(self, inputs):
        normalized_inputs = np.exp(inputs - np.amax(inputs, axis=1, keepdims=True))
        self.output = normalized_inputs / np.sum(normalized_inputs, axis=1, keepdims=True)

class CrossEntropyLoss:
    @staticmethod
    def calculate_loss(y_pred, y_true):
        clipped_values = np.clip(y_pred, 1e-7, 1 - 1e-7)
        if len(y_true.shape) == 1:
            correct_confidences = clipped_values[range(len(clipped_values)), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(clipped_values * y_true, axis=1)
        return np.mean(-np.log(correct_confidences))

# Create dummy data with adjusted features
X, y = create_data(100, 3)

# Initialize layers and activations
l1 = Layer(64, 32)  # Adjusted input_size to match features in X
ac1 = ReluActivation()
l2 = Layer(32, 3)  # Output layer should match the number of classes (3)
ac2 = SoftmaxActivation()

# Training loop parameters
epochs = 50000
learning_rate = 0.01

# Training loop
for epoch in range(epochs):
    # Forward pass
    l1.forward(X)
    ac1.forward(l1.output)
    l2.forward(ac1.output)
    ac2.forward(l2.output)

    # Calculate loss
    loss = CrossEntropyLoss.calculate_loss(ac2.output, y)

    # Backward pass (Gradient calculation)
    grad_ac2 = ac2.output.copy()
    grad_ac2[range(len(y)), y] -= 1
    grad_ac2 /= len(y)

    # Backpropagate gradients
    ac2_backward = grad_ac2
    l2.backward(ac1.output, ac2_backward, learning_rate)

    ac1_backward = np.dot(ac2_backward, l2.weights.T)
    l1.backward(X, ac1_backward, learning_rate)

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")

# Final loss
l1.forward(X)
ac1.forward(l1.output)
l2.forward(ac1.output)
ac2.forward(l2.output)
final_loss = CrossEntropyLoss.calculate_loss(ac2.output, y)

print("acc", np.mean(np.argmax(ac2.output, axis=1) == y))
print("z", np.argmax(ac2.output))
print(np.argmax(ac2.output), y)
print("Final Loss:", final_loss)


Epoch 0, Loss: 1.1094670054316003
Epoch 100, Loss: 1.0745517483650424
Epoch 200, Loss: 1.0737399278277269
Epoch 300, Loss: 1.072957857797186
Epoch 400, Loss: 1.07022647757383
Epoch 500, Loss: 1.067985652857329
Epoch 600, Loss: 1.066444726521256
Epoch 700, Loss: 1.0653038545288402
Epoch 800, Loss: 1.0635180115419423
Epoch 900, Loss: 1.0617271866188989
Epoch 1000, Loss: 1.0600922990285588
Epoch 1100, Loss: 1.058547248483651
Epoch 1200, Loss: 1.0571115013287646
Epoch 1300, Loss: 1.0558425388814192
Epoch 1400, Loss: 1.054617773153044
Epoch 1500, Loss: 1.0535756702968293
Epoch 1600, Loss: 1.0526693364190107
Epoch 1700, Loss: 1.0516511572801914
Epoch 1800, Loss: 1.0507272538145553
Epoch 1900, Loss: 1.0499530320499788
Epoch 2000, Loss: 1.0489732408632213
Epoch 2100, Loss: 1.048221098874625
Epoch 2200, Loss: 1.0474372056622203
Epoch 2300, Loss: 1.0466452896246101
Epoch 2400, Loss: 1.045933907081059
Epoch 2500, Loss: 1.0454057862008945
Epoch 2600, Loss: 1.0447714434415507
Epoch 2700, Loss: 1.04