In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

# Function to create dummy data
def create_data(points, classes): 
    x = np.zeros((points * classes, 64))  # Adjusted to 64 features
    y = np.zeros(points * classes, dtype='uint8')
    for class_number in range(classes):
        ix = range(points * class_number, points * (class_number + 1))
        r = np.linspace(0.0, 1, points)  # radius 
        t = np.linspace(class_number * 4, (class_number + 1) * 4, points) + np.random.randn(points) * 0.2
        features = np.c_[r * np.sin(t * 2.5), r * np.cos(t * 2.5)]  # Generate 2 features
        x[ix, :2] = features  # Place the 2 features in the first 2 columns
        y[ix] = class_number
    return x, y

# Normalize data with a small constant to avoid division by zero
def normalize(X, epsilon=1e-8):
    mean = np.mean(X, axis=0)
    std_dev = np.std(X, axis=0)
    std_dev[std_dev == 0] = epsilon  # Set std_dev to epsilon where it's zero
    return (X - mean) / std_dev

# Layers and activations
class Layer:
    def __init__(self, input_size, neuron_size):
        self.weights = np.random.randn(input_size, neuron_size) * np.sqrt(2.0 / input_size)
        self.biases = np.zeros((1, neuron_size))
        self.dw = np.zeros_like(self.weights)
        self.db = np.zeros_like(self.biases)
        self.v_dw = np.zeros_like(self.weights)
        self.v_db = np.zeros_like(self.biases)
        self.s_dw = np.zeros_like(self.weights)
        self.s_db = np.zeros_like(self.biases)

    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, inputs, grad_output, learning_rate):
        grad_inputs = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(inputs.T, grad_output)
        grad_biases = np.sum(grad_output, axis=0, keepdims=True)
        
        # Adam optimizer
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-7

        self.v_dw = beta1 * self.v_dw + (1 - beta1) * grad_weights
        self.v_db = beta1 * self.v_db + (1 - beta1) * grad_biases
        self.s_dw = beta2 * self.s_dw + (1 - beta2) * (grad_weights ** 2)
        self.s_db = beta2 * self.s_db + (1 - beta2) * (grad_biases ** 2)
        
        v_dw_corrected = self.v_dw / (1 - beta1)
        v_db_corrected = self.v_db / (1 - beta1)
        s_dw_corrected = self.s_dw / (1 - beta2)
        s_db_corrected = self.s_db / (1 - beta2)
        
        self.weights -= learning_rate * v_dw_corrected / (np.sqrt(s_dw_corrected) + epsilon)
        self.biases -= learning_rate * v_db_corrected / (np.sqrt(s_db_corrected) + epsilon)

class ReluActivation:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)

class SoftmaxActivation:
    def forward(self, inputs):
        normalized_inputs = np.exp(inputs - np.amax(inputs, axis=1, keepdims=True))
        self.output = normalized_inputs / np.sum(normalized_inputs, axis=1, keepdims=True)

class CrossEntropyLoss:
    @staticmethod
    def calculate_loss(y_pred, y_true):
        clipped_values = np.clip(y_pred, 1e-7, 1 - 1e-7)
        if len(y_true.shape) == 1:
            correct_confidences = clipped_values[range(len(clipped_values)), y_true]
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(clipped_values * y_true, axis=1)
        return np.mean(-np.log(correct_confidences))

def calculate_accuracy(y_pred, y_true):
    predicted_classes = np.argmax(y_pred, axis=1)
    correct_predictions = np.sum(predicted_classes == y_true)
    accuracy = correct_predictions / len(y_true)
    return accuracy

# Create dummy data with adjusted features
X, y = create_data(100, 3)

# Normalize data
X = normalize(X)

# Initialize layers and activations
l1 = Layer(64, 128)  # Increased neurons in the first layer for better representation
ac1 = ReluActivation()
l2 = Layer(128, 64)  # Added an additional hidden layer
ac2 = ReluActivation()
l3 = Layer(64, 3)    # Output layer should match the number of classes (3)
ac3 = SoftmaxActivation()

# Training loop parameters
epochs = 11000  # Reduced number of epochs to avoid overfitting
learning_rate = 0.01  # Adjusted learning rate

# Training loop
for epoch in range(epochs):
    # Forward pass
    l1.forward(X)
    ac1.forward(l1.output)
    l2.forward(ac1.output)
    ac2.forward(l2.output)
    l3.forward(ac2.output)
    ac3.forward(l3.output)

    # Calculate loss
    loss = CrossEntropyLoss.calculate_loss(ac3.output, y)

    # Backward pass (Gradient calculation)
    grad_ac3 = ac3.output.copy()
    grad_ac3[range(len(y)), y] -= 1
    grad_ac3 /= len(y)

    # Backpropagate gradients
    ac3_backward = grad_ac3
    l3.backward(ac2.output, ac3_backward, learning_rate)

    ac2_backward = np.dot(ac3_backward, l3.weights.T)
    l2.backward(ac1.output, ac2_backward, learning_rate)

    ac1_backward = np.dot(ac2_backward, l2.weights.T)
    l1.backward(X, ac1_backward, learning_rate)

    # Print loss and accuracy every 500 epochs
    if epoch % 500 == 0:
        accuracy = calculate_accuracy(ac3.output, y)
        print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

# Final loss and accuracy
l1.forward(X)
ac1.forward(l1.output)
l2.forward(ac1.output)
ac2.forward(l2.output)
l3.forward(ac2.output)
ac3.forward(l3.output)
final_loss = CrossEntropyLoss.calculate_loss(ac3.output, y)
final_accuracy = calculate_accuracy(ac3.output, y)
print(np.argmax(ac3.output, axis=1), "sss", y)
print("Final Loss:", final_loss)
print("Final Accuracy:", final_accuracy)