<a href="https://colab.research.google.com/github/DeepLearningSaeid/Grad/blob/main/Pure_implimentation_SWAG_Numpy_MNIST_Juan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Activation functions
def identity(x):
    return x

def square(x):
    return np.power(x, 2)/4

def square_(x):
    return np.power(x, 2)/24

def identity_derivative(x):
    return np.ones_like(x)

def square_derivative(x):
    return 2 * x

# Initialize network parameters
def initialize_parameters(input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
    return {
        'W1': np.random.randn(input_size, hidden_size1) * 0.1,
        'b1': np.zeros((1, hidden_size1)),
        'W2': np.random.randn(input_size, hidden_size2) * 0.1,
        'b2': np.zeros((1, hidden_size2)),
        'W3': np.random.randn(hidden_size1 + hidden_size2, hidden_size3) * 0.1,
        'b3': np.zeros((1, hidden_size3)),
        'W4': np.random.randn(hidden_size3 + hidden_size1 + hidden_size2, output_size) * 0.1,
        'b4': np.zeros((1, output_size))
    }

# Forward pass
def forward_pass(X, params):
    Z1 = np.dot(X, params['W1']) + params['b1']
    A1 = identity(Z1)

    Z2 = np.dot(X, params['W2']) + params['b2']
    A2 = square(Z2)

    concatenated = np.concatenate((A1, A2), axis=1)

    Z3 = np.dot(concatenated, params['W3']) + params['b3']
    A3 = square_(Z3)

    concatenated_A3 = np.concatenate((A3, concatenated), axis=1)

    Z4 = np.dot(concatenated_A3, params['W4']) + params['b4']
    A4 = identity(Z4)  # Linear activation

    return A4, (X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated)

# Compute loss (Mean Squared Error)
def compute_loss(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

# Backward pass
def backward_pass(y_true, cache, params):
    X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated = cache

    dA4 = 2 * (A4 - y_true)
    dZ4 = dA4 * identity_derivative(Z4)  # Derivative of linear activation is 1
    dW4 = np.dot(concatenated_A3.T, dZ4)
    db4 = np.sum(dZ4, axis=0, keepdims=True)

    d_concatenated_A3 = np.dot(dZ4, params['W4'].T)
    dA3 = d_concatenated_A3[:, :hidden_size3]
    d_concatenated = d_concatenated_A3[:, hidden_size3:]

    dZ3 = dA3 * square_derivative(Z3)
    dW3 = np.dot(concatenated.T, dZ3)
    db3 = np.sum(dZ3, axis=0, keepdims=True)

    dA2 = d_concatenated[:, hidden_size1:]  # Corrected dA2 calculation
    d_concatenated_Z2 = dA2 * square_derivative(Z2)

    dA1 = d_concatenated[:, :hidden_size1]
    dZ1 = dA1 * identity_derivative(Z1)
    dW1 = np.dot(X.T, dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)

    dZ2 = d_concatenated_Z2
    dW2 = np.dot(X.T, dZ2)
    db2 = np.sum(dZ2, axis=0, keepdims=True)

    grads = {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2, 'dW3': dW3, 'db3': db3, 'dW4': dW4, 'db4': db4}
    return grads

# Update network parameters
def update_parameters(params, grads, learning_rate):
    for key in params.keys():
        params[key] -= learning_rate * grads['d' + key]
    return params

# Load and preprocess the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Y_onehot = np.eye(3)[y]

# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_onehot, test_size=0.2, random_state=42)

# Network architecture
input_size = X_train.shape[1]
hidden_size1 = 5
hidden_size2 = 5
hidden_size3 = 5
output_size = 3

# Initialize parameters
params = initialize_parameters(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)

# Training settings
epochs = 200
learning_rate = 0.001

# Training loop
for epoch in range(epochs):
    output, cache = forward_pass(X_train, params)
    loss = compute_loss(Y_train, output)
    grads = backward_pass(Y_train, cache, params)
    params = update_parameters(params, grads, learning_rate)

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# Evaluate the model
output_test, _ = forward_pass(X_test, params)
test_loss = compute_loss(Y_test, output_test)
predictions = np.argmax(output_test, axis=1)
predictions_onehot = np.eye(output_size)[predictions]
accuracy = np.mean(np.all(predictions_onehot == Y_test, axis=1))

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 0, Loss: 0.3247
Epoch 20, Loss: 0.0691
Epoch 40, Loss: 0.0475
Epoch 60, Loss: 0.0390
Epoch 80, Loss: 0.0357
Epoch 100, Loss: 0.0340
Epoch 120, Loss: 0.0331
Epoch 140, Loss: 0.0635
Epoch 160, Loss: 0.0568
Epoch 180, Loss: 0.0546
Test Loss: 0.0550
Test Accuracy: 0.9667


In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Activation functions
def identity(x):
    return x

def square(x):
    return np.power(x, 2)/4

def square_(x):
    return np.power(x, 2)/24

def identity_derivative(x):
    return np.ones_like(x)

def square_derivative(x):
    return 2 * x

# Initialize network parameters
def initialize_parameters(input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
    return {
        'W1': np.random.randn(input_size, hidden_size1) * 0.1,
        'b1': np.zeros((1, hidden_size1)),
        'W2': np.random.randn(input_size, hidden_size2) * 0.1,
        'b2': np.zeros((1, hidden_size2)),
        'W3': np.random.randn(hidden_size1 + hidden_size2, hidden_size3) * 0.1,
        'b3': np.zeros((1, hidden_size3)),
        'W4': np.random.randn(hidden_size3 + hidden_size1 + hidden_size2, output_size) * 0.1,
        'b4': np.zeros((1, output_size))
    }

# Forward pass
def forward_pass(X, params):
    Z1 = np.dot(X, params['W1']) + params['b1']
    A1 = identity(Z1)

    Z2 = np.dot(X, params['W2']) + params['b2']
    A2 = square(Z2)

    concatenated = np.concatenate((A1, A2), axis=1)

    Z3 = np.dot(concatenated, params['W3']) + params['b3']
    A3 = square_(Z3)

    concatenated_A3 = np.concatenate((A3, concatenated), axis=1)

    Z4 = np.dot(concatenated_A3, params['W4']) + params['b4']
    A4 = identity(Z4)  # Linear activation

    return A4, (X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated)

# Compute loss (Mean Squared Error)
def compute_loss(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

# Backward pass
def backward_pass(y_true, cache, params):
    X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated = cache

    dA4 = 2 * (A4 - y_true)
    dZ4 = dA4 * identity_derivative(Z4)  # Derivative of linear activation is 1
    dW4 = np.dot(concatenated_A3.T, dZ4)
    db4 = np.sum(dZ4, axis=0, keepdims=True)

    d_concatenated_A3 = np.dot(dZ4, params['W4'].T)
    dA3 = d_concatenated_A3[:, :hidden_size3]
    d_concatenated = d_concatenated_A3[:, hidden_size3:]

    dZ3 = dA3 * square_derivative(Z3)
    dW3 = np.dot(concatenated.T, dZ3)
    db3 = np.sum(dZ3, axis=0, keepdims=True)

    dA2 = d_concatenated[:, hidden_size1:]  # Corrected dA2 calculation
    d_concatenated_Z2 = dA2 * square_derivative(Z2)

    dA1 = d_concatenated[:, :hidden_size1]
    dZ1 = dA1 * identity_derivative(Z1)
    dW1 = np.dot(X.T, dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)

    dZ2 = d_concatenated_Z2
    dW2 = np.dot(X.T, dZ2)
    db2 = np.sum(dZ2, axis=0, keepdims=True)

    grads = {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2, 'dW3': dW3, 'db3': db3, 'dW4': dW4, 'db4': db4}
    return grads

# Update network parameters
def update_parameters(params, grads, learning_rate):
    for key in params.keys():
        params[key] -= learning_rate * grads['d' + key]
    return params

# Load and preprocess the Breast Cancer Wisconsin dataset
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Y_onehot = np.eye(2)[y]  # One-hot encode target

# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_onehot, test_size=0.2, random_state=42)

# Network architecture
input_size = X_train.shape[1]
hidden_size1 = 5
hidden_size2 = 5
hidden_size3 = 5
output_size = 2  # Two classes: benign and malignant

# Initialize parameters
params = initialize_parameters(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)

# Training settings
epochs = 200
learning_rate = 0.00001

# Training loop
for epoch in range(epochs):
    output, cache = forward_pass(X_train, params)
    loss = compute_loss(Y_train, output)
    grads = backward_pass(Y_train, cache, params)
    params = update_parameters(params, grads, learning_rate)

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# Evaluate the model
output_test, _ = forward_pass(X_test, params)
test_loss = compute_loss(Y_test, output_test)
predictions = np.argmax(output_test, axis=1)
predictions_onehot = np.eye(output_size)[predictions]
accuracy = np.mean(np.all(predictions_onehot == Y_test, axis=1))

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 0, Loss: 0.5283
Epoch 20, Loss: 0.4137
Epoch 40, Loss: 0.3322
Epoch 60, Loss: 0.2670
Epoch 80, Loss: 0.2122
Epoch 100, Loss: 0.1678
Epoch 120, Loss: 0.1353
Epoch 140, Loss: 0.1135
Epoch 160, Loss: 0.0991
Epoch 180, Loss: 0.0894
Test Loss: 0.0797
Test Accuracy: 0.9386


In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

class NeuralNetwork:
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
        self.params = {
            'W1': np.random.randn(input_size, hidden_size1) * 0.01,
            'b1': np.zeros((1, hidden_size1)),
            'W2': np.random.randn(input_size, hidden_size2) * 0.01,
            'b2': np.zeros((1, hidden_size2)),
            'W3': np.random.randn(hidden_size1 + hidden_size2, hidden_size3) * 0.01,
            'b3': np.zeros((1, hidden_size3)),
            'W4': np.random.randn(hidden_size3 + hidden_size1 + hidden_size2, output_size) * 0.01,
            'b4': np.zeros((1, output_size))
        }

    def activation_identity(self, x):
        return x

    def activation_square(self, x):
        return np.power(x, 2) / 4

    def activation_square_(self, x):
        return np.power(x, 2) / 24

    def derivative_identity(self, x):
        return np.ones_like(x)

    def derivative_square(self, x):
        return 2 * x

    def forward_pass(self, X):
        Z1 = np.dot(X, self.params['W1']) + self.params['b1']
        A1 = self.activation_identity(Z1)

        Z2 = np.dot(X, self.params['W2']) + self.params['b2']
        A2 = self.activation_square(Z2)

        concatenated = np.concatenate((A1, A2), axis=1)

        Z3 = np.dot(concatenated, self.params['W3']) + self.params['b3']
        A3 = self.activation_square_(Z3)

        concatenated_A3 = np.concatenate((A3, concatenated), axis=1)

        Z4 = np.dot(concatenated_A3, self.params['W4']) + self.params['b4']
        A4 = self.activation_identity(Z4)  # Linear activation

        return A4, (X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated)

    def compute_loss(self, y_true, y_pred):
        return np.mean(np.square(y_true - y_pred))

    def backward_pass(self, y_true, cache):
        X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated = cache

        dA4 = 2 * (A4 - y_true)
        dZ4 = dA4 * self.derivative_identity(Z4)
        dW4 = np.dot(concatenated_A3.T, dZ4)
        db4 = np.sum(dZ4, axis=0, keepdims=True)

        d_concatenated_A3 = np.dot(dZ4, self.params['W4'].T)
        dA3 = d_concatenated_A3[:, :hidden_size3]
        d_concatenated = d_concatenated_A3[:, hidden_size3:]

        dZ3 = dA3 * self.derivative_square(Z3)
        dW3 = np.dot(concatenated.T, dZ3)
        db3 = np.sum(dZ3, axis=0, keepdims=True)

        dA2 = d_concatenated[:, hidden_size1:]
        dZ2 = dA2 * self.derivative_square(Z2)
        dW2 = np.dot(X.T, dZ2)
        db2 = np.sum(dZ2, axis=0, keepdims=True)

        dA1 = d_concatenated[:, :hidden_size1]
        dZ1 = dA1 * self.derivative_identity(Z1)
        dW1 = np.dot(X.T, dZ1)
        db1 = np.sum(dZ1, axis=0, keepdims=True)

        grads = {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2, 'dW3': dW3, 'db3': db3, 'dW4': dW4, 'db4': db4}
        return grads

    def update_parameters(self, grads, learning_rate):
        for key in self.params.keys():
            self.params[key] -= learning_rate * grads['d' + key]

# Main script
if __name__ == "__main__":
    breast_cancer = load_breast_cancer()
    X, y = breast_cancer.data, breast_cancer.target
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    Y_onehot = np.eye(2)[y]

    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_onehot, test_size=0.2, random_state=42)

    input_size = X_train.shape[1]
    hidden_size1 = 5
    hidden_size2 = 5
    hidden_size3 = 5
    output_size = 2
    # Start the timer
    start_time = time.time()
    nn = NeuralNetwork(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)
    epochs = 5
    learning_rate = 0.0001

    for epoch in range(epochs):
        output, cache = nn.forward_pass(X_train)
        loss = nn.compute_loss(Y_train, output)
        grads = nn.backward_pass(Y_train, cache)
        nn.update_parameters(grads, learning_rate)

        if epoch % 20 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")

    end_time = time.time()

    # Calculate the execution time
    execution_time = end_time - start_time
    print(f"Execution Time: {execution_time:.2f} seconds")

    output_test, _ = nn.forward_pass(X_test)
    test_loss = nn.compute_loss(Y_test, output_test)
    predictions = np.argmax(output_test, axis=1)
    predictions_onehot = np.eye(output_size)[predictions]
    accuracy = np.mean(np.all(predictions_onehot == Y_test, axis=1))

    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")


Epoch 0, Loss: 0.5005
Execution Time: 0.01 seconds
Test Loss: 0.3344
Test Accuracy: 0.6228


In [5]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

class NeuralNetwork:
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
        glorot_factor = np.sqrt(6.0 / (input_size + hidden_size1))  # Glorot Uniform initialization factor
        self.params = {
            'W1': np.random.uniform(-glorot_factor, glorot_factor, size=(input_size, hidden_size1)),
            'b1': np.zeros((1, hidden_size1)),
            'W2': np.random.uniform(-glorot_factor, glorot_factor, size=(input_size, hidden_size2)),
            'b2': np.zeros((1, hidden_size2)),
            'W3': np.random.uniform(-glorot_factor, glorot_factor, size=(hidden_size1 + hidden_size2, hidden_size3)),
            'b3': np.zeros((1, hidden_size3)),
            'W4': np.random.uniform(-glorot_factor, glorot_factor, size=(hidden_size3 + hidden_size1 + hidden_size2, output_size)),
            'b4': np.zeros((1, output_size))
        }

    def activation_identity(self, x):
        return x

    def activation_square(self, x):
        return np.power(x, 2) / 4

    def activation_square_(self, x):
        return np.power(x, 2) / 24

    def derivative_identity(self, x):
        return np.ones_like(x)

    def derivative_square(self, x):
        return 2 * x

    def forward_pass(self, X):
        Z1 = np.dot(X, self.params['W1']) + self.params['b1']
        A1 = self.activation_identity(Z1)

        Z2 = np.dot(X, self.params['W2']) + self.params['b2']
        A2 = self.activation_square(Z2)

        concatenated = np.concatenate((A1, A2), axis=1)

        Z3 = np.dot(concatenated, self.params['W3']) + self.params['b3']
        A3 = self.activation_square_(Z3)

        concatenated_A3 = np.concatenate((A3, concatenated), axis=1)

        Z4 = np.dot(concatenated_A3, self.params['W4']) + self.params['b4']
        A4 = self.activation_identity(Z4)  # Linear activation

        return A4, (X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated)

    def compute_loss(self, y_true, y_pred):
        return np.mean(np.square(y_true - y_pred))

    def backward_pass(self, y_true, cache):
        X, Z1, A1, Z2, A2, Z3, A3, concatenated_A3, Z4, A4, concatenated = cache

        dA4 = 2 * (A4 - y_true)
        dZ4 = dA4 * self.derivative_identity(Z4)
        dW4 = np.dot(concatenated_A3.T, dZ4)
        db4 = np.sum(dZ4, axis=0, keepdims=True)

        d_concatenated_A3 = np.dot(dZ4, self.params['W4'].T)
        dA3 = d_concatenated_A3[:, :hidden_size3]
        d_concatenated = d_concatenated_A3[:, hidden_size3:]

        dZ3 = dA3 * self.derivative_square(Z3)
        dW3 = np.dot(concatenated.T, dZ3)
        db3 = np.sum(dZ3, axis=0, keepdims=True)

        dA2 = d_concatenated[:, hidden_size1:]
        dZ2 = dA2 * self.derivative_square(Z2)
        dW2 = np.dot(X.T, dZ2)
        db2 = np.sum(dZ2, axis=0, keepdims=True)

        dA1 = d_concatenated[:, :hidden_size1]
        dZ1 = dA1 * self.derivative_identity(Z1)
        dW1 = np.dot(X.T, dZ1)
        db1 = np.sum(dZ1, axis=0, keepdims=True)

        grads = {'dW1': dW1, 'db1': db1, 'dW2': dW2, 'db2': db2, 'dW3': dW3, 'db3': db3, 'dW4': dW4, 'db4': db4}
        return grads

    def update_parameters(self, grads, learning_rate):
        for key in self.params.keys():
            self.params[key] -= learning_rate * grads['d' + key]

# Main script
if __name__ == "__main__":
    breast_cancer = load_breast_cancer()
    X, y = breast_cancer.data, breast_cancer.target
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    Y_onehot = np.eye(2)[y]

    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_onehot, test_size=0.2, random_state=42)

    input_size = X_train.shape[1]
    hidden_size1 = 5
    hidden_size2 = 5
    hidden_size3 = 5
    output_size = 2
    # Start the timer
    start_time = time.time()
    nn = NeuralNetwork(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)
    epochs = 5
    learning_rate = 0.0001

    for epoch in range(epochs):
        output, cache = nn.forward_pass(X_train)
        loss = nn.compute_loss(Y_train, output)
        grads = nn.backward_pass(Y_train, cache)
        nn.update_parameters(grads, learning_rate)

        if epoch % 20 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")

    end_time = time.time()

    # Calculate the execution time
    execution_time = end_time - start_time
    print(f"Execution Time: {execution_time:.2f} seconds")

    output_test, _ = nn.forward_pass(X_test)
    test_loss = nn.compute_loss(Y_test, output_test)
    predictions = np.argmax(output_test, axis=1)
    predictions_onehot = np.eye(output_size)[predictions]
    accuracy = np.mean(np.all(predictions_onehot == Y_test, axis=1))

    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")


Epoch 0, Loss: 0.4084
Execution Time: 0.01 seconds
Test Loss: 0.1332
Test Accuracy: 0.9649


In [6]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, Dense, concatenate, Dropout, Flatten, Activation
from keras import backend as K
from keras.utils import get_custom_objects
from keras.utils import  to_categorical, plot_model
from tensorflow.keras.optimizers import Adam
from keras.datasets import mnist

In [None]:
batch_size = 128
num_classes = 10
epochs = 4

# Load the MNIST data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Reshape the input data
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)

# Convert input data to float32
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

# Add 10 to input data
x_train += 10
x_test += 10

# Normalize input data
x_train /= 300
x_test /= 300

# Print the number of train and test samples
print(f'{x_train.shape[0]} train samples')
print(f'{x_test.shape[0]} test samples')

# Convert class vectors to binary class matrices
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

# Set the number of train and test samples
num_train_samples = 60000
num_test_samples = 10000

# Select the required number of train and test samples
x_train = x_train[:num_train_samples, :]
x_test = x_test[:num_test_samples, :]

y_train = y_train[:num_train_samples]
y_test = y_test[:num_test_samples]

60000 train samples
10000 test samples


In [7]:
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import time

# Constants
batch_size = 128
num_classes = 10
epochs = 40

# Load the MNIST data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Reshape the input data
x_train = x_train.reshape(x_train.shape[0], 784)
x_test = x_test.reshape(x_test.shape[0], 784)

# Convert input data to float32
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

# Add 10 to input data
x_train += 10
x_test += 10

# Normalize input data
x_train /= 500
x_test /= 500

# Convert class vectors to binary class matrices
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)



if __name__ == "__main__":
    input_size = 784
    hidden_size1 = 300
    hidden_size2 = 300
    hidden_size3 = 300
    output_size = num_classes

    nn = NeuralNetwork(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)
    learning_rate = 0.000001

    start_time = time.time()

    for epoch in range(epochs):
        output, cache = nn.forward_pass(x_train)
        loss = nn.compute_loss(y_train, output)
        grads = nn.backward_pass(y_train, cache)
        nn.update_parameters(grads, learning_rate)

        if epoch % 1 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution Time: {execution_time:.2f} seconds")

    output_test, _ = nn.forward_pass(x_test)
    test_loss = nn.compute_loss(y_test, output_test)
    predictions = np.argmax(output_test, axis=1)
    accuracy = np.mean(predictions == np.argmax(y_test, axis=1))

    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
Epoch 0, Loss: 0.1212
Epoch 1, Loss: 0.1130
Epoch 2, Loss: 0.1051
Epoch 3, Loss: 0.0950
Epoch 4, Loss: 0.0869
Epoch 5, Loss: 0.0797
Epoch 6, Loss: 0.0748
Epoch 7, Loss: 0.0711
Epoch 8, Loss: 0.0685
Epoch 9, Loss: 0.0664
Epoch 10, Loss: 0.0647
Epoch 11, Loss: 0.0633
Epoch 12, Loss: 0.0620
Epoch 13, Loss: 0.0610
Epoch 14, Loss: 0.0600
Epoch 15, Loss: 0.0592
Epoch 16, Loss: 0.0584
Epoch 17, Loss: 0.0577
Epoch 18, Loss: 0.0570
Epoch 19, Loss: 0.0564
Epoch 20, Loss: 0.0559
Epoch 21, Loss: 0.0553
Epoch 22, Loss: 0.0548
Epoch 23, Loss: 0.0544
Epoch 24, Loss: 0.0540
Epoch 25, Loss: 0.0536
Epoch 26, Loss: 0.0532
Epoch 27, Loss: 0.0528
Epoch 28, Loss: 0.0525
Epoch 29, Loss: 0.0522
Epoch 30, Loss: 0.0519
Epoch 31, Loss: 0.0516
Epoch 32, Loss: 0.0513
Epoch 33, Loss: 0.0510
Epoch 34, Loss: 0.0508
Epoch 35, Loss: 0.0505
Epoch 36, Loss: 0.0503
Epoch 37, Loss: 0.0501
Epoch 38, Loss: 0.0499
Epoch 39, Loss: 0.049