In [1]:
# import numpy as np
import cupy as np
import math
import sklearn # Only for downloading MNIST Dataset and Accuracy Metrics
import sklearn.metrics
from keras.utils import to_categorical  # Only for categorical one hot encoding
import random
import matplotlib.pyplot as plt
import tensorflow as tf

2024-05-21 10:29:00.344764: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-21 10:29:00.793791: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
antiCategorical = lambda x: [np.argmax(i) for i in x]
linear = lambda x : x

In [3]:
xor_x_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0], [1, 1]])
xor_y_train = np.array([[0], [1], [1], [0], [1], [0], [0]])
xor_x_train.shape, xor_y_train.shape

((7, 2), (7, 1))

In [4]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# (x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
cy_train = np.array(to_categorical(y_train))
cy_test = np.array(to_categorical(y_test))

cx_train, cx_test = np.array(x_train.reshape(-1, 784)/255.), np.array(x_test.reshape(-1, 784)/255.)
cx_train.shape

(60000, 784)

# A Simple General Neural network Library

In [5]:
def antiCategorical(arr):
    return np.argmax(arr, axis=1)

class Layer:
    # Actual operation of the layer
    def __call__(self, inputs):
        raise NotImplementedError

    # Derivative of the layer w.r.t the output
    def derivative(self, x):
        raise NotImplementedError

    def forward(self, x):
        self.last_inputs = x
        return self.__call__(x)

    def backward(self, grad_output):
        return self.derivative(self.last_inputs) * grad_output, None

    def inverse(self, x):
        raise NotImplementedError

    def update_parameters(self, updates):
        raise NotImplementedError

def checkNan(arr, name):
    if np.any(np.isnan(arr)):
        print('NaN found in arr of shape {arr}, {name} {val}'.format(arr=arr.shape, name=name, val=arr))
        raise ValueError('NaN found in arr of shape {arr}, {name}'.format(arr=arr.shape, name=name))

class Activation(Layer):
    def update_parameters(self, updates):
        pass

class Sigmoid(Activation):
    def __call__(self, x):
        return 1 / (1 + np.exp(-x))

    def derivative(self, x):
        sigmoid = self.__call__(self.last_inputs)
        return sigmoid * (1 - sigmoid)

    def inverse(self, x):
        return np.log(x / (1 - x))

class TanH(Activation):
    def __call__(self, x):
        return np.tanh(x)

    def derivative(self, x):
        return 1 - np.tanh(x)**2

    def inverse(self, x):
        return np.log((1 + x) / (1 - x))

class Linear(Activation):
    def __call__(self, x):
        return x

    def derivative(self, x):
        return 1

    def inverse(self, x):
        return x

class ReLU(Activation):
    def __call__(self, x):
        return np.maximum(0, x)

    def derivative(self, x):
        return np.where(x > 0, 1, 0)

    def inverse(self, x):
        return np.where(x > 0, x, 0)

class LeakyReLU(Activation):
    def __init__(self, alpha=0.01):
        self.alpha = alpha

    def __call__(self, x):
        return np.where(x > 0, x, self.alpha * x)

    def derivative(self, x):
        return np.where(x > 0, 1, self.alpha)

    def inverse(self, x):
        return np.where(x > 0, x, x / self.alpha)

class Softmax(Activation):
    def __call__(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.output = exp_x / np.sum(exp_x, axis=1, keepdims=True) + 1e-8
        return np.nan_to_num(self.output)

    def derivative(self, x):
      # The derivative of softmax is complex to calculate so here we will just return the grad as is.
      return 1

    def inverse(self, x):
        return np.argmax(x, axis=1)

class Dense(Layer):
    def __init__(self, n_input, n_output, use_bias=True):
        self.weights = np.random.normal(0, scale=(1/float(math.sqrt(n_input))), size=(n_input, n_output)).astype(np.float64)
        self.biases = np.zeros((1, n_output)).astype(np.float64)
        self.use_bias = use_bias

    def __call__(self, x):
        output = np.dot(x, self.weights)
        if self.use_bias == True:
            output += self.biases
        return output

    # derivative of the layer w.r.t the output of the layer
    def derivative(self, x):
        return self.weights

    def inverse(self, x):
        inv_weight = np.linalg.pinv(self.weights)
        x = np.nan_to_num(np.dot(x, inv_weight))
        return x

    def backward(self, grad_output):
        grad_input = np.dot(grad_output, self.weights.T)

        grad_weights = np.dot(self.last_inputs.T, grad_output)
        grad_biases = np.mean(grad_output, axis=0, keepdims=True)

        return grad_input, (grad_weights, grad_biases)

    def update_parameters(self, updates):
        delta_weights, delta_biases = updates
        self.weights -= delta_weights
        self.biases -= delta_biases


In [6]:
class MeanSquaredError:
    @staticmethod
    def forward(predictions, targets):
        loss = np.mean(0.5 * (predictions - targets)**2)
        # print("Loss=>", loss)
        return loss

    @staticmethod
    def backward(predictions, targets):
        # return ((2 * (predictions - targets)) / predictions.size)# * np.ones_like(predictions)
        return predictions-targets

class SoftmaxCrossEntropy:
    @staticmethod
    def forward(predictions, targets):
        return np.nan_to_num(-np.mean(targets * np.log(predictions)) - np.mean((1 - targets) * np.log(1 - predictions)))

    @staticmethod
    def backward(predictions, targets):
        return (predictions - targets) / predictions.size


In [7]:
class NeuralNet:
    def __init__(self, layers):
        self.layers = layers

    def forward(self, inputs):
        for layer in self.layers:
            inputs = layer.forward(inputs)
        return inputs

    def __call__ (self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def predict(self, x, n_samples=1):
        return self.__call__(x)

    def get_layers(self):
        return self.layers

    def accuracy(self, x_test, y_test):
        preds = np.array(antiCategorical(self.__call__(x_test))).get()
        expected = np.array(antiCategorical(y_test)).get()
        acc = sklearn.metrics.accuracy_score(expected, preds)
        return acc

In [8]:
class Optimizer:
    def __init__(self, model, loss):
        self.model = model
        self.loss = loss

    def train_step(self, x_batch, y_batch):
        print("optimizer train_step")
        raise NotImplementedError

    def fit(self, train_data, test_data, epochs, batch_size=128, verbose=True, **kwargs):
        x_train, y_train = train_data
        x_test, y_test = test_data
        n_batches = max(1, len(x_train) // batch_size)

        self.on_train_start(**kwargs)

        for epoch in range(epochs):
            shuffle_indices = np.random.permutation(len(x_train))
            x_train = x_train[shuffle_indices]
            y_train = y_train[shuffle_indices]

            self.on_epoch_start(epoch)

            for i in range(n_batches):
                x_batch = x_train[i*batch_size:(i+1)*batch_size]
                y_batch = y_train[i*batch_size:(i+1)*batch_size]

                loss_value = self.train_step(x_batch, y_batch)

            if verbose:
                acc = self.model.accuracy(x_test, y_test)
                self.on_reporting(epoch, loss_value, acc)

            self.on_epoch_end(epoch)


    def on_reporting(self, epoch, loss_value, acc):
        print(f'Epoch {epoch}, Loss: {loss_value}, test_acc: {acc}')

    def on_epoch_end(self, epoch):
        pass

    def on_epoch_start(self, epoch):
        pass

    def on_train_start(self, **kwargs):
        pass

# Neural Network with SGD

In [39]:
class SGD_Optimizer(Optimizer):
    def __init__(self, model, learning_rate, loss, gamma=1, delta=2):
        super().__init__(model, loss)
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.delta = delta

    def train_step(self, x_batch, y_batch):
        predictions = self.model.forward(x_batch)
        loss_value = self.loss.forward(predictions, y_batch)
        grad_loss = self.loss.backward(predictions, y_batch)

        for layer in reversed(self.model.get_layers()):
            # Back propogate the loss to the layer
            grad_loss, deltas = layer.backward(grad_loss)
            if deltas != None:
                # Simple Stochastic Gradient Decent
                delta_weights, delta_biases = deltas
                deltas = (delta_weights * self.learning_rate, delta_biases * self.learning_rate)
            # Update the weights
            layer.update_parameters(deltas)

        return loss_value

    def on_epoch_end(self, epoch):
        # Decay Weight
        self.learning_rate *= self.gamma
        self.delta *= self.gamma

In [None]:
np.random.seed(4)
# MNIST
layers = [
    Dense(784, 128),
    ReLU(),
    Dense(128, 10),
    # Softmax(),
    Sigmoid(),
]

net = NeuralNet(layers)

optimizer = SGD_Optimizer(net, learning_rate = 0.01, loss=MeanSquaredError(), delta=0.95, gamma=1)
optimizer.fit((cx_train, cy_train), (cx_test, cy_test), epochs=10, batch_size=100, verbose=True)
# Infer on cx_test, cy_test
predictions = net.predict(cx_test)
preds = np.array(antiCategorical(predictions)).get()
expected = np.array(antiCategorical(cy_test)).get()

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected, preds)}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected, preds)}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected, preds)}')

Epoch 0, Loss: 0.002965000749972247, test_acc: 0.9343
Epoch 1, Loss: 0.00454360453606925, test_acc: 0.9519
Epoch 2, Loss: 0.005682691445204189, test_acc: 0.9587
Epoch 3, Loss: 0.0020864411427078417, test_acc: 0.9635
Epoch 4, Loss: 0.0030289556967028116, test_acc: 0.9679
Epoch 5, Loss: 0.0026872328693654055, test_acc: 0.9692
Epoch 6, Loss: 0.0034305042118758034, test_acc: 0.9699
Epoch 7, Loss: 0.001227873028913653, test_acc: 0.9723
Epoch 8, Loss: 0.0022649712001616197, test_acc: 0.9725
Epoch 9, Loss: 0.0010420624870716318, test_acc: 0.973
Accuracy: 0.973
Confusion Matrix: [[ 967    0    2    1    0    1    6    1    1    1]
 [   0 1122    4    1    0    1    3    1    3    0]
 [   3    1 1011    2    2    0    1    7    5    0]
 [   0    0    9  980    0    8    0    4    4    5]
 [   1    0    5    0  948    0    5    1    2   20]
 [   4    1    1    6    1  864    7    0    5    3]
 [   5    3    0    1    2    5  940    0    2    0]
 [   2    9   16    2    1    0    0  991    1    6

In [None]:
np.random.seed(4)
# MNIST
layers = [
    Dense(784, 128, False),
    # Sigmoid(),
    LeakyReLU(),
    Dense(128, 10, False),
    Sigmoid(),
]

net = NeuralNet(layers)

optimizer = SGD_Optimizer(net, learning_rate = 0.01, loss=MeanSquaredError(), delta=0.95, gamma=1)
optimizer.fit((cx_train, cy_train), (cx_test, cy_test), epochs=10, batch_size=100, verbose=True)
# Infer on cx_test, cy_test
predictions = net.predict(cx_test)
preds = np.array(antiCategorical(predictions)).get()
expected = np.array(antiCategorical(cy_test)).get()

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected, preds)}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected, preds)}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected, preds)}')

Epoch 0, Loss: 0.002961079860814418, test_acc: 0.9339
Epoch 1, Loss: 0.004579937559629234, test_acc: 0.9519
Epoch 2, Loss: 0.005732318261936945, test_acc: 0.959
Epoch 3, Loss: 0.0021536863313846733, test_acc: 0.9631
Epoch 4, Loss: 0.003113965179030122, test_acc: 0.9678
Epoch 5, Loss: 0.002703793325385058, test_acc: 0.9689
Epoch 6, Loss: 0.0034707400510052087, test_acc: 0.97
Epoch 7, Loss: 0.001220872071553772, test_acc: 0.9719
Epoch 8, Loss: 0.002196430221077278, test_acc: 0.9724
Epoch 9, Loss: 0.001037367388548445, test_acc: 0.9734
Accuracy: 0.9734
Confusion Matrix: [[ 969    0    2    0    0    0    5    1    2    1]
 [   0 1122    4    1    0    1    3    1    3    0]
 [   3    1 1011    2    2    0    1    7    5    0]
 [   0    0    9  979    0    8    0    4    5    5]
 [   1    0    5    0  950    0    4    1    2   19]
 [   4    1    1    6    1  864    7    1    4    3]
 [   5    3    0    1    2    5  940    0    2    0]
 [   2    9   16    1    1    0    0  991    1    7]
 [

# Bayesian Networks

In [None]:
class BayesianDense(Layer):
    def __init__(self, n_input, n_output, prior_std=1.0, posterior_std=0.1):
        self.n_input = n_input
        self.n_output = n_output
        self.prior_std = prior_std
        self.posterior_std = posterior_std

        self.W_mu = np.random.normal(0, scale=(1/float(math.sqrt(n_input))), size=(n_input, n_output)).astype(np.float64)
        self.b_mu = np.zeros((1, n_output)).astype(np.float64)

        self.W_sigma = np.full((n_input, n_output), np.log(self.posterior_std)).astype(np.float64)
        self.b_sigma = np.full((1, n_output), np.log(self.posterior_std)).astype(np.float64)
        print("W_mu", np.mean(self.W_mu),
              "b_mu", np.mean(self.b_mu),
              "W_sigma", np.mean(self.W_sigma), self.W_sigma.shape,
              "b_sigma", np.mean(self.b_sigma), self.b_sigma.shape)

    def sample_weights(self):
        weights_std = np.exp(0.5 * self.W_sigma)
        self.W = self.W_mu + weights_std * np.random.normal(size=(self.n_input, self.n_output))
        biases_std = np.exp(0.5 * self.b_sigma)
        self.b = self.b_mu + biases_std * np.random.normal(size=(1, self.n_output))
        # print("w", np.mean(self.W), np.mean(weights_std), "b", np.mean(self.b), np.mean(biases_std))

    def forward(self, inputs):
        self.inputs = inputs
        self.sample_weights()
        self.output = np.dot(inputs, self.W) + self.b
        return self.output

    def backward(self, grad_output, learning_rate):
        grad_input = np.dot(grad_output, self.W.T)

        grad_W = np.dot(self.inputs.T, grad_output)
        grad_b = np.mean(grad_output, axis=0, keepdims=True)

        grad_W_mu = grad_W / grad_output.shape[0]
        grad_b_mu = grad_b / grad_output.shape[0]

        grad_W_sigma = ((grad_W ** 2) - 1) / (2 * grad_output.shape[0])
        grad_b_sigma = ((grad_b_mu ** 2) - 1) / (2 * grad_output.shape[0])

        self.W_mu -= learning_rate * grad_W_mu
        self.b_mu -= learning_rate * grad_b_mu

        self.W_sigma -= learning_rate * grad_W_sigma
        self.b_sigma -= learning_rate * grad_b_sigma

        return grad_input

In [None]:
np.random.seed(4)
# MNIST
layers = [
    BayesianDense(784, 100),
    LeakyReLU(),
    BayesianDense(100, 50),
    LeakyReLU(),
    BayesianDense(50, 10),
    Softmax(),
]

net = NeuralNet(layers, learning_rate = 0.01, loss=MeanSquaredError(), delta=0.95, gamma=1)

net.fit(cx_train, cy_train, epochs=10, batch_size=100, verbose=True)
# Infer on cx_test, cy_test
predictions = net.predict(cx_test, 20)
preds = np.array(antiCategorical(predictions)).get()
expected = np.array(antiCategorical(cy_test)).get()


print(f'Accuracy: {sklearn.metrics.accuracy_score(expected, preds)}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected, preds)}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected, preds)}')

W_mu -0.0001789835031751473 b_mu 0.0 W_sigma -2.3025850929940455 (784, 100) b_sigma -2.3025850929940455 (1, 100)
W_mu 0.0011180262676438772 b_mu 0.0 W_sigma -2.3025850929940455 (100, 50) b_sigma -2.3025850929940455 (1, 50)
W_mu 0.0023444765023452477 b_mu 0.0 W_sigma -2.302585092994045 (50, 10) b_sigma -2.3025850929940455 (1, 10)
Epoch 0, Batch 0, Loss: 0.07363589103305021
Epoch 1, Batch 0, Loss: 0.03985098104954909
Epoch 2, Batch 0, Loss: 0.02593534829055494
Epoch 3, Batch 0, Loss: 0.02348652026844813
Epoch 4, Batch 0, Loss: 0.019979116766026435
Epoch 5, Batch 0, Loss: 0.01774217933964233
Epoch 6, Batch 0, Loss: 0.017500500468994637
Epoch 7, Batch 0, Loss: 0.01771578663478224
Epoch 8, Batch 0, Loss: 0.01647555074216599
Epoch 9, Batch 0, Loss: 0.013828134710165842
Accuracy: 0.7548
Confusion Matrix: [[ 866    0   10    1    1   11    3    2   13   73]
 [   0 1078    2    4    0    0    4    0   44    3]
 [  25   15  810   20    9    5   20   13   76   39]
 [  19    3   36  587    0  197 

In [None]:
predictions = net.predict(cx_test, 100)
preds = np.array(antiCategorical(predictions)).get()
expected = np.array(antiCategorical(cy_test)).get()

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected, preds)}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected, preds)}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected, preds)}')

Accuracy: 0.6333
Confusion Matrix: [[747   0  19   1   0  16  84   1 111   1]
 [  0 982  22   7   0   0   3  27  94   0]
 [  4  54 706   5   6   8  41   1 207   0]
 [  9  37  40 685   2   9   8   2 217   1]
 [  4  17   6 122 518  34  17   4 243  17]
 [ 23  31  32  46   6 263  20   4 461   6]
 [  9   2  15   8   3   6 814   0 101   0]
 [  2   8  62  19   7  25   5 672 203  25]
 [  4  11  32   0   2   1  13   1 910   0]
 [  2   4  21 361   5   5   3 185 387  36]]
Classification Report:               precision    recall  f1-score   support

           0       0.93      0.76      0.84       980
           1       0.86      0.87      0.86      1135
           2       0.74      0.68      0.71      1032
           3       0.55      0.68      0.61      1010
           4       0.94      0.53      0.68       982
           5       0.72      0.29      0.42       892
           6       0.81      0.85      0.83       958
           7       0.75      0.65      0.70      1028
           8       0.31 

# Simulated Anealing

In [77]:
# import cupy as np
class SimulatedAnnealingLayer(Layer):
    def forward_with_perturbations(self, inputs, perturbations):
        raise NotImplementedError

    def perturb_parameters(self, stddev = 0.01):
        raise NotImplementedError

class SADense(SimulatedAnnealingLayer, Dense):
    def __init__(self, n_input, n_output, *args, **kwargs):
        super().__init__(n_input, n_output, *args, **kwargs)

    def forward_with_perturbations(self, inputs, perturbations):
        delta_w, delta_b = perturbations
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights + delta_w) + self.biases + delta_b
        return self.output

    def perturb_parameters(self, stddev = 0.1):
        """Randomly perturbs weights and biases with a given standard deviation."""
        perturbation_weights = np.random.randn(*self.weights.shape) * stddev
        perturbation_biases = np.random.randn(*self.biases.shape) * stddev
        return perturbation_weights, perturbation_biases

    def update_parameters(self, updates):
        delta_weights, delta_biases = updates
        self.weights += delta_weights
        self.biases += delta_biases

class SimulatedAnnealingOptimizer(Optimizer):
    def __init__(self, model, learning_rate, loss):
        super().__init__(model, loss)
        self.learning_rate = learning_rate
        self.best_loss = float("inf")

    def forward_with_perturbations(self, x_train, stddev):
        perturbations = []
        inputs = x_train
        for layer in self.model.get_layers():
            if isinstance(layer, SimulatedAnnealingLayer):
                perturbation = layer.perturb_parameters(stddev)
                perturbations.append(perturbation)
                inputs = layer.forward_with_perturbations(inputs, perturbation)
            else:
                inputs = layer.forward(inputs)
        return inputs, perturbations

    def update_parameters(self, perturbations):
      i = 0
      for layer in self.model.get_layers():
            if isinstance(layer, SimulatedAnnealingLayer):
                layer.update_parameters(perturbations[i])
                i += 1

    def cooling_schedule(self, old_loss, new_loss, current_temp, current_step_size, cooling_rate, step_decay_rate=0.9999):
        deltaLoss = new_loss - old_loss
        deltaLossScale = deltaLoss / new_loss

        if abs(deltaLossScale) > min(1e-4, current_temp):
            scalefactor = min(1, current_temp/new_loss)
            elastic_rate = 1.0 - scalefactor
            elastic_rate = (elastic_rate + scalefactor) / 2
            rate = max(cooling_rate, elastic_rate)
            new_temp = current_temp * rate
            # new_temp = current_temp * cooling_rate
            new_step_size = current_step_size * step_decay_rate
        else:
            new_temp = current_temp / cooling_rate
            new_step_size = self.learning_rate
            self.learning_rate *= step_decay_rate

        return new_temp, new_step_size

    def train_step(self, x_batch, y_batch):
        for j in range(self.sample_per_batch):
            predictions, perturbations = self.forward_with_perturbations(x_batch, self.step_size)
            loss = self.loss.forward(predictions, y_batch)
            delta_E = (loss - self.best_loss)
            energy_cost = np.exp(-delta_E / self.current_temp)
            acceptance_rate = energy_cost if delta_E > 0 else  1.0
            if delta_E < 0 or np.random.rand() <= acceptance_rate:
                # Accept proposal
                self.update_parameters(perturbations)
                self.best_loss = loss
                # print(f'Accepted proposal due to temperature {loss} exp: {acceptance_rate}, deltaE {delta_E}')
        return self.best_loss

    def on_epoch_start(self, epoch):
        self.old_loss = self.best_loss

    def on_epoch_end(self, epoch):
        self.current_temp, self.step_size = self.cooling_schedule(self.old_loss, self.best_loss, self.current_temp, self.step_size, self.cooling_rate, 0.9999)

    def on_reporting(self, epoch, loss_value, acc):
        print(f'Epoch {epoch}, BestLoss: {loss_value}, Temperature {self.current_temp}, step_size {self.step_size}, test_acc: {acc}')

    def on_train_start(self, sample_per_batch=1, initial_temp=1.0, cooling_rate=0.99):
        self.current_temp = initial_temp
        # self.best_loss = float("inf")
        self.step_size = self.learning_rate
        self.sample_per_batch = sample_per_batch
        self.cooling_rate = cooling_rate

In [None]:
# XOR
np.random.seed(4)
layers = [
    SADense(2, 3),
    Sigmoid(),
    # Dense(3, 3),
    # Sigmoid(),
    SADense(3, 1),
    Sigmoid(),
]

net = NeuralNet(layers)
optimizer = SimulatedAnnealingOptimizer(net, learning_rate = 0.1, loss=MeanSquaredError())

optimizer.fit((xor_x_train, xor_y_train), (xor_x_train, xor_y_train), epochs=1000, batch_size=60000, sample_per_batch=20, initial_temp=1.0, cooling_rate=0.95, verbose=True)
# Infer on cx_test, cy_test
preds = net(xor_x_train)
expected = xor_y_train

print("expected", expected)
print("preds", preds)

In [None]:
# MNIST
np.random.seed(4)
layers = [
    SADense(784, 128),
    ReLU(),
    SADense(128, 10),
    Softmax(),
]

net = NeuralNet(layers)
optimizer = SimulatedAnnealingOptimizer(net, learning_rate = 1, loss=MeanSquaredError())

optimizer.fit((cx_train, cy_train), (cx_test, cy_test), epochs=10000, batch_size=60000, sample_per_batch=1, initial_temp=1.0, cooling_rate=0.95, verbose=True)
# Infer on cx_test, cy_test
predictions = net.forward(cx_test)
preds = np.array(antiCategorical(predictions))
expected = np.array(antiCategorical(cy_test))

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected.get(), preds.get())}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected.get(), preds.get())}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected.get(), preds.get())}')

# Simulated Annealing and Gradient based Hybrid Neural Network

In [None]:
class HybridNeuralNet(SANeuralNet):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.train_step = self.train_step_hybrid

    def train_step_SA(self, x_batch, y_batch, sample_per_batch, step_size, prev_loss, current_temp):
        for j in range(sample_per_batch):
            predictions, perturbations = self.forward_with_perturbations(x_batch, step_size)
            loss = self.loss.forward(predictions, y_batch)
            delta_E = (loss - prev_loss)
            energy_cost = np.exp(-delta_E / current_temp)
            acceptance_rate = energy_cost if delta_E > 0 else  1.0
            if delta_E < 0 or np.random.rand() <= acceptance_rate:
                # Accept proposal
                self.update_parameters(perturbations)
                prev_loss = loss
                # print(f'Accepted proposal due to temperature exp: {acceptance_rate}, deltaE {delta_E}')
        return prev_loss

    def train_step_hybrid(self, x_batch, y_batch, sample_per_batch, step_size, prev_loss, current_temp):
        prev_loss = self.train_step_SA(x_batch, y_batch, sample_per_batch, step_size, prev_loss, current_temp)
        prev_loss = self.train_step_sgd(x_batch, y_batch)
        return prev_loss


In [None]:
# XOR
np.random.seed(4)
layers = [
    SADense(2, 3),
    Sigmoid(),
    # Dense(3, 3),
    # Sigmoid(),
    SADense(3, 1),
    Sigmoid(),
]

net = HybridNeuralNet(layers, learning_rate = 0.1, loss=MeanSquaredError())

net.fit((xor_x_train, xor_y_train), (xor_x_train, xor_y_train), epochs=1000, batch_size=1000, sample_per_batch=10, initial_temp=1.0, cooling_rate=0.95, verbose=True)
# Infer on cx_test, cy_test
preds = net.forward(xor_x_train)
expected = xor_y_train

print("expected", expected)
print("preds", preds)

Epoch 0, BestLoss: 0.1285299620717038, Temperature 0.95, step_size 0.0999, deltaLossScale -inf, test_acc: 1.0
Epoch 1, BestLoss: 0.1334681927933293, Temperature 0.9025, step_size 0.0998001, deltaLossScale 0.03699930761235503, test_acc: 1.0
Epoch 2, BestLoss: 0.13779156697452147, Temperature 0.8573749999999999, step_size 0.0997002999, deltaLossScale 0.03137618851516205, test_acc: 1.0
Epoch 3, BestLoss: 0.15534785327297587, Temperature 0.8145062499999999, step_size 0.0996005996001, deltaLossScale 0.11301273837112286, test_acc: 1.0
Epoch 4, BestLoss: 0.1947260502499157, Temperature 0.7737809374999999, step_size 0.0995009990004999, deltaLossScale 0.20222356960663965, test_acc: 1.0
Epoch 5, BestLoss: 0.19652475344833104, Temperature 0.7350918906249998, step_size 0.0994014980014994, deltaLossScale 0.009152552881274712, test_acc: 1.0
Epoch 6, BestLoss: 0.22513125606397796, Temperature 0.6983372960937497, step_size 0.0993020965034979, deltaLossScale 0.1270658864334569, test_acc: 1.0
Epoch 7, B

In [None]:
# MNIST
np.random.seed(4)
layers = [
    SADense(784, 128),
    ReLU(),
    SADense(128, 10),
    Softmax(),
]

net = HybridNeuralNet(layers, learning_rate = 0.01, loss=MeanSquaredError())

net.fit((cx_train, cy_train), (cx_test, cy_test), epochs=2000, batch_size=100, sample_per_batch=1, initial_temp=1.0, cooling_rate=0.9, verbose=True)
# Infer on cx_test, cy_test
predictions = net.forward(cx_test)
preds = np.array(antiCategorical(predictions))
expected = np.array(antiCategorical(cy_test))

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected.get(), preds.get())}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected.get(), preds.get())}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected.get(), preds.get())}')

Epoch 0, BestLoss: 0.004870264409361111, Temperature 0.9, step_size 0.00999, deltaLossScale -inf, test_acc: 0.9453
Epoch 1, BestLoss: 0.006203848175226041, Temperature 0.81, step_size 0.009980010000000001, deltaLossScale 0.21496073536911464, test_acc: 0.9388
Epoch 2, BestLoss: 0.004154569115699565, Temperature 0.7290000000000001, step_size 0.00997002999, deltaLossScale -0.493259108816489, test_acc: 0.9421
Epoch 3, BestLoss: 0.004539973677774611, Temperature 0.6561000000000001, step_size 0.00996005996001, deltaLossScale 0.08489136489090006, test_acc: 0.9418
Epoch 4, BestLoss: 0.006272982921689323, Temperature 0.5904900000000002, step_size 0.00995009990004999, deltaLossScale 0.27626557660833073, test_acc: 0.9385
Epoch 5, BestLoss: 0.0047939897615897975, Temperature 0.5314410000000002, step_size 0.00994014980014994, deltaLossScale -0.30850987041095745, test_acc: 0.9432
Epoch 6, BestLoss: 0.003446158576123292, Temperature 0.47829690000000014, step_size 0.00993020965034979, deltaLossScale -

# Extreme Learning Machines

In [85]:
# Extreme Learning Machines
class ELM_Optimizer(Optimizer):
    def __init__(self, model, learning_rate, loss):
        super().__init__(model, loss)
        self.learning_rate = learning_rate

    def train_step(self, x_batch, y_batch, alpha=0):
        predictions = self.model.forward(x_batch)
        loss_value = self.loss.forward(predictions, y_batch)

        # Smooth out y_batch
        y_batch = np.where(y_batch > 0.5, 0.9, 0.1)
        expected = y_batch
        for layer in reversed(self.model.get_layers()):
            if isinstance(layer, Dense):
                x_inv = np.linalg.pinv(layer.last_inputs)
                weight_approx = np.dot(x_inv, expected)
                layer.weights = layer.weights * alpha + weight_approx * (1 - alpha)
                # layer.weights = weight_approx
                # expected = layer.last_inputs
                expected = layer.inverse(expected)
            else:
                expected = layer.inverse(expected)
        return loss_value



In [86]:
# XOR
np.random.seed(4)
layers = [
    Dense(2, 3, False),
    LeakyReLU(),
    # Sigmoid(),
    # Dense(3, 3),
    # Sigmoid(),
    Dense(3, 1, False),
    LeakyReLU(),
]

net = NeuralNet(layers)
optimizer = ELM_Optimizer(net, learning_rate = 0.1, loss=MeanSquaredError())

optimizer.fit((xor_x_train, xor_y_train), (xor_x_train, xor_y_train), epochs=10, batch_size=60000, verbose=True)
# Infer on cx_test, cy_test
preds = net(xor_x_train)
expected = xor_y_train

print("expected", expected)
print("preds", preds)

Epoch 0, Loss: 0.16489561130438207, test_acc: 1.0
Epoch 1, Loss: 0.14321428571428574, test_acc: 1.0
Epoch 2, Loss: 0.1432142857142857, test_acc: 1.0
Epoch 3, Loss: 0.1432142857142857, test_acc: 1.0
Epoch 4, Loss: 0.1432142857142857, test_acc: 1.0
Epoch 5, Loss: 0.14321428571428574, test_acc: 1.0
Epoch 6, Loss: 0.1432142857142857, test_acc: 1.0
Epoch 7, Loss: 0.1432142857142857, test_acc: 1.0
Epoch 8, Loss: 0.1432142857142857, test_acc: 1.0
Epoch 9, Loss: 0.1432142857142857, test_acc: 1.0
expected [[0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]]
preds [[0.   ]
 [0.05 ]
 [0.475]
 [0.525]
 [0.475]
 [0.   ]
 [0.525]]


In [114]:
# MNIST
np.random.seed(4)
layers = [
    Dense(784, 128, False),
    # LeakyReLU(),
    # Linear(),
    # Dense(128, 128, False),
    # LeakyReLU(),
    # Dense(128, 10, False),
    # Sigmoid(),
]

net = NeuralNet(layers)
optimizer = ELM_Optimizer(net, learning_rate = 0.01, loss=MeanSquaredError())

optimizer.fit((cx_train, cy_train), (cx_test, cy_test), epochs=1, batch_size=60000, verbose=True)
# Infer on cx_test, cy_test
predictions = net.forward(cx_test)
preds = np.array(antiCategorical(predictions))
expected = np.array(antiCategorical(cy_test))

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected.get(), preds.get())}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected.get(), preds.get())}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected.get(), preds.get())}')

Epoch 0, Loss: 0.10928068989420225, test_acc: 0.8509
Accuracy: 0.8509
Confusion Matrix: [[ 936    0    3    3    2    9   17    1    7    2]
 [   0 1108    2    2    1    1    5    1   14    1]
 [  18   59  805   29   15    0   38   24   39    5]
 [   5   19   24  875    1   19    9   20   24   14]
 [   0   24    8    4  866    5    9    2   14   50]
 [  20   15    6   77   16  632   22   14   66   24]
 [  19   10   10    0   19   21  870    0    9    0]
 [   5   40   18    6   22    1    1  868    5   62]
 [  16   56    9   32   26   47   15   11  736   26]
 [  19   11    3   15   69    0    1   67   11  813]]
Classification Report:               precision    recall  f1-score   support

           0       0.90      0.96      0.93       980
           1       0.83      0.98      0.89      1135
           2       0.91      0.78      0.84      1032
           3       0.84      0.87      0.85      1010
           4       0.84      0.88      0.86       982
           5       0.86      0.71

In [112]:
# Given matrices
a = cy_train[:30000]
b = cx_train[:30000]
c = cy_train[30000:]
d = cx_train[30000:]

# Randomly initialize x and y
x = np.random.rand(b.shape[1], c.shape[1])
y = np.random.rand(a.shape[0], c.shape[0])


In [115]:

# Calculate intermediate variables
# z1 = np.dot(b, x)
# z2 = np.dot(d, x)
z1 = b
z2 = d

# Form stacked matrices
A = np.vstack((a, c))
Z = np.vstack((z1, z2))

# Compute pseudoinverse and solve for y
Z_pseudo_inv = np.linalg.pinv(Z)
y = np.dot(Z_pseudo_inv, A)

# Substitute back to solve for x
# B = np.vstack((b, d))
# Z_prime = np.vstack((np.dot(b, x), np.dot(d, x)))
# B_pseudo_inv = np.linalg.pinv(B)
# x = np.dot(B_pseudo_inv, Z_prime)

# output = np.dot(cx_test, x)
output = np.dot(cx_test, y)

preds = np.array(antiCategorical(output))
expected = np.array(antiCategorical(cy_test))

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected.get(), preds.get())}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected.get(), preds.get())}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected.get(), preds.get())}')

Accuracy: 0.8534
Confusion Matrix: [[ 942    0    2    2    1    7   15    2    7    2]
 [   0 1107    2    2    1    1    5    2   15    0]
 [  17   56  809   28   16    0   42   21   39    4]
 [   4   15   26  887    2   14    9   21   21   11]
 [   0   23    6    3  872    5   10    2   13   48]
 [  20   17    2   84   19  624   22   13   69   22]
 [  17    9   10    0   21   20  872    0    9    0]
 [   5   38   18    8   20    0    1  877    3   58]
 [  17   54    9   32   27   42   15   12  743   23]
 [  18   10    2   15   72    1    1   76   13  801]]
Classification Report:               precision    recall  f1-score   support

           0       0.91      0.96      0.93       980
           1       0.83      0.98      0.90      1135
           2       0.91      0.78      0.84      1032
           3       0.84      0.88      0.86      1010
           4       0.83      0.89      0.86       982
           5       0.87      0.70      0.78       892
           6       0.88      0.9

# ELM + Simulated Annealing Hybrid

In [82]:
# Extreme Learning Machines Hybrid
class ELM_SA_Optimizer(SimulatedAnnealingOptimizer):
    def __init__(self, model, learning_rate, loss):
        super().__init__(model, learning_rate, loss)

    def train_step(self, x_batch, y_batch, alpha=0.8):
        predictions = self.model.forward(x_batch)
        # loss_value = self.loss.forward(predictions, y_batch)

        # Smooth out y_batch
        y_batch = np.where(y_batch > 0.5, 0.9, 0.1)
        expected = y_batch
        for layer in reversed(self.model.get_layers()):
            if isinstance(layer, Dense):
                x_inv = np.linalg.pinv(layer.last_inputs)
                weight_approx = np.dot(x_inv, expected)
                layer.weights = layer.weights * alpha + weight_approx * (1 - alpha)
                # layer.weights = weight_approx
                # expected = layer.last_inputs
                expected = layer.inverse(expected)
            else:
                expected = layer.inverse(expected)

        loss_value = super().train_step(x_batch, y_batch)
        return loss_value



In [84]:
# MNIST
np.random.seed(4)
layers = [
    SADense(784, 128, False),
    # LeakyReLU(),
    # Linear(),
    # Dense(128, 128, False),
    # LeakyReLU(),
    SADense(128, 10, False),
    Sigmoid(),
]

net = NeuralNet(layers)
optimizer = ELM_SA_Optimizer(net, learning_rate = 1, loss=MeanSquaredError())

optimizer.fit((cx_train, cy_train), (cx_test, cy_test), epochs=10000, batch_size=60000, sample_per_batch=1, initial_temp=1.0, cooling_rate=0.95, verbose=True)
# Infer on cx_test, cy_test
predictions = net.forward(cx_test)
preds = np.array(antiCategorical(predictions))
expected = np.array(antiCategorical(cy_test))

print(f'Accuracy: {sklearn.metrics.accuracy_score(expected.get(), preds.get())}')
print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected.get(), preds.get())}')
print(f'Classification Report: {sklearn.metrics.classification_report(expected.get(), preds.get())}')

Epoch 0, BestLoss: 0.18332151059265014, Temperature 1.0, step_size 1, test_acc: 0.0949
Epoch 1, BestLoss: 0.2409566013195773, Temperature 0.95, step_size 0.9999, test_acc: 0.1291
Epoch 2, BestLoss: 0.23312620153584057, Temperature 0.9025, step_size 0.9998000100000001, test_acc: 0.101
Epoch 3, BestLoss: 0.21234784144866736, Temperature 0.8573749999999999, step_size 0.9997000299990001, test_acc: 0.0721
Epoch 4, BestLoss: 0.20036266550948745, Temperature 0.8145062499999999, step_size 0.9996000599960002, test_acc: 0.0824
Epoch 5, BestLoss: 0.22088384759242521, Temperature 0.7737809374999999, step_size 0.9995000999900007, test_acc: 0.0419
Epoch 6, BestLoss: 0.18670273352967434, Temperature 0.7350918906249998, step_size 0.9994001499800017, test_acc: 0.065
Epoch 7, BestLoss: 0.18920608779592904, Temperature 0.6983372960937497, step_size 0.9993002099650037, test_acc: 0.0987
Epoch 8, BestLoss: 0.19674538165629463, Temperature 0.6634204312890623, step_size 0.9992002799440072, test_acc: 0.0846
Ep

KeyboardInterrupt: 

# K-BFGS Algorithm

In [None]:
class Layer:
    def forward(self, inputs):
        raise NotImplementedError

    def backward(self, grad_output):
        raise NotImplementedError

class Dense(Layer):
    def __init__(self, n_input, n_output):
        self.weights = np.random.randn(n_input, n_output) * np.sqrt(2.0 / n_input)
        self.biases = np.zeros((1, n_output))

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases
        return self.output

    def set_weights_biases(self, weights, biases):
        self.weights = weights
        self.biases = biases

    def get_params(self):
        return np.concatenate([self.weights.flatten(), self.biases.flatten()])

    def set_params(self, params):
        weight_size = self.weights.size
        self.weights = params[:weight_size].reshape(self.weights.shape)
        self.biases = params[weight_size:].reshape(self.biases.shape)

class Sigmoid:
    @staticmethod
    def forward(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def backward(x):
        sigmoid = Sigmoid.forward(x)
        return sigmoid * (1 - sigmoid)

class ReLU:
    @staticmethod
    def forward(x):
        return np.maximum(0, x)

    @staticmethod
    def backward(x):
        return np.where(x > 0, 1, 0)

class Softmax:
    @staticmethod
    def forward(x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    @staticmethod
    def backward(x):
        pass

class MeanSquaredError:
    @staticmethod
    def forward(predictions, targets):
        return np.mean((predictions - targets)**2)

    @staticmethod
    def backward(predictions, targets):
        return 2 * (predictions - targets) / predictions.size

class NeuralNet:
    def __init__(self, layers, loss):
        self.layers = layers
        self.loss = loss

    def forward(self, inputs):
        for layer in self.layers:
            inputs = layer.forward(inputs)
        return inputs

    def get_params(self):
        """Returns all layer parameters as a single flattened array."""
        params = np.concatenate([layer.get_params() for layer in self.layers if isinstance(layer, Dense)])
        return params

    def set_params(self, params):
        """Sets all layer parameters from a single flattened array."""
        start_idx = 0
        for layer in self.layers:
            if isinstance(layer, Dense):
                layer_params_size = layer.get_params().size
                layer.set_params(params[start_idx:start_idx + layer_params_size])
                start_idx += layer_params_size

    def loss_and_grad(self, params, x_train, y_train):
        """Calculates loss and gradient based on provided parameters."""
        self.set_params(params)
        predictions = self.forward(x_train)

        loss_value = self.loss.forward(predictions, y_train)
        grad_loss = self.loss.backward(predictions, y_train)

        grad_params = []
        for layer in self.layers:
            if isinstance(layer, Dense):
                grad_weights = np.dot(layer.inputs.T, grad_loss)
                grad_biases = np.sum(grad_loss, axis=0, keepdims=True)

                grad_params.append(grad_weights.flatten())
                grad_params.append(grad_biases.flatten())

                grad_loss = np.dot(grad_loss, layer.weights.T)

        grad_params = np.concatenate(grad_params)
        return loss_value, grad_params

    def k_bfgs(self, x_train, y_train, epochs=1, verbose=True):
        """Trains the model using the K-BFGS algorithm."""
        params = self.get_params()

        # Initialize memory for Hessian approximation
        s_list, y_list = [], []

        for epoch in range(epochs):
            loss_value, grad_params = self.loss_and_grad(params, x_train, y_train)

            # Form Hessian approximation from s and y lists
            H = np.eye(len(params))

            for s, y in zip(s_list, y_list):
                rho = 1 / np.dot(y, s)
                V = H - rho * np.outer(np.dot(H, s), y)
                H = V + rho * np.outer(s, s)

            # Compute search direction
            direction = -np.dot(H, grad_params)

            # Line search to find suitable step size
            step_size = 1.0

            def line_search_loss(step):
                new_params = params + step * direction
                new_loss, _ = self.loss_and_grad(new_params, x_train, y_train)
                return new_loss

            # Try step size reductions until loss decreases
            while line_search_loss(step_size) >= loss_value:
                step_size *= 0.5
                if step_size < 1e-10:
                    break

            new_params = params + step_size * direction

            # Update s and y lists
            s = new_params - params
            new_loss, new_grad_params = self.loss_and_grad(new_params, x_train, y_train)
            y = new_grad_params - grad_params

            if len(s_list) >= 10:  # Limit to 10 recent updates
                s_list.pop(0)
                y_list.pop(0)

            s_list.append(s)
            y_list.append(y)

            params = new_params

            if verbose:
                print(f'Epoch {epoch}, Loss: {loss_value}')

        self.set_params(params)

def antiCategorical(arr):
    return np.argmax(arr, axis=1)

def main():
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

    y_train_c = np.array(to_categorical(y_train))
    y_test_c = np.array(to_categorical(y_test))

    x_train = np.array(x_train.reshape(-1, 784) / 255.)
    x_test = np.array(x_test.reshape(-1, 784) / 255.)

    layers = [
        Dense(784, 128),
        ReLU(),
        Dense(128, 10),
        Softmax(),
    ]

    nn = NeuralNet(layers, loss=MeanSquaredError())

    nn.k_bfgs(x_train, y_train_c, epochs=10, verbose=True)

    predictions = nn.forward(x_test)

    preds = antiCategorical(predictions)
    expected = antiCategorical(y_test_c)

    print(f'Accuracy: {sklearn.metrics.accuracy_score(expected, preds)}')
    print(f'Confusion Matrix: {sklearn.metrics.confusion_matrix(expected, preds)}')
    print(f'Classification Report: {sklearn.metrics.classification_report(expected, preds)}')

main()


In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Data Preprocessing
def preprocess_data():

    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

    y_train_c = np.array(to_categorical(y_train))
    y_test_c = np.array(to_categorical(y_test))

    x_train = np.array(x_train.reshape(-1, 784) / 255.)
    x_test = np.array(x_test.reshape(-1, 784) / 255.)

    # Split into training and testing sets
    return x_train, x_test, y_train_c, y_test_c

# Utility functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

# Mesh Neural Network Class
class MeshNeuralNetwork:
    def __init__(self, num_neurons, input_size, output_size, num_passes=3, learning_rate=0.001):
        self.num_neurons = num_neurons
        self.input_size = input_size
        self.output_size = output_size
        self.num_passes = num_passes
        self.learning_rate = learning_rate

        # Initialize neurons
        self.neurons = [None] * num_neurons
        self.connections = np.random.choice(
            np.arange(num_neurons),
            size=(num_neurons, num_neurons),
            replace=True
        )

        # Initialize weights and biases randomly
        self.weights = np.random.randn(num_neurons, num_neurons)
        self.biases = np.random.randn(num_neurons)

        # Select random input/output neurons
        self.input_neurons = np.random.choice(np.arange(num_neurons), size=input_size, replace=False)
        self.output_neurons = np.random.choice(np.arange(num_neurons), size=output_size, replace=False)

    def forward_pass(self, X):
        # Initialize neuron activations with input data
        self.neurons[self.input_neurons] = X

        # Propagate for multiple passes
        for _ in range(self.num_passes):
            for i in range(self.num_neurons):
                # Update each neuron activation based on connections
                inputs = self.weights[i, self.connections[i]]
                activation_input = np.dot(inputs, self.neurons[self.connections[i]]) + self.biases[i]
                self.neurons[i] = relu(activation_input)

        # Output activations
        return self.neurons[self.output_neurons]

    def backward_pass(self, X, y_true, y_pred):
        # Loss gradient (Cross-Entropy Loss derivative)
        loss_grad = y_pred - y_true

        # Backpropagate through output neurons first
        for i in self.output_neurons:
            # Compute gradients w.r.t weights and biases
            neuron_output = self.neurons[i]
            relu_grad = relu_derivative(neuron_output)
            grad_w = np.outer(loss_grad[i], self.neurons[self.connections[i]]) * relu_grad
            grad_b = loss_grad[i] * relu_grad

            # Update weights and biases
            self.weights[i, self.connections[i]] -= self.learning_rate * grad_w
            self.biases[i] -= self.learning_rate * grad_b

        # Backpropagate further into the network
        for i in reversed(range(self.num_neurons)):
            if i in self.output_neurons:
                continue

            neuron_output = self.neurons[i]
            relu_grad = relu_derivative(neuron_output)
            grad_w = np.outer(loss_grad[i], self.neurons[self.connections[i]]) * relu_grad
            grad_b = loss_grad[i] * relu_grad

            self.weights[i, self.connections[i]] -= self.learning_rate * grad_w
            self.biases[i] -= self.learning_rate * grad_b

    def fit(self, X_train, y_train, epochs):
        for epoch in range(epochs):
            for X_batch, y_batch in zip(X_train, y_train):
                y_pred = self.forward_pass(X_batch)
                self.backward_pass(X_batch, y_batch, y_pred)

            # Output training metrics
            train_loss, train_accuracy = self.evaluate(X_train, y_train)
            print(f"Epoch {epoch+1}/{epochs} - Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}")

    def evaluate(self, X, y):
        correct_predictions = 0
        total_loss = 0

        for X_sample, y_true in zip(X, y):
            y_pred = self.forward_pass(X_sample)

            # Compute loss
            loss = np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))
            total_loss += loss

            # Compute accuracy
            correct_predictions += np.argmax(y_pred) == np.argmax(y_true)

        return total_loss / len(X), correct_predictions / len(X)

# Main Training Loop
def main():
    X_train, X_test, y_train, y_test = preprocess_data()

    # Convert to arrays for easier manipulation
    X_train, X_test = np.array(X_train), np.array(X_test)
    y_train, y_test = np.array(y_train), np.array(y_test)

    # Initialize network with a suitable number of neurons, input size, and output size
    input_size, output_size = X_train.shape[1], y_train.shape[1]
    num_neurons = 2000  # Example number
    network = MeshNeuralNetwork(num_neurons=num_neurons, input_size=input_size, output_size=output_size, num_passes=3)

    # Train the network
    network.fit(X_train, y_train, epochs=10)

    # Evaluate on the test set
    test_loss, test_accuracy = network.evaluate(X_test, y_test)
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

if __name__ == "__main__":
    main()


TypeError: only integer scalar arrays can be converted to a scalar index