In [53]:
import numpy as np

class Optimizer:
    def __init__(self, optimizer_name, layers, learning_rate=0.01):
        self.optimizer_name = optimizer_name
        self.learning_rate = learning_rate
        self.weights_cache = {}
        self.bias_cache = {}
        self.accumulated_gradients = {}

        # Initialize caches for weights and biases
        for i in range(len(layers) - 1):  # len(layers) - 1 because we're connecting layer i to i+1
            weights_key = f'layer{i}_to_layer{i+1}_weights'
            bias_key = f'layer{i+1}_bias'

            # Initialize caches
            self.weights_cache[weights_key] = np.random.rand(layers[i], layers[i+1]) * 0.001  # Small random weights
            self.bias_cache[bias_key] = np.zeros(layers[i+1])
            self.accumulated_gradients[weights_key] = np.zeros((layers[i], layers[i+1]))

    def adagrad_update(self, gradients):
        for key in self.weights_cache.keys():
            # Update accumulated gradients 
            self.accumulated_gradients[key] += gradients[key] ** 2
            
            # Update weights
            adjusted_learning_rate = self.learning_rate / (np.sqrt(self.accumulated_gradients[key]) + 1e-8)  # Small value to prevent division by zero
            self.weights_cache[key] -= adjusted_learning_rate * gradients[key]

        # Update biases
        for bias_key in self.bias_cache.keys():
            self.bias_cache[bias_key] -= self.learning_rate * np.sum(gradients[bias_key], axis=0)

class NeuralNetwork:
    def __init__(self, layers, activations, optimizer):
        self.layers = layers
        self.activations = activations
        self.optimizer = optimizer  # Instance of the Optimizer class
        self.output_layer_index = len(layers) - 1

        self.weights = {}
        self.biases = {}

        # Initialize weights and biases
        for i in range(len(layers) - 1):
            weights_key = f'layer{i}_to_layer{i+1}_weights'
            bias_key = f'layer{i+1}_bias'
            self.weights[weights_key] = np.random.rand(layers[i], layers[i+1]) * 0.01
            self.biases[bias_key] = np.zeros(layers[i+1])

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        sig = self.sigmoid(x)
        return sig * (1 - sig)

    def ReLU(self, x):
        return np.maximum(0, x)

    def ReLU_derivative(self, x):
        return np.where(x > 0, 1, 0)

    def activation_function(self, activation_name):
        self.activation_dict = {'relu': self.ReLU, 'sigmoid': self.sigmoid}
        return self.activation_dict.get(activation_name)

    def activation_derivative(self, activation_name):
        self.activation_derivative_dict = {'relu': self.ReLU_derivative, 'sigmoid': self.sigmoid_derivative}
        return self.activation_derivative_dict.get(activation_name)

    def forward_pass(self, X):
        self.z = {}  # Post activation layer output 
        self.h = {}  # Pre activation layer output
        self.z['layer0_output'] = X 

        # Loop for forward propagation 
        for i in range(1, len(self.layers)):
            h_key = f'h{i}'
            z_key = f'layer{i}_output'
            weights_key = f'layer{i-1}_to_layer{i}_weights'

            self.h[h_key] = np.dot(self.z[f'layer{i-1}_output'], self.weights[weights_key]) + self.biases[f'layer{i}_bias']
            activation_func = self.activation_function(self.activations[i-1])
            self.z[z_key] = activation_func(self.h[h_key]) 

        self.y_hat = self.z[f'layer{self.output_layer_index}_output']

    def backward_pass(self, X, y):
        self.error = y - self.y_hat
        self.errors = {}
        self.gradients = {}
        activation_derivative = self.activation_derivative(self.activations[self.output_layer_index - 1])
        self.errors[f'layer{self.output_layer_index}_error'] = self.error * activation_derivative(self.h[f'h{self.output_layer_index}'])

        for i in range(self.output_layer_index, 0, -1):
            error_key = f'layer{i}_error'
            weights_key = f'layer{i-1}_to_layer{i}_weights'
            gradient_key = f'layer{i-1}_to_layer{i}_gradients'

            activation_derivative = self.activation_derivative(self.activations[i-1])
            self.errors[error_key] = np.dot(self.errors[f'layer{i+1}_error'], self.weights[weights_key].T) * activation_derivative(self.h[f'h{i}'])
            self.gradients[gradient_key] = np.dot(self.z[f'layer{i-1}_output'].T, self.errors[error_key])

        # Update the weights and biases using the optimizer
        self.optimizer.adagrad_update(self.gradients)



In [54]:
nn = optimizer('adam',[2,3,4,5])

In [39]:
nn_t = NeuralNetwork(layers=[2,3,4,5],optimizer='adam',activations=[])

In [55]:
nn.weights_cache

{'layer0_to_layer1_weights': 0,
 'layer1_to_layer2_weights': 0,
 'layer2_to_layer3_weights': 0,
 'layer3_to_layer4_weights': 0}

In [45]:
layer_sizes = [2,3,4,5,1]

In [51]:
weights_cache = {
    'layer0_to_layer1_weights': np.zeros_like((2,3)),
    'layer1_to_layer2_weights': np.random.rand(layer_sizes[1], layer_sizes[2]),
    'layer2_to_layer3_weights': np.random.rand(layer_sizes[2], layer_sizes[3]),
    'layer3_to_layer4_weights': np.random.rand(layer_sizes[3], layer_sizes[4])
}

In [52]:
weights_cache

{'layer0_to_layer1_weights': array([0, 0]),
 'layer1_to_layer2_weights': array([[0.13439699, 0.36540155, 0.75575309, 0.49598251],
        [0.63817102, 0.82166636, 0.0723674 , 0.08654648],
        [0.35314526, 0.60408918, 0.93035609, 0.47513007]]),
 'layer2_to_layer3_weights': array([[0.37584995, 0.80725366, 0.92150188, 0.41092753, 0.51875249],
        [0.06037035, 0.51526596, 0.62878656, 0.56231432, 0.51390748],
        [0.52039467, 0.49551973, 0.90495244, 0.71608265, 0.08290224],
        [0.7586491 , 0.70018291, 0.64170256, 0.1851104 , 0.85155092]]),
 'layer3_to_layer4_weights': array([[0.41696173],
        [0.16693957],
        [0.36960976],
        [0.49062498],
        [0.04437588]])}

In [47]:
weights_cache

{'layer0_to_layer1_weights': array([[0.9865235 , 0.30182431, 0.78561757],
        [0.08596021, 0.7050031 , 0.03825895]]),
 'layer1_to_layer2_weights': array([[0.39628808, 0.9524818 , 0.05238355, 0.952711  ],
        [0.17458104, 0.75306497, 0.0766882 , 0.10458555],
        [0.99134696, 0.56583861, 0.57057921, 0.91813118]]),
 'layer2_to_layer3_weights': array([[0.17360458, 0.43922869, 0.93335021, 0.35379363, 0.5940879 ],
        [0.80575696, 0.33497127, 0.09345009, 0.95929143, 0.93636888],
        [0.62821606, 0.92872066, 0.50534744, 0.4961844 , 0.24726944],
        [0.9064185 , 0.10386185, 0.57439881, 0.66205625, 0.08188162]]),
 'layer3_to_layer4_weights': array([[0.24282842],
        [0.66766086],
        [0.58732907],
        [0.2450482 ],
        [0.22654418]])}