# Optimizers implememtation(Manually)

# Basic Setup (Neural Network Functions)

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate a simple synthetic dataset
np.random.seed(1)
m = 1000 # Number of examples
n_x = 10  # Number of input features

X = np.random.randn(n_x, m)
Y = (np.sum(X, axis=0) > 0).astype(int).reshape(1, m)  # Simple rule-based labels

# Split dataset into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X.T, Y.T, test_size=0.2, random_state=1)
X_train, X_test = X_train.T, X_test.T
Y_train, Y_test = Y_train.T, Y_test.T


# Initialize parameters (weights)
def initialize_parameters(layer_dims):
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

    return parameters

# Forward Propagation (simple linear model for illustration)
def forward_propagation(X, parameters):
    W1 = parameters['W1']
    b1 = parameters['b1']
    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)
    
    return A1

# Loss function (Mean Squared Error)
def compute_loss(A1, Y):
    m = Y.shape[1]
    loss = (1/m) * np.sum((A1 - Y)**2)
    return loss

# Backward Propagation (simple linear model)
def backward_propagation(X, Y, A1, parameters):
    m = X.shape[1]
    dZ1 = A1 - Y
    dW1 = (1/m) * np.dot(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {'dW1': dW1, 'db1': db1}
    return grads

# 1. Gradient Descent (GD)

In [2]:
def gradient_descent_update(parameters, grads, learning_rate):
    W1 = parameters['W1'] - learning_rate * grads['dW1']
    b1 = parameters['b1'] - learning_rate * grads['db1']

    parameters['W1'] = W1
    parameters['b1'] = b1
    return parameters

# 2. Stochastic Gradient Descent (SGD)

In [3]:
def sgd_update(parameters, grads, learning_rate):
    W1 = parameters['W1'] - learning_rate * grads['dW1']
    b1 = parameters['b1'] - learning_rate * grads['db1']

    parameters['W1'] = W1
    parameters['b1'] = b1
    return parameters

# 3. SGD with Momentum

In [4]:
def momentum_update(parameters, grads, velocity, beta, learning_rate):
    velocity['dW1'] = beta * velocity['dW1'] + (1 - beta) * grads['dW1']
    velocity['db1'] = beta * velocity['db1'] + (1 - beta) * grads['db1']
    
    parameters['W1'] = parameters['W1'] - learning_rate * velocity['dW1']
    parameters['b1'] = parameters['b1'] - learning_rate * velocity['db1']
    
    return parameters, velocity

# Initialize velocity for momentum
def initialize_velocity(parameters):
    velocity = {}
    velocity['dW1'] = np.zeros_like(parameters['W1'])
    velocity['db1'] = np.zeros_like(parameters['b1'])
    return velocity

# 4. RMSProp

In [5]:
def rmsprop_update(parameters, grads, s, beta2, epsilon, learning_rate):
    s['dW1'] = beta2 * s['dW1'] + (1 - beta2) * np.square(grads['dW1'])
    s['db1'] = beta2 * s['db1'] + (1 - beta2) * np.square(grads['db1'])
    
    parameters['W1'] -= learning_rate * grads['dW1'] / (np.sqrt(s['dW1']) + epsilon)
    parameters['b1'] -= learning_rate * grads['db1'] / (np.sqrt(s['db1']) + epsilon)
    
    return parameters, s

# Initialize the RMSProp cache
def initialize_rmsprop(parameters):
    s = {}
    s['dW1'] = np.zeros_like(parameters['W1'])
    s['db1'] = np.zeros_like(parameters['b1'])
    return s

# 5. Adam

In [6]:
def adam_update(parameters, grads, velocity, s, t, beta1, beta2, epsilon, learning_rate):
    # Momentum update
    velocity['dW1'] = beta1 * velocity['dW1'] + (1 - beta1) * grads['dW1']
    velocity['db1'] = beta1 * velocity['db1'] + (1 - beta1) * grads['db1']
    
    # RMSProp update
    s['dW1'] = beta2 * s['dW1'] + (1 - beta2) * np.square(grads['dW1'])
    s['db1'] = beta2 * s['db1'] + (1 - beta2) * np.square(grads['db1'])
    
    # Bias correction
    v_corrected_dW1 = velocity['dW1'] / (1 - beta1 ** t)
    v_corrected_db1 = velocity['db1'] / (1 - beta1 ** t)
    s_corrected_dW1 = s['dW1'] / (1 - beta2 ** t)
    s_corrected_db1 = s['db1'] / (1 - beta2 ** t)
    
    # Update parameters
    parameters['W1'] -= learning_rate * v_corrected_dW1 / (np.sqrt(s_corrected_dW1) + epsilon)
    parameters['b1'] -= learning_rate * v_corrected_db1 / (np.sqrt(s_corrected_db1) + epsilon)
    
    return parameters, velocity, s

# Initialize for Adam optimizer
def initialize_adam(parameters):
    velocity = initialize_velocity(parameters)
    s = initialize_rmsprop(parameters)
    return velocity, s

In [7]:
def train_model(optimizer, X_train, Y_train, X_test, Y_test, num_iterations=1000, learning_rate=0.01):
    layer_dims = [X_train.shape[0], 1]
    parameters = initialize_parameters(layer_dims)
    
    if optimizer == 'momentum':
        velocity = initialize_velocity(parameters)
        beta = 0.9
    elif optimizer == 'rmsprop':
        s = initialize_rmsprop(parameters)
        beta2 = 0.999
        epsilon = 1e-8
    elif optimizer == 'adam':
        velocity, s = initialize_adam(parameters)
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-8
    
    for t in range(1, num_iterations + 1):
        A1 = forward_propagation(X_train, parameters)
        loss = compute_loss(A1, Y_train)
        grads = backward_propagation(X_train, Y_train, A1, parameters)

        if optimizer == 'gd':
            parameters = gradient_descent_update(parameters, grads, learning_rate)
        elif optimizer == 'sgd':
            parameters = sgd_update(parameters, grads, learning_rate)
        elif optimizer == 'momentum':
            parameters, velocity = momentum_update(parameters, grads, velocity, beta, learning_rate)
        elif optimizer == 'rmsprop':
            parameters, s = rmsprop_update(parameters, grads, s, beta2, epsilon, learning_rate)
        elif optimizer == 'adam':
            parameters, velocity, s = adam_update(parameters, grads, velocity, s, t, beta1, beta2, epsilon, learning_rate)

    # Predict on test set
    A1_test = forward_propagation(X_test, parameters)
    predictions = (A1_test > 0.5).astype(int)
    accuracy = accuracy_score(Y_test.T, predictions.T)

    return accuracy

# Train models using different optimizers
optimizers = ['gd', 'sgd', 'momentum', 'rmsprop', 'adam']
for opt in optimizers:
    accuracy = train_model(opt, X_train, Y_train, X_test, Y_test, num_iterations=1000, learning_rate=0.1)
    print(f"Accuracy with {opt} optimizer: {accuracy}")

Accuracy with gd optimizer: 0.9
Accuracy with sgd optimizer: 0.9
Accuracy with momentum optimizer: 0.9
Accuracy with rmsprop optimizer: 0.865
Accuracy with adam optimizer: 0.9
