In [4]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Define the softmax function
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# SGD with Momentum Optimizer
class SGDWithMomentum:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.learning_rate * grads[key]
            params[key] += self.v[key]

# Adam Optimizer
class Adam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.iteration = 0

    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iteration += 1
        for key in params.keys():
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key] ** 2)
            m_hat = self.m[key] / (1 - self.beta1 ** self.iteration)
            v_hat = self.v[key] / (1 - self.beta2 ** self.iteration)
            params[key] -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

            
            
class Adagrad:
    def __init__(self, learning_rate=0.01, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.G = None

    def update(self, params, grads):
        if self.G is None:
            self.G = {}
            for key, val in params.items():
                self.G[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.G[key] += grads[key] ** 2
            params[key] -= self.learning_rate * grads[key] / (np.sqrt(self.G[key]) + self.epsilon)

            
class RMSprop:
    def __init__(self, learning_rate=0.001, rho=0.9, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.rho = rho
        self.epsilon = epsilon
        self.Eg = None

    def update(self, params, grads):
        if self.Eg is None:
            self.Eg = {}
            for key, val in params.items():
                self.Eg[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.Eg[key] = self.rho * self.Eg[key] + (1 - self.rho) * (grads[key] ** 2)
            params[key] -= self.learning_rate * grads[key] / (np.sqrt(self.Eg[key]) + self.epsilon)

            

class AdaMax:
    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None  # First moment vector
        self.u = None  # Infinity norm
        self.iteration = 0

    def update(self, params, grads):
        if self.m is None or self.u is None:
            self.m, self.u = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.u[key] = np.zeros_like(val)

        self.iteration += 1
        alpha_t = self.learning_rate / (1 - self.beta1 ** self.iteration)  # Corrects bias for first moment

        for key in params.keys():
            # Update biased first moment estimate
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
            # Update the infinity norm
            self.u[key] = np.maximum(self.beta2 * self.u[key], np.abs(grads[key]))
            # Update parameters
            params[key] -= alpha_t * self.m[key] / (self.u[key] + self.epsilon)


class FTRL:
    def __init__(self, learning_rate=0.01, beta=1.0, l1=0.1, l2=1.0):
        self.learning_rate = learning_rate
        self.beta = beta
        self.l1 = l1
        self.l2 = l2
        self.z = None
        self.n = None

    def update(self, params, grads):
        if self.z is None or self.n is None:
            self.z, self.n = {}, {}
            for key, val in params.items():
                self.z[key] = np.zeros_like(val)
                self.n[key] = np.zeros_like(val)

        for key in params.keys():
            self.n[key] += grads[key] ** 2
            sigma = (np.sqrt(self.n[key]) - np.sqrt(self.n[key] - grads[key] ** 2)) / self.learning_rate
            self.z[key] += grads[key] - sigma * params[key]
            params[key] = - ((self.z[key] - np.sign(self.z[key]) * self.l1) / 
                            ((self.beta + np.sqrt(self.n[key])) / self.learning_rate + self.l2))
            params[key] *= np.abs(self.z[key]) > self.l1

class NesterovAcceleratedGradient:
    def __init__(self, learning_rate=0.01, momentum=0.9):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.keys():
            v_prev = self.v[key]
            self.v[key] = self.momentum * self.v[key] - self.learning_rate * grads[key]
            params[key] += -self.momentum * v_prev + (1 + self.momentum) * self.v[key]

class AdaDelta:
    def __init__(self, rho=0.95, epsilon=1e-6):
        self.rho = rho
        self.epsilon = epsilon
        self.Eg = None
        self.Edelta = None

    def update(self, params, grads):
        if self.Eg is None or self.Edelta is None:
            self.Eg, self.Edelta = {}, {}
            for key, val in params.items():
                self.Eg[key] = np.zeros_like(val)
                self.Edelta[key] = np.zeros_like(val)

        for key in params.keys():
            self.Eg[key] = self.rho * self.Eg[key] + (1 - self.rho) * (grads[key] ** 2)
            delta = - (np.sqrt(self.Edelta[key] + self.epsilon) / np.sqrt(self.Eg[key] + self.epsilon)) * grads[key]
            self.Edelta[key] = self.rho * self.Edelta[key] + (1 - self.rho) * (delta ** 2)
            params[key] += delta
            
class Nadam:
    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.iteration = 0

    def update(self, params, grads):
        if self.m is None or self.v is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)

        self.iteration += 1
        for key in params.keys():
            # Moving average of the gradients.
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
            # Moving average of the squared gradients.
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key] ** 2)

            # Compute bias-corrected first moment estimate.
            m_hat = self.m[key] / (1 - self.beta1 ** self.iteration)
            # Compute bias-corrected second raw moment estimate.
            v_hat = self.v[key] / (1 - self.beta2 ** self.iteration)

            # Compute the Nesterov-accelerated gradient.
            m_bar = (1 - self.beta1) * grads[key] + self.beta1 * m_hat

            # Update parameters.
            params[key] -= self.learning_rate * m_bar / (np.sqrt(v_hat) + self.epsilon)

            
# Neural Network
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.params = {
            'W1': np.random.randn(input_size, hidden_size) * 0.1,
            'b1': np.zeros(hidden_size),
            'W2': np.random.randn(hidden_size, output_size) * 0.1,
            'b2': np.zeros(output_size),
        }

    def forward(self, X):
        z1 = np.dot(X, self.params['W1']) + self.params['b1']
        a1 = np.maximum(0, z1)  # ReLU activation
        z2 = np.dot(a1, self.params['W2']) + self.params['b2']
        a2 = softmax(z2)  # Softmax activation
        return a1, a2

    def backward(self, X, a1, a2, y):
        dz2 = a2 - y
        dW2 = np.dot(a1.T, dz2)
        db2 = np.sum(dz2, axis=0)

        da1 = np.dot(dz2, self.params['W2'].T)
        dz1 = da1
        dz1[a1 <= 0] = 0  # Derivative of ReLU
        dW1 = np.dot(X.T, dz1)
        db1 = np.sum(dz1, axis=0)

        grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}
        return grads

    def compute_loss(self, y_hat, y):
        # Using a small epsilon to avoid log(0)
        epsilon = 1e-15
        return -np.mean(np.log(y_hat[np.arange(len(y_hat)), y.argmax(axis=1)] + epsilon))

    def predict(self, X):
        _, y_hat = self.forward(X)
        return y_hat.argmax(axis=1)

    def train(self, X_train, y_train, X_val, y_val, epochs, optimizer):
        for epoch in range(epochs):
            # Forward pass
            a1, a2 = self.forward(X_train)

            # Compute loss
            loss = self.compute_loss(a2, y_train)

            # Backward pass
            grads = self.backward(X_train, a1, a2, y_train)

            # Update parameters
            optimizer.update(self.params, grads)

            # Validation loss
            _, a2_val = self.forward(X_val)
            val_loss = self.compute_loss(a2_val, y_val)

            # Print out the losses periodically
            if epoch % 10 == 0:
                print(f'Epoch {epoch}, Loss: {loss}, Validation Loss: {val_loss}')

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# One-hot encode the target variable
encoder = OneHotEncoder(sparse_output=False)  # Updated parameter
y_onehot = encoder.fit_transform(y.reshape(-1, 1))

# Split dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize the neural network with the size of the input layer (number of features),
# the size of the hidden layer, and the size of the output layer (number of classes)
nn = NeuralNetwork(input_size=X_train_scaled.shape[1], hidden_size=10, output_size=y_train.shape[1])

# Select an optimizer
optimizer = SGDWithMomentum(learning_rate=0.01, momentum=0.9)
# optimizer = Adam(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8)

# Train the neural network
epochs = 200
nn.train(X_train_scaled, y_train, X_val_scaled, y_val, epochs, optimizer)

# Predict on the validation set
y_pred = nn.predict(X_val_scaled)

# One-hot encoding is reversed to compare with the original labels
y_val_labels = np.argmax(y_val, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_val_labels, y_pred)
print(f'Validation accuracy: {accuracy:.2f}')


Epoch 0, Loss: 1.105364304612469, Validation Loss: 1.0506513929506545
Epoch 10, Loss: 0.13816787720294677, Validation Loss: 0.044747994374113725
Epoch 20, Loss: 0.057654666599093664, Validation Loss: 0.022059327056193265
Epoch 30, Loss: 0.05058895655218387, Validation Loss: 0.02288998463529905
Epoch 40, Loss: 0.04833563733898785, Validation Loss: 0.009776008025516628
Epoch 50, Loss: 0.04744727936003065, Validation Loss: 0.013258128231685337
Epoch 60, Loss: 0.0470880181821431, Validation Loss: 0.014001155969004418
Epoch 70, Loss: 0.04677522108942112, Validation Loss: 0.012438450292568428
Epoch 80, Loss: 0.0466565855657776, Validation Loss: 0.015292039227321046
Epoch 90, Loss: 0.04660256431345639, Validation Loss: 0.014676075558936397
Epoch 100, Loss: 0.046587029974382674, Validation Loss: 0.01504352983200253
Epoch 110, Loss: 0.046582650519847044, Validation Loss: 0.015288605755247245
Epoch 120, Loss: 0.046581456208505, Validation Loss: 0.015298583893257828
Epoch 130, Loss: 0.04658101122