# use symthetic data
# keep in mind network should converge 
# use the best weight initialization strategy
# implement backpropogation from scratch 
# loss binary cross entropy

In [None]:

import numpy as np

# Activation functions
def sigmoid(x):
    x = np.clip(x, -500, 500)
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def Relu(x):
    return np.maximum(0 ,X)
def relu_deriv(x):
    return (x> 0 ).astype(float)


class MultiLayerPerceptron:
    def __init__(self, input_size=1, hidden1_size=4, output_size=1, learning_rate=0.01):
        self.input_size = input_size
        self.hidden1_size = hidden1_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        
        
        self.w1 = np.random.randn(input_size, hidden1_size) * 0.01
        self.b1 = np.zeros((1, hidden1_size))    
        
        # He initialization for second layer
        self.w2 = np.random.randn(hidden1_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))
        
        
        self.V_w1 = np.zeros_like
        
    def forward(self, X):
        self.X = X
        
        # Input to hidden layer
        self.z1 = np.dot(X, self.w1) + self.b1
        self.a1 = sigmoid(self.z1)
        
        # Hidden layer to output
        self.z2 = np.dot(self.a1, self.w2) + self.b2
        self.output = sigmoid(self.z2)
        
        return self.output
    
    def backward(self, y):
        m = self.X.shape[0] 
        
        
        epsilon = 1e-15  # to avoid log(0)
        self.output = np.clip(self.output, epsilon, 1 - epsilon)
        loss = -np.mean(y * np.log(self.output) + (1 - y) * np.log(1 - self.output))
        
        
        delta_output = self.output - y
        
        # Gradients for output layer
        delta2 = delta_output  
        dw2 = np.dot(self.a1.T, delta2) / m
        db2 = np.sum(delta2, axis=0, keepdims=True) / m
        
        # Gradients for first hidden layer
        delta1 = np.dot(delta2, self.w2.T) * sigmoid_derivative(self.a1)
        dw1 = np.dot(self.X.T, delta1) / m
        db1 = np.sum(delta1, axis=0, keepdims=True) / m
        
        # Store gradients
        self.dw1, self.db1 = dw1, db1
        self.dw2, self.db2 = dw2, db2
        
        return loss
    
    def update_weights(self):
        
        self.w1 -= self.learning_rate * self.dw1
        self.b1 -= self.learning_rate * self.db1
        
        self.w2 -= self.learning_rate * self.dw2
        self.b2 -= self.learning_rate * self.db2
    
    def train_iteration(self, X, y):
        # One iteration of training
        self.forward(X)
        loss = self.backward(y)
        self.update_weights()
        return loss
    
    def train(self, X, y, epochs=1000, verbose=True):
        """Train the network for multiple epochs"""
        losses = []
        for epoch in range(epochs):
            loss = self.train_iteration(X, y)
            losses.append(loss)
            
            if verbose and epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.6f}")
                
        return losses


np.random.seed(42)
num_samples = 100


X = np.random.randn(num_samples, 1) * 2 

y = (X > 0).astype(float)  
print(y)

train_ratio = 0.8
train_size = int(num_samples * train_ratio)

X_train, X_test = X[:train_size], X[train_size:]
print(X_test)
y_train, y_test = y[:train_size], y[train_size:]


mlp = MultiLayerPerceptron(input_size=1, hidden1_size=4, output_size=1, learning_rate=0.1)

# Print initial weights
print("Initial weights:")
print("W1 (input -> hidden1):")
print(mlp.w1)
print("\nW2 (hidden1 -> output):")
print(mlp.w2)

# Train the model
losses = mlp.train(X_train, y_train, epochs=1000, verbose=True)

# Evaluate the model
y_pred_train = mlp.forward(X_train)
y_pred_train_binary = (y_pred_train > 0.5).astype(float)
train_accuracy = np.mean(y_pred_train_binary == y_train)

y_pred_test = mlp.forward(X_test)
y_pred_test_binary = (y_pred_test > 0.5).astype(float)
test_accuracy = np.mean(y_pred_test_binary == y_test)

print("\nTraining accuracy:", train_accuracy)
print("Test accuracy:", test_accuracy)

# Display some sample predictions
print("\nSample predictions:")
for i in range(5):
    x_sample = X_test[i]
    y_true = y_test[i]
    y_pred = mlp.forward(x_sample.reshape(1, -1))[0][0]
    
    print(f"Input: {x_sample[0]:.4f}, True: {int(y_true[0])}, Predicted: {y_pred:.4f}, Binary Prediction: {int(y_pred > 0.5)}")

[[1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]]
[[-0.43934378]
 [ 0.71422514]
 [ 2.95578809]
 [-1.03654044]
 [-1.61698721]
 [-1.00351409]
 [ 1.83080424]
 [ 0.65750222]
 [-1.05952041]
 [ 1.02653487]
 [ 0.1941551 ]
 [ 1.93728998]
 [-1.40410619]
 [-0.65532429]
 [-0.78421631]
 [-2.9270299 ]
 [ 0.59224055]
 [ 0.52211054]
 [ 0.01022691]
 [-0.46917427]]
Initial weights:
W1 (input -> hidden1):
[[-0.01415371 -0.00420645 -0.00342715 -0.00802277]]

W2 (h

In [4]:
import numpy as np

# Activation functions
def sigmoid(x):
    x = np.clip(x, -500, 500)
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

class MultiLayerPerceptron:
    def __init__(self, input_size=1, hidden1_size=4, output_size=1, learning_rate=0.01, optimizer='adam'):
        self.input_size = input_size
        self.hidden1_size = hidden1_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.optimizer = optimizer.lower()

        # Initialize weights and biases
        self.w1 = np.random.randn(input_size, hidden1_size) * 0.01
        self.b1 = np.zeros((1, hidden1_size))    
        self.w2 = np.random.randn(hidden1_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))

        # Common velocity terms
        self.v_w1 = np.zeros_like(self.w1)
        self.v_b1 = np.zeros_like(self.b1)
        self.v_w2 = np.zeros_like(self.w2)
        self.v_b2 = np.zeros_like(self.b2)

        # For AdaGrad, RMSProp, Adam
        self.eps = 1e-8
        self.G_w1 = np.zeros_like(self.w1)
        self.G_b1 = np.zeros_like(self.b1)
        self.G_w2 = np.zeros_like(self.w2)
        self.G_b2 = np.zeros_like(self.b2)

        # Adam specific
        self.m_w1 = np.zeros_like(self.w1)
        self.vv_w1 = np.zeros_like(self.w1)
        self.m_b1 = np.zeros_like(self.b1)
        self.vv_b1 = np.zeros_like(self.b1)
        self.m_w2 = np.zeros_like(self.w2)
        self.vv_w2 = np.zeros_like(self.w2)
        self.m_b2 = np.zeros_like(self.b2)
        self.vv_b2 = np.zeros_like(self.b2)
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.t = 0

        # RMSProp
        self.decay_rate = 0.9

    def forward(self, X):
        self.X = X
        self.z1 = np.dot(X, self.w1) + self.b1
        self.a1 = sigmoid(self.z1)
        self.z2 = np.dot(self.a1, self.w2) + self.b2
        self.output = sigmoid(self.z2)
        return self.output

    def backward(self, y):
        m = self.X.shape[0]
        output_clipped = np.clip(self.output, 1e-15, 1 - 1e-15)
        loss = -np.mean(y * np.log(output_clipped) + (1 - y) * np.log(1 - output_clipped))

        delta_output = self.output - y
        dw2 = np.dot(self.a1.T, delta_output) / m
        db2 = np.sum(delta_output, axis=0, keepdims=True) / m
        delta1 = np.dot(delta_output, self.w2.T) * sigmoid_derivative(self.a1)
        dw1 = np.dot(self.X.T, delta1) / m
        db1 = np.sum(delta1, axis=0, keepdims=True) / m

        return loss, dw1, db1, dw2, db2

    def update_weights(self, dw1, db1, dw2, db2):
        if self.optimizer == 'momentum':
            gamma = 0.9
            self.v_w1 = gamma * self.v_w1 + self.learning_rate * dw1
            self.v_b1 = gamma * self.v_b1 + self.learning_rate * db1
            self.v_w2 = gamma * self.v_w2 + self.learning_rate * dw2
            self.v_b2 = gamma * self.v_b2 + self.learning_rate * db2
            self.w1 -= self.v_w1
            self.b1 -= self.v_b1
            self.w2 -= self.v_w2
            self.b2 -= self.v_b2

        elif self.optimizer == 'nag':
            gamma = 0.9
            # Look ahead step
            w1_ahead = self.w1 - gamma * self.v_w1
            b1_ahead = self.b1 - gamma * self.v_b1
            w2_ahead = self.w2 - gamma * self.v_w2
            b2_ahead = self.b2 - gamma * self.v_b2

            z1 = np.dot(self.X, w1_ahead) + b1_ahead
            a1 = sigmoid(z1)
            z2 = np.dot(a1, w2_ahead) + b2_ahead
            output = sigmoid(z2)
            delta_output = output - self.y_batch
            dw2 = np.dot(a1.T, delta_output) / self.X.shape[0]
            db2 = np.sum(delta_output, axis=0, keepdims=True) / self.X.shape[0]
            delta1 = np.dot(delta_output, w2_ahead.T) * sigmoid_derivative(a1)
            dw1 = np.dot(self.X.T, delta1) / self.X.shape[0]
            db1 = np.sum(delta1, axis=0, keepdims=True) / self.X.shape[0]

            # Update
            self.v_w1 = gamma * self.v_w1 + self.learning_rate * dw1
            self.v_b1 = gamma * self.v_b1 + self.learning_rate * db1
            self.v_w2 = gamma * self.v_w2 + self.learning_rate * dw2
            self.v_b2 = gamma * self.v_b2 + self.learning_rate * db2
            self.w1 -= self.v_w1
            self.b1 -= self.v_b1
            self.w2 -= self.v_w2
            self.b2 -= self.v_b2

        elif self.optimizer == 'adagrad':
            self.G_w1 += dw1 ** 2
            self.G_b1 += db1 ** 2
            self.G_w2 += dw2 ** 2
            self.G_b2 += db2 ** 2
            self.w1 -= self.learning_rate * dw1 / (np.sqrt(self.G_w1) + self.eps)
            self.b1 -= self.learning_rate * db1 / (np.sqrt(self.G_b1) + self.eps)
            self.w2 -= self.learning_rate * dw2 / (np.sqrt(self.G_w2) + self.eps)
            self.b2 -= self.learning_rate * db2 / (np.sqrt(self.G_b2) + self.eps)

        elif self.optimizer == 'rmsprop':
            self.G_w1 = self.decay_rate * self.G_w1 + (1 - self.decay_rate) * (dw1 ** 2)
            self.G_b1 = self.decay_rate * self.G_b1 + (1 - self.decay_rate) * (db1 ** 2)
            self.G_w2 = self.decay_rate * self.G_w2 + (1 - self.decay_rate) * (dw2 ** 2)
            self.G_b2 = self.decay_rate * self.G_b2 + (1 - self.decay_rate) * (db2 ** 2)
            self.w1 -= self.learning_rate * dw1 / (np.sqrt(self.G_w1) + self.eps)
            self.b1 -= self.learning_rate * db1 / (np.sqrt(self.G_b1) + self.eps)
            self.w2 -= self.learning_rate * dw2 / (np.sqrt(self.G_w2) + self.eps)
            self.b2 -= self.learning_rate * db2 / (np.sqrt(self.G_b2) + self.eps)

        elif self.optimizer == 'adam':
            self.t += 1
            self.m_w1 = self.beta1 * self.m_w1 + (1 - self.beta1) * dw1
            self.vv_w1 = self.beta2 * self.vv_w1 + (1 - self.beta2) * (dw1 ** 2)
            m_hat_w1 = self.m_w1 / (1 - self.beta1 ** self.t)
            v_hat_w1 = self.vv_w1 / (1 - self.beta2 ** self.t)

            self.m_b1 = self.beta1 * self.m_b1 + (1 - self.beta1) * db1
            self.vv_b1 = self.beta2 * self.vv_b1 + (1 - self.beta2) * (db1 ** 2)
            m_hat_b1 = self.m_b1 / (1 - self.beta1 ** self.t)
            v_hat_b1 = self.vv_b1 / (1 - self.beta2 ** self.t)

            self.m_w2 = self.beta1 * self.m_w2 + (1 - self.beta1) * dw2
            self.vv_w2 = self.beta2 * self.vv_w2 + (1 - self.beta2) * (dw2 ** 2)
            m_hat_w2 = self.m_w2 / (1 - self.beta1 ** self.t)
            v_hat_w2 = self.vv_w2 / (1 - self.beta2 ** self.t)

            self.m_b2 = self.beta1 * self.m_b2 + (1 - self.beta1) * db2
            self.vv_b2 = self.beta2 * self.vv_b2 + (1 - self.beta2) * (db2 ** 2)
            m_hat_b2 = self.m_b2 / (1 - self.beta1 ** self.t)
            v_hat_b2 = self.vv_b2 / (1 - self.beta2 ** self.t)

            self.w1 -= self.learning_rate * m_hat_w1 / (np.sqrt(v_hat_w1) + self.eps)
            self.b1 -= self.learning_rate * m_hat_b1 / (np.sqrt(v_hat_b1) + self.eps)
            self.w2 -= self.learning_rate * m_hat_w2 / (np.sqrt(v_hat_w2) + self.eps)
            self.b2 -= self.learning_rate * m_hat_b2 / (np.sqrt(v_hat_b2) + self.eps)

        else:  # default: SGD
            self.w1 -= self.learning_rate * dw1
            self.b1 -= self.learning_rate * db1
            self.w2 -= self.learning_rate * dw2
            self.b2 -= self.learning_rate * db2

    def train(self, X, y, epochs=1000, batch_size=16, verbose=True):
        losses = []
        for epoch in range(epochs):
            indices = np.arange(X.shape[0])
            np.random.shuffle(indices)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            epoch_loss = 0
            for i in range(0, X.shape[0], batch_size):
                end = i + batch_size
                X_batch = X_shuffled[i:end]
                y_batch = y_shuffled[i:end]

                self.X = X_batch
                self.y_batch = y_batch
                self.forward(X_batch)
                loss, dw1, db1, dw2, db2 = self.backward(y_batch)
                self.update_weights(dw1, db1, dw2, db2)
                epoch_loss += loss

            avg_loss = epoch_loss / (X.shape[0] // batch_size)
            losses.append(avg_loss)

            if verbose and epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {avg_loss:.6f}")

        return losses

# === Usage Example ===
np.random.seed(0)
X = np.random.randn(100, 1)
y = (X > 0).astype(float)

mlp = MultiLayerPerceptron(input_size=1, hidden1_size=4, output_size=1, learning_rate=0.01, optimizer='nag')
mlp.train(X, y, epochs=1000, batch_size=16, verbose=True)

# Evaluate
y_pred = mlp.forward(X)
acc = np.mean((y_pred > 0.5) == y)
print("Accuracy:", acc)


Epoch 0, Loss: 0.808286
Epoch 100, Loss: 0.371731
Epoch 200, Loss: 0.132664
Epoch 300, Loss: 0.078281
Epoch 400, Loss: 0.060665
Epoch 500, Loss: 0.056182
Epoch 600, Loss: 0.039847
Epoch 700, Loss: 0.035158
Epoch 800, Loss: 0.032103
Epoch 900, Loss: 0.029068
Accuracy: 0.99
