<a href="https://colab.research.google.com/github/Colette-c/MAT-422/blob/main/HW_3_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Neural Networks

In [9]:
import numpy as np

## Define activation functions and their derivatives
## Sigmoid functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(sigmoid):
    return sigmoid * (1 - sigmoid)

## Mean Squared Error Cost Functions
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

def mean_squared_error_derivative(y_true, y_pred):
    return 2 * (y_pred - y_true)

## Example dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [0], [0], [1]])

## Initialize parameters
np.random.seed(42)
input_size = 2     ## Number of input neurons
hidden_size = 2    ## Number of neurons in the hidden layer
output_size = 1    ## Number of output neurons

## Weights and biases initialization
W1 = np.random.rand(input_size, hidden_size)
b1 = np.random.rand(1, hidden_size)
W2 = np.random.rand(hidden_size, output_size)
b2 = np.random.rand(1, output_size)

## Training parameters
learning_rate = 0.1
epochs = 10000

## Training the neural network using backpropagation
for epoch in range(epochs):
    ## Forward pass
    ## Layer 1 (Input to Hidden)
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)

    ## Layer 2 (Hidden to Output)
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)

    ## Cost function (Mean Squared Error)
    loss = mean_squared_error(y, a2)

    ## Backpropagation
    ## Output layer error and gradient
    d_loss_a2 = mean_squared_error_derivative(y, a2)
    d_a2_z2 = sigmoid_derivative(a2)
    d_loss_z2 = d_loss_a2 * d_a2_z2  ## Chain rule
    d_loss_W2 = np.dot(a1.T, d_loss_z2)  ## Gradient for W2
    d_loss_b2 = np.sum(d_loss_z2, axis=0, keepdims=True)  ## Gradient for b2

    ## Hidden layer error and gradient
    d_loss_a1 = np.dot(d_loss_z2, W2.T)
    d_a1_z1 = sigmoid_derivative(a1)
    d_loss_z1 = d_loss_a1 * d_a1_z1  ## Chain rule
    d_loss_W1 = np.dot(X.T, d_loss_z1)  ## Gradient for W1
    d_loss_b1 = np.sum(d_loss_z1, axis=0, keepdims=True)  ## Gradient for b1

    ## Update weights and biases
    W2 -= learning_rate * d_loss_W2
    b2 -= learning_rate * d_loss_b2
    W1 -= learning_rate * d_loss_W1
    b1 -= learning_rate * d_loss_b1

    ## Print loss every 1000 epochs
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")

## Final results
print("\nFinal prediction values:")
print(a2)
print("\nOriginal true values")
print(y)

Epoch 0, Loss: 0.45420522091806426
Epoch 1000, Loss: 0.015174459356917566
Epoch 2000, Loss: 0.0034487176830037486
Epoch 3000, Loss: 0.0017542077230208244
Epoch 4000, Loss: 0.0011393208805266888
Epoch 5000, Loss: 0.0008311431778569072
Epoch 6000, Loss: 0.0006486945735062011
Epoch 7000, Loss: 0.0005290871120171327
Epoch 8000, Loss: 0.0004450843770327675
Epoch 9000, Loss: 0.0003830834304400625

Final prediction values:
[[0.00229235]
 [0.01868908]
 [0.01897543]
 [0.97494294]]

Original true values
[[0]
 [0]
 [0]
 [1]]


# Mathematical Formulation
We use the equation $\hat{y}=z=xw+b$ where $x$ is the input or the previous layer output and $w$ is the weight and $b$ is the bias.
#Activation Functions
For this example we used the sigmoid function for activation, $$\sigma(x)=\frac{1}{1+e^{-x}}$$ and it's derivative in terms of the sigmoid function is $$\sigma(x)=\sigma(x)(1-\sigma(x))$$
#Cost Function
We used Mean Squared Error(MSE) as the cost function $$MSE=\frac{1}{n}\sum(y_{true}-y_{pred})^2$$
#Backpropagation Algorithm
Using the gradient calculated for each layer's weight and bias, we update the weights and biases.
$W=W-\text{learning rate} \times \text{gradient}$, $b=b-\text{learning rate} \times \text{gradient}$
