<a href="https://colab.research.google.com/github/Daddy-senpaii/MLP-from-scratch/blob/main/Moon_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git clone https://github.com/Daddy-senpaii/MLP-from-scatch.git

Cloning into 'MLP-from-scatch'...
fatal: could not read Username for 'https://github.com': No such device or address


# **Objective:**
- Build a Multi-Layer Perceptron (MLP) neural network from scratch to classify points from the make_moons dataset.
- Later, implement the same network using TensorFlow with mini-batch training.

# **1. Load the Dataset:**
- Dataset: make_moons from sklearn.datasets
- Each sample has 2 features and a binary label (0 or 1)
- Preprocess:
  - Transpose X to shape (n_features, n_samples)
  - Reshape Y to (1, n_samples) to match output layer dimension

In [None]:
import numpy as np

# **3. Parameter Initialization:**
- Weights W[l] initialized using **He initialization** for ReLU layers:
  W[l] = np.random.randn(layers_dims[l], layers_dims[l-1]) * sqrt(2 / layers_dims[l-1])
- Biases b[l] initialized as zeros: b[l] = zeros((layers_dims[l], 1))
- Store parameters in a dictionary:
  parameters = {"W1": ..., "b1": ..., "W2": ..., "b2": ..., ..., "W10": ..., "b10": ...}
- Proper initialization prevents vanishing/exploding gradients and accelerates learning

In [None]:
def initialize_parameters(dims, X): # total features
  # initialize the paraameters // for now three will do fine
  parameters = {}
  L = len(dims)
  W1 = np.random.randn(dims[0], X.shape[0])
  b1 = np.random.randn(dims[0],1)
  parameters["W1"] = W1
  parameters["b1"] = b1
  for i in range(1, L):
    parameters["W"+str(i+1)] = np.random.randn(dims[i], dims[i-1])*np.sqrt(2/layers_dims[i-1])
    parameters["b"+str(i+1)] = np.random.randn(dims[i],1)
  return parameters



In [None]:
def relu(Z):
  A = np.maximum(0, Z)
  cache = Z
  return A,cache

In [None]:
def sigmoid(Z):
  A = 1/(1+np.exp(-Z))
  cache = Z
  return A, cache

In [None]:

def linear_activation_forward(A_prev , W, b , activations):
  # print(A_prev.shape, W.shape, b.shape, activations)
  if activations == "relu":
    Z = np.dot(W, A_prev)+b
    A, activation_cache = relu(Z)
    cache = ((A_prev, W, b), activation_cache)

  if activations == "sigmoid":
    Z = np.dot(W, A_prev)+b
    A,activation_cache = sigmoid(Z)
    cache = ((A_prev, W, b), activation_cache)
  return A, cache



# **. Forward Propagation:**
- Linear + Activation step for each layer:
  Z[l] = W[l] * A[l-1] + b[l]
  A[l] = activation(Z[l])
- Activations:
  - Hidden layers: ReLU
  - Output layer: Sigmoid
- Caching for Backprop:
  - Store at each layer: cache = ((A_prev, W, b), Z)
  - A_prev = input to the layer
  - W, b = parameters
  - Z = linear combination before activation
  - Store all caches in a list: caches = [cache1, cache2, ..., cacheL]
- Purpose: During backprop, caches allow efficient gradient computation without recomputation


In [None]:
def L_model_forward(X, parameters):
  caches = []
  L = len(parameters) // 2
  A = X
  for i in range(1, L):
    A_prev = A
    A, cache = linear_activation_forward(A_prev, parameters["W"+str(i)], parameters["b"+str(i)],activations = "relu")
    caches.append(cache)
  AL, cache = linear_activation_forward(A, parameters["W"+str(L)], parameters["b"+str(L)], activations = "sigmoid")
  caches.append(cache)
  return AL, caches

# **5. Loss Function:**
- Binary cross-entropy loss:
  cost = -1/m * sum(Y * log(AL) + (1-Y) * log(1-AL))
- Ensure AL and Y shapes match (1, m)
- Clip AL to avoid log(0) issues
- Cost is a scalar, useful for tracking convergence

In [None]:
def calculate_cost(Y, AL):
  Y = Y.reshape(1, Y.shape[0])
  m = Y.shape[1]
  cost =  (-1/m)*(np.dot(Y, np.log(AL).T) + np.dot((1-Y), np.log(1-AL).T))
  cost = np.squeeze(cost)
  return cost


In [None]:
def relu_backward(dA, cache):
  Z = cache
  dZ = np.array(dA, copy = True)
  dZ[Z<=0] = 0
  return dZ

In [None]:
def sigmoid_backward(dA, cache):
  Z = cache
  s = 1/(1+np.exp(-Z))
  dZ = dA * s * (1-s)
  return dZ

In [None]:
def linear_backward(dZ, cache):
  A_prev, W, b = cache
  m = A_prev.shape[1]
  dW = (1/m) * np.dot(dZ, A_prev.T)
  db = (1/m) * np.sum(dZ, axis = 1, keepdims = True)
  dA_prev = np.dot(W.T, dZ)
  return dA_prev, dW, db

In [None]:
def linear_activation_backward(dA, cache, activation):
  linear_cache, activation_cache = cache
  if activation == "relu":
    dZ = relu_backward(dA, activation_cache)
  elif activation == "sigmoid":
    dZ = sigmoid_backward(dA, activation_cache)
  dA_prev, dW, db = linear_backward(dZ, linear_cache)
  return dA_prev, dW, db

# ** .Backpropagation:**
- Goal: Compute gradients of cost w.r.t parameters dW, db
- Output layer:
  - Compute dAL = derivative of cost w.r.t AL
  - Backprop through sigmoid to get dZ
  - Compute dW, db, dA_prev using cached A_prev, W, b
- Hidden layers:
  - Backprop dA from next layer through ReLU
  - Compute dZ, dW, db, dA_prev using caches
- Store gradients in a dictionary:
  grads = {"dW1": ..., "db1": ..., "dW2": ..., "db2": ..., ..., "dW10": ..., "db10": ...}

In [None]:
def L_model_backward(AL, Y, caches):
  grads = {}
  L = len(caches)
  m = AL.shape[1]
  Y = Y.reshape(AL.shape)
  dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1-AL))
  current_cache = caches[L-1]
  grads["dA"+str(L-1)], grads["dW"+str(L)], grads["db"+str(L)] = linear_activation_backward(dAL, current_cache, "sigmoid")
  for l in reversed(range(L-1)):
    current_cache = caches[l]
    dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA"+str(l+1)], current_cache, "relu")
    grads["dA"+str(l)] = dA_prev_temp
    grads["dW"+str(l+1)] = dW_temp
    grads["db"+str(l+1)] = db_temp
  return grads

# **. Parameter Update:**
- Update all parameters using gradient descent:
  W[l] = W[l] - learning_rate * dW[l]
  b[l] = b[l] - learning_rate * db[l]

In [None]:
def update_parameters(parameters, grads, learning_rate):
   L = len(parameters) // 2
   for l in range(L):
    parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
    parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
   return parameters


# **. Training Loop:**
- For each epoch:
  1. Forward pass → compute AL and caches
  2. Compute cost
  3. Backprop → compute gradients using caches
  4. Update parameters
  5. Track cost over epochs for plotting and convergence

In [None]:
def nn_model(X, y , parameters , num_iterations = 10000, learning_rate = 0.001, print_cost= True):
  for i in range(0, num_iterations):
    AL, caches = L_model_forward(X, parameters)
    cost = calculate_cost(y, AL)
    grads = L_model_backward(AL, y, caches)
    parameters = update_parameters(parameters, grads, learning_rate)
    if print_cost and i % 1000 == 0:
      print("Cost after iteration %i: %f" %(i, cost))
  return parameters

# ***Define Network Architecture:***
- layers_dims defines the network dynamically:
  Example: layers_dims = [2, 10, 10, 10, 10, 10, 10, 10, 10, 1]
  - Input layer: 2 neurons
  - Hidden layers: 10 neurons each (8 hidden layers total)
  - Output layer: 1 neuron (Sigmoid)
- This allows building a **deep network** with any number of layers by just changing layers_dims

In [None]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples = 60000, noise = 0.2 , random_state = 42)
X = X.T
y = y.reshape(y.shape[0],1 )
print(X[:10])

layers_dims = [2, 10,10,10, 10, 10 ,10, 10, 10,1]
parameters = initialize_parameters(layers_dims,X)
# AL , caches = L_model_forward(X, parameters)



[[ 0.16792167  0.41235714  0.48607938 ...  1.11629467  0.10118756
   2.0014006 ]
 [ 0.92371313 -0.17840666 -0.1326211  ...  0.24992198  1.16326836
  -0.01490159]]


In [None]:
simple_gd_parameters = nn_model(X,y, parameters)
print('updated_cost',simple_gd_parameters)

Cost after iteration 0: 0.730999
Cost after iteration 1000: 0.630428
Cost after iteration 2000: 0.575786
Cost after iteration 3000: 0.540739
Cost after iteration 4000: 0.519240
Cost after iteration 5000: 0.500552
Cost after iteration 6000: 0.483797
Cost after iteration 7000: 0.468850
Cost after iteration 8000: 0.455482
Cost after iteration 9000: 0.443378
updated_cost {'W1': array([[ 0.14640252,  0.19784595],
       [ 1.86896081, -0.31896918]]), 'b1': array([[-3.04382469],
       [ 1.63138978]]), 'W2': array([[ 1.39924082, -0.11715749],
       [-1.32255596, -1.65595481],
       [ 1.12836543,  0.06593403],
       [-0.33251112, -0.18852785],
       [-0.02684548,  0.79598643],
       [-1.30615884, -0.27663236],
       [-1.31471307,  0.22962215],
       [-0.96845126,  1.37894979],
       [ 0.75287148,  0.07424349],
       [-0.55792244, -1.67430971]]), 'b2': array([[ 0.51348878],
       [ 2.5035568 ],
       [ 1.156786  ],
       [ 0.10651423],
       [-0.65671659],
       [-1.72210378],
   

In [None]:
AL,caches = L_model_forward(X, simple_gd_parameters)
cost = calculate_cost(y, AL)
print(cost)

0.43227618678778806


In [None]:
X_test , y_test = make_moons(n_samples = 3000, noise = 0.2, random_state = 42)
X_test = X_test.T
y_test = y_test.reshape(y_test.shape[0],1)
parameters = simple_gd_parameters

AL , cache = L_model_forward(X_test, parameters)
cost = calculate_cost(y_test, AL)
print(cost)

0.43887453181058694


In [None]:
def predict(X, parameters):
    """
    Predict class labels for input X using the trained parameters.

    Parameters:
    X -- input data (n_x, m)
    parameters -- dict of trained weights and biases

    Returns:
    predictions -- array of 0/1 predictions (1, m)
    """
    AL, _ = L_model_forward(X, parameters)
    predictions = (AL > 0.5).astype(int)
    return predictions

def get_accuracy(X, y, parameters):
    """
    Compute accuracy percentage on the given data.

    Parameters:
    X -- input data (n_x, m)
    y -- true labels (m, 1)
    parameters -- dict of trained weights and biases

    Returns:
    accuracy -- float, accuracy percentage
    """
    predictions = predict(X, parameters)
    # Transpose y to match predictions shape (1, m)
    accuracy = np.mean(predictions == y.T) * 100
    return accuracy

In [None]:
accuracy = get_accuracy(X, y, simple_gd_parameters)
print(f"Training Accuracy: {accuracy:.2f}%")

Training Accuracy: 71.03%


In [None]:
accuracy_test = get_accuracy(X_test, y_test, simple_gd_parameters)
print(f"Test Accuracy: {accuracy_test:.2f}%")

Test Accuracy: 71.10%
