In [1]:
import jax
print(jax.__version__)

0.4.30


## Activation Functions

In [2]:
import numpy as np

In [3]:
"""
Sigmoid function
Sigmoid squashes input to a range between O and 1 , fitting well for binary classification tasks.

"""
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [4]:
"""
Rectified Linear Unit (ReLU)
ReLU is a non-linear activation function that outputs the input directly if it is positive, otherwise, it outputs zero.
Enhancing effciency compared to sigmoid.

"""
def relu(x):
    return np.maximum(0, x)

In [5]:
"""
Tanh function
Tanh squashes input to a range between -1 and 1, which is useful for classification tasks.
(different than Sigmoid that squashes input to a range between O and 1 )

"""
def tanh(x):
    return np.tanh(x)

In [None]:
"""
Softmax function
Softmax squashes input to a range between O and 1, and the sum of the output is 1.
Used in the output layer for multi-class classification, converting outputs into probabilities.

"""
def softmax(x):
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

## Backpropagation
Unraveling the Gradient Descent Algorithm.    
Backpropagation, the engine behind neural network training, iteratively adjusts weights and biases to minimize the
error between predicted and actual outputs.


In [6]:
# Assuming a simple neural network with one hidden layer
def backpropagation(inputs, targets, weights_input_hidden, weights_hidden_output):
  # Forward pass
  hidden_inputs = np.dot(inputs, weights_input_hidden)
  hidden_outputs = sigmoid(hidden_inputs)

  final_inputs = np.dot(hidden_outputs, weights_hidden_output)
  final_outputs = sigmoid(final_inputs)

  # Calculate error
  output_errors = targets - final_outputs

  # Backward pass
  output_grad = final_outputs * (1 - final_outputs) * output_errors
  hidden_errors = np.dot(output_grad, weights_hidden_output.T)
  hidden_grad = hidden_outputs * (1 - hidden_outputs) * hidden_errors

  # Update weights and biases
  weights_hidden_output += np.dot(hidden_outputs.T, output_grad)
  weights_input_hidden += np.dot(inputs.T, hidden_grad)

## Optimization techniques

### Stochastic Gradient Descent (SGD)
Stochastic Gradient Descent (SGD) is an optimization algorithm used to minimize a function by iteratively moving towards the minimum value of the function.     
It is particularly useful for training machine learning models.     
The primary goal of gradient descent is to identify the model parameters that provide the maximum accuracy on both training and test datasets.     
Unlike traditional Gradient Descent, which uses the entire dataset to compute the gradient of the loss function,    
SGD randomly selects a subset of data at each step.     
This makes SGD faster and more scalable for large datasets, though it may introduce more noise into the optimization process.

In [9]:
# Stochastic Gradient Descent SGD optimizer
def sgd_optimizer(inputs, targets, learning_rate=0.01, epochs=100):
  for epoch in range(epochs):
    for i in range(len(inputs)):
      # Forward pass
      # Backward pass and update weights
      print('.', end='')

### Momentum
A training optimization technique used to accelerate the convergence of gradient descent algorithms.    
It helps to speed up the learning process by incorporating the direction and velocity of the previous gradients into the current update.    
Essentially, it adds a fraction of the previous update step to the current update step,    
allowing the algorithm to move faster through shallow regions and dampening oscillations in steep regions.     
This technique is inspired by the physical concept of momentum, where an object in motion tends to stay in motion.    
In the context of neural network training, it helps to overcome some of the limitations of standard gradient descent    
by making the path towards the minimum more direct and thus potentially reducing the number of iterations needed to reach convergence.


In [None]:
def momentum_optimizer(inputs, targets, learning_rate=0.01, epochs=100, momentum=0.9):
  velocity = 0
  for epoch in range(epochs):
    for i in range(len(inputs)):
      # Forward pass
      # Backward pass and update weights
      velocity = momentum * velocity + learning_rate * gradient

### Adaptive Learning Rate
A training optimization technique that adjusts the learning rate dynamically during the training of a model.    
Unlike fixed learning rate strategies, adaptive methods modify the learning rate for each parameter,    
based on the history of gradients for that parameter.     
This approach helps in addressing issues like choosing an appropriate learning rate or adjusting it during training,     
which can significantly affect the convergence speed and quality of the final model.    
Techniques such as AdaGrad, RMSprop, and Adam are examples of adaptive learning rate methods.     
They aim to decrease the learning rate for parameters with large gradients to avoid overshooting and   
increase it for parameters with small gradients to speed up the learning process.    
This results in a more efficient and effective training process, especially for complex models and large datasets.

In [10]:
def adaptive_learning_rate_optimizer(inputs, targets, learning_rate=0.01, epochs=100):
  for epoch in range(epochs):
    for i in range(len(inputs)):
      # Forward pass
      # Backward pass and update weights
      learning_rate *= 1.0/(1.0 + learning_rate * epoch)

### Regularization Technique
A training optimization technique used to prevent overfitting in machine learning models.     
Overfitting occurs when a model learns the training data too well,     
capturing noise along with the underlying patterns, which results in poor performance on new, unseen data.     
Regularization addresses this issue by adding a penalty on the size of the model parameters to the loss function.     
This penalty encourages the model to keep the weights small, which can lead to simpler models that generalize better to new data.     
</br>
There are several types of regularization techniques, including:     
</br>
L1 Regularization (Lasso): Adds a penalty equal to the absolute value of the magnitude of coefficients.     
This can lead to sparse models where some weights can become zero, effectively performing feature selection.    
</br>
L2 Regularization (Ridge): Adds a penalty equal to the square of the magnitude of coefficients.     
This discourages large weights but does not necessarily drive them to zero.    
</br>
Elastic Net: Combines L1 and L2 regularization, adding both penalties to the loss function.    
This method enjoys the feature selection property of L1 and the smoothing of L2 regularization.   
</br>
Regularization techniques are widely used in linear regression, logistic regression, neural networks, and     
many other machine learning algorithms to improve their generalization capabilities.

In [11]:
def dropout(inputs, dropout_rate=0.2):
  mask = (np.random.rand(*inputs.shape) < 1.0 - dropout_rate) / (1.0 - dropout_rate)
  return inputs * mask

def weight_decay(weights, decay_rate=0.001):
  return weights - decay_rate * weights

## Implementing a Multilayer Perceptron (MLP)
Implement a simple multilayer perceptron (MLP) for a binary classification problem.    
Use NumPy for matrix operations and implement both forward and backward passes.     
Additionally, include a training loop to update the weights and biases using gradient descent.  </br>  

#### Requirements:
1. Design a multilayer perceptron with:
   - Input layer with 5 neurons.
   - Hidden layer with 10 neurons, using a ReLU activation function.
   - Output layer with 1 neuron and a sigmoid activation function.
2. Implement forward pass logic to compute the predicted output.
3. Implement backward pass logic to calculate gradients and update weights and biases using gradient descent.
4. Create a simple dataset for binary classification (e.g., use NumPy to generate random data).
5. Train your MLP on the dataset for a specified number of epochs.

In [14]:
# Multilayer Perceptron (MLP)
import numpy as np

# Define the MLP architecture
input_size = 5
hidden_size = 10
output_size = 1
learning_rate = 0.01
epochs = 1000

# Initialize weights and biases
weights_input_hidden = np.random.randn(input_size, hidden_size)
biases_hidden = np.zeros((1, hidden_size))
weights_hidden_output = np.random.randn(hidden_size, output_size)
biases_output = np.zeros((1, output_size))

# Activation functions
def relu(x):
  return np.maximum(0, x)

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

# Forward pass
def forward_pass(inputs):
  global weights_hidden_output, biases_output, weights_input_hidden, biases_hidden
  hidden_input = np.dot(inputs, weights_input_hidden) + biases_hidden
  hidden_output = relu(hidden_input)
  final_input = np.dot(hidden_output, weights_hidden_output) + biases_output
  predicted_output = sigmoid(final_input)
  return predicted_output, hidden_output

# Backward pass
def backward_pass(inputs, predicted_output, hidden_output, targets):
  global weights_hidden_output, biases_output, weights_input_hidden, biases_hidden
  output_error = predicted_output - targets
  output_delta = output_error * (predicted_output * (1 - predicted_output))

  hidden_error = output_delta.dot(weights_hidden_output.T)
  hidden_delta = hidden_error * (hidden_output > 0)

  # Update weights and biases
  weights_hidden_output -= learning_rate * hidden_output.T.dot(output_delta)
  biases_output -= learning_rate * np.sum(output_delta, axis=0, keepdims=True)
  weights_input_hidden -= learning_rate * inputs.T.dot(hidden_delta)
  biases_hidden -= learning_rate * np.sum(hidden_delta, axis=0, keepdims=True)

# Generate a simple dataset
np.random.seed(42)
X = np.random.rand(100, input_size)
y = (X[:,0] + X[:,1] > 1).astype(int).reshape(-1, 1)

# Training loop
for epoch in range(epochs):
  # Forward pass
  predicted_output, hidden_output = forward_pass(X)

  # Backward pass
  backward_pass(X, predicted_output, hidden_output, y)

  # Print loss every 100 epochs
  if epoch % 100 == 0:
    loss = -np.mean(y * np.log(predicted_output) + (1 - y) * np.log(1 - predicted_output))
    print(f"Epoch {epoch}, loss {loss}")

# Test the model
data_point = np.array([[0.6, 0.7, 0.8, 0.9, 1.0]])
predicted_output, _ = forward_pass(data_point)
print(f"Predicted output for data point: {predicted_output}")

Epoch 0, loss 4.302259694208785
Epoch 100, loss 2.6849827636148813
Epoch 200, loss 0.29311894603362476
Epoch 300, loss 0.20835549379259646
Epoch 400, loss 0.1732184134278425
Epoch 500, loss 0.15290112611359258
Epoch 600, loss 0.13903928116198963
Epoch 700, loss 0.12842133841620013
Epoch 800, loss 0.11981254043894086
Epoch 900, loss 0.11262041788768858
Predicted output for data point: [[0.98945626]]
