In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Generate synthetic data
np.random.seed(42)
X = 2 * np.random.randn(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

#Add bias term\
X_b = np.c_[np.ones((100, 1)), X]

# Batch Gradient Descent

#Batch Gradient Descent
def batch_gradient_descent(X, y, alpha = 0.1, num_iters = 1000):
  m = len(y)
  theta = np.random.randn(2,1)
  print(theta)
  for iteration in range(num_iters):
    gradient = 2/m * X.T.dot(X.dot(theta) - y)
    theta = theta - alpha * gradient
  return theta

theta_bgd = batch_gradient_descent(X_b, y)
print("BATCH GRADIENT DESCENT THETA",theta_bgd)

# Stochastic Gradient Descent

#Stochastic Gradient Descent
def stochastic_gradient_descent(X, y, alpha = 0.1, n_epochs = 50):
  m = len(y)
  theta = np.random.randn(2,1)
  for epoch in range(n_epochs):
    for i in range(m):
      random_index = np.random.randint(m)
      xi = X[random_index:random_index+1]
      yi = y[random_index:random_index+1]
      gradient = 2 * xi.T.dot(xi.dot(theta) - yi)
      theta = theta - alpha * gradient
  return theta

theta_sgd = stochastic_gradient_descent(X_b, y)
print("STOCHASTIC GRADIENT DESCENT THETA",theta_sgd)

# Mini-Batch Gradient Descent

# Mini-Batch Gradient Descent
def mini_batch_gradient_descent(X, y, alpha = 0.1, num_iters = 1000, batch_size = 20):
  m = len(y)
  theta = np.random.randn(2,1)
  for iteration in range(num_iters):
    indices = np.random.permutation(m)
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    for i in range(0, m, batch_size):
      xi = X_shuffled[i:i+batch_size]
      yi = y_shuffled[i:i+batch_size]
      gradient = 2/len(xi) * xi.T.dot(xi.dot(theta) - yi)
      theta = theta - alpha * gradient
    return theta

theta_mbgd = mini_batch_gradient_descent(X_b, y)
print("MINI-BATCH GRADIENT DESCENT THETA",theta_mbgd)

def plot_gradient_descent(X, y, theta_bgd, theta_sgd, theta_mbgd):
  plt.plot(X, y, "b.")
  X_new = np.array([[0], [2]])
  X_new_b = np.c_[np.ones((2, 1)), X_new]
  print(X_new)
  y_predict_bgd = X_new_b.dot(theta_bgd)
  y_predict_sgd = X_new_b.dot(theta_sgd)
  y_predict_mbgd = X_new_b.dot(theta_mbgd)
  plt.plot(X_new, y_predict_bgd, "r-", linewidth = 4, label = "BGD")
  plt.plot(X_new, y_predict_sgd, "g-", linewidth = 2, label = "SGD")
  plt.plot(X_new, y_predict_mbgd, "y-", linewidth = 2, label = "MBGD")
  plt.xlabel("$x_1$", fontsize=18)
  plt.ylabel("$y$", rotation=0, fontsize=18)
  plt.legend(loc="upper left", fontsize=16)
  plt.title("Gradient Descent", fontsize=16)
  plt.show()

plot_gradient_descent(X, y, theta_bgd, theta_sgd, theta_mbgd)

# Advanced Optimizers
# Momentum-based Gradient Descent

def gradient_descent_with_momentum(X, y, theta, alpha, gamma, num_iters):
  m = len(y)
  velocity = np.zeros_like(theta)

  for iteration in range(num_iters):
    gradient = 1/m * X.T.dot(X.dot(theta) - y)
    velocity = gamma * velocity + alpha * gradient
    theta = theta - velocity
  return theta

#Example usage
X = np.array([[1,2],[3,4],[5,6]])
y = np.array([1,2,3])
theta = np.zeros(X.shape[1])
alpha = 0.01
gamma = 0.9
num_iters = 1000

theta_momentum = gradient_descent_with_momentum(X, y, theta, alpha, gamma, num_iters)
print("Optimized paramters:", theta_momentum)


# Adagrad

np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

X_b = np.c_[np.ones((100, 1)), X]

alpha = 0.1
n_epochs = 1000
epsilon = 1e-8
theta = np.random.randn(2,1)

gradient_accum = np.zeros((2,1))

for iteration in range(n_epochs):
  gradient = 2/len(X_b) * X_b.T.dot(X_b.dot(theta) - y)
  gradient_accum += gradient**2
  adjusted_gradient = gradient / (np.sqrt(gradient_accum) + epsilon)
  theta = theta - alpha * adjusted_gradient

print("Optimized paramters:", theta)

plt.scatter(X, y)
plt.plot(X, X_b.dot(theta), "r-", linewidth = 2, label = 'Adagrad')
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show

# RMSprop

np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

X_b = np.c_[np.ones((100, 1)), X]

alpha = 0.1
n_epochs = 1000
epsilon = 1e-8
gamma = 0.9
theta = np.random.randn(2,1)

gradient_accum = np.zeros((2,1))

for i in range(n_epochs):
  gradient = 2/len(X_b) * X_b.T.dot(X_b.dot(theta) - y)
  gradient_accum = gamma*gradient_accum + (1-gamma)*(gradient**2)
  adjusted_gradient = gradient / np.sqrt(gradient_accum + epsilon)
  theta = theta - alpha * adjusted_gradient

print("Optimized paramters:", theta)

plt.scatter(X, y)
plt.plot(X, X_b.dot(theta), "r-", linewidth = 2, label = 'Adagrad')
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show

In [None]:

OPTIMIZERS USING MNIST DATA

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Preprocess the data
X_train = X_train.reshape((60000, 28, 28, 1))  # Reshape for CNN
X_test = X_test.reshape((10000, 28, 28, 1))
X_train = X_train.astype('float32') / 255  # Normalize to [0, 1]
X_test = X_test.astype('float32') / 255

y_train = to_categorical(y_train, 10)  # One-hot encode labels
y_test = to_categorical(y_test, 10)

# Create a simple CNN model
def create_model(optimizer):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(10, activation='softmax'))
    
    model.compile(optimizer=optimizer, 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

# Optimizers
optimizers = {
    'Batch Gradient Descent': 'sgd',  # Using SGD as a representation
    'Stochastic Gradient Descent': tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    'Momentum': tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    'Adagrad': tf.keras.optimizers.Adagrad(learning_rate=0.01),
}

# Train and evaluate models using different optimizers
results = {}
for name, optimizer in optimizers.items():
    model = create_model(optimizer)
    print(f"\nTraining with {name}...")
    history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=2)
    results[name] = history.history

# Plot accuracy for each optimizer
plt.figure(figsize=(12, 6))
for name, history in results.items():
    plt.plot(history['val_accuracy'], label=name)
    
plt.title('Model Accuracy with Different Optimizers')
plt.xlabel('Epochs')
plt.ylabel('Validation Accuracy')
plt.legend()
plt.grid()
plt.show()

# Plot loss for each optimizer
plt.figure(figsize=(12, 6))
for name, history in results.items():
    plt.plot(history['val_loss'], label=name)

plt.title('Model Loss with Different Optimizers')
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
plt.legend()
plt.grid()
plt.show()