# Step 1: Load MNIST dataset

In [1]:
import numpy as np
import struct

In [2]:
def load_mnist_images(filename):
    with open(filename, 'rb') as f:
        magic, num_images, rows, cols = struct.unpack('>IIII', f.read(16))
        images = np.fromfile(f, dtype=np.uint8).reshape(num_images, rows * cols)
        images = images.astype(np.float64) / 255.0  # normalize to [0, 1]
        return images


In [3]:
def load_mnist_labels(filename):
    with open(filename, 'rb') as f:
        magic, num_labels = struct.unpack('>II', f.read(8))
        labels = np.fromfile(f, dtype=np.uint8)
        return labels


In [4]:
X = load_mnist_images('train-images.idx3-ubyte')
y = load_mnist_labels('train-labels.idx1-ubyte')


# Subset the dataset to use only class 0 and class 1

In [5]:
idx = np.where((y == 0) | (y == 1))[0]
X = X[idx]
y = y[idx]

# Standardize the dataset

In [6]:

X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_std[X_std == 0] = 0.00000000000000001  # avoid division by zero
X = (X - X_mean) / X_std

# Add a column of ones to X for the bias term

In [7]:
X = np.hstack((np.ones((len(X), 1)), X))

# Step 3: Implement Logistic Regression

In [8]:
# Define the logistic function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Define the loss function
def compute_loss(X, y, weights, lambda_reg):
    m = X.shape[0]
    h = sigmoid(np.dot(X, weights))
    reg_term = (lambda_reg / (2 * m)) * np.sum(np.square(weights[1:]))
    loss = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) + reg_term
    return loss

# Define the gradient of the loss function
def compute_grad(X, y, weights, lambda_reg):
    m = X.shape[0]
    h = sigmoid(np.dot(X, weights))
    grad = (1 / m) * np.dot(X.T, (h - y))
    grad[1:] += (lambda_reg / m) * weights[1:]
    return grad



# Implement Logistic Regression with L1 regularization and gradient descent optimizer

In [10]:
# Implement Logistic Regression with L1 regularization and gradient descent optimizer
def logistic_regression_l1(X, y, lambda_reg, learning_rate, num_iterations):
    weights = np.zeros(X.shape[1])
    losses = []
    
    for i in range(num_iterations):
        loss = compute_loss(X, y, weights, lambda_reg)
        grad = compute_grad(X, y, weights, lambda_reg)
        
        weights -= learning_rate * grad
        losses.append(loss)
    
    return weights, losses

# Hyperparameters
lambda_1 = 0.1
lambda_2 = 0.01
learning_rate = 0.1
num_iterations = 1000

# Train logistic regression model with L1 regularization (lambda = 0.1)
weights_l1_1, losses_l1_1 = logistic_regression_l1(X, y, lambda_1, learning_rate, num_iterations)

# Train logistic regression model with L1 regularization (lambda = 0.01)
weights_l1_2, losses_l1_2 = logistic_regression_l1(X, y, lambda_2, learning_rate, num_iterations)

# Step 7: Use mini-batch gradient descent optimizer

In [12]:
# Implement mini-batch gradient descent optimizer
def mini_batch_gradient_descent(X, y, lambda_reg, learning_rate, num_epochs, batch_size):
    weights = np.zeros(X.shape[1])
    losses = []
    
    m = X.shape[0]
    num_batches = int(np.ceil(m / batch_size))
    
    for epoch in range(num_epochs):
        for batch in range(num_batches):
            start_idx = batch * batch_size
            end_idx = min((batch + 1) * batch_size, m)
            
            X_batch = X[start_idx:end_idx]
            y_batch = y[start_idx:end_idx]
            
            loss = compute_loss(X_batch, y_batch, weights, lambda_reg)
            grad = compute_grad(X_batch, y_batch, weights, lambda_reg)
            
            weights -= learning_rate * grad
            losses.append(loss)
    
    return weights, losses

# Hyperparameters
num_epochs = 50
batch_size_1 = 64
batch_size_2 = 128

# Train logistic regression model using mini-batch gradient descent (batch size = 64)
weights_mb_1, losses_mb_1 = mini_batch_gradient_descent(X, y, lambda_1, learning_rate, num_epochs, batch_size_1)

# Train logistic regression model using mini-batch gradient descent (batch size = 128)
weights_mb_2, losses_mb_2 = mini_batch_gradient_descent(X, y, lambda_1, learning_rate, num_epochs, batch_size_2)


# Step 8: Use RMSProp optimizer and Adam optimizer


In [15]:
# Implement RMSProp optimizer
def rmsprop(X, y, lambda_reg, learning_rate, num_iterations, epsilon, decay_rate):
    weights = np.zeros(X.shape[1])
    cache = np.zeros(X.shape[1])
    losses = []
    
    for i in range(num_iterations):
        loss = compute_loss(X, y, weights, lambda_reg)
        grad = compute_grad(X, y, weights, lambda_reg)
        
        cache = decay_rate * cache + (1 - decay_rate) * np.square(grad)
        weights -= learning_rate * grad / (np.sqrt(cache) + epsilon)
        
        losses.append(loss)
    
    return weights, losses

# Hyperparameters
epsilon = 1e-8
decay_rate = 0.9
num_iterations = 1000

# Train logistic regression model using RMSProp optimizer
weights_rmsprop, losses_rmsprop = rmsprop(X, y, lambda_1, learning_rate, num_iterations, epsilon, decay_rate)


# Implement Adam optimizer
def adam(X, y, lambda_reg, learning_rate, num_iterations, epsilon, beta1, beta2):
    weights = np.zeros(X.shape[1])
    m = np.zeros(X.shape[1])
    v = np.zeros(X.shape[1])
    t = 0
    losses = []
    
    for i in range(num_iterations):
        t += 1
        loss = compute_loss(X, y, weights, lambda_reg)
        grad = compute_grad(X, y, weights, lambda_reg)
        
        m = beta1 * m + (1 - beta1) * grad
        v = beta2 * v + (1 - beta2) * np.square(grad)
        
        m_hat = m / (1 - beta1 ** t)
        v_hat = v / (1 - beta2 ** t)
        
        weights -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
        
        losses.append(loss)
    
    return weights, losses

# Hyperparameters
beta1 = 0.9
beta2 = 0.999
num_iterations = 1000

# Train logistic regression model using Adam optimizer
weights_adam, losses_adam = adam(X, y, lambda_1, learning_rate, num_iterations, epsilon, beta1, beta2)


  loss = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) + reg_term
  loss = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) + reg_term


# Step 9: Report the accuracies and write conclusions

In [18]:
# Define prediction function
def predict(X, weights):
    return sigmoid(np.dot(X, weights))

# Evaluate accuracy on the validation set
def accuracy(X, y, weights):
    y_pred = np.round(predict(X, weights))
    return np.mean(y_pred == y)

# Calculate accuracies for each case

# Logistic Regression with L1 regularization (lambda = 0.1)
accuracy_l1_1 = accuracy(X, y, weights_l1_1)

# Logistic Regression with L1 regularization (lambda = 0.01)
accuracy_l1_2 = accuracy(X, y, weights_l1_2)

# Mini-batch Gradient Descent (batch size = 64)
accuracy_mb_1 = accuracy(X, y, weights_mb_1)

# Mini-batch Gradient Descent (batch size = 128)
accuracy_mb_2 = accuracy(X, y, weights_mb_2)

# RMSProp optimizer
accuracy_rmsprop = accuracy(X, y, weights_rmsprop)

# Adam optimizer
accuracy_adam = accuracy(X,y, weights_adam)


In [22]:
# Calculate accuracies for each case

# Logistic Regression with L1 regularization (lambda = 0.1)
accuracy_l1_1 = accuracy(X, y, weights_l1_1)

# Logistic Regression with L1 regularization (lambda = 0.01)
accuracy_l1_2 = accuracy(X, y, weights_l1_2)

# Mini-batch Gradient Descent (batch size = 64)
accuracy_mb_1 = accuracy(X, y, weights_mb_1)

# Mini-batch Gradient Descent (batch size = 128)
accuracy_mb_2 = accuracy(X, y, weights_mb_2)

# RMSProp optimizer
accuracy_rmsprop = accuracy(X, y, weights_rmsprop)

# Adam optimizer
accuracy_adam = accuracy(X, y, weights_adam)

# Report the accuracies and write conclusions

print("Accuracy for Logistic Regression with L1 regularization (lambda = 0.1): {:.2f}%".format(accuracy_l1_1 * 100))
print("Accuracy for Logistic Regression with L1 regularization (lambda = 0.01): {:.2f}%".format(accuracy_l1_2 * 100))
print("Accuracy for Mini-batch Gradient Descent (batch size = 64): {:.2f}%".format(accuracy_mb_1 * 100))
print("Accuracy for Mini-batch Gradient Descent (batch size = 128): {:.2f}%".format(accuracy_mb_2 * 100))
print("Accuracy for RMSProp optimizer: {:.2f}%".format(accuracy_rmsprop * 100))
print("Accuracy for Adam optimizer: {:.2f}%".format(accuracy_adam * 100))



Accuracy for Logistic Regression with L1 regularization (lambda = 0.1): 99.94%
Accuracy for Logistic Regression with L1 regularization (lambda = 0.01): 99.94%
Accuracy for Mini-batch Gradient Descent (batch size = 64): 99.99%
Accuracy for Mini-batch Gradient Descent (batch size = 128): 99.99%
Accuracy for RMSProp optimizer: 100.00%
Accuracy for Adam optimizer: 100.00%


# Write conclusions

In [23]:


print("\n--- Conclusions ---")

print("The logistic regression model with L1 regularization (lambda = 0.1) achieved an accuracy of {:.2f}%.".format(accuracy_l1_1 * 100))
print("The logistic regression model with L1 regularization (lambda = 0.01) achieved an accuracy of {:.2f}%.".format(accuracy_l1_2 * 100))
print("The mini-batch gradient descent optimizer with batch size 64 achieved an accuracy of {:.2f}%.".format(accuracy_mb_1 * 100))
print("The mini-batch gradient descent optimizer with batch size 128 achieved an accuracy of {:.2f}%.".format(accuracy_mb_2 * 100))
print("The RMSProp optimizer achieved an accuracy of {:.2f}%.".format(accuracy_rmsprop * 100))
print("The Adam optimizer achieved an accuracy of {:.2f}%.".format(accuracy_adam * 100))

print("\nThe logistic regression model with L1 regularization and a larger lambda value (lambda = 0.1) may have resulted in a lower accuracy compared to the model with a smaller lambda value (lambda = 0.01). This is because a larger lambda value imposes a stronger regularization penalty, which can lead to underfitting if the regularization is too strong.")

print("The mini-batch gradient descent optimizer with a batch size of 64 achieved a similar accuracy to the one with a batch size of 128. The choice of batch size can impact the convergence speed and generalization of the model, but in this case, the difference in accuracy between the two batch sizes was not significant.")

print("Both the RMSProp optimizer and the Adam optimizer achieved high accuracies. These optimizers adapt the learning rate based on the gradients, which can improve convergence and performance compared to standard gradient descent methods.")




--- Conclusions ---
The logistic regression model with L1 regularization (lambda = 0.1) achieved an accuracy of 99.94%.
The logistic regression model with L1 regularization (lambda = 0.01) achieved an accuracy of 99.94%.
The mini-batch gradient descent optimizer with batch size 64 achieved an accuracy of 99.99%.
The mini-batch gradient descent optimizer with batch size 128 achieved an accuracy of 99.99%.
The RMSProp optimizer achieved an accuracy of 100.00%.
The Adam optimizer achieved an accuracy of 100.00%.

The logistic regression model with L1 regularization and a larger lambda value (lambda = 0.1) may have resulted in a lower accuracy compared to the model with a smaller lambda value (lambda = 0.01). This is because a larger lambda value imposes a stronger regularization penalty, which can lead to underfitting if the regularization is too strong.
The mini-batch gradient descent optimizer with a batch size of 64 achieved a similar accuracy to the one with a batch size of 128. The 