In [59]:
# def SGD(X, y, gradient_function, num_epochs, lr):
#     num_rows, num_cols = X.shape[0], X.shape[1]
#     theta = np.random.randn(num_cols)

#     for e in range(num_epochs):
#         for i in range(num_rows):
#             xi = X[i]
#             yi = y[i]
#             grad = gradient_function(xi, yi, theta)
#             theta = theta - (lr * grad)
#     return theta


| Loss  | Loss Function (Matrix Form) | Gradient |
|-------|-----------------------------|------------------------------------------------------|
| **MSE** | $$\mathcal{L}_{MSE} = \frac{1}{N} \| Y - X\theta \|_2^2$$ | $$\frac{\partial \mathcal{L}_{MSE}}{\partial \theta} = -\frac{2}{N} X^T (Y - X\theta)$$ |
| **BCE** | $$\mathcal{L}_{BCE} = -\frac{1}{N} \left[ Y^T \log \sigma(X\theta) + (1 - Y)^T \log (1 - \sigma(X\theta)) \right]$$ | $$\frac{\partial \mathcal{L}_{BCE}}{\partial \theta} = -\frac{1}{N} X^T (Y - \sigma(X\theta))$$ |


In [79]:
import numpy as np

def MBGD(X, y, gradient_function, num_epochs, lr, batch_size):
    num_rows, num_cols = X.shape[0], X.shape[1]
    theta = np.random.randn(num_cols)

    indices = np.random.permutation(len(X)) 
    X, y = X[indices], y[indices]

    # no_improve_count = 0  # Counter for stopping patience
    # tol = 1e-2
    # patience = 5

    for e in range(num_epochs):
        for b in range(0, num_rows, batch_size):
            X_batch = X[b:b+batch_size]
            y_batch = y[b:b+batch_size].flatten()
            grad = gradient_function(X_batch, y_batch, theta)
            theta = theta - (lr * grad)

            # # **Stopping Criterion: Check gradient magnitude**
            # grad_norm = np.linalg.norm(grad)  # Compute L2 norm of gradient
            # if grad_norm < tol:
            #     no_improve_count += 1
            # else:
            #     no_improve_count = 0  # Reset if gradient is significant

            # # **Early stopping if gradient is too small for `patience` epochs**
            # if no_improve_count >= patience:
            #     print(f"Stopping early at epoch {e+1} due to small gradient updates.")
            #     return theta

    return theta

In [76]:
def mse_grad(X, y, theta):
    n = len(y) 
    return (-2 / n) * X.T @ (y - (X @ theta))

X = np.random.randn(10000, 1)
noise = np.random.randn(10000, 1) * 0.2  # Add noise
y = 4 + 3*X + noise
X = np.column_stack([np.ones(10000), X])

MBGD(X, y, mse_grad, 1000, 0.01, 8)

array([3.99886433, 3.00427567])

In [80]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def bce_grad(X, y, theta):
    n = len(y) 
    return (-1 / n) * X.T @ (y - sigmoid(X @ theta))

X = np.random.randn(10000, 1)
noise = np.random.randn(10000, 1) * 0.2  # Add noise
y = sigmoid(4 + 3*X + noise) 
# y = (y > 0.5).astype(int)
X = np.column_stack([np.ones(10000), X])

MBGD(X, y, bce_grad, 50, 0.01, 32)

array([3.57146676, 2.63263087])

# More Details

Mean Squared Error (MSE)
Loss Function
For predictions ŷ = f(X;θ) and true values y:
$$\mathcal{L}{MSE}(\theta) = \frac{1}{n} \sum{i=1}^{n} (y_i - \hat{y}_i)^2 = \frac{1}{n} |y - \hat{y}|^2_2$$
In matrix form:
$$\mathcal{L}_{MSE}(\theta) = \frac{1}{n} (y - \hat{y})^T(y - \hat{y})$$
Gradient
For a linear model where ŷ = Xθ:
$$\nabla_\theta\mathcal{L}_{MSE}(\theta) = -\frac{2}{n}X^T(y - X\theta)$$
For a general model where ŷ = f(X;θ):
$$\nabla_\theta\mathcal{L}_{MSE}(\theta) = -\frac{2}{n}J^T(y - \hat{y})$$
where J is the Jacobian matrix with elements $J_{ij} = \frac{\partial \hat{y}_i}{\partial \theta_j}$
Binary Cross-Entropy (BCE)
Loss Function
For binary classification with predicted probabilities ŷ = σ(f(X;θ)) where σ is the sigmoid function:
$$\mathcal{L}{BCE}(\theta) = -\frac{1}{n}\sum{i=1}^{n} \left[ y_i\log(\hat{y}_i) + (1-y_i)\log(1-\hat{y}_i) \right]$$
In matrix form:
$$\mathcal{L}_{BCE}(\theta) = -\frac{1}{n}\left[ y^T\log(\hat{y}) + (1-y)^T\log(1-\hat{y}) \right]$$
Gradient
For a linear model where ŷ = σ(Xθ):
$$\nabla_\theta\mathcal{L}_{BCE}(\theta) = \frac{1}{n}X^T(\hat{y} - y)$$
For a general model where ŷ = σ(f(X;θ)):
$$\nabla_\theta\mathcal{L}_{BCE}(\theta) = \frac{1}{n}J^T(\hat{y} - y)$$
where J is the Jacobian matrix with elements $J_{ij} = \frac{\partial \hat{y}_i}{\partial \theta_j}$ and incorporates the sigmoid derivative.

# Multiple Features

In [103]:
def mse_grad(X, y, theta):
    n = len(y) 
    return (-2 / n) * X.T @ (y - (X @ theta))

X = np.random.randn(10000, 2)
noise = np.random.randn(10000, 1).flatten() * 0.2  # Add noise

y = 4 + (X @ np.array([3,2])) + noise

X = np.column_stack([np.ones(10000), X])

MBGD(X, y, mse_grad, 1000, 0.01, 8)

array([3.99366568, 3.00319916, 2.00144661])

In [109]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def bce_grad(X, y, theta):
    n = len(y) 
    return (-1 / n) * X.T @ (y - sigmoid(X @ theta))

X = np.random.randn(10000, 2)
noise = np.random.randn(10000, 1).flatten() * 0.2  # Add noise

y = sigmoid(4 + (X @ np.array([3,2])) + noise) 
y = (y > 0.5).astype(int)
X = np.column_stack([np.ones(10000), X])

MBGD(X, y, bce_grad, 50, 0.01, 32)

array([4.56036152, 3.21328784, 2.18231314])