Let's define a function to calculate the gradients for a given set of data. The gradients are the partial derivatives of the loss (cost) function with respect to each of the parameters. So if we have 
$$ L(X, Y, W) = \frac{1}{m + 1} \big\|XW - Y\big\|^2_2 = \frac{1}{m + 1}\sum_{i=0}^{m}(x^i W - y^i)^2$$ in this case we have $$grad(W) = \frac{2}{m + 1}X^T(XW-Y)$$

In [10]:
import numpy as np


def compute_gradients(X, Y, W):
    '''
    X: input data
    Y: output data
    W: parameters (weights)
    return the gradient of parameters W
    '''
    m = len(Y)
    predictions = np.dot(X, W)
    errors = predictions - Y
    gradients = (2 / m) * np.dot(X.T, errors)
    return gradients


def lin_reg_l2(x, y, w):
    '''
    x: input data
    y: target data
    w: given parameters
    '''
    y_pred = np.dot(x, w)
    # mean square error
    return np.sqrt(np.mean((y_pred - y) ** 2))

Next, we define a gradient descent function, i.e. apply the gradients and update the parameter values iteratively

In [11]:
def gradient_descent(X, y, learn_rate, num_iters, theta=None):
    '''
    X: input data
    y: output data
    learn_rate: the learning rate for gradient descent 
    num_iterations
    theta: initial value for the given parameters 
    '''
    m = len(y)
    if not theta:
        theta = np.zeros((X.shape[1], 1))
    for i in range(num_iters):
        gradients = compute_gradients(X, y, theta)
        theta = theta - learn_rate * gradients
        loss = lin_reg_l2(X, y, theta)
        print(f'{i}) loss: {loss}')
    return theta

To show that the gradient descent converges nearly to the same parameters as the solution of the normal equation for linear regression, we can generate some random data and compare the results of gradient descent and the normal equation:

In [12]:
# Generate random data
np.random.seed(0)
X = np.random.rand(100, 2)
y = 2 + np.dot(X, np.array([[3, 4]]).T) + 0.1 * np.random.rand(100, 1)

# Add intercept column to X (for bias)
X = np.hstack((np.ones((X.shape[0], 1)), X))

# Calculate parameters using normal equation
theta_normal = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)

# Calculate parameters using gradient descent
theta_gd = gradient_descent(X, y, 0.1, 400)

print("Parameters using normal equation: ", theta_normal.T)
print("Parameters using gradient descent: ", theta_gd.T)

0) loss: 3.9923695504740464
1) loss: 2.8107099480787543
2) loss: 2.0042900376700956
3) loss: 1.4626759300519554
4) loss: 1.1086673282532158
5) loss: 0.8864096485093699
6) loss: 0.7534131286648105
7) loss: 0.6770122454851051
8) loss: 0.6337548858181596
9) loss: 0.608609267768981
10) loss: 0.5929054970487517
11) loss: 0.5819996739708985
12) loss: 0.5735117725332467
13) loss: 0.5662508557444872
14) loss: 0.5596281604378662
15) loss: 0.5533542036243779
16) loss: 0.5472868472710333
17) loss: 0.5413561581757218
18) loss: 0.5355275074977497
19) loss: 0.5297835123040149
20) loss: 0.5241152132494787
21) loss: 0.5185177681287797
22) loss: 0.5129883505983024
23) loss: 0.5075251250380641
24) loss: 0.5021267464119357
25) loss: 0.49679211620931313
26) loss: 0.49152026328743853
27) loss: 0.48631028563072143
28) loss: 0.4811613218210305
29) loss: 0.47607253699977337
30) loss: 0.47104311589954373
31) loss: 0.46607225932565566
32) loss: 0.4611591823223173
33) loss: 0.4563031131625634
34) loss: 0.4515032

As we can see, the parameters calculated using gradient descent are close to the parameters calculated using the normal equation (if iterations are chosen enough). This demonstrates that gradient descent can be a useful alternative to the normal equation for linear regression, especially for larger datasets where the normal equation may become computationally expensive.

As a homework do the same experiment in case of Logistic regression

In [13]:
import numpy as np


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def compute_gradients_logistic(X, y, theta):
    '''
    X: input data
    y: output data
    theta: parameters (weights)
    return the gradient of parameters theta
    '''
    m = len(y)
    predictions = sigmoid(np.dot(X, theta))
    errors = predictions - y
    gradients = (1 / m) * np.dot(X.T, errors)
    return gradients

In [14]:
def logistic_loss(X, y, theta):
    '''
    X: input data
    y: target data
    theta: given parameters
    '''
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    epsilon = 1e-8  # small constant to avoid division by zero
    loss = (-1 / m) * np.sum(y * np.log(h + epsilon) + (1 - y) * np.log(1 - h + epsilon))
    return loss


In [15]:
def logistic_regression_gradient_descent(X, y, learn_rate, num_iters, theta=None):
    '''
    X: input data
    y: output data
    learn_rate: the learning rate for gradient descent 
    num_iterations
    theta: initial value for the given parameters 
    '''
    m, n = X.shape
    if not theta:
        theta = np.zeros((n, 1))
    for i in range(num_iters):
        gradients = compute_gradients_logistic(X, y, theta)
        theta -= learn_rate * gradients
        loss = logistic_loss(X, y, theta)
        print(f'{i}) loss: {loss}')
    return theta

In [16]:
# X = np.random.rand(100, 2)
# y = 2 + np.dot(X, np.array([[3, 4]]).T) + 0.1 * np.random.rand(100, 1)

# Add intercept column to X (for bias)
X = np.hstack((np.ones((X.shape[0], 1)), X))
theta_logistic_gd = logistic_regression_gradient_descent(X, y, 0.1, 400)

print("Parameters using gradient descent for logistic regression: ", theta_logistic_gd.T)

0) loss: -5.745160598532947
1) loss: -11.568984190244349
2) loss: -17.12021239855039
3) loss: -22.575770977009217
4) loss: -27.999878253369058
5) loss: -33.41375154693934
6) loss: -38.824168066348506
7) loss: -44.23298592324469
8) loss: -49.63962099625799
9) loss: -55.03966061330185
10) loss: -60.416766637709564
11) loss: -65.71515932991608
12) loss: -70.7693141028946
13) loss: -75.23260291306256
14) loss: -78.71131900898042
15) loss: -81.04894649980339
16) loss: -82.40498614591733
17) loss: -83.09905612001305
18) loss: -83.41974379861682
19) loss: -83.55505257398475
20) loss: -83.60802406520511
21) loss: -83.62777665566122
22) loss: -83.63496656084543
23) loss: -83.63756050753479
24) loss: -83.63849491791231
25) loss: -83.63883203167744
26) loss: -83.6389539790995
27) loss: -83.63899823116446
28) loss: -83.63901432554003
29) loss: -83.6390202081573
30) loss: -83.6390223527912
31) loss: -83.63902314354007
32) loss: -83.63902343162827
33) loss: -83.63902353946669
34) loss: -83.639023577