[Reference](https://ai.plainenglish.io/momentum-based-gradient-descent-f96fc3c8f470)

In [1]:
import numpy as np

def gradient_descent_momentum(X, y, learning_rate=0.01, momentum=0.9, num_iterations=100):
    # Initialize the parameters
    num_samples, num_features = X.shape
    theta = np.zeros(num_features)

    # Initialize the velocity vector
    velocity = np.zeros_like(theta)

    # Perform iterations
    for iteration in range(num_iterations):
        # Compute the predictions and errors
        predicted = np.dot(X, theta)
        errors = predicted - y

        # Compute the gradients
        gradients = (1/num_samples) * np.dot(X.T, errors)

        # Update the velocity
        velocity = momentum * velocity + learning_rate * gradients

        # Update the parameters
        theta -= velocity

        # Compute the mean squared error
        mse = np.mean(errors**2)

        # Print the MSE at each iteration
        print(f"Iteration {iteration+1}, MSE: {mse}")

    return theta

In [3]:
# Generate some random data
np.random.seed(42)
X = np.random.rand(100, 1)
y = 2 + 3 * X + np.random.randn(100, 1)

# Apply Gradient Descent with Momentum
theta_momentum = gradient_descent_momentum(X, y, learning_rate=0.1, momentum=0.9, num_iterations=100)

# Apply Vanilla Gradient Descent
theta_vanilla = gradient_descent(X, y, learning_rate=0.1, num_iterations=100)