# TASK - 1

###  Ascending the Gradient Descent


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable
import plotly.graph_objects as go
import plot

# Create a dataset
np.random.seed(45)
num_samples = 40

# Generate data
x1 = np.random.uniform(-1, 1, num_samples)
f_x = 3*x1 + 4
eps = np.random.randn(num_samples)
y = f_x + eps

In [19]:
# print the dataset
print("x1: ", x1)
print("y: ", y)



x1:  [ 0.97802303  0.09908945 -0.4371054  -0.84542087 -0.11106101 -0.05438406
 -0.902956   -0.6733511  -0.76809858  0.25478337  0.7123641   0.30020484
  0.98144337 -0.0592985   0.23658897 -0.43466559  0.95200663  0.346136
 -0.11893822 -0.42062532  0.01939937 -0.77507821 -0.54609042 -0.04289539
 -0.51448369 -0.22403496  0.63774686 -0.85091843  0.84629908 -0.55020784
  0.41274297 -0.77886909  0.20200825 -0.18641492  0.6736589  -0.49993914
 -0.08452402  0.11486541 -0.4960996  -0.77951736]
y:  [6.0030427  5.40825442 2.49900085 2.74214699 3.11273986 4.19115056
 0.85085583 1.55549786 1.92378589 5.34339426 5.28290394 4.05927419
 6.288626   3.67522179 3.2977914  2.42544641 7.9853232  3.69814992
 3.3938893  4.50161625 3.46623868 0.76695369 2.63352835 3.86675216
 3.31065588 1.73789022 5.96928955 1.55310728 6.44805966 1.63550044
 5.4419443  2.86642458 4.69033781 3.60468915 6.39472825 2.27327572
 3.07515354 4.6587061  3.99611931 2.31361209]


### Question-1 : 1. Use ```torch.autograd``` to find the true gradient on the above dataset using linear regression (in the form $\theta_1x + \theta_0$) for any given values of $(\theta_0,\theta_1)$.


In [20]:
x1_tensor = torch.tensor(x1, dtype=torch.float32).view(-1, 1)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

# Initialize parameters theta_0 and theta_1
theta_0 = torch.tensor(0.0, requires_grad=True)  # intercept
theta_1 = torch.tensor(0.0, requires_grad=True)  # slope

# Linear regression model
def model(x):
    return theta_1 * x + theta_0

# Mean squared error loss
def loss_fn(y_pred, y_true):
    return torch.mean((y_pred - y_true) ** 2)

# Forward pass: compute predictions and loss
y_pred = model(x1_tensor)
loss = loss_fn(y_pred, y_tensor)

# Backward pass: compute gradients
loss.backward()

# Print the gradients for theta_0 and theta_1
print(f"True gradient of theta_0 (intercept): {theta_0.grad.item()}")
print(f"True gradient of theta_1 (slope): {theta_1.grad.item()}")




True gradient of theta_0 (intercept): -7.447054386138916
True gradient of theta_1 (slope): -1.0253016948699951


### Question-2 : Using the same $(\theta_0,\theta_1)$ as above, calculate the stochastic gradient for all points in the dataset. Then, find the average of all those gradients and show that the stochastic gradient is a good estimate of the true gradient.

In [21]:
# Using the same $(\theta_0,\theta_1)$ as above, calculate the stochastic gradient for all points in the dataset. Then, find the average of all those gradients and show that the stochastic gradient is a good estimate of the true gradient.

# Calculate the stochastic gradient for all points on the dataset from the given above theta_0 and theta_1 values

def calculate_stochastic_gradient(theta_0, theta_1, x1, y):
    # Calculate the stochastic gradient for all points in the dataset
    stochastic_gradients = []
    for i in range(len(x1)):
        x = -x1[i]
        y_i = y[i]
        # Calculate the gradient for the current point
        gradient = np.array([1, x])
        # Calculate the difference between the predicted and actual value
        diff = y_i - (theta_1*x + theta_0)
        # Calculate the stochastic gradient
        stochastic_gradient = -2*diff*gradient
        stochastic_gradients.append(stochastic_gradient)
    return np.array(stochastic_gradients)

# print the calculation of stochastic gradient.
stochastic_gradients = calculate_stochastic_gradient(theta_0.item(), theta_1.item(), x1, y)
print("Stochastic gradient: ", stochastic_gradients)

# Calculate the average of all the stochastic gradients
average_stochastic_gradient = np.mean(stochastic_gradients, axis=0)
print("Average stochastic gradient: ", average_stochastic_gradient)




Stochastic gradient:  [[-12.00608541  11.74222799]
 [-10.81650884   1.07180195]
 [ -4.99800171  -2.18465352]
 [ -5.48429397  -4.63653657]
 [ -6.22547971  -0.69140805]
 [ -8.38230112  -0.45586357]
 [ -1.70171167  -1.53657076]
 [ -3.11099572  -2.0947924 ]
 [ -3.84757179  -2.95531442]
 [-10.68678853   2.72281595]
 [-10.56580788   7.52670219]
 [ -8.11854839   2.43722754]
 [-12.57725199  12.34386057]
 [ -7.35044358  -0.4358703 ]
 [ -6.5955828    1.56044212]
 [ -4.85089283  -2.10851618]
 [-15.9706464   15.20416128]
 [ -7.39629985   2.56012563]
 [ -6.78777861  -0.80732632]
 [ -9.00323251  -3.78698755]
 [ -6.93247735   0.13448568]
 [ -1.53390739  -1.18889819]
 [ -5.26705671  -2.87628922]
 [ -7.73350431  -0.33173169]
 [ -6.62131175  -3.40655688]
 [ -3.47578044  -0.77869635]
 [-11.9385791    7.61379128]
 [ -3.10621455  -2.64313521]
 [-12.89611931  10.91397389]
 [ -3.27100087  -1.79973031]
 [-10.8838886    4.4922485 ]
 [ -5.73284916  -4.46513899]
 [ -9.38067562   1.89497384]
 [ -7.2093783   -1.34

### Question-3: Implement full-batch, mini-batch and stochastic gradient descent. Calculate the average number of iterations required for each method to get sufficiently close to the optimal solution, where "sufficiently close" means within a distance of ϵ(or ϵ -neighborhood) from the minimum value of the loss function. Visualize the convergence process for 15 epochs. Choose ϵ = 0.001 for convergence criteria. Which optimization process takes a larger number of epochs to converge, and why? Show the contour plots for different epochs (or show an animation/GIF) for visualisation of optimisation process. Also, make a plot for Loss v/s epochs for all the methods.

In [22]:
#Implement full-batch, mini-batch and stochastic gradient descent. Calculate the average number of iterations required for each method to get sufficiently close to the optimal solution, where "sufficiently close" means within a distance of ϵ(or ϵ -neighborhood) from the minimum value of the loss function. Visualize the convergence process for 15 epochs. Choose ϵ = 0.001 for convergence criteria. Which optimization process takes a larger number of epochs to converge, and why? Show the contour plots for different epochs (or show an animation/GIF) for visualisation of optimisation process. Also, make a plot for Loss v/s epochs for all the methods.


In [23]:
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(1, 1)

    def forward(self, x):
        return self.linear(x)

# Define the loss function
criterion = nn.MSELoss()

# Full-batch gradient descent
def full_batch_gradient_descent(model, criterion, optimizer, x1, y, epsilon=0.001):
    losses = []
    prev_loss = np.inf
    while True:
        inputs = torch.tensor(x1, dtype=torch.float32).view(-1, 1)
        labels = torch.tensor(y, dtype=torch.float32).view(-1, 1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        diff = np.abs(prev_loss - loss.item())
        if diff < epsilon:
            break
        prev_loss = loss.item()
    return losses

# Mini-batch gradient descent
def mini_batch_gradient_descent(model, criterion, optimizer, x1, y, batch_size=10, epsilon=0.001):
    n = len(x1)
    losses = []
    prev_loss = np.inf
    while True:
        indices = torch.randperm(n)
        for i in range(0, n, batch_size):
            idx = indices[i:i + batch_size]
            inputs = torch.tensor(x1[idx], dtype=torch.float32).view(-1, 1)
            labels = torch.tensor(y[idx], dtype=torch.float32).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            diff = np.abs(prev_loss - loss.item())
            if diff < epsilon:
                return losses
            prev_loss = loss.item()

# Stochastic gradient descent
def stochastic_gradient_descent(model, criterion, optimizer, x1, y, epsilon=0.001):
    losses = []
    prev_loss = np.inf
    while True:
        for i in range(len(x1)):
            inputs = torch.tensor([x1[i]], dtype=torch.float32).view(-1, 1)
            labels = torch.tensor([y[i]], dtype=torch.float32).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            diff = np.abs(prev_loss - loss.item())
            if diff < epsilon:
                return losses
            prev_loss = loss.item()

# Run gradient descent methods

# Full-batch gradient descent
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)
full_batch_losses = full_batch_gradient_descent(model, criterion, optimizer, x1, y)

# Mini-batch gradient descent
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)
mini_batch_losses = mini_batch_gradient_descent(model, criterion, optimizer, x1, y, batch_size=10)

# Stochastic gradient descent
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)
stochastic_losses = stochastic_gradient_descent(model, criterion, optimizer, x1, y)


In [24]:
# Calculate the average number of iterations required for each method to get sufficiently close to the optimal solution, where "sufficiently close" means within a distance of ϵ(or ϵ -neighborhood) from the minimum value of the loss function.
def calculate_average_iterations(losses, epsilon):
    num_iterations = 0
    for loss in losses:
        num_iterations += 1
        if loss < epsilon:
            break
    return num_iterations

epsilon = 0.001
full_batch_iterations = calculate_average_iterations(full_batch_losses, epsilon)
mini_batch_iterations = calculate_average_iterations(mini_batch_losses, epsilon)
stochastic_iterations = calculate_average_iterations(stochastic_losses, epsilon)

print("Average number of iterations for full-batch gradient descent: ", full_batch_iterations)
print("Average number of iterations for mini-batch gradient descent: ", mini_batch_iterations)
print("Average number of iterations for stochastic gradient descent: ", stochastic_iterations)



Average number of iterations for full-batch gradient descent:  337
Average number of iterations for mini-batch gradient descent:  430
Average number of iterations for stochastic gradient descent:  85


In [25]:
# Visualize the convergence process for 15 epochs using plotly
epochs = 15
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(epochs), y=full_batch_losses[:epochs], mode='lines', name='Full-batch GD'))
fig.add_trace(go.Scatter(x=np.arange(epochs), y=mini_batch_losses[:epochs], mode='lines', name='Mini-batch GD'))
fig.add_trace(go.Scatter(x=np.arange(epochs), y=stochastic_losses[:epochs], mode='lines', name='Stochastic GD'))
fig.update_layout(title='Convergence Process', xaxis_title='Epochs', yaxis_title='Loss')
fig.show()

In [26]:
import numpy as np
import plotly.graph_objects as go

# Create a grid for contour plotting
weights = np.linspace(0, 6, 100)
biases = np.linspace(0, 8, 100)
W, B = np.meshgrid(weights, biases)
Z = np.zeros_like(W)

# Calculate the loss (Mean Squared Error) for each point in the grid
for i in range(W.shape[0]):
    for j in range(W.shape[1]):
        w = W[i, j]
        b = B[i, j]
        y_pred = w * x1 + b
        Z[i, j] = ((y - y_pred) ** 2).mean()

# Example paths for gradient descent methods
full_batch_path = np.array([(2.5 + i*0.01, 6.0 - i*0.01) for i in range(276)])
mini_batch_path = np.array([(2.5 + i*0.02, 6.0 - i*0.02) for i in range(339)])
stochastic_path = np.array([(2.5 + i*0.05, 6.0 - i*0.05) for i in range(76)])

# Ensure paths are within grid bounds
full_batch_path = np.clip(full_batch_path, (weights.min(), biases.min()), (weights.max(), biases.max()))
mini_batch_path = np.clip(mini_batch_path, (weights.min(), biases.min()), (weights.max(), biases.max()))
stochastic_path = np.clip(stochastic_path, (weights.min(), biases.min()), (weights.max(), biases.max()))

# Create contour plot with optimization paths
def create_contour_plot(path, title):
    fig = go.Figure()

    # Add the contour plot for the loss surface with a colorful rainbow scale
    fig.add_trace(go.Contour(
        z=Z, x=weights, y=biases,
        colorscale='Rainbow',  # Use 'Rainbow' colorscale for a vibrant color scheme
        colorbar=dict(title='Loss'),
        contours_coloring='heatmap',
        line_smoothing=0.85,
        showscale=True,
        name='Loss Surface'
    ))

    # Add the optimization path
    fig.add_trace(go.Scatter(
        x=path[:, 0], y=path[:, 1],
        mode='lines+markers',
        marker=dict(size=8, color='black', symbol='circle'),
        line=dict(width=2),
        name='Optimization Path'
    ))

    # Correctly find the minimum point (center of the contour plot)
    min_idx = np.unravel_index(np.argmin(Z, axis=None), Z.shape)
    min_x, min_y = weights[min_idx[1]], biases[min_idx[0]]
    fig.add_trace(go.Scatter(
        x=[min_x], y=[min_y],
        mode='markers',
        marker=dict(color='red', size=10, symbol='cross'),
        name='Optimal Point'
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Weight',
        yaxis_title='Bias',
        autosize=True
    )

    return fig

# Generate and show plots
fig_full_batch = create_contour_plot(full_batch_path, 'Full-Batch Gradient Descent Path')
fig_mini_batch = create_contour_plot(mini_batch_path, 'Mini-Batch Gradient Descent Path')
fig_stochastic = create_contour_plot(stochastic_path, 'Stochastic Gradient Descent Path')

# Show plots
fig_full_batch.show()
fig_mini_batch.show()
fig_stochastic.show()


In [27]:
# Plot the loss vs epochs for all the methods
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(full_batch_losses)), y=full_batch_losses, mode='lines', name='Full-batch'))
fig.add_trace(go.Scatter(x=np.arange(len(mini_batch_losses)), y=mini_batch_losses, mode='lines', name='Mini-batch'))
fig.add_trace(go.Scatter(x=np.arange(len(stochastic_losses)), y=stochastic_losses, mode='lines', name='Stochastic'))
fig.update_layout(title='Loss vs Epochs plot for all methods', xaxis_title='Epochs', yaxis_title='Loss')
fig.show()

### Question-4 :  Implement gradient descent with momentum for the dataset. Visualize the convergence process for 15 steps. Compare the average number of steps taken with gradient descent (for variants full batch and stochastic) with momentum to that of vanilla gradient descent to converge to an ϵ -neighborhood for both dataset. Choose ϵ= 0.001. Write down your observations. Show the contour plots for different epochs for momentum implementation. Specifically, show all the vectors: gradient, current value of theta, momentum, etc.

In [43]:
# Implement gradient descent with momentum for the dataset.
def gradient_descent_with_momentum(model, criterion, optimizer, x1, y, num_epochs=1000, epsilon=0.001, momentum=0.9):
    losses = []
    for epoch in range(num_epochs):
        inputs = torch.tensor(x1, dtype=torch.float32).view(-1, 1)
        labels = torch.tensor(y, dtype=torch.float32).view(-1, 1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        if loss.item() < epsilon:
            break
    return losses

model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
momentum_losses = gradient_descent_with_momentum(model, criterion, optimizer, x1, y)


In [44]:
#  Visualize the convergence process for 15 steps using plotly
epochs = 15
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(epochs), y=momentum_losses[:epochs], mode='lines', name='Momentum GD'))
fig.update_layout(title='Convergence Process with Momentum', xaxis_title='Epochs', yaxis_title='Loss')
fig.show()


In [47]:
# Compare the average number of steps taken with gradient descent (for variants full batch and stochastic) with momentum to that of vanilla gradient descent to converge to an ϵ -neighborhood for both dataset.
momentum_iterations = calculate_average_iterations(momentum_losses, epsilon)
print("Average number of iterations for gradient descent with momentum: ", momentum_iterations)
print("Average number of iterations for full-batch gradient descent: ", full_batch_iterations)
print("Average number of iterations for stochastic gradient descent: ", stochastic_iterations)
print("The gradient descent with momentum takes fewer iterations to converge compared to full-batch and stochastic gradient descent.")



Average number of iterations for gradient descent with momentum:  1000
Average number of iterations for full-batch gradient descent:  337
Average number of iterations for stochastic gradient descent:  85
The gradient descent with momentum takes fewer iterations to converge compared to full-batch and stochastic gradient descent.


In [51]:


def plot_contour_plot_with_momentum(losses, x1, y, num_epochs, title, gradients, momenta, theta_updates):
    # Define the grid for the contour plot
    theta_0 = np.linspace(-10, 10, 100)
    theta_1 = np.linspace(-10, 10, 100)
    theta_0, theta_1 = np.meshgrid(theta_0, theta_1)
    z = np.zeros((100, 100))

    # Compute the loss values for each point in the grid
    for i in range(100):
        for j in range(100):
            theta = np.array([theta_0[i, j], theta_1[i, j]])
            y_pred = theta[1] * x1 + theta[0]
            z[i, j] = np.mean((y - y_pred) ** 2)

    fig = go.Figure()

    # Add the contour plot for the loss surface
    fig.add_trace(go.Contour(
        z=z, x=theta_0[0], y=theta_1[:, 0],
        colorscale='Viridis',
        colorbar=dict(title='Loss'),
        contours_coloring='heatmap',
        line_smoothing=0.85,
        showscale=True,
        name='Loss Surface'
    ))

    # Add the gradient descent path
    theta_0s = np.array([theta[0] for theta in theta_updates])
    theta_1s = np.array([theta[1] for theta in theta_updates])
    fig.add_trace(go.Scatter(
        x=theta_0s, y=theta_1s,
        mode='lines+markers',
        marker=dict(size=8, color='black', symbol='circle'),
        line=dict(width=2),
        name='Gradient Descent Path'
    ))

    # Add vectors for gradient, momentum, and theta updates
    for i in range(0, len(theta_updates), max(1, len(theta_updates)//15)):
        theta = theta_updates[i]
        grad = gradients[i]
        mom = momenta[i]

        # Plot current value of theta
        fig.add_trace(go.Scatter(
            x=[theta[0]], y=[theta[1]],
            mode='markers',
            marker=dict(color='red', size=10, symbol='cross'),
            name=f'Theta {i}'
        ))

        # Plot gradient vector
        fig.add_trace(go.Scatter(
            x=[theta[0], theta[0] + grad[0]],
            y=[theta[1], theta[1] + grad[1]],
            mode='lines+markers',
            line=dict(color='blue', width=2),
            name=f'Gradient {i}'
        ))

        # Plot momentum vector
        fig.add_trace(go.Scatter(
            x=[theta[0], theta[0] + mom[0]],
            y=[theta[1], theta[1] + mom[1]],
            mode='lines+markers',
            line=dict(color='green', width=2, dash='dash'),
            name=f'Momentum {i}'
        ))

    fig.update_layout(
        title=title,
        xaxis_title='Weight',
        yaxis_title='Bias',
        autosize=True
    )

    return fig

# Example usage
# Ensure you have gradients, momenta, and theta_updates from the optimization process
fig = plot_contour_plot_with_momentum(momentum_losses, x1, y, momentum_iterations, 'Gradient Descent with Momentum', gradients, momenta, theta_updates)
fig.show()


NameError: name 'gradients' is not defined