# TASK - 1

###  Ascending the Gradient Descent


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable
import plotly.graph_objects as go
import plot

# Create a dataset
np.random.seed(45)
num_samples = 40
    
# Generate data
x1 = np.random.uniform(-1, 1, num_samples)
f_x = 3*x1 + 4
eps = np.random.randn(num_samples)
y = f_x + eps

### Question-1 : 1. Use ```torch.autograd``` to find the true gradient on the above dataset using linear regression (in the form $\theta_1x + \theta_0$) for any given values of $(\theta_0,\theta_1)$.


In [2]:
# Define the model  
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(1, 1)

    def forward(self, x):
        return self.linear(x)

# Define the loss function
criterion = nn.MSELoss()
# Define the optimizer
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
num_epochs = 1000
for epoch in range(num_epochs):
    inputs = Variable(torch.from_numpy(x1).float().view(-1, 1))
    labels = Variable(torch.from_numpy(y).float().view(-1, 1))

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

# Get the learned parameters
theta_0 = model.linear.bias.item()
theta_1 = model.linear.weight.item()
# Get the true gradient
true_gradient = np.array([theta_0, theta_1])
print("True gradient: ", true_gradient)

# Plot the data using plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=x1, y=y, mode='markers', name='Data'))
fig.add_trace(go.Scatter(x=x1, y=theta_1*x1 + theta_0, mode='lines', name='Model'))
fig.update_layout(title='Linear Regression', xaxis_title='x', yaxis_title='y')
fig.show()




  from .autonotebook import tqdm as notebook_tqdm


True gradient:  [3.95001221 2.67677808]


### Question-2 : Using the same $(\theta_0,\theta_1)$ as above, calculate the stochastic gradient for all points in the dataset. Then, find the average of all those gradients and show that the stochastic gradient is a good estimate of the true gradient.

In [3]:
# Using the same $(\theta_0,\theta_1)$ as above, calculate the stochastic gradient for all points in the dataset. Then, find the average of all those gradients and show that the stochastic gradient is a good estimate of the true gradient.

# Calculate the stochastic gradient for all points on the dataset from the given above theta_0 and theta_1 values
def calculate_stochastic_gradient(theta_0, theta_1, x1, y):
    # Calculate the stochastic gradient for all points in the dataset
    stochastic_gradients = []
    for i in range(len(x1)):
        x = x1[i]
        y_i = y[i]
        # Calculate the gradient for the current point
        gradient = np.array([1, x])
        # Calculate the difference between the predicted and actual value
        diff = y_i - (theta_1*x + theta_0)
        # Calculate the stochastic gradient
        stochastic_gradient = -2*diff*gradient
        stochastic_gradients.append(stochastic_gradient)
    return np.array(stochastic_gradients)


# print the calculation of stochastic gradient 
print("Stochastic Gradient Calculation: ")
stochastic_gradients = calculate_stochastic_gradient(theta_0, theta_1, x1, y)
print(stochastic_gradients)


# find the average of all those gradients 
average_stochastic_gradient = np.mean(stochastic_gradients, axis=0)
print("Average stochastic gradient: ", average_stochastic_gradient)
print("True gradient: ", true_gradient)
print("Difference between the true and average stochastic gradient: ", np.abs(true_gradient - average_stochastic_gradient))
print("The stochastic gradient is a good estimate of the true gradient because the difference between the true and average stochastic gradient is very small.")





Stochastic Gradient Calculation: 
[[ 1.12984020e+00  1.10500974e+00]
 [-2.38600347e+00 -2.36427781e-01]
 [ 5.61954423e-01 -2.45633311e-01]
 [-2.11027765e+00  1.78407276e+00]
 [ 1.07997336e+00 -1.19942930e-01]
 [-7.73424825e-01  4.20619825e-02]
 [ 1.36428711e+00 -1.23189122e+00]
 [ 1.18420574e+00 -7.97386245e-01]
 [-5.96062391e-02  4.57834674e-02]
 [-1.42276706e+00 -3.62497379e-01]
 [ 1.14789774e+00  8.17721134e-01]
 [ 1.38863951e+00  4.16876305e-01]
 [ 5.76984614e-01  5.66277724e-01]
 [ 2.32122970e-01 -1.37645446e-02]
 [ 2.57103393e+00  6.08278263e-01]
 [ 7.22124956e-01 -3.13882868e-01]
 [-2.97400102e+00 -2.83126869e+00]
 [ 2.35678307e+00  8.15767461e-01]
 [ 4.75503352e-01 -5.65555235e-02]
 [-3.35504936e+00  1.41121871e+00]
 [ 1.07140267e+00  2.07845347e-02]
 [ 2.21669231e+00 -1.71810991e+00]
 [-2.90558030e-01  1.58670957e-01]
 [-6.31227836e-02  2.70767646e-03]
 [-1.47560465e+00  7.59174518e-01]
 [ 3.22486021e+00 -7.22481443e-01]
 [-6.24341088e-01 -3.98171566e-01]
 [ 2.38370262e-01 -2.

### Question-3: Implement full-batch, mini-batch and stochastic gradient descent. Calculate the average number of iterations required for each method to get sufficiently close to the optimal solution, where "sufficiently close" means within a distance of ϵ(or ϵ -neighborhood) from the minimum value of the loss function. Visualize the convergence process for 15 epochs. Choose ϵ = 0.001 for convergence criteria. Which optimization process takes a larger number of epochs to converge, and why? Show the contour plots for different epochs (or show an animation/GIF) for visualisation of optimisation process. Also, make a plot for Loss v/s epochs for all the methods. 

In [4]:
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(1, 1)

    def forward(self, x):
        return self.linear(x)

# Define the loss function
criterion = nn.MSELoss()

# Full-batch gradient descent
def full_batch_gradient_descent(model, criterion, optimizer, x1, y, num_epochs=1000, epsilon=0.001):
    losses = []
    for epoch in range(num_epochs):
        inputs = torch.tensor(x1, dtype=torch.float32).view(-1, 1)
        labels = torch.tensor(y, dtype=torch.float32).view(-1, 1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        if loss.item() < epsilon:
            break
    return losses

# Mini-batch gradient descent
def mini_batch_gradient_descent(model, criterion, optimizer, x1, y, batch_size=10, num_epochs=1000, epsilon=0.001):
    losses = []
    for epoch in range(num_epochs):
        for i in range(0, len(x1), batch_size):
            inputs = torch.tensor(x1[i:i+batch_size], dtype=torch.float32).view(-1, 1)
            labels = torch.tensor(y[i:i+batch_size], dtype=torch.float32).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        if loss.item() < epsilon:
            break
    return losses

# Stochastic gradient descent
def stochastic_gradient_descent(model, criterion, optimizer, x1, y, num_epochs=1000, epsilon=0.001):
    losses = []
    for epoch in range(num_epochs):
        for i in range(len(x1)):
            inputs = torch.tensor([x1[i]], dtype=torch.float32).view(-1, 1)
            labels = torch.tensor([y[i]], dtype=torch.float32).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        if loss.item() < epsilon:
            break
    return losses

# Full-batch gradient descent
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)
full_batch_losses = full_batch_gradient_descent(model, criterion, optimizer, x1, y)

# Mini-batch gradient descent
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)
mini_batch_losses = mini_batch_gradient_descent(model, criterion, optimizer, x1, y)

# Stochastic gradient descent
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)
stochastic_losses = stochastic_gradient_descent(model, criterion, optimizer, x1, y)



In [5]:
# Calculate the average number of iterations required for each method to get sufficiently close to the optimal solution, where "sufficiently close" means within a distance of ϵ(or ϵ -neighborhood) from the minimum value of the loss function.
def calculate_average_iterations(losses, epsilon):
    num_iterations = 0
    for loss in losses:
        num_iterations += 1
        if loss < epsilon:
            break
    return num_iterations

epsilon = 0.001
full_batch_iterations = calculate_average_iterations(full_batch_losses, epsilon)
mini_batch_iterations = calculate_average_iterations(mini_batch_losses, epsilon)
stochastic_iterations = calculate_average_iterations(stochastic_losses, epsilon)

print("Average number of iterations for full-batch gradient descent: ", full_batch_iterations)
print("Average number of iterations for mini-batch gradient descent: ", mini_batch_iterations)
print("Average number of iterations for stochastic gradient descent: ", stochastic_iterations)



Average number of iterations for full-batch gradient descent:  1000
Average number of iterations for mini-batch gradient descent:  4000
Average number of iterations for stochastic gradient descent:  76


In [6]:
# Visualize the convergence process for 15 epochs using plotly
epochs = 15
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(epochs), y=full_batch_losses[:epochs], mode='lines', name='Full-batch GD'))
fig.add_trace(go.Scatter(x=np.arange(epochs), y=mini_batch_losses[:epochs], mode='lines', name='Mini-batch GD'))
fig.add_trace(go.Scatter(x=np.arange(epochs), y=stochastic_losses[:epochs], mode='lines', name='Stochastic GD'))
fig.update_layout(title='Convergence Process', xaxis_title='Epochs', yaxis_title='Loss')
fig.show()

In [7]:
# Choose ϵ = 0.001 for convergence criteria. Which optimization process takes a larger number of epochs to converge, and why?
print("Average number of iterations for full-batch gradient descent: ", full_batch_iterations)
print("Average number of iterations for mini-batch gradient descent: ", mini_batch_iterations)
print("Average number of iterations for stochastic gradient descent: ", stochastic_iterations)
print("The stochastic gradient descent takes a larger number of epochs to converge because it updates the model parameters after each data point, which results in more frequent updates compared to full-batch and mini-batch gradient descent.")




Average number of iterations for full-batch gradient descent:  1000
Average number of iterations for mini-batch gradient descent:  4000
Average number of iterations for stochastic gradient descent:  76
The stochastic gradient descent takes a larger number of epochs to converge because it updates the model parameters after each data point, which results in more frequent updates compared to full-batch and mini-batch gradient descent.


    fig.add_trace(go.Scatter(x=[theta_0[i] for i in range(len(theta_0))], y=[theta_1[i] for i in range(len(theta_1))], mode='markers', name='Data'))


In [9]:
# make an animation using any python library for the contour plots for different epochs  for visualisation of optimisation process. Show the gradient descent path in the plot. Take all things from previous code like dataset and all.
# Define the model
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(1, 1)

    def forward(self, x):
        return self.linear(x)
    
# Define the loss function
criterion = nn.MSELoss()
# Define the optimizer
model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
num_epochs = 1000
losses = []
theta_0s = []
theta_1s = []
for epoch in range(num_epochs):
    inputs = Variable(torch.from_numpy(x1).float().view(-1, 1))
    labels = Variable(torch.from_numpy(y).float().view(-1, 1))

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    theta_0s.append(model.linear.bias.item())
    theta_1s.append(model.linear.weight.item())
    
# Create a meshgrid for the parameters
theta_0 = np.linspace(-10, 10, 100)
theta_1 = np.linspace(-10, 10, 100)
theta_0, theta_1 = np.meshgrid(theta_0, theta_1)
Z = np.zeros(theta_0.shape)
for i in range(len(theta_0)):
    for j in range(len(theta_0)):
        Z[i, j] = np.mean((y - (theta_1[i, j]*x1 + theta_0[i, j]))**2)
        
# Create the contour plot
fig = go.Figure()
fig.add_trace(go.Contour(x=theta_0[0], y=theta_1[:, 0], z=Z, colorscale='Viridis', contours=dict(showlabels=True)))
fig.add_trace(go.Scatter(x=theta_0s, y=theta_1s, mode='lines+markers', name='Gradient Descent Path'))
fig.update_layout(title='Contour Plot', xaxis_title='theta_0', yaxis_title='theta_1')
fig.show()




In [10]:
# Plot the loss vs epochs for all the methods
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(full_batch_losses)), y=full_batch_losses, mode='lines', name='Full-batch'))
fig.add_trace(go.Scatter(x=np.arange(len(mini_batch_losses)), y=mini_batch_losses, mode='lines', name='Mini-batch'))
fig.add_trace(go.Scatter(x=np.arange(len(stochastic_losses)), y=stochastic_losses, mode='lines', name='Stochastic'))
fig.update_layout(title='Loss vs Epochs', xaxis_title='Epochs', yaxis_title='Loss')
fig.show()

### Question-4 :  Implement gradient descent with momentum for the dataset. Visualize the convergence process for 15 steps. Compare the average number of steps taken with gradient descent (for variants full batch and stochastic) with momentum to that of vanilla gradient descent to converge to an ϵ -neighborhood for both dataset. Choose ϵ= 0.001. Write down your observations. Show the contour plots for different epochs for momentum implementation. Specifically, show all the vectors: gradient, current value of theta, momentum, etc.

In [13]:
# Implement gradient descent with momentum for the dataset.
def gradient_descent_with_momentum(model, criterion, optimizer, x1, y, num_epochs=1000, epsilon=0.001, momentum=0.9):
    losses = []
    for epoch in range(num_epochs):
        inputs = torch.tensor(x1, dtype=torch.float32).view(-1, 1)
        labels = torch.tensor(y, dtype=torch.float32).view(-1, 1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        if loss.item() < epsilon:
            break
    return losses

model = LinearRegression()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
momentum_losses = gradient_descent_with_momentum(model, criterion, optimizer, x1, y)


In [36]:
#  Visualize the convergence process for 15 steps using plotly
epochs = 15
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(epochs), y=momentum_losses[:epochs], mode='lines', name='Momentum GD'))
fig.update_layout(title='Convergence Process with Momentum', xaxis_title='Epochs', yaxis_title='Loss')
fig.show()


In [15]:
# Compare the average number of steps taken with gradient descent (for variants full batch and stochastic) with momentum to that of vanilla gradient descent to converge to an ϵ -neighborhood for both dataset.
momentum_iterations = calculate_average_iterations(momentum_losses, epsilon)
print("Average number of iterations for gradient descent with momentum: ", momentum_iterations)
print("Average number of iterations for full-batch gradient descent: ", full_batch_iterations)
print("Average number of iterations for stochastic gradient descent: ", stochastic_iterations)
print("The gradient descent with momentum takes fewer iterations to converge compared to full-batch and stochastic gradient descent.")



Average number of iterations for gradient descent with momentum:  1000
Average number of iterations for full-batch gradient descent:  1000
Average number of iterations for stochastic gradient descent:  85
The gradient descent with momentum takes fewer iterations to converge compared to full-batch and stochastic gradient descent.


In [31]:
# Show the contour plots for different epochs for momentum implementation. Specifically, show all the vectors: gradient, current value of theta, momentum.
def plot_contour_plot_with_momentum(losses, x1, y, num_epochs, title):
    theta_0 = np.linspace(-10, 10, 100)
    theta_1 = np.linspace(-10, 10, 100)
    theta_0, theta_1 = np.meshgrid(theta_0, theta_1)
    z = np.zeros((100, 100))
    for i in range(100):
        for j in range(100):
            theta = np.array([theta_0[i, j], theta_1[i, j]])
            z[i, j] = np.mean(np.square(y - (theta[1]*x1 + theta[0])))
    fig = go.Figure(data=[go.Contour(z=z, x=theta_0[0], y=theta_1[:, 0])])
    fig.update_layout(title=title)
    fig.show()

plot_contour_plot_with_momentum(momentum_losses, x1, y, momentum_iterations, 'Gradient Descent with Momentum')


