In [None]:
'''
What is Autograd?

Autograd is PyTorch’s automatic differentiation engine.
It automatically computes gradients (derivatives) of tensors with respect to some scalar value (usually the loss in machine learning).

This is super useful for backpropagation in neural networks, where we need gradients to update weights.
PyTorch’s Autograd automates for you — instead of manually applying chain rule, it traces the graph and computes everything automatically.

🔹 Key Concepts

1) Tensor with requires_grad=True

Tells PyTorch: "track all operations on this tensor, so I can compute gradients later."

import torch

x = torch.tensor([2.0], requires_grad=True)


2)Computation Graph

When you perform operations on tensors, PyTorch builds a graph of those operations.

Each node in the graph is a tensor, and edges are functions (operations).

Example:

y = x**2 + 3*x + 1


Here, PyTorch builds a graph:

x → (square) → (multiply by 3) → (add) → y


3)Backward Pass (.backward())

If y is a scalar, calling y.backward() computes dy/dx.

The gradient is stored in x.grad.

y.backward()
print(x.grad)  # derivative of y wrt x: dy/dx = 2x + 3 = 7


4) Gradient Accumulation

By default, PyTorch accumulates gradients in .grad.

Before each training step, we clear them with:

optimizer.zero_grad()  # or x.grad.zero_()


with torch.no_grad()

Used during inference to stop building the graph and save memory.

with torch.no_grad():
    y = x * 2


5) Detach Tensor (.detach())

Creates a new tensor that shares data but is not tracked for gradients.

z = x.detach()

'''

'\nWhat is Autograd?\n\nAutograd is PyTorch’s automatic differentiation engine.\nIt automatically computes gradients (derivatives) of tensors with respect to some scalar value (usually the loss in machine learning).\n\nThis is super useful for backpropagation in neural networks, where we need gradients to update weights.\n\n🔹 Key Concepts\n\n1) Tensor with requires_grad=True\n\nTells PyTorch: "track all operations on this tensor, so I can compute gradients later."\n\nimport torch\n\nx = torch.tensor([2.0], requires_grad=True)\n\n\n2)Computation Graph\n\nWhen you perform operations on tensors, PyTorch builds a graph of those operations.\n\nEach node in the graph is a tensor, and edges are functions (operations).\n\nExample:\n\ny = x**2 + 3*x + 1\n\n\nHere, PyTorch builds a graph:\n\nx → (square) → (multiply by 3) → (add) → y\n\n\n3)Backward Pass (.backward())\n\nIf y is a scalar, calling y.backward() computes dy/dx.\n\nThe gradient is stored in x.grad.\n\ny.backward()\nprint(x.grad)  # 

# Without auto grad in pytorch


In [3]:
import torch

# Inputs
x = torch.tensor(6.7)  # Input feature
y = torch.tensor(0.0)  # True label (binary)

w = torch.tensor(1.0)  # Weight
b = torch.tensor(0.0)  # Bias

In [4]:
# Binary Cross-Entropy Loss for scalar
def binary_cross_entropy_loss(prediction, target):
    epsilon = 1e-8  # To prevent log(0)
    prediction = torch.clamp(prediction, epsilon, 1 - epsilon)
    return -(target * torch.log(prediction) + (1 - target) * torch.log(1 - prediction))

In [5]:
# Forward pass
z = w * x + b  # Weighted sum (linear part)
y_pred = torch.sigmoid(z)  # Predicted probability

# Compute binary cross-entropy loss
loss = binary_cross_entropy_loss(y_pred, y)

In [6]:
loss

tensor(6.7012)

In [7]:
# Derivatives:
# 1. dL/d(y_pred): Loss with respect to the prediction (y_pred)
dloss_dy_pred = (y_pred - y)/(y_pred*(1-y_pred))

# 2. dy_pred/dz: Prediction (y_pred) with respect to z (sigmoid derivative)
dy_pred_dz = y_pred * (1 - y_pred)

# 3. dz/dw and dz/db: z with respect to w and b
dz_dw = x  # dz/dw = x
dz_db = 1  # dz/db = 1 (bias contributes directly to z)

dL_dw = dloss_dy_pred * dy_pred_dz * dz_dw
dL_db = dloss_dy_pred * dy_pred_dz * dz_db

In [8]:
print(f"Manual Gradient of loss w.r.t weight (dw): {dL_dw}")
print(f"Manual Gradient of loss w.r.t bias (db): {dL_db}")

Manual Gradient of loss w.r.t weight (dw): 6.691762447357178
Manual Gradient of loss w.r.t bias (db): 0.998770534992218


# with auto grad in pytorch

In [9]:
x = torch.tensor(6.7)
y = torch.tensor(0.0)

In [10]:
w = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

In [11]:
w

tensor(1., requires_grad=True)

In [12]:
b

tensor(0., requires_grad=True)

In [13]:
z = w*x + b
z

tensor(6.7000, grad_fn=<AddBackward0>)

In [14]:
y_pred = torch.sigmoid(z)
y_pred

tensor(0.9988, grad_fn=<SigmoidBackward0>)

In [15]:
loss = binary_cross_entropy_loss(y_pred, y)
loss

tensor(6.7012, grad_fn=<NegBackward0>)

In [16]:
loss.backward()

In [17]:
print(w.grad)
print(b.grad)

tensor(6.6918)
tensor(0.9988)


In [None]:
'''
Problem Statement

We want to train a simple linear model:

𝑦=2𝑥+1

y=2x+1

Our goal: Given some training data, learn the parameters (weight w and bias b) using PyTorch autograd.
'''



In [18]:
import torch

# Training data (x, y) pairs
x = torch.tensor([1.0, 2.0, 3.0, 4.0])
y = torch.tensor([3.0, 5.0, 7.0, 9.0])  # y = 2x + 1

In [19]:
# Random initialization of parameters
w = torch.tensor(0.0, requires_grad=True)  # weight
b = torch.tensor(0.0, requires_grad=True)  # bias

In [20]:
# Linear model
def model(x):
    return w * x + b

# Mean Squared Error (MSE)
def mse(y_pred, y_true):
    return ((y_pred - y_true)**2).mean()


In [21]:
learning_rate = 0.1

for epoch in range(10):
    # Forward pass
    y_pred = model(x)
    loss = mse(y_pred, y)

    # Backward pass
    loss.backward()   # computes dloss/dw and dloss/db

    # Update parameters (manual SGD)
    with torch.no_grad():   # stop tracking updates
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

    # Clear gradients (important!)
    w.grad.zero_()
    b.grad.zero_()

    print(f"Epoch {epoch+1}: w={w.item():.4f}, b={b.item():.4f}, loss={loss.item():.4f}")


Epoch 1: w=3.5000, b=1.2000, loss=41.0000
Epoch 2: w=1.1500, b=0.4100, loss=18.4150
Epoch 3: w=2.7200, b=0.9530, loss=8.2744
Epoch 4: w=1.6635, b=0.6024, loss=3.7210
Epoch 5: w=2.3671, b=0.8502, loss=1.6763
Epoch 6: w=1.8914, b=0.6966, loss=0.7579
Epoch 7: w=2.2060, b=0.8116, loss=0.3453
Epoch 8: w=1.9912, b=0.7463, loss=0.1597
Epoch 9: w=2.1313, b=0.8014, loss=0.0761
Epoch 10: w=2.0337, b=0.7755, loss=0.0383


In [None]:
'''
     (x=2)        (w)        (b)
        │           │          │
        │           │          │
        └─────┐   * │   ┌──────┘
              │───────> mul
              │         │
              │         ▼
              │      (wx)
              │
              │
              │             add
              └─────────────► (+) ──────► ŷ = wx + b
                                      │
                                      │
                                      ▼
                              (ŷ - y_true)
                                      │
                                      ▼
                               square error
                                      │
                                      ▼
                                    loss


What happens during .backward()?

PyTorch applies chain rule through this graph:

Compute

∂𝐿
∂𝑦^=2(𝑦^−𝑦)∂y^∂L=2(y^−y)

Flow gradient to w and b:

∂𝐿∂𝑤=∂𝐿∂𝑦^⋅𝑥∂w∂L=∂y^∂L⋅x𝐿∂𝑏=∂𝐿∂𝑦^⋅1∂b∂=∂y^∂L⋅1

PyTorch stores these in w.grad and b.grad.

🔹 Gradient Flow Summary

Forward pass: Build graph → compute outputs (ŷ) → compute loss.

Backward pass: Start from loss → apply chain rule backwards → fill w.grad and b.grad.

Update step: Adjust w and b using gradients.


'''