# PyTorch
## 1. Build the neural network
### 1) Build A NN Model


In [16]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [18]:
## Define Class
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 12),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [19]:
model = NN()

print(model)

X = torch.rand(1, 28, 28)
logits = model(X)
print("Result: ", logits)
pred_probab = nn.Softmax(dim=1)(logits)
print("pred_probab: ", pred_probab)
y_pred = pred_probab.argmax(1)
print(f"Predicited Class: {y_pred}")

NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=12, bias=True)
  )
)
Result:  tensor([[-0.0612,  0.0053, -0.0304,  0.0424, -0.0022, -0.0888,  0.0538,  0.1274,
         -0.0222, -0.0036, -0.0424,  0.0291]], grad_fn=<AddmmBackward0>)
pred_probab:  tensor([[0.0782, 0.0836, 0.0807, 0.0868, 0.0830, 0.0761, 0.0877, 0.0945, 0.0813,
         0.0829, 0.0797, 0.0856]], grad_fn=<SoftmaxBackward0>)
Predicited Class: tensor([7])


### 2) Model Layers

In [20]:
input_image = torch.rand(3, 28, 28)
print(input_image.size())

torch.Size([3, 28, 28])


In [21]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


In [22]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


In [23]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.2773, -0.0380,  0.1990,  0.0036,  0.0352, -0.6916, -0.2113,  0.0315,
          0.1830, -0.5613, -0.0925,  0.2545,  0.2812,  0.1356,  0.3041,  0.1392,
         -0.6648,  0.1825, -0.5288,  0.0067],
        [ 0.4752, -0.0291,  0.3626,  0.1304,  0.0532, -0.4519, -0.2813,  0.0238,
          0.2121, -0.5340, -0.1467,  0.4844,  0.0466,  0.1575,  0.5052,  0.3524,
         -0.6622, -0.0446, -0.3488,  0.3965],
        [ 0.1879,  0.2822,  0.3781,  0.1618, -0.1590, -0.3062, -0.3237,  0.1606,
          0.1825, -0.7992, -0.1025,  0.2283, -0.1023,  0.0246,  0.1677,  0.1427,
         -0.3895,  0.1979, -0.6698, -0.1093]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.2773, 0.0000, 0.1990, 0.0036, 0.0352, 0.0000, 0.0000, 0.0315, 0.1830,
         0.0000, 0.0000, 0.2545, 0.2812, 0.1356, 0.3041, 0.1392, 0.0000, 0.1825,
         0.0000, 0.0067],
        [0.4752, 0.0000, 0.3626, 0.1304, 0.0532, 0.0000, 0.0000, 0.0238, 0.2121,
         0.0000, 0.0000, 0.4844, 0.0466, 0.1575, 0.50

In [24]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)

input_image = torch.rand(3, 28, 28)
logits = seq_modules(input_image)

In [25]:
logits

tensor([[-0.1401, -0.2220,  0.1288, -0.0182,  0.2030,  0.0044,  0.0046,  0.2070,
          0.1709,  0.1050],
        [-0.2172, -0.1837,  0.2571, -0.0160,  0.2552, -0.0059, -0.0054,  0.2456,
          0.0862,  0.1759],
        [-0.1179, -0.2739,  0.1828, -0.0356,  0.1859, -0.0231,  0.1972,  0.2984,
          0.1610,  0.1459]], grad_fn=<AddmmBackward0>)

In [26]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)
pred_probab

tensor([[0.0824, 0.0759, 0.1078, 0.0931, 0.1161, 0.0952, 0.0952, 0.1166, 0.1124,
         0.1053],
        [0.0748, 0.0774, 0.1202, 0.0915, 0.1200, 0.0924, 0.0925, 0.1189, 0.1014,
         0.1109],
        [0.0816, 0.0698, 0.1102, 0.0886, 0.1105, 0.0897, 0.1118, 0.1237, 0.1078,
         0.1062]], grad_fn=<SoftmaxBackward0>)

In [27]:
print(f"Model structure:  {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | size: {param.size()} | Values: {param[:2]} \n")

Model structure:  NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=12, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | size: torch.Size([512, 784]) | Values: tensor([[-0.0157, -0.0124,  0.0347,  ...,  0.0302,  0.0143, -0.0017],
        [ 0.0122,  0.0111,  0.0303,  ..., -0.0132,  0.0272,  0.0117]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | size: torch.Size([512]) | Values: tensor([0.0341, 0.0066], grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | size: torch.Size([512, 512]) | Values: tensor([[ 0.0337, -0.0393, -0.0407,  ...,  0.0294,  0.0197, -0.0048],
        [-0.0231,  0.0124,  0.0158,  ...,  0.0098, -0.0427, -0.0085]],
       grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.bias | size: torch.Siz

## 2. Automatic Differentiation with torch.autograde
When training neural network, the most frequently used algorithm is back propagation. In this algorithm, parameters(model weights) are adjusted according to the gradient of the loss function with respect to the given parameter.

PyTorch has build the torch.autograd to compute those gradients.

In [28]:
import torch

x = torch.ones(5)
y = torch.zeros(3)
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

In [38]:
# to check the function for gradient
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x0000011ED76117C0>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x0000011ED7611820>


#### Computing Gradients
To optimize weights of parameters in the neural network, we need to compute the derivatives of our loss function with respect to parameters, namely, we need $\frac{\partial loss}{\partial w}$ and $\frac{\partial loss}{\partial b}$ under some fixed values of x and y. We call loss.backward() to compute those derivatives, and then retrieve the values from w.grad and b.grad.

* We can only obtain the grad properties for the leaf nodes of the computational graph, which have requires_grad property set to True. For all other nodes in our graph, gradients will not be available.
* We can only perform gradient calculations using backward once on a given graph, for performance reasons. If we need to do several backward calls on the same graph, we need to pass retain_graph=True to the backward call.

In [39]:
loss.backward()

In [40]:
print(w.grad)

tensor([[0.1949, 0.3180, 0.2546],
        [0.1949, 0.3180, 0.2546],
        [0.1949, 0.3180, 0.2546],
        [0.1949, 0.3180, 0.2546],
        [0.1949, 0.3180, 0.2546]])


In [41]:
print(b.grad)

tensor([0.1949, 0.3180, 0.2546])


#### Disabling Gradient Tracking
Be default, all tensors with requires_grad=True are tracking their computational history and support gradient computation. However, there are some cases when we do not need do that, for example, when we have trained the model and just want to apply it to some input data, i.e. we only want to *forward* computations through the network. We can stop tracking computations by surrounding our computation code with torch.no_grad() block:

In [43]:
z = torch.matmul(x, w) + b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w) + b
print(z.requires_grad)

True
False


Another way to achieve the same result is to use the detach() on the tensor

In [45]:
z = torch.matmul(x, w) + b
z_det = z.detach()
print(z_det.requires_grad)

False


#### Tensor Gradients and Jacobian Products

In [62]:
inp = torch.eye(4, 5, requires_grad=True)
inp

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.]], requires_grad=True)

In [63]:
out = (inp + 1).pow(2).t()
out

tensor([[4., 1., 1., 1.],
        [1., 4., 1., 1.],
        [1., 1., 4., 1.],
        [1., 1., 1., 4.],
        [1., 1., 1., 1.]], grad_fn=<TBackward0>)

In [64]:
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")

First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])


In [65]:
out.backward(torch.ones_like(out), retain_graph=True)
print(f"Second call\n{inp.grad}")

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])


In [66]:
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"Call after zeroing gradients\n{inp.grad}")

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])


## 3. Optimizing Model Parameters
Training a model is an iterative process; in each iteration the model makes a guess about the output, calculates the error in its guess *(loss)*, collects the derivatives of the error with respect to its parameters, and optimizes these parameters using gradient descent. For a more detailed walkthrough of this process, check out the video: https://www.youtube.com/watch?v=tIeHLnjs5U8.


#### Prerequisite Code
We load the code from the previous section on datasets & Data_loads and build model

In [75]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

training_data = datasets.FashionMNIST(
    root = "data",
    train = True,
    download = True,
    transform = ToTensor()
)

test_data = datasets.FashionMNIST(
    root = "data",
    train = False,
    download = True,
    transform = ToTensor()
)

train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NN()



In [68]:
# Hyperparameters
learning_rate = 1e-3
batch_size = 64
epochs = 5

#### Optimization Loop
Each iteration of optimization loop is called an **epoch**.
Each epoch consists of two main parts:
* **The Train Loop** - iterate over the training dataset and try to converge to optimal parameters.
* **The Validation/Test Loop** -iterate over the test dataset to check if model performance is improving.

#### Loss Function
* **nn.MSELoss** -Mean Square Error for regression task.
* **nn.NLLLoss** -Negative Log Likelihood for classfication.
* **nn.CrossEntropyLoss** -combines nn.LogSoftmax and nn.NLLLoss.

In [69]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

Inside the training loop, optimization happens in three steps:
* Call **optimizer.zero_grad()** to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.
* Backpropagate the prediction loss with a call to **loss.backward()**. PyToch deposits the gradients of the loss w.r.t. each parameter.
* Once we have our gradients, we call **optimizer.step()** to adjust the parameters by the gradients collected in the backward pass.

#### Full Implementation

In [71]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f} [{current: >5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n ACC: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}\n")

In [76]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

epochs = 100

for t in range(epochs):
    print(f"Epoch {t+1}\n---------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
---------------------------------
loss: 2.307213 [   64/60000]
loss: 2.293616 [ 6464/60000]
loss: 2.280282 [12864/60000]
loss: 2.262713 [19264/60000]
loss: 2.244120 [25664/60000]
loss: 2.214276 [32064/60000]
loss: 2.224720 [38464/60000]
loss: 2.197532 [44864/60000]
loss: 2.186252 [51264/60000]
loss: 2.145954 [57664/60000]
Test Error: 
 ACC: 34.4%, Avg loss: 2.151113

Epoch 2
---------------------------------
loss: 2.165578 [   64/60000]
loss: 2.152183 [ 6464/60000]
loss: 2.101547 [12864/60000]
loss: 2.103894 [19264/60000]
loss: 2.049453 [25664/60000]
loss: 1.991416 [32064/60000]
loss: 2.016410 [38464/60000]
loss: 1.944982 [44864/60000]
loss: 1.941644 [51264/60000]
loss: 1.861235 [57664/60000]
Test Error: 
 ACC: 56.1%, Avg loss: 1.867334

Epoch 3
---------------------------------
loss: 1.905179 [   64/60000]
loss: 1.872437 [ 6464/60000]
loss: 1.764631 [12864/60000]
loss: 1.790527 [19264/60000]
loss: 1.675589 [25664/60000]
loss: 1.639269 [32064/60000]
loss: 1.654220 [38464/60000]