<a href="https://colab.research.google.com/github/ElFatemehHonarvar/ElFatemehHonarvar/blob/main/pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn

In [None]:
a = [[1, 2, 3], [4, 5, 6]]
b = [[1, 2, 3]]
print(torch.tensor(a).shape)
print(torch.tensor(b).shape)

print(torch.tensor(a) + torch.tensor(b))


torch.Size([2, 3])
torch.Size([1, 3])
tensor([[2, 4, 6],
        [5, 7, 9]])


In [None]:
a = [[1, 2, 3, -1], [4, 5, 6, 0]]
b = [[1, 2]]
print(torch.tensor(a).shape)
print(torch.tensor(b).shape)

print(torch.tensor(a) + torch.tensor(b))

torch.Size([2, 4])
torch.Size([1, 2])


RuntimeError: The size of tensor a (4) must match the size of tensor b (2) at non-singleton dimension 1

In [None]:
torch.tensor(a).view(8, 1)

tensor([[ 1],
        [ 2],
        [ 3],
        [-1],
        [ 4],
        [ 5],
        [ 6],
        [ 0]])

In [None]:
a = [[1, 2, 3], [4, 5, 6]]
b = [[1, 2, 3]]

tensor_a = torch.tensor(a)
tensor_b = torch.tensor(b)
print(f"shape a is {tensor_a.shape}")
print(f"shape b is {tensor_b.shape}")

tensor_b = tensor_b.view(3, 1)
print(tensor_a.matmul(tensor_b))

shape a is torch.Size([2, 3])
shape b is torch.Size([1, 3])
tensor([[14],
        [32]])


inter-convert tensors with numpy arrays

In [None]:
array_a = tensor_a.numpy()
print(f" array a is {array_a}")

tensor_a = torch.tensor(array_a)
print(f" tensor a is {tensor_a}")


 array a is [[1 2 3]
 [4 5 6]]
 tensor a is tensor([[1, 2, 3],
        [4, 5, 6]])


**Note** : One of the reasons why we use **tensors** is vectorized oprations, operations that be conducted in parallel over a particular dimension of tensor. Essentially we can parallelize a lot of different computations and do them, for instance, across a batch of data

In [None]:
tensor_a
tensor_b

tensor([[1],
        [2],
        [3]])

In [None]:
tensor_b.sum(dim=1)

tensor([1, 2, 3])

In [None]:
tensor_c = torch.tensor([[[1, 2, 3]]])
print(f"tensor is  {tensor_c}, shape is {tensor_c.shape}")
# the dimension that you specify in the sum is the dimension you're collapsing.

print(f"sum over dim = 0  is {tensor_c.sum(dim=0)}, shape is {tensor_c.sum(dim=0).shape} ")
print(f"sum over dim = 1  is {tensor_c.sum(dim=1)}, shape is {tensor_c.sum(dim=1).shape}")
print(f"sum over dim = 2  is {tensor_c.sum(dim=2)}, shape is {tensor_c.sum(dim=2).shape}")


tensor is  tensor([[[1, 2, 3]]]), shape is torch.Size([1, 1, 3])
sum over dim = 0  is tensor([[1, 2, 3]]), shape is torch.Size([1, 3]) 
sum over dim = 1  is tensor([[1, 2, 3]]), shape is torch.Size([1, 3])
sum over dim = 2  is tensor([[6]]), shape is torch.Size([1, 1])


**Note** : But it's not just sum. You can compute standard deviations. You can normalize your data. You can do other operations, which essentially batch across the entire set of data.

In [None]:
# if you don't specify any dimensions, then by default, the operation actually applies to the entire tensor.
tensor_c.sum()

tensor(6)

In [None]:
tensor_d = torch.tensor([[1, 2.2, 9.6], [4, -7.2, 6.3]])

print(f"tensor is  {tensor_d}, shape is {tensor_d.shape}")
print("-"*50)

col_avg = tensor_d.mean(dim=0)
print(f"mean over dim = 0  is {col_avg}, shape is {col_avg.shape}")

row_avg = tensor_d.mean(dim=1)
# If we're taking the average Over Rows, then an object that's 2x3, should just become an object that's 2.
print(f"mean over dim = 1  is {row_avg}, shape is {row_avg.shape}")



tensor is  tensor([[ 1.0000,  2.2000,  9.6000],
        [ 4.0000, -7.2000,  6.3000]]), shape is torch.Size([2, 3])
--------------------------------------------------
mean over dim = 0  is tensor([ 2.5000, -2.5000,  7.9500]), shape is torch.Size([3])
mean over dim = 1  is tensor([4.2667, 1.0333]), shape is torch.Size([2])



## **Indexing**

You can access arbitrary elements of a tensor using the ``[]`` operator




In [None]:
tensor_e = torch.tensor([
    [[1, 2, 3], [0, -1, -2]], [[4, 5, 6], [0, -3, -4]]
])
tensor_e.shape

torch.Size([2, 2, 3])

In [None]:
# Access the 0th element which is the first row
tensor_e[0] # Equivalent to tensor_e[0, :]

tensor([[ 1,  2,  3],
        [ 0, -1, -2]])

In [None]:
tensor_e[:, 0]

tensor([[1, 2, 3],
        [4, 5, 6]])

In [None]:
import numpy as np
x = np.array([[1, 2, 3], [4, 5, 6]])
x[0, :]

array([1, 2, 3])

In [None]:
tensor_f = torch.arange(1, 31, 2).view(5, 3)
tensor_f

tensor([[ 1,  3,  5],
        [ 7,  9, 11],
        [13, 15, 17],
        [19, 21, 23],
        [25, 27, 29]])

In [None]:
tensor_f[0, :]

tensor([1, 3, 5])

In [None]:
tensor_f[0, 2]

tensor(5)

In [None]:
tensor_f[0:2]


tensor([[ 1,  3,  5],
        [ 7,  9, 11]])

In [None]:
tensor_f[3, 1]

tensor(21)

In [None]:
tensor_f[:, 1:2]

tensor([[ 3],
        [ 9],
        [15],
        [21],
        [27]])

In [None]:
tensor_f[:, 1]

tensor([ 3,  9, 15, 21, 27])

In [None]:
tensor_f[0:3, 0:2]

tensor([[ 1,  3],
        [ 7,  9],
        [13, 15]])

In [None]:
tensor_f[0:3, 2]

tensor([ 5, 11, 17])

In [None]:
tensor_f[0:3][2]

tensor([13, 15, 17])

In [None]:
tensor_f[3, 2]

tensor(23)

In [None]:
tensor_f[3][2]

tensor(23)

In [None]:
# list indexing
tensor_f[[0, 2, 4]]

tensor([[ 1,  3,  5],
        [13, 15, 17],
        [25, 27, 29]])

In [None]:
tensor_f[:,[0, 2]]

tensor([[ 1,  5],
        [ 7, 11],
        [13, 17],
        [19, 23],
        [25, 29]])

In [None]:
tensor_g = torch.arange(1, 13, dtype=torch.float32).view(3, 2, 2)
tensor_g


tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

In [None]:
tensor_g[0, :, :] # Equivalent to tensor_g[0]

tensor([[1., 2.],
        [3., 4.]])

In [None]:
tensor_g[0, 1, 1]

tensor(4.)

**Note** : when we're writing code with neural networks,ultimately, we're going to process
some data through a network and we're going to get a loss. And that loss needs to be a scalar. And then we're going to compute gradients, with respect to that loss. So one thing to keep in mind is that sometimes you might have an operation and it fails because it was actually expecting a scalar value rather than a tensor.

We can get a ``Python`` scalar value from a tensor with ``item():``

In [None]:
tensor_g[0, 1, 1].item()

4.0

## Autograd
PyTorch is well-known for its automatic differentiation feature. We can call backward call the ``backward()`` method to ask ``PyTorch`` to calculate the gradients, which are then stored in the ``grad`` attribute.

Detailed explanation:

PyTorch essentially provides an automatic differentiation package where when you define your neural network, you're essentially defining many nodes that
compute some function. And in the forward pass, you're kind of running your data through those nodes. But what PyTorch is doing on the back end, is that at each of those points, it's going to actually store the gradients and accumulate them, so that every time you do your backwards pass, you apply the chain rule to be able to calculate all of these different gradients. And PyTorch caches those gradients. And then you will have access to all of those gradients to be able to actually then run your favorite optimizer and optimize with SGD, or with Adam, or whichever optimizer you choose.

summary : y.backward() will perform backprop to compute the gradients for all the leaf Tensors used to compute y.
The .grad attribute of leaf Tensors is where these computed gradients are stored.
In the backwards pass, what it's doing is you can imagine there's the x tensor
and then there's the ``.grad`` attribute, which is another separate tensor. It's going to be the same shape as x. And what that is storing, is it's storing the accumulated gradient from every single time that you've called ``.backward()`` on a quantity that essentially has some dependency on x, that will have a non-zero gradient.


More info :
https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html

In [None]:
x = torch.tensor([2.], requires_grad=True)
print(x.grad)

None


``requires_grad`` parameter tells PyTorch to store gradients

why do we have this when we always want to store the gradient?

the answer is, at train time, you need the gradients in order to actually train your network. But at inference time, you'd actually want to disable your gradients. And you can actually do that because it's a lot of extra computation that's not needed, since you're not making any updates to your network anymore.
you need to zero out the gradient because you don't want the previous gradients
from the last epoch where you iterated through all of your training data to mess with the current update that you're doing.


In [None]:
x = torch.tensor([2.], requires_grad=True)

# we've essentially made two different backwards passes. We've called it once
# on this function y, which is a function of x. And we've called it once on z,
# which is also a function of x.
y = x * x * 3
y.backward()
print(x.grad)

z = x * x * 3
z.backward()
print(x.grad)

tensor([12.])
tensor([24.])


#**bold text** Add description

## Neural Network Module

we're going to be defining neural networks in terms of existing building blocks, in terms of existing APIs, which will implement for instance linear layers or different activation functions that we need.

In [None]:
import torch.nn as nn

The way the linear layer works in PyTorch, is it takes in two arguments. It takes in the input dimension and then the output dimension. And so what it does, is it takes in some input, which has some arbitrary amount of dimensions,
and then finally, the input dimension. And it will essentially output it to that same set of dimensions, except the output dimension and the very last place.


Typically, we think of the first dimension as the batch dimension. So in this case, it said ``N`` this you can think of as if you had a batch of images,
it would be the number of images. If you had a training corpus of text,
it would be essentially the number of sentences or sequences.

The star indicates that there can be an arbitrary number of dimensions. So for instance, if we had images, this could be a 4-dimensional tensor object. It could be the batch size by the number of channels by the height, by the width.
But in general, there's no fixed number of dimensions.

Your input tensor can be any number of dimensions.The key is just that last dimension needs to match up with the input dimension of your linear layer.

So essentially, we're saying that we're going to map this last dimension, which
is 4-dimensional to now 2-dimensional. So in general, you can think of this as if we're stacking a neural network, 5 in nn.Linear(5, 1) is the kind of input dimension size.And 1 in nn.Linear(5, 1) would be like the hidden dimension size.

## Linear Layer

In [None]:
input = torch.ones(2, 3, 5)

linear = nn.Linear(5, 1)
output = linear(input)
print(output.shape)
print("-------------")
print(output)

torch.Size([2, 3, 1])
-------------
tensor([[[0.4913],
         [0.4913],
         [0.4913]],

        [[0.4913],
         [0.4913],
         [0.4913]]], grad_fn=<ViewBackward0>)


we have this grad function ``grad_fn``. And so that's because we're actually computing and storing the gradients here for our tensor.

In [None]:
# And so both of them store the gradients.And in this case, these are what the current values of these parameters are.And they'll change as we trained the network.
list(linear.parameters())   # Ax + b

[Parameter containing:
 tensor([[ 0.1817,  0.3444, -0.4287, -0.1835, -0.0045]], requires_grad=True),
 Parameter containing:
 tensor([0.1396], requires_grad=True)]

In [None]:
block = nn.Sequential(
    nn.Linear(5, 1),
    nn.Sigmoid()
)

input = torch.ones(2, 3, 5)
output = block(input)
output

tensor([[[0.5247],
         [0.5247],
         [0.5247]],

        [[0.5247],
         [0.5247],
         [0.5247]]], grad_fn=<SigmoidBackward0>)

In [None]:
class MultilayerPerceptron(nn.Module):
  def __init__(self, input_size , hidden_size):
    super(MultilayerPerceptron, self).__init__()   # super is an object from main class,nn.Modul. here we bound the main class to MultilayerPerceptron and its objest,self.

    self.input_size = input_size
    self.hidden_size = hidden_size

    self.model = nn.Sequential(
        nn.Linear(self.input_size, self.hidden_size),
        nn.ReLU(),
        nn.Linear(self.hidden_size, self.input_size),
        nn.Sigmoid()
    )

  def forward(self, x):
    output = self.model(x)
    return output

In [None]:
super(MultilayerPerceptron) # unbound

<super: __main__.MultilayerPerceptron, None>

In [None]:
input = torch.randn(2, 5)
model = MultilayerPerceptron(5, 3)
model(input)

tensor([[0.7216, 0.5806, 0.3648, 0.3676, 0.4173],
        [0.6102, 0.5158, 0.4536, 0.4386, 0.4934]], grad_fn=<SigmoidBackward0>)

In [None]:
super(MultilayerPerceptron, model) #bound

<super: __main__.MultilayerPerceptron,
        MultilayerPerceptron(
          (model): Sequential(
            (0): Linear(in_features=5, out_features=3, bias=True)
            (1): ReLU()
            (2): Linear(in_features=3, out_features=5, bias=True)
            (3): Sigmoid()
          )
        )>

In [None]:
model.forward(input)

tensor([[0.3328, 0.5894, 0.5448, 0.4074, 0.6427],
        [0.4010, 0.5857, 0.5591, 0.4121, 0.6242]], grad_fn=<SigmoidBackward0>)

The reason ``model(input)`` and ``model.forward(input)`` yield the same results is due to how PyTorch's nn.Module class is designed.

When you call ``model(input)``, it internally invokes the ``__call__ ``method of ``nn.Module``, which is built into every PyTorch model. The ``__call__`` method, in turn, calls the forward method you defined in your model. Therefore, ``model(input)`` is effectively the same as ``model.forward(input)``.

``model.forward(input)`` directly calls the forward method, bypassing any additional functionality provided by ``__call__``, such as hooks, which PyTorch uses for things like logging, profiling, or modifying inputs and outputs.

In [None]:
list(model.named_parameters())


[('model.0.weight',
  Parameter containing:
  tensor([[ 0.3469, -0.2862, -0.1382, -0.1603, -0.3757],
          [ 0.3582, -0.0254,  0.3075,  0.0823, -0.3236],
          [ 0.1594, -0.3484, -0.1925,  0.3264, -0.0264]], requires_grad=True)),
 ('model.0.bias',
  Parameter containing:
  tensor([-0.4370, -0.4294, -0.2770], requires_grad=True)),
 ('model.2.weight',
  Parameter containing:
  tensor([[-0.5521,  0.4971, -0.1844],
          [ 0.1914,  0.5041,  0.0373],
          [-0.1350,  0.1558,  0.3496],
          [-0.0565,  0.0993,  0.3199],
          [-0.0483, -0.2651, -0.1678]], requires_grad=True)),
 ('model.2.bias',
  Parameter containing:
  tensor([ 0.2900,  0.5341, -0.3321, -0.4505,  0.2494], requires_grad=True))]

## How to train a network?

In [None]:
# optimization
# opdate the parameters using those gradients
import torch.optim as optim

In [None]:
y = torch.ones(10, 5)
x = y + torch.randn_like(y)  # y + noise

In [None]:
model = MultilayerPerceptron(5, 3)

# optimizer
adam_optim = optim.Adam(model.parameters() ,lr=1e-1)

# loss function  -> here the loss is cross entropy loss
loss_function = nn.BCELoss()

y_pred = model(x)
loss_function(y_pred, y).item()  # .item()    to extract scalar

0.8946205973625183

In [None]:
n_epoch = 10
for epoch in torch.arange(n_epoch):

  adam_optim.zero_grad()
  y_pred = model(x)
  loss = loss_function(y_pred, y)
  print(type(loss))
  print(f"epoch : {epoch}, loss : {loss}")

  # computes all the gradients in the backward pass from our loss
  loss.backward()

  adam_optim.step()



<class 'torch.Tensor'>
epoch : 0, loss : 0.177326962351799
<class 'torch.Tensor'>
epoch : 1, loss : 0.1085415929555893
<class 'torch.Tensor'>
epoch : 2, loss : 0.061373766511678696
<class 'torch.Tensor'>
epoch : 3, loss : 0.032618504017591476
<class 'torch.Tensor'>
epoch : 4, loss : 0.016694240272045135
<class 'torch.Tensor'>
epoch : 5, loss : 0.008397682569921017
<class 'torch.Tensor'>
epoch : 6, loss : 0.004193254746496677
<class 'torch.Tensor'>
epoch : 7, loss : 0.002120769117027521
<class 'torch.Tensor'>
epoch : 8, loss : 0.0011030498426407576
<class 'torch.Tensor'>
epoch : 9, loss : 0.0005926700541749597


In the code snippet you provided, let's break down the key parts:

1. **`loss.backward()`**:
   - This line initiates the *backward pass* in the neural network, which calculates the gradients of the loss with respect to each of the model parameters.
   - Specifically, it leverages the chain rule to compute these gradients for all the parameters that contributed to the output.
   - When you call `loss.backward()`, it populates the `.grad` attributes of each parameter in your model with the gradient values.
   - These gradients are later used by the optimizer (here, `adam_optim`) to update the model's parameters, allowing the model to learn from the data.

2. **Why is `loss.item()` not used in the print statement?**
   - Normally, `loss.item()` is used to retrieve the raw Python float value of the loss tensor for display or logging. Without `.item()`, `loss` will still be a tensor, but this doesn’t cause an error with `print()` because the `print()` function can handle tensor objects and will automatically convert them to a readable format.
   - By omitting `.item()`, you keep `loss` as a tensor. In PyTorch, keeping loss as a tensor instead of converting it to a scalar doesn’t affect the code’s functionality here since it's only being printed, not used in any calculations.

When you see the loss as a number in the print statement, it’s because the `print()` function displays the tensor’s scalar value (like a number) for easy readability. However, even though `loss` appears as a number, it’s actually a **tensor object** with a lot of underlying information that PyTorch uses during backpropagation.

Here’s how it works in more detail:

1. **Loss as a Tensor with Gradient Tracking**:
   - In PyTorch, when you compute a loss (e.g., using a loss function like `MSELoss` or `CrossEntropyLoss`), the resulting `loss` is a tensor with a single scalar value.
   - However, it’s not just any number; it’s a tensor that keeps track of the computation history and the operations that led to its value. This is because PyTorch tracks these operations in a *computational graph* whenever `requires_grad=True` is set on any tensor involved in the calculations.

2. **Backward Propagation Using `loss.backward()`**:
   - When you call `loss.backward()`, PyTorch uses the computational graph stored with `loss` to compute gradients for each parameter in the model with respect to this loss. This graph tells PyTorch how each parameter contributed to the final loss value.
   - Since `loss` retains this graph, it can trace back through each operation, compute partial derivatives, and populate the `.grad` attributes of the model’s parameters.

3. **Printing vs. Using in Backward Pass**:
   - While `print(loss)` displays the scalar value for readability, `loss` as a tensor is much more complex. This underlying information (the computational graph) makes `loss` usable in `loss.backward()`.

In summary, although `print(loss)` shows a simple number, the tensor contains all the necessary information for `loss.backward()` to work properly by retaining the computation history for gradient calculation.

In [None]:
!pip install transformers
!pip install datasets   # this is correspond to Hugging Face transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

# Let's practise

In [1]:
import torch
import torch.nn as nn
from torchvision import datasets  # We will use datasets from torchvision to load the MNIST handwritten digits dataset
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import torch.optim as optim

In [2]:
train = datasets.CIFAR10(
    root="./data",
    train=True,
    download=True,
    transform=ToTensor(),
    )

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:03<00:00, 46.0MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


In [3]:
test = datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=ToTensor()
)

Files already downloaded and verified


In [4]:
print(f"train_data: {train}")
print(f"test_data: {test}")

train_data: Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: ToTensor()
test_data: Dataset CIFAR10
    Number of datapoints: 10000
    Root location: ./data
    Split: Test
    StandardTransform
Transform: ToTensor()


In [5]:
batch_size = 64
train_data_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test, batch_size=batch_size, shuffle=True)

for x,y in test_data_loader:
  print(f"shape of data : {x.shape}")
  print(f"shape of label : {y.shape}")
  break


shape of data : torch.Size([64, 3, 32, 32])
shape of label : torch.Size([64])


In [None]:
images_test ,label_test = next(iter(test_data_loader))  # test_data_loader -> iterable & iter(test_data_loader) -> iterator

print(f"data : {images_test}")
print(f"label : {label_test}")

print(f"shape of data : {images_test.shape}")
print(f"shape of label : {label_test.shape}")

print("--------------------"*10)
print(f"shape of data : {images_test[0]}")
print(f"shape of label : {label_test[0]}")
print(f"shape of data : {images_test[0].shape}")
print(f"shape of label : {label_test[0].shape}")

data : tensor([[[[0.6784, 0.6627, 0.6627,  ..., 0.5451, 0.5412, 0.5373],
          [0.6980, 0.6824, 0.6824,  ..., 0.5608, 0.5569, 0.5529],
          [0.7176, 0.7020, 0.6980,  ..., 0.5765, 0.5725, 0.5647],
          ...,
          [0.8980, 0.8980, 0.9020,  ..., 0.6510, 0.8235, 0.8392],
          [0.9137, 0.8980, 0.9098,  ..., 0.8667, 0.9137, 0.9137],
          [0.9137, 0.8980, 0.9020,  ..., 0.9098, 0.9059, 0.9098]],

         [[0.8275, 0.8078, 0.8078,  ..., 0.7059, 0.7020, 0.6980],
          [0.8471, 0.8275, 0.8275,  ..., 0.7176, 0.7137, 0.7098],
          [0.8588, 0.8392, 0.8392,  ..., 0.7255, 0.7176, 0.7137],
          ...,
          [0.8941, 0.8941, 0.8980,  ..., 0.6784, 0.8431, 0.8549],
          [0.9098, 0.8941, 0.9059,  ..., 0.8667, 0.9098, 0.9176],
          [0.9137, 0.8980, 0.9020,  ..., 0.9098, 0.9059, 0.9098]],

         [[1.0000, 0.9843, 0.9843,  ..., 0.9255, 0.9216, 0.9176],
          [1.0000, 0.9961, 1.0000,  ..., 0.9373, 0.9294, 0.9255],
          [1.0000, 1.0000, 0.9961, 

In [6]:
# To accelerate operations in the neural network, we move it to the GPU or MPS (for Apple silicon) if available.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(device)


# x= 4
# y=10s
# [x if x==y else y]  -> [10]
# (x if x==y else y)  -> 10

cpu


In [7]:
# Number of classes
classes = len(train.classes)  # Get the number of classes
classes

10

For now we only want to use ``Linear`` layers so we must flatten the inputs so that we can pass it to the linear layers. The ``nn.Flatten()`` module allows us to do this.

For a classification task with multiple classes, you should use ``nn.Softmax(dim=1)`` (for probabilities) or omit it if you're using ``CrossEntropyLoss``, which applies ``log_softmax`` internally.

In [8]:

class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()

    # self.input_size = input_size
    # self.hidden_size = hidden_size
    self.flatten =nn.Flatten()
    self.model = nn.Sequential(
      nn.Linear(32*32*3, 512),
      nn.ReLU(),
      nn.Linear(512, 512),
      nn.ReLU(),
      nn.Linear(512, classes),
    )

  def forward(self, x):
    x = self.flatten(x)
    output = self.model(x)
    return output

In [9]:
model = Model().to(device)
print(model)

Model(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (model): Sequential(
    (0): Linear(in_features=3072, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [10]:
loss_func = nn.CrossEntropyLoss()

we need to setup an optimizer for training our model. We use **stochastic gradient descent** so we must use the **``SGD``** module from torch.optim. We must pass the ``model.parameters()`` to the ``SGD`` optimizer and set its learning rate ``lr=1e-3``.

In [11]:
optim = optim.SGD(model.parameters(), lr=1e-3)

In [18]:
from tqdm import tqdm
n_epoch = 50
avg_loss = []
for epoch in (pbar:=tqdm(range(n_epoch))):
  for x, y in train_data_loader:
      x, y = x.to(device), y.to(device)
      optim.zero_grad()
      y_pred = model(x)
      loss = loss_func(y_pred, y)
      avg_loss.append(loss.item())

      loss.backward()
      optim.step()
  avg = sum(avg_loss) / len(avg_loss)
  pbar.write(f"Epoch {epoch + 1}, Loss: {avg:.4f}")


  2%|▏         | 1/50 [00:19<16:16, 19.92s/it]

Epoch 1, Loss: 1.6296


  4%|▍         | 2/50 [00:40<16:24, 20.52s/it]

Epoch 2, Loss: 1.6272


  6%|▌         | 3/50 [01:01<16:11, 20.67s/it]

Epoch 3, Loss: 1.6247


  8%|▊         | 4/50 [01:21<15:36, 20.36s/it]

Epoch 4, Loss: 1.6223


 10%|█         | 5/50 [01:42<15:24, 20.54s/it]

Epoch 5, Loss: 1.6200


 12%|█▏        | 6/50 [02:01<14:48, 20.19s/it]

Epoch 6, Loss: 1.6176


 14%|█▍        | 7/50 [02:22<14:36, 20.37s/it]

Epoch 7, Loss: 1.6153


 16%|█▌        | 8/50 [02:42<14:07, 20.17s/it]

Epoch 8, Loss: 1.6130


 18%|█▊        | 9/50 [03:03<13:57, 20.42s/it]

Epoch 9, Loss: 1.6106


 20%|██        | 10/50 [03:23<13:35, 20.39s/it]

Epoch 10, Loss: 1.6083


 22%|██▏       | 11/50 [03:44<13:17, 20.44s/it]

Epoch 11, Loss: 1.6061


 24%|██▍       | 12/50 [04:05<13:00, 20.53s/it]

Epoch 12, Loss: 1.6038


 26%|██▌       | 13/50 [04:25<12:39, 20.54s/it]

Epoch 13, Loss: 1.6016


 28%|██▊       | 14/50 [04:46<12:22, 20.63s/it]

Epoch 14, Loss: 1.5993


 30%|███       | 15/50 [05:06<11:54, 20.40s/it]

Epoch 15, Loss: 1.5971


 32%|███▏      | 16/50 [05:26<11:35, 20.45s/it]

Epoch 16, Loss: 1.5949


 34%|███▍      | 17/50 [05:47<11:14, 20.45s/it]

Epoch 17, Loss: 1.5927


 36%|███▌      | 18/50 [06:07<10:48, 20.28s/it]

Epoch 18, Loss: 1.5906


 38%|███▊      | 19/50 [06:27<10:28, 20.29s/it]

Epoch 19, Loss: 1.5885


 40%|████      | 20/50 [06:47<10:03, 20.11s/it]

Epoch 20, Loss: 1.5863


 42%|████▏     | 21/50 [07:08<09:51, 20.39s/it]

Epoch 21, Loss: 1.5842


 44%|████▍     | 22/50 [07:27<09:23, 20.13s/it]

Epoch 22, Loss: 1.5821


 46%|████▌     | 23/50 [07:48<09:08, 20.32s/it]

Epoch 23, Loss: 1.5800


 48%|████▊     | 24/50 [08:08<08:43, 20.13s/it]

Epoch 24, Loss: 1.5779


 50%|█████     | 25/50 [08:28<08:26, 20.27s/it]

Epoch 25, Loss: 1.5758


 52%|█████▏    | 26/50 [08:48<08:03, 20.14s/it]

Epoch 26, Loss: 1.5738


 54%|█████▍    | 27/50 [09:08<07:44, 20.18s/it]

Epoch 27, Loss: 1.5717


 56%|█████▌    | 28/50 [09:30<07:34, 20.64s/it]

Epoch 28, Loss: 1.5697


 58%|█████▊    | 29/50 [09:50<07:08, 20.39s/it]

Epoch 29, Loss: 1.5677


 60%|██████    | 30/50 [10:10<06:47, 20.39s/it]

Epoch 30, Loss: 1.5657


 62%|██████▏   | 31/50 [10:30<06:21, 20.06s/it]

Epoch 31, Loss: 1.5637


 64%|██████▍   | 32/50 [10:50<06:02, 20.14s/it]

Epoch 32, Loss: 1.5617


 66%|██████▌   | 33/50 [11:10<05:44, 20.26s/it]

Epoch 33, Loss: 1.5598


 68%|██████▊   | 34/50 [11:31<05:24, 20.31s/it]

Epoch 34, Loss: 1.5578


 70%|███████   | 35/50 [11:52<05:07, 20.49s/it]

Epoch 35, Loss: 1.5559


 72%|███████▏  | 36/50 [12:11<04:43, 20.23s/it]

Epoch 36, Loss: 1.5540


 74%|███████▍  | 37/50 [12:32<04:25, 20.40s/it]

Epoch 37, Loss: 1.5520


 76%|███████▌  | 38/50 [12:52<04:02, 20.22s/it]

Epoch 38, Loss: 1.5501


 78%|███████▊  | 39/50 [13:13<03:45, 20.47s/it]

Epoch 39, Loss: 1.5482


 80%|████████  | 40/50 [13:33<03:23, 20.31s/it]

Epoch 40, Loss: 1.5463


 82%|████████▏ | 41/50 [13:54<03:04, 20.45s/it]

Epoch 41, Loss: 1.5444


 84%|████████▍ | 42/50 [14:14<02:43, 20.49s/it]

Epoch 42, Loss: 1.5426


 86%|████████▌ | 43/50 [14:35<02:24, 20.63s/it]

Epoch 43, Loss: 1.5407


 88%|████████▊ | 44/50 [14:56<02:03, 20.66s/it]

Epoch 44, Loss: 1.5389


 90%|█████████ | 45/50 [15:16<01:42, 20.58s/it]

Epoch 45, Loss: 1.5370


 92%|█████████▏| 46/50 [15:37<01:22, 20.62s/it]

Epoch 46, Loss: 1.5352


 94%|█████████▍| 47/50 [15:57<01:00, 20.32s/it]

Epoch 47, Loss: 1.5334


 96%|█████████▌| 48/50 [16:17<00:40, 20.38s/it]

Epoch 48, Loss: 1.5315


 98%|█████████▊| 49/50 [16:38<00:20, 20.36s/it]

Epoch 49, Loss: 1.5298


100%|██████████| 50/50 [16:58<00:00, 20.37s/it]

Epoch 50, Loss: 1.5280





In [None]:
# n_epochs = 30

# for _ in (pbar := trange(n_epochs)):
#     # Iterate over the data
#     for x, y in train_dataloader:
#         # Move the datapoints to same device as the model
#         x, y = x.to(device), y.to(device)
#         # Predict the output and perform the forward pass
#         pred = model(x)
#         # Compute prediction error
#         loss = loss_fn(pred, y)
#         # Backpropagation
#         loss.backward()
#         # Update the model weights
#         optimizer.step()
#         # Clear the gradients
#         optimizer.zero_grad()
#         # Update the progress bar
#         pbar.set_description(f'Loss = {loss.item():.3f}')

``pbar := trange(n_epochs):``

The walrus operator assigns the ``trange`` object to the variable ``pbar`` while also using it in the loop.
This allows you to interact with the progress bar object (``pbar``) within the loop, such as updating its description with ``pbar.set_description``.

You could achieve the same effect without the walrus operator like this:

``
pbar = trange(n_epochs)``

``for _ in pbar:
    pbar.set_description(f'Loss = {loss.item():.3f}')``
    
However, using the walrus operator makes the code more concise.

In [14]:
# Store the number of correctly classified and total labels
correct, total = 0, 0

# Disable gradient calculation
with torch.no_grad():
    # Iterate over the test data
    for x, y in test_data_loader:
        # Move the datapoints to same device as the model
        x, y = x.to(device), y.to(device)
        # Predict the output
        logits = model(x)
        # Get the predicted label
        pred = torch.argmax(logits, axis=1)
        # Update the number of correclty classified labels
        correct += sum(pred == y).item()
        # Update the number of total labels
        total += pred.shape[0] # Adds the number of samples in the current batch (pred.shape[0]) to the total counter to keep track of the total number of samples evaluated.

print(f'Accuracy: {100 * correct / total:.2f}%')

Accuracy: 42.07%


### let's break down code step by step
``logits = model(x)`` :
Feeds the input batch (x) through the model to compute the raw output (logits), which is a tensor of shape ``[batch_size, num_classes]``.
Each value in ``logits`` represents the unnormalized prediction scores for a class.

``pred = torch.argmax(logits, axis=1)`` : Converts the raw output (logits) into predicted class labels (pred) by taking the index of the class with the highest score for each sample in the batch.
``torch.argmax(logits, axis=1)`` returns a tensor of size ``[batch_size]`` containing the predicted class indices.

example:



In [None]:
>>> A = torch.randn(1, 2) # tensor([[-0.1278, -0.3047]])
>>> print("Tensor-A:", A) # torch.Size([1, 2])
>>> print(torch.argmax(A, axis=1))  # tensor([0]) -> -0.1278

>>> a = torch.randn(4, 4)
>>> a
tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
        [-0.7401, -0.8805, -0.3402, -1.1936],
        [ 0.4907, -1.3948, -1.0691, -0.3132],
        [-1.6092,  0.5419, -0.2993,  0.3195]])
>>> torch.argmax(a)
tensor(0)


>>> a = torch.randn(4, 4)
>>> a
tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
        [-0.7401, -0.8805, -0.3402, -1.1936],
        [ 0.4907, -1.3948, -1.0691, -0.3132],
        [-1.6092,  0.5419, -0.2993,  0.3195]])
>>> torch.argmax(a, dim=1)
tensor([ 0,  2,  0,  1])


>>> A = torch.randn(1, 3)
 tensor([[1.5726, 0.7617, 0.1560]])
>>> print("Tensor-A:", A)
>>> print(torch.max(A, axis=0))
torch.return_types.max(
values=tensor([1.5726, 0.7617, 0.1560]),
indices=tensor([0, 0, 0]))

# Do some experiments to find the best optimizer for our task




In [None]:

class Model(nn.Module):
  def __init__(self):
    super(Model, self).__init__()

    # self.input_size = input_size
    # self.hidden_size = hidden_size
    self.flatten =nn.Flatten()
    self.model = nn.Sequential(
      nn.Linear(32*32*3, 512),
      nn.ReLU(),
      nn.Linear(512, 512),
      nn.ReLU(),
      nn.Linear(512, classes),
    )

  def forward(self, x):
    x = self.flatten(x)
    output = self.model(x)
    return output

In [39]:
model = Model().to(device)


In [None]:
loss_func = nn.CrossEntropyLoss()

In [37]:
import torch.optim as optim
optimizers =['SGD', 'AdaGrad', 'RMSProp', 'Adam']

def train_model(train_data, optimizer, n_epoch=5):
  losses, accuracies = [], []
  for epoch in  (pbar:=tqdm(range(n_epoch))):
    running_loss, acc , total, correct = 0, 0, 0, 0
    for x, y in train_data:
      x,y = x.to(device), y.to(device)  # shape x : (batch_size, ...)
      optimizer.zero_grad()
      y_pred = model(x)
      loss = loss_func(y_pred, y)
      loss.backward()
      optimizer.step()

      pred = torch.argmax(y_pred, axis=1)
      correct += sum(pred==y).item()
      total += pred.shape[0]
      running_loss += loss.item()
    print(total, len(train_data))

    avg_loss = running_loss / total
    acc = (correct / total) * 100
    accuracies.append(acc)
    losses.append(avg_loss)
    return losses, accuracies

def run(train_data_loder, optimizer_name):
  if optimizer_name == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=1e-3)
  elif optimizer_name == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
  elif optimizer_name == 'AdaGrad':
    optimizer = optim.AdaGrad(model.parameters(), lr=1e-3)
  elif optimizer_name == 'RMSProp':
    optimizer = optim.RMSProp(model.parameters(), lr=1e-3)
  else:
    raise ValueError(f"Invalid optimizer name: {optimizer_name}")

  loss, acc = train_model(train_data_loader, optimizer)
  print(f"{optimizer_name}, loss = {loss}, accuracy= {acc}")

In [38]:
train_data_loader = DataLoader(train, batch_size=128)
run(train_data_loader, "SGD")

  0%|          | 0/5 [00:19<?, ?it/s]

50000 391
SGD, loss = [0.011218184535503387], accuracy= [49.816]



