# Install packages|

In [1]:
import torch
import numpy as np
from torch import nn
from torch import optim
import pandas as pd

print(f"Torch version: {torch.__version__}")

# Tensors are just like Numpy ndarrays matrices, but with capability to run much faster on GPUs, TPUs and CPUs

Torch version: 2.5.1


In [2]:
cuda_available = torch.cuda.is_available()
print(f"Cude is available: {cuda_available}")

#If false, then PyTorch is running on the CPU. If true, then running on GPU, but we installed it without CUDA NVIDIA drivers

Cude is available: False


In [4]:
# Creating a simple tensor

x = torch.randn(4,7) # random but follows normal distribution
y = torch.rand(3,2) # random within 0 - 1 range continuous values
print(x, "\nTensor stored in:", x.device, "\nTensor shape:", x.shape, "\nTensor datatype:", x.dtype, type(x), "\n")

# By default the dtype is float32
type(y)
display(y)
print(f"Tensor stored in: {y.device}\nTensor shape: {y.shape}\nTensor datatype: {y.dtype} {type(y)}")

tensor([[ 0.1975,  2.3857,  2.0725,  0.2292,  1.4295,  0.2413,  1.5068],
        [ 0.8240, -0.3320,  0.6636,  1.4492,  0.9831, -0.5983, -1.3798],
        [ 2.1171, -0.6836,  0.9505,  1.0101, -0.0826,  1.4257,  0.1901],
        [-1.0944,  0.0055,  0.3359,  0.3584,  1.1735,  0.3462, -1.5338]]) 
Tensor stored in: cpu 
Tensor shape: torch.Size([4, 7]) 
Tensor datatype: torch.float32 <class 'torch.Tensor'> 



tensor([[0.8075, 0.5019],
        [0.4307, 0.6033],
        [0.4977, 0.8694]])

Tensor stored in: cpu
Tensor shape: torch.Size([3, 2])
Tensor datatype: torch.float32 <class 'torch.Tensor'>


In [146]:
# Creating tensors from python list

data_list = [[1,2], [3,4],[7,8]]
tensfor_from_list = torch.tensor(data_list, dtype = torch.float32)
display(tensfor_from_list)
print(f"Tensor info: {tensfor_from_list.dtype}, {tensfor_from_list.shape}, {tensfor_from_list.device}, {type(tensfor_from_list)}")

tensor([[1., 2.],
        [3., 4.],
        [7., 8.]])

Tensor info: torch.float32, torch.Size([3, 2]), cpu, <class 'torch.Tensor'>


# Working with Tensors

In [203]:
# [] - a scalar number, 0D array/tensor. rank-0
# [[]] - a vector, 1D array/tensor. rank-1
# [[],[],[]...] - a matrix, 2D array/tensor. rank-n

# [[1,2,3]] - row vector, shape(1,3), one row & three columns. rank-1
# [[1], [2], [3]] - column vector, shape three rows & one column. rank-1
# [[1,2,3], [3,4,5]] - matrix of shape(2,3), two rows and 3 columns. rank-2


data_list = [[1,2,3,4,5]] 
tensor_from_list = torch.tensor(data_list, dtype=torch.float32) # dtype explicitly can be changed
display(type(tensor_from_list))

print(tensor_from_list.T, tensor_from_list.T.shape, tensor_from_list.device, tensor_from_list.dtype, type(tensor_from_list))

torch.Tensor

tensor([[1.],
        [2.],
        [3.],
        [4.],
        [5.]]) torch.Size([5, 1]) cpu torch.float32 <class 'torch.Tensor'>


In [200]:
data_list = [[1,2,3], [3,4,5], [7,8,9], [11, 12, 13], [14, 15, 16]]
tensor_from_list = torch.tensor(data_list)

print(f"{tensor_from_list.T}, \n{tensor_from_list.dtype}, {tensor_from_list.device}, {tensor_from_list.T.shape}, {type(tensor_from_list)}")

tensor([[ 1,  3,  7, 11, 14],
        [ 2,  4,  8, 12, 15],
        [ 3,  5,  9, 13, 16]]), 
torch.int64, cpu, torch.Size([3, 5]), <class 'torch.Tensor'>


In [215]:
# Tensors from numpy

data_numpy = np.array([1, 2, 3, 4]) # a single [] represents a scalar number
tensor_from_np = torch.from_numpy(data_numpy)

display(tensor_from_np)
print(tensor_from_np.dtype, tensor_from_np.device, tensor_from_np.shape, type(tensor_from_np))

tensor([1, 2, 3, 4])

torch.int64 cpu torch.Size([4]) <class 'torch.Tensor'>


In [261]:
data_numpy = np.array([[1, 2, 3, 4]], dtype=np.float32) # [[]] represents row vector as of now
print("Numpy dtype:", data_numpy.dtype)
tensor_from_np = torch.from_numpy(data_numpy)

display(tensor_from_np.T) # after transposing, now it is column vector
print(f"Tensor shape: {tensor_from_np.T.shape}\nTensor datatype: {tensor_from_np.dtype, type(tensor_from_np)}\n \
Tensor stored in: {tensor_from_np.device}\n")


Numpy dtype: float32


tensor([[1.],
        [2.],
        [3.],
        [4.]])

Tensor shape: torch.Size([4, 1])
Tensor datatype: (torch.float32, <class 'torch.Tensor'>)
 Tensor stored in: cpu



In [293]:
data_numpy = np.array([[1,2,3],[4,5,6],[7,8,9],[11,12,13],[15,16,17]])
tensor_from_np = torch.from_numpy(data_numpy)

display(tensor_from_np)
print(f"Tensor shape: {tensor_from_np.shape}\nTensor datatype: {tensor_from_np.dtype, type(tensor_from_np)}\n \
Tensor stored in: {tensor_from_np.device}")

tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [11, 12, 13],
        [15, 16, 17]])

Tensor shape: torch.Size([5, 3])
Tensor datatype: (torch.int64, <class 'torch.Tensor'>)
 Tensor stored in: cpu


In [290]:
data_numpy = np.array([[[1,2,3], [3,4,5], [6,7,8]],
                      [[11,12,13], [14,15,16], [17,18,19]],
                      [[21,22,23], [23,24,25], [26,27,28]]])
tensor_from_np = torch.from_numpy(data_numpy)

display(tensor_from_np)

print(tensor_from_np.shape, tensor_from_np.dtype, tensor_from_np.device, type(tensor_from_np))

tensor([[[ 1,  2,  3],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[11, 12, 13],
         [14, 15, 16],
         [17, 18, 19]],

        [[21, 22, 23],
         [23, 24, 25],
         [26, 27, 28]]])

torch.Size([3, 3, 3]) torch.int64 cpu <class 'torch.Tensor'>


In [13]:
# the range is between 0 and 1, continuous values
data_rand = torch.rand(3,6,5,5,6)
display(data_rand)

print(type(data_rand), data_rand.device, data_rand.shape, data_rand.dtype)

tensor([[[[[0.5690, 0.0818, 0.7334, 0.7479, 0.7470, 0.7807],
           [0.3080, 0.5642, 0.0268, 0.6542, 0.9962, 0.1948],
           [0.8360, 0.5335, 0.7966, 0.2066, 0.3203, 0.6581],
           [0.0686, 0.6480, 0.4794, 0.2984, 0.3671, 0.0635],
           [0.5211, 0.2838, 0.5029, 0.7513, 0.0535, 0.7766]],

          [[0.1584, 0.7182, 0.8947, 0.7645, 0.1467, 0.5904],
           [0.3381, 0.3275, 0.8389, 0.6606, 0.8481, 0.7248],
           [0.1729, 0.2273, 0.8817, 0.0909, 0.9683, 0.8622],
           [0.2807, 0.7360, 0.4525, 0.0498, 0.7388, 0.5148],
           [0.7674, 0.5755, 0.8463, 0.8062, 0.0062, 0.4510]],

          [[0.8714, 0.3873, 0.7940, 0.5704, 0.6458, 0.0063],
           [0.2315, 0.4958, 0.8788, 0.1932, 0.2610, 0.6501],
           [0.2683, 0.1105, 0.7179, 0.8703, 0.5799, 0.4031],
           [0.3507, 0.7391, 0.1129, 0.3320, 0.5486, 0.8110],
           [0.1462, 0.1957, 0.6390, 0.2085, 0.7570, 0.1197]],

          [[0.5360, 0.2915, 0.2596, 0.4052, 0.4653, 0.5107],
           [0.4753

<class 'torch.Tensor'> cpu torch.Size([3, 6, 5, 5, 6]) torch.float32


In [139]:
# values follow a normal distribution
data_randn = torch.randn(2,3,2,2)
print("g:", data_randn)

g: tensor([[[[-1.1143, -0.2329],
          [-0.6670,  0.7410]],

         [[ 0.7010,  0.1869],
          [ 0.2297, -0.0074]],

         [[-0.3161,  0.0057],
          [ 1.3207, -0.9103]]],


        [[[ 0.0486, -0.4034],
          [ 0.3652,  0.5020]],

         [[ 0.9283,  1.0278],
          [ 0.7446, -0.0997]],

         [[ 0.7095,  1.4607],
          [-0.2492,  0.2420]]]])


In [142]:
data_random_zero_to_one = torch.rand(3,5)
data_random_normal = torch.randn(4,7)

display(data_random_zero_to_one, data_random_normal)
display(data_random_zero_to_one.T, data_random_normal.T)

tensor([[0.4904, 0.1041, 0.7446, 0.8541, 0.0787],
        [0.1401, 0.7797, 0.1442, 0.9158, 0.2678],
        [0.9923, 0.2882, 0.3806, 0.5698, 0.6849]])

tensor([[-0.6434,  1.4317, -0.4912,  1.7764,  1.0498, -0.3891, -0.6872],
        [ 0.4841,  2.1635,  0.7083, -0.2999, -0.1306,  1.3264,  0.1005],
        [ 0.4051,  0.8345,  0.2810, -0.6223,  0.8083,  0.0234, -0.2112],
        [ 0.1314, -0.8676, -0.5983, -1.3256,  0.4899, -0.1967, -1.0668]])

tensor([[0.4904, 0.1401, 0.9923],
        [0.1041, 0.7797, 0.2882],
        [0.7446, 0.1442, 0.3806],
        [0.8541, 0.9158, 0.5698],
        [0.0787, 0.2678, 0.6849]])

tensor([[-0.6434,  0.4841,  0.4051,  0.1314],
        [ 1.4317,  2.1635,  0.8345, -0.8676],
        [-0.4912,  0.7083,  0.2810, -0.5983],
        [ 1.7764, -0.2999, -0.6223, -1.3256],
        [ 1.0498, -0.1306,  0.8083,  0.4899],
        [-0.3891,  1.3264,  0.0234, -0.1967],
        [-0.6872,  0.1005, -0.2112, -1.0668]])

In [172]:
# shape(2,3) - 2 rows and 3 columns: or shortly called a matrix of 2 by 3
# shape(1,2,3) - a single matrix of 2 by 3 
# shape(2,2,3) - two matrixes of shape 2 by 3
# shape(3,2,3) - three matrixes of shape 2 by 3
# shape(5,2,4) - five matrixes of shape 2 by 4
# shape(1,5,2,4) - a single matrix that contains 5 sub-matrices with shape of 2 by 4
# shape(2,5,2,4) - two matrixes, each containing 5 sub-matrices of shape 2 by 4
# shape(4,3,4,4) - four matrices, each containing 3 sub-matrices of square shape 4 by 4
""" 
shape(n_1,n_2,n_3,...,n_k,row,column) - in high dimensional ndarrays/tensors the last two numbers represent the actual 
#s of rows and columns, while the numbers before/preceding them represent how many sub-matrices each matrix has but at its core
the last two numbers representing the actual shape of the base matrix, for which, its shape is being used multiple times

"""
# shape(1,1,1,1,3,4) - at its core it is 3 by 4 matrix, the output will just display many []s around [3,4], like [[[[[[3,4]]]]]]


data_zero = torch.zeros(2,3,4)
display(data_zero)

tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

In [175]:
data_ones = torch.ones(2,3,3)
display(data_ones)

tensor([[[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],

        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]])

In [227]:
data_zeros = torch.zeros(4,9)
data_ones = torch.ones(3,11)

display(data_zeros, data_ones)
print(f"data_zeros info: {data_zeros.shape}, {data_zeros.dtype}, {data_zeros.device}, {type(data_zeros)}\
\ndata_ones info: {data_ones.shape}, {data_ones.dtype}, {data_ones.device}, {type(data_ones)}\n\nTransposed:")

display(data_zeros.T, data_ones.T)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.]])

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

data_zeros info: torch.Size([4, 9]), torch.float32, cpu, <class 'torch.Tensor'>
data_ones info: torch.Size([3, 11]), torch.float32, cpu, <class 'torch.Tensor'>

Transposed:


tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [226]:
# to run cli scripts prefix it with "!", exclamation mark so that jupyter knows the specfied code is sli and not a python code
!jupyter kernelspec list --help-all

List installed kernel specifications.

Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--json
    output spec name and location as machine-readable json.
    Equivalent to: [--ListKernelSpecs.json_output=True]
--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--log-level=<Enum>
    Set the log level by value or name.
    Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
    Default: 30
    Equivalent to: [--Application.log_level]
--config=<Unicode>
    Full path of a config file.
    Default: ''
    Equivalent to: [--JupyterApp.config_file]

Class options
The command-line option below sets the respective configurable class-parameter:
    --Class.parameter=value
This line is evaluated in Python, so simple exp

# Indexing and Slicing

In [270]:
data = torch.tensor([[1,2,3],[3,4,5],[5,6,7]])
display(data)
print("Extract first row:",data[0])
print("Extract first column:", data[:,0])
print("Extract second row:", data[1])
print("Extract second column:", data[:,1])
print("Extract third row:", data[2])
print("Extract third column:", data[:,2],"\n")
print("Extract the first first row:", data[0,0])
print("Extract the first two rows:", data[0:2,0])
print("Extract the last last row:", data[2,0])
print("Extract the last two rows:", data[1:,0],"\n")
print("Extract the first second column:", data[0,1])
print("Extract the first two rows of the second column:", data[0:2,1])
print("Extract the last row of second column:", data[2,1])
print("Extract the last two rows of second column:", data[1:,1], "\n")
print("Extract the first first row of third column:", data[0,2])
print("Extract the first two rows of third column:", data[0:2,2])
print("Extract the last row of third column:", data[2,2])
print("Extract the last two rows of third column:", data[1:,2])

tensor([[1, 2, 3],
        [3, 4, 5],
        [5, 6, 7]])

Extract first row: tensor([1, 2, 3])
Extract first column: tensor([1, 3, 5])
Extract second row: tensor([3, 4, 5])
Extract second column: tensor([2, 4, 6])
Extract third row: tensor([5, 6, 7])
Extract third column: tensor([3, 5, 7]) 

Extract the first first row: tensor(1)
Extract the first two rows: tensor([1, 3])
Extract the last last row: tensor(5)
Extract the last two rows: tensor([3, 5]) 

Extract the first second column: tensor(2)
Extract the first two rows of the second column: tensor([2, 4])
Extract the last row of second column: tensor(6)
Extract the last two rows of second column: tensor([4, 6]) 

Extract the first first row of third column: tensor(3)
Extract the first two rows of third column: tensor([3, 5])
Extract the last row of third column: tensor(7)
Extract the last two rows of third column: tensor([5, 7])


In [276]:
data = torch.tensor([range(20)])
display(data.T)

tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [12],
        [13],
        [14],
        [15],
        [16],
        [17],
        [18],
        [19]])

In [284]:
data = torch.tensor([range(15)])
display(data)
display(data.T)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]])

tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [12],
        [13],
        [14]])

In [399]:
data = torch.tensor([[2,3,4],[5,6,7],[8,9,0],[11,14,16],[67,78,99], [44,33,22]])
display(data, data.shape)

print("Extract the middle two rows of first column:", data[2:4,0])
print("Extract the middle two rows:", data[2:4])
print("Extract the sub-matrix of 2 by 2:",data[3:5,1:], data[4:,0:2], data[0:2,1:]\
     , data[1:3,0:2], data[2:4,1:], data[2:4,0:2])
print("Extract the sub-matrix of 3 by 3:", data[0:3,:], data[3:,:])
print("Extract the sub-matrix of 3 by 2:", data[0:3,0:2], data[0:3, 1:], data[2:5,1:]\
     , data[3:,1:], data[2:5,0:2], data[3:,0:2], data[1:4,0:2], data[1:4,1:])
print("Extract the sub-matrix of 2 by 3:", data[0:2,:], data[2:4,:], data[3:5,:]\
     , data[4:,:])
print("Extract the column vector of 4 by 1:", data[0:4,0:1], data[2:,0:1]\
     , data[0:4,1:2], data[2:,1:2], data[0:4,2:], data[2:,2:])
print("Extract the row vector of 1 by 2:", data[0:1,0:2], data[0:1,1:]\
     , data[1:2,0:2], data[1:2,1:], data[2:3,0:2], data[2:3,1:], data[3:4,0:2]\
     , data[3:4,1:], data[4:5,:2], data[4:5,1:], data[5:,:2], data[5:,1:])
print("Extract column vector 2 by 1:", data[3:5,2:], data[3:5,1:2], data[1:3,2:])
print("Extract columns only:", data[:,0:1], data[:,1:2], data[:,2:])
print("Extract rows only:", data[0:1], data[1:2], data[2:3], data[3:4]\
     , data[4:5], data[5:])
print("Extract scalar/single numers:", data[0,0], data[3,2:], data[4:5,1], data[5,2])

tensor([[ 2,  3,  4],
        [ 5,  6,  7],
        [ 8,  9,  0],
        [11, 14, 16],
        [67, 78, 99],
        [44, 33, 22]])

torch.Size([6, 3])

Extract the middle two rows of first column: tensor([ 8, 11])
Extract the middle two rows: tensor([[ 8,  9,  0],
        [11, 14, 16]])
Extract the sub-matrix of 2 by 2: tensor([[14, 16],
        [78, 99]]) tensor([[67, 78],
        [44, 33]]) tensor([[3, 4],
        [6, 7]]) tensor([[5, 6],
        [8, 9]]) tensor([[ 9,  0],
        [14, 16]]) tensor([[ 8,  9],
        [11, 14]])
Extract the sub-matrix of 3 by 3: tensor([[2, 3, 4],
        [5, 6, 7],
        [8, 9, 0]]) tensor([[11, 14, 16],
        [67, 78, 99],
        [44, 33, 22]])
Extract the sub-matrix of 3 by 2: tensor([[2, 3],
        [5, 6],
        [8, 9]]) tensor([[3, 4],
        [6, 7],
        [9, 0]]) tensor([[ 9,  0],
        [14, 16],
        [78, 99]]) tensor([[14, 16],
        [78, 99],
        [33, 22]]) tensor([[ 8,  9],
        [11, 14],
        [67, 78]]) tensor([[11, 14],
        [67, 78],
        [44, 33]]) tensor([[ 5,  6],
        [ 8,  9],
        [11, 14]]) tensor([[ 6,  7],
        [ 9,  0],
        [14, 1

# Creating Tensors based on other Tensors

In [435]:
# zeros_like(input_tensor)
# ones_like(input_tensor)
# rand_like(input_tensor)
# randn_like(input_tensor)


base_foundation = torch.tensor([[1,2,3,10,11,12],[4,5,6,13,14,15],[7,8,9,16,17,18]], dtype=torch.float64)
display(base_foundation)
print(f"Base tensor shape: {base_foundation.shape}, and datatype {base_foundation.dtype}")

zero_like_base = torch.zeros_like(base_foundation)
one_like_base = torch.ones_like(base_foundation)
rand_like_base = torch.rand_like(base_foundation) # since rand & randn, can only have floating values, integer tensors will prompt error
randn_like_base = torch.randn_like(base_foundation)

display(zero_like_base)
print(f"Zero like base info: {zero_like_base.shape}, {zero_like_base.dtype}\n")

display(one_like_base)
print(f"One like base info: {one_like_base.shape}, {one_like_base.dtype}\n")

display(rand_like_base)
print(f"Rand like base info: {rand_like_base.shape}, {rand_like_base.dtype}\n")

display(randn_like_base)
print(f"Randn like base info: {randn_like_base.shape}, {randn_like_base.dtype}")

tensor([[ 1.,  2.,  3., 10., 11., 12.],
        [ 4.,  5.,  6., 13., 14., 15.],
        [ 7.,  8.,  9., 16., 17., 18.]], dtype=torch.float64)

Base tensor shape: torch.Size([3, 6]), and datatype torch.float64


tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]], dtype=torch.float64)

Zero like base info: torch.Size([3, 6]), torch.float64



tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]], dtype=torch.float64)

One like base info: torch.Size([3, 6]), torch.float64



tensor([[0.9731, 0.5757, 0.1599, 0.4065, 0.4899, 0.7840],
        [0.7871, 0.7655, 0.9428, 0.5875, 0.7318, 0.5175],
        [0.1560, 0.5264, 0.3200, 0.4035, 0.2370, 0.8763]], dtype=torch.float64)

Rand like base info: torch.Size([3, 6]), torch.float64



tensor([[-0.3289,  0.8927,  0.5578,  0.1772,  0.2331,  1.9367],
        [ 2.1001,  1.0563,  1.1401, -0.0840, -0.1294,  0.5280],
        [ 0.0841, -0.3956, -0.0523,  0.3307,  1.7072, -1.6295]],
       dtype=torch.float64)

Randn like base info: torch.Size([3, 6]), torch.float64


In [450]:
fnd = torch.rand(2,3,4, dtype=torch.float64)
display(fnd, fnd.shape, fnd.dtype, fnd.device)

subzero = torch.zeros_like(fnd)
oneone = torch.ones_like(fnd)
rand_zero_to_one = torch.rand_like(fnd)
randn_normal = torch.randn_like(fnd)

print(f"Zeros like: {subzero}, {subzero.shape, subzero.dtype}\n")
print(f"Ones like: {oneone}, {oneone.shape, oneone.dtype}\n")
print(f"Rand (0-1) like: {rand_zero_to_one}, {rand_zero_to_one.shape, rand_zero_to_one.dtype}\n")
print(f"Randn (-3-3) like: {randn_normal}, {randn_normal.shape, randn_normal.dtype}")

tensor([[[0.3841, 0.3180, 0.3793, 0.4173],
         [0.3500, 0.8248, 0.2829, 0.8171],
         [0.1338, 0.8299, 0.7436, 0.3687]],

        [[0.1049, 0.7300, 0.9700, 0.5374],
         [0.9372, 0.6310, 0.0248, 0.9433],
         [0.7177, 0.9117, 0.5864, 0.3479]]], dtype=torch.float64)

torch.Size([2, 3, 4])

torch.float64

device(type='cpu')

Zeros like: tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]], dtype=torch.float64), (torch.Size([2, 3, 4]), torch.float64)

Ones like: tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]], dtype=torch.float64), (torch.Size([2, 3, 4]), torch.float64)

Rand (0-1) like: tensor([[[0.1044, 0.4622, 0.6890, 0.5514],
         [0.1013, 0.1911, 0.5922, 0.5115],
         [0.4042, 0.8815, 0.0344, 0.7364]],

        [[0.7062, 0.3122, 0.3774, 0.6398],
         [0.2664, 0.9501, 0.8376, 0.2279],
         [0.1950, 0.4914, 0.4397, 0.2307]]], dtype=torch.float64), (torch.Size([2, 3, 4]), torch.float64)

Randn (-3-3) like: tensor([[[ 2.0311, -0.2349,  0.7264, -0.9681],
         [ 0.9670, -0.3477, -0.4224, -0.5017],
         [-2.5053, -1.9490,  1.3155,  0.3068]],

        [[ 0.1734,

# Basic operations with Tensors

In [392]:
# Addition torch.add()
# Subtraction torch.sub()
# Multiplication torch.mul()
# Division torch.div()
# Exponentiation torch.pow()

"""
    All these operations are done element-wise only, and tensors must have compatible shapes
    (m, n) * (m, n) - all entries must be the same number, basically only square and rectangular matrices can 
    perform element-wise operations!
    
    NOTE: that there it is not a matrix multiplication torch.matmul(), which is completely different from 
    element-wise multiplication torch.mul()
"""
a_1 = torch.tensor([[1,2],[3,4]])
b_1 = torch.tensor([[11,12],[13,14]])
display(a_1, b_1)
sum_a_b = a_1 + b_1
print("\nSum of tensors (a_1 + b_1):\n", sum_a_b)
print("\nSum of tensors torch.add(a,b):\n", torch.add(a_1, b_1))
diff_a_b = a_1 - b_1
print("\nSubtraction of tensors (a_1 - b_1):\n", diff_a_b)
print("\nSubtraction of tensors torch.sub(a_1,b_1):\n", torch.sub(a_1,b_1))
div_a_b = a_1/b_1
print("\nDivision of tensors (a_1/b_1):\n", div_a_b)
print("\nDivision of tensors torch.div(a_1,b_1):\n", torch.div(a_1,b_1))
mul_a_b = a_1*b_1
print("\nMultiplication of tensors (a_1*b_1):\n", mul_a_b)
print("\nMultiplication of tensors torch.mul(a_1,b_1):\n", torch.mul(a_1,b_1))
raise_a_to_pow_of_b = a_1 ** b_1
print("\nExponentiation of tensors (a_1**b_1):\n", raise_a_to_pow_of_b)
print("\nExponentiation of tensors torch.pow(a_1,b_1):\n", torch.pow(a_1,b_1))

tensor([[1, 2],
        [3, 4]])

tensor([[11, 12],
        [13, 14]])


Sum of tensors (a_1 + b_1):
 tensor([[12, 14],
        [16, 18]])

Sum of tensors torch.add(a,b):
 tensor([[12, 14],
        [16, 18]])

Subtraction of tensors (a_1 - b_1):
 tensor([[-10, -10],
        [-10, -10]])

Subtraction of tensors torch.sub(a_1,b_1):
 tensor([[-10, -10],
        [-10, -10]])

Division of tensors (a_1/b_1):
 tensor([[0.0909, 0.1667],
        [0.2308, 0.2857]])

Division of tensors torch.div(a_1,b_1):
 tensor([[0.0909, 0.1667],
        [0.2308, 0.2857]])

Multiplication of tensors (a_1*b_1):
 tensor([[11, 24],
        [39, 56]])

Multiplication of tensors torch.mul(a_1,b_1):
 tensor([[11, 24],
        [39, 56]])

Exponentiation of tensors (a_1**b_1):
 tensor([[        1,      4096],
        [  1594323, 268435456]])

Exponentiation of tensors torch.pow(a_1,b_1):
 tensor([[        1,      4096],
        [  1594323, 268435456]])


In [393]:
# For element-wise operations shapes, both rows and columns of two or more tensors must be the same
a_2 = torch.rand(4,4)
b_2 = torch.rand(4,4)

# Element-wise operations where shapes of both must be square and equal
print(torch.add(a_2,b_2))
print(torch.sub(a_2,b_2))
print(torch.sub(b_2,a_2))
print(torch.mul(a_2,b_2))
print(torch.div(a_2,b_2))
print(torch.div(b_2,a_2))
print(torch.pow(a_2,b_2))
print(torch.pow(b_2,a_2))

tensor([[1.0532, 0.8718, 1.6102, 1.5640],
        [1.6450, 1.4353, 0.5954, 0.9011],
        [1.5734, 1.4784, 0.8115, 1.0972],
        [0.7050, 0.6156, 1.5870, 1.1890]])
tensor([[ 0.2096, -0.3296, -0.2742,  0.0789],
        [ 0.3094,  0.3300,  0.0230,  0.7940],
        [ 0.2722, -0.1056,  0.1342,  0.1017],
        [-0.3155,  0.5696, -0.1861,  0.7369]])
tensor([[-0.2096,  0.3296,  0.2742, -0.0789],
        [-0.3094, -0.3300, -0.0230, -0.7940],
        [-0.2722,  0.1056, -0.1342, -0.1017],
        [ 0.3155, -0.5696,  0.1861, -0.7369]])
tensor([[0.2663, 0.1628, 0.6294, 0.6100],
        [0.6526, 0.4878, 0.0885, 0.0454],
        [0.6004, 0.5437, 0.1601, 0.2984],
        [0.0994, 0.0136, 0.6210, 0.2176]])
tensor([[ 1.4970,  0.4513,  0.7090,  1.1063],
        [ 1.4633,  1.5971,  1.0805, 15.8363],
        [ 1.4183,  0.8667,  1.3961,  1.2044],
        [ 0.3817, 25.7728,  0.7901,  4.2605]])
tensor([[0.6680, 2.2158, 1.4104, 0.9039],
        [0.6834, 0.6261, 0.9255, 0.0631],
        [0.7051, 1.1538

In [394]:
# Above given example creates a new tensor, say by a + b or torch.add(a,b)
# However the original tensors "a" and "b" remain unchanged
# We can do in-place operations where either a or b will be changed without creating a new tensor, thus saving memory

a_3 = torch.tensor([[3,4],[6,7]])
b_3 = torch.tensor([[5,1],[8,0]])
display(a_3, b_3)
print(torch.add(a_3, b_3), "\nTensor a_3 after basic operation:",a_3)
print("\nTensor a_3 after in-place operation:", a_3.add_(b_3), a_3, b_3)
print("\nTensor b_3 after in-place operation:", b_3.add_(a_3), b_3)

tensor([[3, 4],
        [6, 7]])

tensor([[5, 1],
        [8, 0]])

tensor([[ 8,  5],
        [14,  7]]) 
Tensor a_3 after basic operation: tensor([[3, 4],
        [6, 7]])

Tensor a_3 after in-place operation: tensor([[ 8,  5],
        [14,  7]]) tensor([[ 8,  5],
        [14,  7]]) tensor([[5, 1],
        [8, 0]])

Tensor b_3 after in-place operation: tensor([[13,  6],
        [22,  7]]) tensor([[13,  6],
        [22,  7]])


In [395]:
print(a_3.mul_(b_3))
print(b_3.sub_(a_3))
print(a_3.pow_(b_3))
print(a_3.add_(b_3))
#print(a_3.div_(b_3))

# In-place operations change the original tensors, thus saving memory, while basic operations create new tensors,
# therefore original tensors intact

tensor([[104,  30],
        [308,  49]])
tensor([[ -91,  -24],
        [-286,  -42]])
tensor([[0, 0],
        [0, 0]])
tensor([[ -91,  -24],
        [-286,  -42]])


In [396]:
# Element-wise operations with scalar(single) numbers

x = torch.ones(4,3, dtype=torch.int32)
y = torch.tensor([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])

scalar = 10

print("\nTensor + Scalar:\n", scalar+x,"\n",scalar+y)
print("\nTensor - Scalar:\n", scalar-x,"\n", scalar-y)
print("\nTensor * Scalar:\n", scalar * x,"\n", scalar * y)
print("\nPower:\n", y.pow_(scalar), y) #Inplace operations
print("\nMultiplication:\n", y.mul_(scalar), y)

# Again Inplace operations change the original tensors, thus saving memory without creating a new object/variable to store the result
# Be cautious to use it, in cases where original value of tensors are important


Tensor + Scalar:
 tensor([[11, 11, 11],
        [11, 11, 11],
        [11, 11, 11],
        [11, 11, 11]], dtype=torch.int32) 
 tensor([[11, 11, 11],
        [12, 12, 12],
        [13, 13, 13],
        [14, 14, 14],
        [15, 15, 15]])

Tensor - Scalar:
 tensor([[9, 9, 9],
        [9, 9, 9],
        [9, 9, 9],
        [9, 9, 9]], dtype=torch.int32) 
 tensor([[9, 9, 9],
        [8, 8, 8],
        [7, 7, 7],
        [6, 6, 6],
        [5, 5, 5]])

Tensor * Scalar:
 tensor([[10, 10, 10],
        [10, 10, 10],
        [10, 10, 10],
        [10, 10, 10]], dtype=torch.int32) 
 tensor([[10, 10, 10],
        [20, 20, 20],
        [30, 30, 30],
        [40, 40, 40],
        [50, 50, 50]])

Power:
 tensor([[      1,       1,       1],
        [   1024,    1024,    1024],
        [  59049,   59049,   59049],
        [1048576, 1048576, 1048576],
        [9765625, 9765625, 9765625]]) tensor([[      1,       1,       1],
        [   1024,    1024,    1024],
        [  59049,   59049,   59049],
 

In [397]:
# PyTorch also includes operations such as sqrt, log, abs, exp just like in Numpy

x = torch.rand(3,4)
y = torch.tensor([[100, 100], [25,25],[9,9]])
print("Square root:\n", torch.sqrt(x), "\n",torch.sqrt(y))

x_1 = torch.randn(3,5)
x_1_2 = torch.randn(2,3,3)
print("\nAbsolute value:\n", torch.abs(x_1), "\nBefore taking absolute:\n", x_1)
print("\nAbsolute value:\n", torch.abs(x_1_2), "\nBefore taking absolute:\n", x_1_2)


y_1 = torch.ones(2,3)
y_1_2 = torch.tensor([[2,2,2],[2,2,2],[2,2,2]])
y_1_3 = torch.zeros(3,2)
y_1_4 = torch.rand(2,2)
print("\nExponentiation e^(input_value):\n", torch.exp(y_1), "\n", torch.exp(y_1_2)\
     , "\n", torch.exp(y_1_3), "\n", torch.exp(y_1_4))

Square root:
 tensor([[0.7279, 0.9160, 0.3552, 0.5117],
        [0.9645, 0.9171, 0.7372, 0.4763],
        [0.4625, 0.6453, 0.6363, 0.6052]]) 
 tensor([[10., 10.],
        [ 5.,  5.],
        [ 3.,  3.]])

Absolute value:
 tensor([[0.3186, 0.3233, 0.2157, 0.0171, 0.1623],
        [0.8317, 0.9261, 0.0188, 1.0016, 0.3461],
        [0.0326, 0.8280, 1.1600, 0.4430, 0.5705]]) 
Before taking absolute:
 tensor([[ 0.3186,  0.3233,  0.2157, -0.0171,  0.1623],
        [-0.8317,  0.9261, -0.0188, -1.0016, -0.3461],
        [ 0.0326, -0.8280,  1.1600, -0.4430,  0.5705]])

Absolute value:
 tensor([[[0.4411, 0.9329, 0.4732],
         [0.3270, 2.2166, 0.4721],
         [1.3215, 0.3173, 0.8030]],

        [[1.2395, 0.0512, 1.4917],
         [0.4787, 0.3557, 0.5708],
         [0.9547, 1.5155, 0.7965]]]) 
Before taking absolute:
 tensor([[[-0.4411, -0.9329, -0.4732],
         [-0.3270,  2.2166,  0.4721],
         [-1.3215, -0.3173,  0.8030]],

        [[-1.2395, -0.0512,  1.4917],
         [-0.4787,  0.3

In [398]:
# For natural logarithm with the base of e = 2.718~, we must provide only positive numbers as input
# negative numbers will not work for any logarithms
# ln(e^x) = y(input_value), basically it will calculate such "x" so that e^x equals to "y" an input value
# Natural log is simple an inverse of e^x, in torch.exp(input_value), we are raising Eulers' number to given input_value, e^input_value
# Natural log, is log with base of e=2.718~, in torch.log(input_value), it will calculate an exponent for e such that
# the e^x = input_value, basically it is does the exact opposite of torch.exp()

x = torch.tensor([3,3,3])
e = torch.exp(x)
print("\n e^input_value:\n", e)

x = torch.abs(x)
print("\n Find such exponent so that e^x will equal to input_value:\n", torch.log(e))

# torch.exp() and torch.log() are inverses of each other


 e^input_value:
 tensor([20.0855, 20.0855, 20.0855])

 Find such exponent so that e^x will equal to input_value:
 tensor([3., 3., 3.])


In [399]:
x = torch.randn(2,3)
display(x)
e = torch.exp(x)
display(e) # raises Euler's number to the power of input_value
y = torch.log(e)
display(y)

tensor([[ 2.5460, -0.5250,  0.1986],
        [-1.7966, -2.4358, -0.0138]])

tensor([[12.7557,  0.5915,  1.2198],
        [ 0.1659,  0.0875,  0.9863]])

tensor([[ 2.5460, -0.5250,  0.1986],
        [-1.7966, -2.4358, -0.0138]])

In [400]:
x = torch.rand(2,3)
display(x)
y = torch.log(x) # remember it outputs the exponential values "x" such that e^x=input_value
display(y)
e = torch.exp(y) # if we e^torch.exp(x), then we will get back the original value of x
display(e)
# Natural log does not take negative values, because e^x = the output is never negative even if exponent "x" is negative
# negative exponents, will simply make the value smaller but never become negative
# ln(e^x) = y, basically it finds such exponent "x" so that the output is "y"

tensor([[0.5093, 0.2118, 0.2989],
        [0.8138, 0.8431, 0.9175]])

tensor([[-0.6748, -1.5523, -1.2076],
        [-0.2060, -0.1707, -0.0861]])

tensor([[0.5093, 0.2118, 0.2989],
        [0.8138, 0.8431, 0.9175]])

In [401]:
x = torch.randn(4,5)
x = torch.abs(x)
x_1 = torch.ones(3,5) * 2 # Scalar
e = torch.exp(x_1)
print(torch.sqrt(e))

tensor([[2.7183, 2.7183, 2.7183, 2.7183, 2.7183],
        [2.7183, 2.7183, 2.7183, 2.7183, 2.7183],
        [2.7183, 2.7183, 2.7183, 2.7183, 2.7183]])


In [402]:
# Reduction operations help to summarize vast amount of data within a single number, such as sum, mean, min, max
# Instead of looking at all the data, it is much easier to analyze the only few numbers which are representitives of the whole data
a = torch.rand(4,9)
display(a)

print("\nMean:\n", torch.mean(a))
print("\nSum:\n", torch.sum(a))
print("\nMin:\n", torch.min(a))
print("\nMax:\n", torch.max(a))

# Notice that the mean value stays more or less the same with few changes, which means that most of the data points
# are centered around that value of approximate 0.5, only when we are using rand with range (0,1) and not randn with normal distribution

tensor([[0.5340, 0.2789, 0.6121, 0.8012, 0.1938, 0.1412, 0.3877, 0.0363, 0.7764],
        [0.8699, 0.7946, 0.5591, 0.9067, 0.6184, 0.7033, 0.6262, 0.0195, 0.8188],
        [0.5028, 0.1628, 0.5443, 0.7226, 0.8835, 0.9063, 0.8497, 0.7745, 0.9600],
        [0.5109, 0.3464, 0.4243, 0.1182, 0.5919, 0.3182, 0.8460, 0.6174, 0.9228]])


Mean:
 tensor(0.5745)

Sum:
 tensor(20.6808)

Min:
 tensor(0.0195)

Max:
 tensor(0.9600)


In [403]:
a = torch.tensor([range(5)])
b = torch.randn(2,3)
display(a.float().T)
display(torch.abs(b.int()))

# The reason why we can keep adding methods in objects, because at its core they are torch.tensor() 
# which allows adding more methods to their assigned objects, if we are doing a.abs().int()
# effectively we are doing torch.tensor().abs().int(), because torch.tensor() is assigned to object "a"

display(torch.max(a))
display(a.max()) # the output is the same
display(torch.sum(a))
display(a.sum())
display(torch.mean(a.float()))
display(a.float().mean()) # remember that the mean requires input to be of floating type and not integer

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.]])

tensor([[1, 0, 0],
        [2, 0, 0]], dtype=torch.int32)

tensor(4)

tensor(4)

tensor(10)

tensor(10)

tensor(2.)

tensor(2.)

In [404]:
# We can use reduction operations along particular dimensions (rows and columns)
# if not dimension is specified the reduction will be made along all the dimensions by producing a single output for the entire tensor
# dim=0 represents rows, while dim=1 represents columns, dim=3,4...k represents sub-matrices any number preceding those
# standard (row, column) shape

a = torch.tensor([[1,1,1],[2,2,2],[4,4,4],[7,7,7]])
display(a.device, a.dtype, a.shape)
print("\nSumming Rows:\n", torch.sum(a, dim=0))
print("\nSumming columns:\n", torch.sum(a, dim=1))
print("\nMean rows:\n", torch.mean(a.float(), dim=0))
print("\nMean columns:\n", torch.mean(a.float(), dim=1))

device(type='cpu')

torch.int64

torch.Size([4, 3])


Summing Rows:
 tensor([14, 14, 14])

Summing columns:
 tensor([ 3,  6, 12, 21])

Mean rows:
 tensor([3.5000, 3.5000, 3.5000])

Mean columns:
 tensor([1., 2., 4., 7.])


In [405]:
a = torch.ones(2,5)
display(a, type(a))
display(torch.sum(a, dim=0))
display(torch.sum(a, dim=1))
display(torch.mean(a, dim=0))
display(torch.mean(a, dim=1))

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

torch.Tensor

tensor([2., 2., 2., 2., 2.])

tensor([5., 5.])

tensor([1., 1., 1., 1., 1.])

tensor([1., 1.])

In [406]:
# Comparison operations, shapes of both tensors must match, although does not have to be a square matrices like for element-wise operations
# <
# >
# <=
# >=
# == 
# !=
# the dtype of output is torch.bool, but it is still a torch.tensor at its core
# it is doing element-wise comparison, that is why shapes of both tensors must be equal, but not necessarily be square, can be rectangle

a = torch.rand(3,5)
b = torch.randn(3,5)
display(a,b)
c = a == b
print("\na == b:\n", a == b, type(c), c.dtype) # False if not equal
print("\na > b:\n", a > b)
print("\na < b:\n", a < b)
print("\na >= b:\n", a >= b)
print("\na <= b:\n", a <= b)
print("\na != b:\n", a != b) # True if not equal

tensor([[0.3696, 0.3075, 0.7802, 0.3711, 0.7749],
        [0.1043, 0.5252, 0.5201, 0.3494, 0.4971],
        [0.3683, 0.2233, 0.5532, 0.7388, 0.5115]])

tensor([[-0.9742,  1.3534,  0.4010,  1.7942, -0.1868],
        [ 0.1541,  1.2404, -1.8009,  0.3063, -0.1182],
        [ 0.9977,  1.9513, -0.2250, -1.2190, -1.1815]])


a == b:
 tensor([[False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False]]) <class 'torch.Tensor'> torch.bool

a > b:
 tensor([[ True, False,  True, False,  True],
        [False, False,  True,  True,  True],
        [False, False,  True,  True,  True]])

a < b:
 tensor([[False,  True, False,  True, False],
        [ True,  True, False, False, False],
        [ True,  True, False, False, False]])

a >= b:
 tensor([[ True, False,  True, False,  True],
        [False, False,  True,  True,  True],
        [False, False,  True,  True,  True]])

a <= b:
 tensor([[False,  True, False,  True, False],
        [ True,  True, False, False, False],
        [ True,  True, False, False, False]])

a != b:
 tensor([[True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True]])


In [436]:
# Updated info, element-wise operations can be performed by only square and rectangular matrices
# the shapes of both tensors must be the same, (mxn) for both A and B tensors

a = torch.rand(2,3)
b = torch.rand(2,3)

display(torch.mul(a, b))

x = torch.ones(2,3)
k = torch.ones(2,3)
display(x.add_(k), x, k)
display(k.add_(x), k)
display(k.pow_(x), k)
display(k.div_(x), k)
display(x.sub_(k), x, k)
display(k.add_(x), k)
k = k.int()
display(k)

tensor([[0.0556, 0.1531, 0.3779],
        [0.0325, 0.0393, 0.4640]])

tensor([[2., 2., 2.],
        [2., 2., 2.]])

tensor([[2., 2., 2.],
        [2., 2., 2.]])

tensor([[1., 1., 1.],
        [1., 1., 1.]])

tensor([[3., 3., 3.],
        [3., 3., 3.]])

tensor([[3., 3., 3.],
        [3., 3., 3.]])

tensor([[9., 9., 9.],
        [9., 9., 9.]])

tensor([[9., 9., 9.],
        [9., 9., 9.]])

tensor([[4.5000, 4.5000, 4.5000],
        [4.5000, 4.5000, 4.5000]])

tensor([[4.5000, 4.5000, 4.5000],
        [4.5000, 4.5000, 4.5000]])

tensor([[-2.5000, -2.5000, -2.5000],
        [-2.5000, -2.5000, -2.5000]])

tensor([[-2.5000, -2.5000, -2.5000],
        [-2.5000, -2.5000, -2.5000]])

tensor([[4.5000, 4.5000, 4.5000],
        [4.5000, 4.5000, 4.5000]])

tensor([[2., 2., 2.],
        [2., 2., 2.]])

tensor([[2., 2., 2.],
        [2., 2., 2.]])

tensor([[2, 2, 2],
        [2, 2, 2]], dtype=torch.int32)

In [82]:
# .ipynb jupyter notebook is a simple JSON file containing text, source code, rich media output, and metadata

!cat PyTorch.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2a7eff3e-a048-4829-8425-841989f1e275",
   "metadata": {},
   "source": [
    "# Install packages|"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "766ecba1-aed8-45f0-8c22-338731c752ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Torch version: 2.5.1\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "\n",
    "print(f\"Torch version: {torch.__version__}\")\n",
    "\n",
    "# Tensors are just like Numpy ndarrays matrices, but with capability to run much faster on GPUs, TPUs and CPUs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7c69fb56-74fe-4af5-9c35-550e150afd6d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cude is available: False\n"
     ]
    }
   ],
   "source": [
    "cuda_available = torc

In [546]:
# Logical operations, are effectively element-wise boolean operations on boolean tensors containing only "True" or "False"
# torch.logical_and(a,b) - it will output True only if both "a" AND "b" are the same, either True or False. 
# if at least one of them is different, then it will output False.
# torch.logical_or(a,b) - will output True of either of one boolean tensors have value True, and will output False only if
# both of the values of tensors are False.
# torch.logical_not(a) - takes only one tensor, and will output the exact opposite of the input_values, for instance,
# if input of given element is True, then it becomes False, and vice versa
# torch.logical_xor(a,b) - will output True only if both element values are not the same, otherwise if both values are the same
# it will output False, it is the reverse of what AND logical operation does which for statement to be True both must be equal and if not
# then will show False

x_1 = torch.ones(3,4)
x_2 = torch.zeros(3,4)
a = x_1 != x_2
b = x_1 == x_2
display(a,b)
print("\nLogical AND operation:\n", torch.logical_and(a, b))
print("\nLogical OR operation:\n", torch.logical_or(a, b))
print("\nLogical NOT operation of tensor A:\n", torch.logical_not(a))
print("\nLogical NOT operation of tensor B:\n", torch.logical_not(b))

tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])

tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])


Logical AND operation:
 tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

Logical OR operation:
 tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])

Logical NOT operation of tensor A:
 tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

Logical NOT operation of tensor B:
 tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])


In [547]:
import copy

#commented since the intention for boolean tensors not to change but rather stay the same with the help of copy.deepcopy()
#x_1 = torch.rand(3,4)
#x_2 = torch.rand(3,4)

x_1_1 = copy.deepcopy(x_1)
x_2_2 = copy.deepcopy(x_2)

a = x_1_1 >= x_2_2
b = x_1_1 <= x_2_2
display(a,b)

print("\nLogical AND operation:\n", torch.logical_and(a,b))
print("\nLogical OR operation:\n", torch.logical_or(a,b))
print("\nLogical NOT operation for tensor A:\n", torch.logical_not(a))
print(f"\nLogical NOT operation for tensor B:\n{torch.logical_not(b)}")
print(f"\nLogical XOR operation:\n{torch.logical_xor(a,b)}")

tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])

tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])


Logical AND operation:
 tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

Logical OR operation:
 tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])

Logical NOT operation for tensor A:
 tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]])

Logical NOT operation for tensor B:
tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])

Logical XOR operation:
tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])


In [585]:
import copy

#x_1 = torch.randn(4,7)
#x_2 = torch.randn(4,7)
x_1 = copy.deepcopy(x_1)
x_2 = copy.deepcopy(x_2)
a = x_1 > x_2
b = x_1 < x_2
c = x_1 == x_2
d = x_1 != x_2
f = copy.deepcopy(a)
display(a,b,c,d)
print("\nLogical AND operator:\n", torch.logical_and(b,d))
print("\nLogical OR operator:\n", torch.logical_or(b,c))
print("\nLogical NOT operator:\n", torch.logical_not(torch.logical_or(b,c)))
print("\nLogical XOR operator:\n", torch.logical_xor(d,b))
display(a.dtype, b.dtype, c.dtype, d.dtype, type(a), type(b), type(c), type(d))
print(f, f.dtype)
display(f.int(), f.dtype)

tensor([[ True,  True,  True,  True,  True, False,  True],
        [False, False, False, False,  True,  True, False],
        [False,  True, False,  True,  True,  True, False],
        [False,  True,  True, False,  True,  True,  True]])

tensor([[False, False, False, False, False,  True, False],
        [ True,  True,  True,  True, False, False,  True],
        [ True, False,  True, False, False, False,  True],
        [ True, False, False,  True, False, False, False]])

tensor([[False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False]])

tensor([[True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True]])


Logical AND operator:
 tensor([[False, False, False, False, False,  True, False],
        [ True,  True,  True,  True, False, False,  True],
        [ True, False,  True, False, False, False,  True],
        [ True, False, False,  True, False, False, False]])

Logical OR operator:
 tensor([[False, False, False, False, False,  True, False],
        [ True,  True,  True,  True, False, False,  True],
        [ True, False,  True, False, False, False,  True],
        [ True, False, False,  True, False, False, False]])

Logical NOT operator:
 tensor([[ True,  True,  True,  True,  True, False,  True],
        [False, False, False, False,  True,  True, False],
        [False,  True, False,  True,  True,  True, False],
        [False,  True,  True, False,  True,  True,  True]])

Logical XOR operator:
 tensor([[ True,  True,  True,  True,  True, False,  True],
        [False, False, False, False,  True,  True, False],
        [False,  True, False,  True,  True,  True, False],
        [False,  

torch.bool

torch.bool

torch.bool

torch.bool

torch.Tensor

torch.Tensor

torch.Tensor

torch.Tensor

tensor([[ True,  True,  True,  True,  True, False,  True],
        [False, False, False, False,  True,  True, False],
        [False,  True, False,  True,  True,  True, False],
        [False,  True,  True, False,  True,  True,  True]]) torch.bool


tensor([[1, 1, 1, 1, 1, 0, 1],
        [0, 0, 0, 0, 1, 1, 0],
        [0, 1, 0, 1, 1, 1, 0],
        [0, 1, 1, 0, 1, 1, 1]], dtype=torch.int32)

torch.bool

In [62]:
# Again for element-wise operations the shape or order which represents the number of rows and columns 
# must be the same for both tensors if tensor A is of shape(4,9), then tensor B -which is being used
# for element-wise operations with tensor A - must also be of shape(4,9). shape can be both square and rectangular
import copy

a = torch.ones(3,4)
b = torch.randn(3,4)
x = a >= b
f = copy.deepcopy(x.int())
display(x, x.dtype)
display(x.int())
display(x.float())
display(f.bool())

# torch.logical_and(a,b) - True if both elements are the same 
# torch.logical_or(a,b) - True if at least one of the elements is True
# torch.logical_not(a) - returns the exact opposite, if it was True, then it turns to False
# torch.logical_xor(a,b) - True if both are not the same

tensor([[ True,  True,  True,  True],
        [ True,  True,  True, False],
        [ True, False, False,  True]])

torch.bool

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0],
        [1, 0, 0, 1]], dtype=torch.int32)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 0.],
        [1., 0., 0., 1.]])

tensor([[ True,  True,  True,  True],
        [ True,  True,  True, False],
        [ True, False, False,  True]])

# Relationship with Numpy

In [135]:
import numpy as np

data_np = np.array([[1,2,3],[4,5,6]], dtype=np.float32) # numpy.ndarray object type
data_tn = torch.from_numpy(data_np) # torch.Tensor object type

display(data_np, type(data_np), data_np.dtype, data_np.device)
display(data_tn, type(data_tn), data_tn.dtype, data_tn.device)

# One thing to note, if changes made to ndarray, it will reflect in tensor as well
# Because, in this case, since tensor is created from ndarray, they are both stored in the same CPU location
# If you do not want to create duplicates of ndarray from tensor, then do not use torch.from_numpy(), 
# but instead use simple torch.tensor(), because change in one will not reflect in another

a_np = np.array([[1,2,3],[4,5,6]])
a_tn_from_np = torch.from_numpy(a_np)
a_tn = torch.tensor(a_np)

display(a_np, a_tn_from_np)
a_np[1,2] = 99
display(a_np,a_tn_from_np)
a_tn_from_np[0,1] = 226
display(a_np, a_tn_from_np)
# Notice that change in ndarray, causes the same change in tensor
# However, when using simple torch.tensor() instead of torch.from_numpy(), does cause the tensor to change if ndarray is changed
# because now, they are stored in different CPU locations, but with from_numpy(), they are stored in the same CPU location

array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)

numpy.ndarray

dtype('float32')

'cpu'

tensor([[1., 2., 3.],
        [4., 5., 6.]])

torch.Tensor

torch.float32

device(type='cpu')

array([[1, 2, 3],
       [4, 5, 6]])

tensor([[1, 2, 3],
        [4, 5, 6]])

array([[ 1,  2,  3],
       [ 4,  5, 99]])

tensor([[ 1,  2,  3],
        [ 4,  5, 99]])

array([[  1, 226,   3],
       [  4,   5,  99]])

tensor([[  1, 226,   3],
        [  4,   5,  99]])

In [141]:
# If you do not want a change to reflect if either of one is modified, then do not use torch.from_numpy()
# use a simple torch.tensor() instead
# We can also convert PyTorch tensors into Numpy ndarrays using .numpy() method, but again in this way
# they both will point to the same memory location, cause change in one to be reflected in the other

cpu_tensor = torch.tensor([[4,5,6],[11,12,13]])
np_from_tn = cpu_tensor.numpy()
print(f"{cpu_tensor}\n{type(cpu_tensor)}\n")
print(f"{np_from_tn}\n{type(np_from_tn)}\n")

cpu_tensor[1,1] = 144
display(cpu_tensor, np_from_tn)
np_from_tn[1,2] = 169
display(cpu_tensor, np_from_tn)
# See, if ndarray is modified, so does tensor, and vice versa
# If you do not want this behaviour, then consider in this case using copy.deepcopy()

tensor([[ 4,  5,  6],
        [11, 12, 13]])
<class 'torch.Tensor'>

[[ 4  5  6]
 [11 12 13]]
<class 'numpy.ndarray'>



tensor([[  4,   5,   6],
        [ 11, 144,  13]])

array([[  4,   5,   6],
       [ 11, 144,  13]])

tensor([[  4,   5,   6],
        [ 11, 144, 169]])

array([[  4,   5,   6],
       [ 11, 144, 169]])

In [155]:
# .numpy() method only works for tensors stored on CPU, meaning torch.tensor().device must show CPU for
# conversion from tensor to numpy to actually work, if tensor is on GPU, then first move it to CPU
# by using method .cpu() and then use .numpy()

if torch.cuda.is_available():
    gpu_tensor_1 = torch.tensor([[4,5,6],[8,9,0]], device='cuda')
    gpu_tensor_2 = torch.randn(4,8)
    print(f"Tensors stored on GPU:\n{gpu_tensor_1}\n{gpu_tensor_2}")

    # Change the tensor memory location from GPU to CPU, in order for numpy conversion to work
    # because numpy ndarrays can only be stored and used on CPU only
    from_gpu_to_cpu_tensor_1 = gpu_tensor_1.cpu()
    from_gpu_to_cpu_tensor_2 = gpu_tensor_2.cpu()
    print(f"Tensors moved to CPU from GPU:\n{from_gpu_to_cpu_tensor_1}\n{from_gpu_to_cpu_tensor_2}")

    tn_to_np_1 = from_gpu_to_cpu_tensor_1.numpy()
    tn_to_np_2 = from_gpu_to_cpu_tensor_2.numpy()
    print(f"Successfuly conversion from tensor to numpy:\n{tn_to_np_1}\n{tn_to_np_2}")
else:
    print("\nCuda not available, conversion failed.")


Cuda not available, conversion failed.


In [158]:
# MPS GPU acceleration on Mac M-series is alternative for CUDA GPU acceleration
# MPS - Metal Performance Shaders
display(torch.backends.mps.is_available())
display(torch.backends.mps.is_built())

True

True

In [159]:
display(torch.cuda.is_available())

False

In [196]:
if torch.backends.mps.is_available():
    data_tn = torch.randn((4,6), device="mps")
    print(f"\nTensor stored in {data_tn.device} :\n{data_tn}\n")

    # Conversion to CPU from GPU
    tn_to_cpu = data_tn.cpu()
    print(f"Tensor stored in {tn_to_cpu.device} :\n{tn_to_cpu}")

    # From Tensor to Numpy ndarray
    tn_to_np = tn_to_cpu.numpy()
    print(f"From tensor to {type(tn_to_np)}:\n{tn_to_np}")
else:
    print("MPS is not available, because current PyTorch was not built with MPS enabled")

# Attempting to convert Tensors (stored on GPU) to Numpy ndarrays, will result in error


Tensor stored in mps:0 :
tensor([[ 0.0811,  0.1007, -1.5628,  0.8233,  2.2584,  1.0718],
        [ 0.1500,  0.8188, -1.5844,  0.8607,  0.5774,  0.9538],
        [-0.6257,  1.2679, -0.4052, -0.7668, -1.0988,  0.1977],
        [ 1.0082,  0.8887, -0.6935, -1.2598, -0.7664,  0.3295]],
       device='mps:0')

Tensor stored in cpu :
tensor([[ 0.0811,  0.1007, -1.5628,  0.8233,  2.2584,  1.0718],
        [ 0.1500,  0.8188, -1.5844,  0.8607,  0.5774,  0.9538],
        [-0.6257,  1.2679, -0.4052, -0.7668, -1.0988,  0.1977],
        [ 1.0082,  0.8887, -0.6935, -1.2598, -0.7664,  0.3295]])
From tensor to <class 'numpy.ndarray'>:
[[ 0.08106023  0.10071743 -1.5628318   0.8232865   2.2583804   1.071765  ]
 [ 0.15002526  0.81876004 -1.5844166   0.8607232   0.5773765   0.9538122 ]
 [-0.62565416  1.2679352  -0.40515414 -0.76682585 -1.0988053   0.19771942]
 [ 1.0082097   0.8886786  -0.69345546 -1.2597829  -0.7663796   0.32948285]]


In [186]:
print(torch.backends.mps.is_available()) # checks if MPS acceleration is available, since it can have limited memory size
print(torch.backends.mps.is_built()) # checks if current PyTorch install was built with MPS enabled 
# As of my knowledge, MPS may not be available if Mac is intel-based, or due to old MacOS version 
# or simply MPS is not enabled, hence need to enable it or install corresponding PyTorch with MPS-enabled

True
True


In [188]:
x = torch.rand(4,5)
display(x.device)
# Looks like as of now, by default configurations, variables are stored on CPU

device(type='cpu')

In [213]:
"""
    The ability to convert tensors to numpy and vice versa is critical, since at the initial step
    I might want to work with NumPy ndarrays for data loading and processing and leveraging libraries
    built using NumPy ndarrays. Then when it is time for building or training the neural networks
    I will need to convert NumPy ndarrays to PyTorch tensors in order to access GPU/MPS acceleration.
    Likewise, the outputs of the model (which are tensors), can be converted back to NumPy ndarrays for
    data analysis and visualization using matplotlib or seaborn.
    
"""

# Remember that torch.from_numpy(ndarray_as_input), results in both tensor and ndarray having the same CPU location in memory
# hence changing one, will modify the other as well. Likewise, converting torch.tensor(..., device="cpu") given that it is in CPU,
# converting tensor to numpy using torch.tensor().numpy() by using .numpy() will result in changes being reflected for both.
# Make sure that tensor's location is changed from GPU/MPS to CPU, before using .numpy() since ndarrays can be used on CPU only,
# otherwise the error will show up.

import copy

if torch.backends.mps.is_available():
    a = torch.randn((4,5), device="mps")
    b = torch.rand((3,4), device="mps")
    c = torch.ones((6,7), device="mps")
    d = torch.zeros((2,8), device="mps")
    print(f"\nTensors are stored in a: {a.device}, b: {b.device}, c: {c.device}, d: {d.device}\nObject type is {type(a)}")

    # with .cpu command which is effectively being used for torch.tensor().cpu()
    # changes the memory location of objects from GPU/MPS to CPU
    a_cpu = a.cpu()
    b_cpu = b.cpu()
    c_cpu = c.cpu()
    d_cpu = d.cpu()
    print(f"Tensors are stored in a: {a.device}, b: {b.device}, c: {c.device}, d: {d.device}")

    # If we want both ndarray and tensors be in the same CPU location, hence changes being reflect then do the following:
    a_np_ref = a_cpu.numpy()
    b_np_ref = b_cpu.numpy()
    c_np_ref = c_cpu.numpy()
    d_np_ref = d_cpu.numpy()
    print(f"Tensors are converted to {type(a_np_ref)}")

    c_cpu[2,3] = 789
    display(c_cpu, c_np_ref) # changes are reflect in both
    
    # If we do not want ndarray and tensors have the same CPU location and want to avoid change in one, cause change in another
    # , then do the following:
    a_np = copy.deepcopy(a_np_ref)
    b_np = copy.deepcopy(b_np_ref)
    c_np = copy.deepcopy(c_np_ref)
    d_np = copy.deepcopy(d_np_ref)

    d_cpu[1,4] = 567 # changing tensor
    display(d_cpu,d_np)
    d_np[1, 3] = 888 # changing ndarray
    display(d_cpu,d_np)

else:
    print("\nMPS is not available because the current PyTorch install was not built with MPS enabled or the MacOS version is old\
    or this Mac is intel-based.")


Tensors are stored in a: mps:0, b: mps:0, c: mps:0, d: mps:0
Object type is <class 'torch.Tensor'>
Tensors are stored in a: mps:0, b: mps:0, c: mps:0, d: mps:0
Tensors are converted to <class 'numpy.ndarray'>


tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [  1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [  1.,   1.,   1., 789.,   1.,   1.,   1.],
        [  1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [  1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [  1.,   1.,   1.,   1.,   1.,   1.,   1.]])

array([[  1.,   1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   1.,   1., 789.,   1.,   1.,   1.],
       [  1.,   1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   1.,   1.,   1.,   1.,   1.,   1.]], dtype=float32)

tensor([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0., 567.,   0.,   0.,   0.]])

array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

tensor([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0., 567.,   0.,   0.,   0.]])

array([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0., 888.,   0.,   0.,   0.,   0.]], dtype=float32)

In [253]:
x = torch.randn(3,4)
display(x.device)

# use .to() method in order to move the variables/models from CPU to GPU and vice versa
device_cuda = torch.device("cuda" if torch.cuda.is_available() else "cpu") # since no cuda then replace it with mps
device_mps = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
x = x.to(device_mps)
display(x.device)

# Likewise we can also move models from CPU to GPU/MPS model.to(torch.device("cuda"))

x_1 = torch.rand(5,8)
display(x_1.device)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
x_1 = x_1.to(device)
display(x_1.device)

x_1 = x_1.cpu()
display(x_1.device)
x_np = x_1.numpy()
display(type(x_np))

device(type='cpu')

device(type='mps', index=0)

device(type='cpu')

device(type='mps', index=0)

device(type='cpu')

numpy.ndarray

In [260]:
a = torch.rand(5,6)
display(a.device)
device_cuda = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_mps = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
a_cuda = a.to(device_cuda)
a_mps = a.to(device_mps)
display(a_cuda.device, a_mps.device)

device(type='cpu')

device(type='cpu')

device(type='mps', index=0)

In [299]:
[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]

x = torch.randn(4,4)
print(x.device)
print(x.is_cpu)
print(x.is_cuda)

cpu
True
False


In [386]:
# list comprehension
a = [i**2 for i in range(5)]
b = [i+10 for i in range(9)]
c = [i+1e-10 for i in range(1,4,1)]
print(a, b, c)

display(1e-3)
display(9e-3)

[torch.cuda.get_device_name(k) for k in range(torch.cuda.device_count())]
device = torch.device("cuda" if torch.backends.mps.is_available() and torch.cuda.is_available() else "cpu")
display(device)

[0, 1, 4, 9, 16] [10, 11, 12, 13, 14, 15, 16, 17, 18] [1.0000000001, 2.0000000001, 3.0000000001]


0.001

0.009

device(type='cpu')

In [4]:
import timeit

timeit.default_timer() # shows in seconds the current time

x = torch.ones((4,5), dtype=torch.int32)
display(x, x.dtype)

tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]], dtype=torch.int32)

torch.int32

# Practice

In [14]:
# Below are various ways to check GPU availability, with list comprehension method being the shortets code-wise

import torch 
import numpy as np

print(f"PyTorch version: {torch.__version__}")

if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Devices name: {[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]}")

    device = torch.device("cuda")
else:
    print("GPU is not available")
    device = torch.device("cpu")

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device)

PyTorch version: 2.5.1
GPU is not available
mps


In [10]:
import torch
import numpy as np

print(f"PyTorch version: {torch.__version__}")

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    print("CUDA/MPS is not available because it is either not enabled with PyTorch install or this machine does not have CUDA/MPS \
    compatible GPU. Therefore switching to CPU.")
    device = torch.device("cpu")

print(device)

PyTorch version: 2.5.1
mps


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [32]:
# Some tensors operations

data_list = [[3,4,5],[0,2,6]]
x = torch.tensor(data_list, device="mps:0")
display(x, x.dtype, x.device, x.shape)
x_1 = torch.tensor(data_list, dtype=torch.float32)
display(x_1, x_1.dtype, x_1.device, x_1.shape)
display(x_1.int(),x_1.bool(),x_1.float())

tensor([[3, 4, 5],
        [0, 2, 6]], device='mps:0')

torch.int64

device(type='mps', index=0)

torch.Size([2, 3])

tensor([[3., 4., 5.],
        [0., 2., 6.]])

torch.float32

device(type='cpu')

torch.Size([2, 3])

tensor([[3, 4, 5],
        [0, 2, 6]], dtype=torch.int32)

tensor([[ True,  True,  True],
        [False,  True,  True]])

tensor([[3., 4., 5.],
        [0., 2., 6.]])

In [76]:
a = torch.zeros(3,4)
b = torch.ones(3,5)
c = torch.rand(5,3)
d = torch.randn(3,4)
e = torch.arange(9)
display(a.T,b.T,c.T,d.T,e.T)

a_1 = torch.tensor([range(9)])
a_2 = torch.arange(8)
a_3 = torch.arange(0,8,2)
a_4 = torch.arange(-10,-20,-1)
display(a_1, a_2, a_3, a_4)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

tensor([[0.2732, 0.6586, 0.2809, 0.8131, 0.4121],
        [0.2538, 0.6386, 0.6992, 0.0259, 0.0512],
        [0.8165, 0.8876, 0.3466, 0.1153, 0.2816]])

tensor([[-0.5081,  0.7981, -0.3126],
        [-1.6333, -0.1380,  0.9548],
        [ 0.2522, -0.6939, -0.1397],
        [-2.1123,  1.0102,  0.6103]])

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8])

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]])

tensor([0, 1, 2, 3, 4, 5, 6, 7])

tensor([0, 2, 4, 6])

tensor([-10, -11, -12, -13, -14, -15, -16, -17, -18, -19])

In [111]:
# For matrix multiplication we can use either A @ B or torch.matmul(A,B)
# althoug note that (x, m) @ (m, y) that the column of first matrix must match row of second matrix
a = torch.ones(3,4)
b = torch.rand(3,4)
#y = a @ b
print(y)
#print(torch.matmul(a,b))
print(torch.add(a,b), a+b)
print(torch.sub(a,b), a-b)
print(torch.div(a,b), a/b)
print(torch.mul(a,b), a*b)
print(torch.pow(a,b), a**b)
print(torch.mean(a.float()))
print(torch.mean(b.float()))
print(torch.sum(a))
print(torch.sum(b))
print(torch.min(a))
print(torch.min(b))
print(torch.max(a))
print(torch.max(b))
print(torch.sqrt(a))
print(torch.sqrt(b))
print(torch.abs(b))
print(torch.abs(a))
print(torch.exp(a))
print(torch.exp(b))
print(torch.log(a.abs()))
print(torch.log(b.abs()))
print(a.add_(b))
print(a.sub_(b))
print(a.div_(b))
print(a.mul_(b))
print(a.pow_(b))

tensor([[2.0890, 1.0459, 1.6776, 1.7605, 2.3827, 2.4006, 2.8182],
        [2.0890, 1.0459, 1.6776, 1.7605, 2.3827, 2.4006, 2.8182],
        [2.0890, 1.0459, 1.6776, 1.7605, 2.3827, 2.4006, 2.8182]])
tensor([[1.3166, 1.0175, 1.1785, 1.6299],
        [1.5109, 1.0669, 1.3003, 1.8897],
        [1.5190, 1.7838, 1.4329, 1.6846]]) tensor([[1.3166, 1.0175, 1.1785, 1.6299],
        [1.5109, 1.0669, 1.3003, 1.8897],
        [1.5190, 1.7838, 1.4329, 1.6846]])
tensor([[0.6834, 0.9825, 0.8215, 0.3701],
        [0.4891, 0.9331, 0.6997, 0.1103],
        [0.4810, 0.2162, 0.5671, 0.3154]]) tensor([[0.6834, 0.9825, 0.8215, 0.3701],
        [0.4891, 0.9331, 0.6997, 0.1103],
        [0.4810, 0.2162, 0.5671, 0.3154]])
tensor([[ 3.1585, 57.2129,  5.6023,  1.5875],
        [ 1.9574, 14.9368,  3.3298,  1.1240],
        [ 1.9269,  1.2759,  2.3102,  1.4608]]) tensor([[ 3.1585, 57.2129,  5.6023,  1.5875],
        [ 1.9574, 14.9368,  3.3298,  1.1240],
        [ 1.9269,  1.2759,  2.3102,  1.4608]])
tensor([[0.3166

In [125]:
# matrix multiplication
a = torch.randn(4,5)
b = torch.rand(5,3)
print(a @ b)
#print(b @ a) # matrix multiplication is not commutative
print(torch.matmul(a,b))
#print(torch.matmul(b,a))

a_1 = torch.rand(2,9)
b_1 = torch.randn(9,5)
print(a_1 @ b_1)
print(torch.matmul(a_1, b_1))
display(a_1 * 3)
print(b_1 + 1)
display(b_1 - 10)
display(b_1 / 4)
display(b_1 ** 2)
print(id(a))
print(id(b))

tensor([[-2.2348, -2.1300, -2.0124],
        [-0.8307, -0.6297, -0.5578],
        [ 1.0247,  0.6327,  0.4816],
        [ 2.6537,  2.9586,  1.0045]])
tensor([[-2.2348, -2.1300, -2.0124],
        [-0.8307, -0.6297, -0.5578],
        [ 1.0247,  0.6327,  0.4816],
        [ 2.6537,  2.9586,  1.0045]])
tensor([[-2.0059, -1.9786,  1.9418,  0.6486, -1.0208],
        [-0.7881, -0.6209,  1.2929,  1.9759, -1.2086]])
tensor([[-2.0059, -1.9786,  1.9418,  0.6486, -1.0208],
        [-0.7881, -0.6209,  1.2929,  1.9759, -1.2086]])


tensor([[1.1166, 1.0423, 1.8062, 1.7092, 2.0117, 1.3892, 0.4268, 0.7326, 1.8988],
        [2.8019, 2.1994, 1.0522, 0.6768, 2.5155, 2.3751, 2.3687, 0.5885, 0.2601]])

tensor([[ 2.5791,  2.2930,  0.0651,  1.4913,  0.4913],
        [ 0.1671, -0.4124,  0.2346,  1.3057,  1.3856],
        [ 0.6891,  2.0036,  1.6641, -0.4641,  0.8951],
        [ 0.6300, -0.7139,  1.3866,  1.5974,  0.3811],
        [-0.6538,  0.7905,  3.0068,  1.9610,  0.6015],
        [ 1.5509,  0.7372,  2.2493,  2.2552,  0.3759],
        [ 0.8132,  0.8284,  0.9675,  0.8721,  1.1839],
        [-0.3596,  0.6334, -0.4825,  1.0931,  0.0532],
        [-0.0962, -0.9288,  1.5972,  0.4794,  1.3342]])


tensor([[ -8.4209,  -8.7070, -10.9349,  -9.5087, -10.5087],
        [-10.8329, -11.4124, -10.7654,  -9.6943,  -9.6144],
        [-10.3109,  -8.9964,  -9.3359, -11.4641, -10.1049],
        [-10.3700, -11.7139,  -9.6134,  -9.4026, -10.6189],
        [-11.6538, -10.2095,  -7.9932,  -9.0390, -10.3985],
        [ -9.4491, -10.2628,  -8.7507,  -8.7448, -10.6241],
        [-10.1868, -10.1716, -10.0325, -10.1279,  -9.8161],
        [-11.3596, -10.3666, -11.4825,  -9.9069, -10.9468],
        [-11.0962, -11.9288,  -9.4028, -10.5206,  -9.6658]])

tensor([[ 0.3948,  0.3232, -0.2337,  0.1228, -0.1272],
        [-0.2082, -0.3531, -0.1914,  0.0764,  0.0964],
        [-0.0777,  0.2509,  0.1660, -0.3660, -0.0262],
        [-0.0925, -0.4285,  0.0967,  0.1493, -0.1547],
        [-0.4134, -0.0524,  0.5017,  0.2402, -0.0996],
        [ 0.1377, -0.0657,  0.3123,  0.3138, -0.1560],
        [-0.0467, -0.0429, -0.0081, -0.0320,  0.0460],
        [-0.3399, -0.0917, -0.3706,  0.0233, -0.2367],
        [-0.2741, -0.4822,  0.1493, -0.1302,  0.0835]])

tensor([[2.4935e+00, 1.6718e+00, 8.7412e-01, 2.4140e-01, 2.5880e-01],
        [6.9371e-01, 1.9947e+00, 5.8590e-01, 9.3476e-02, 1.4871e-01],
        [9.6665e-02, 1.0072e+00, 4.4102e-01, 2.1435e+00, 1.0995e-02],
        [1.3690e-01, 2.9375e+00, 1.4946e-01, 3.5683e-01, 3.8298e-01],
        [2.7349e+00, 4.3877e-02, 4.0272e+00, 9.2347e-01, 1.5878e-01],
        [3.0349e-01, 6.9080e-02, 1.5608e+00, 1.5755e+00, 3.8949e-01],
        [3.4878e-02, 2.9460e-02, 1.0565e-03, 1.6359e-02, 3.3828e-02],
        [1.8485e+00, 1.3440e-01, 2.1978e+00, 8.6640e-03, 8.9646e-01],
        [1.2017e+00, 3.7201e+00, 3.5668e-01, 2.7107e-01, 1.1168e-01]])

6356706384
6356649120


In [131]:
a = torch.rand(3,4)
b = torch.rand(3,4)
print(f"Memory address ID before in-place operation: {id(a)}")
print(a.add_(b))
print(f"After in-place operation: {id(a)}")
display(id(a) == id(a))

data_np = np.array([[1,2,3],[5,6,7]])
data_tn_np = torch.from_numpy(data_np)
data_tn = torch.tensor(data_np)
display(id(data_np))
display(id(data_tn_np))
display(id(data_tn))

print(f"Tensors from Numpy using torch.from_numpy(), have_same_memory_location: {id(data_np) == id(data_tn_np)}")
print(f"Tensors from Numpy using torch.tensor(), have_same_memory_location: {id(data_np) == id(data_tn)}")

Memory address ID before in-place operation: 6356530256
tensor([[0.7734, 1.9098, 1.1382, 0.9781],
        [0.7951, 0.9700, 1.2751, 0.8907],
        [0.7296, 0.6490, 0.6124, 0.9672]])
After in-place operation: 6356530256


True

6356641520

6356531456

6359090816

Tensors from Numpy using torch.from_numpy(), have_same_memory_location: False
Tensors from Numpy using torch.tensor(), have_same_memory_location: False


In [159]:
import numpy as np

data_np = np.array([[3,4,5],[1,2,3]])
data_tn_np = torch.from_numpy(data_np)
data_tn_np = data_tn_np.clone() 
# this helps to create a distinct copy of the tensor, so that any changes made to numpy will not reflect tensor
data_np[0,2] = 45

display(data_np, data_tn_np)

array([[ 3,  4, 45],
       [ 1,  2,  3]])

tensor([[3, 4, 5],
        [1, 2, 3]])

In [202]:
import copy

if torch.cuda.is_available() or torch.backends.mps.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "mps")
    data_tn = torch.ones((3,4), device=device)
    display(data_tn.device, data_tn.is_cuda, data_tn.is_mps)

    data_tn = data_tn.cpu()
    display(data_tn.device)
    data_tn = data_tn.clone() # .clone() is available for tensors only, use copy.deepcopy() for numpy
    data_tn_np = data_tn.numpy()
    data_tn_np = copy.deepcopy(data_tn_np)
    data_tn[2,2] = 888
    display(data_tn, data_tn_np)
else:
    print("GPU/MPS acceleration is not available.")

device(type='mps', index=0)

False

True

device(type='cpu')

tensor([[  1.,   1.,   1.,   1.],
        [  1.,   1.,   1.,   1.],
        [  1.,   1., 888.,   1.]])

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]], dtype=float32)

In [201]:
import numpy as np

ndarray_np = np.array([[4,5,6],[3,4,5]])
tn = torch.from_numpy(ndarray_np)
tn = tn.clone()
ndarray_np[1,2] = 999
tn[1,2] = 123
display(ndarray_np, tn)

tns = torch.tensor(ndarray_np)
tns[0,0] = 48
ndarray_np[0,0] = 56
display(tns, ndarray_np)

array([[  4,   5,   6],
       [  3,   4, 999]])

tensor([[  4,   5,   6],
        [  3,   4, 123]])

tensor([[ 48,   5,   6],
        [  3,   4, 999]])

array([[ 56,   5,   6],
       [  3,   4, 999]])

# Tensor indexing and slicing

In [21]:
import torch

a = torch.tensor([12,3,4,5,98,56])
a_ = a[0].item() # If we use .item() then it will return a strandard Python dtype
print(a_, type(a_)) # Even though it returns a single value scalar, 0-dimensional, it still shows as torch.Tenso
a_1 = a[-1]
a[2] = 457
display(a_1, a)
b = torch.arange(2,13,3)
display(b[-1])

12 <class 'int'>


tensor(56)

tensor([ 12,   3, 457,   5,  98,  56])

tensor(11)

In [42]:
a = torch.arange(0,100,3)
a_last = a[-1]
a_mid = a[15]
a[4] = 144
display(a_last, a_mid, a)
a_1 = a[5]
a_2 = a[6].item()
display(type(a_1), type(a_2))

tensor(99)

tensor(45)

tensor([  0,   3,   6,   9, 144,  15,  18,  21,  24,  27,  30,  33,  36,  39,
         42,  45,  48,  51,  54,  57,  60,  63,  66,  69,  72,  75,  78,  81,
         84,  87,  90,  93,  96,  99])

torch.Tensor

int

In [41]:
a = torch.rand(4,5)
col_1 = a[:,0]
row_1 = a[0]
col_2 = a[:,1]
row_3 = a[2]
scalar = a[0,0].item() #.item works for 0-dimensional tensors/scalars

display(a, col_1, row_1, col_2, row_3)
display(type(row_3), type(scalar))

tensor([[0.4805, 0.3837, 0.9928, 0.0972, 0.7391],
        [0.9631, 0.6443, 0.5999, 0.4072, 0.7218],
        [0.4677, 0.3635, 0.2099, 0.8057, 0.3101],
        [0.4876, 0.5296, 0.4929, 0.5418, 0.2990]])

tensor([0.4805, 0.9631, 0.4677, 0.4876])

tensor([0.4805, 0.3837, 0.9928, 0.0972, 0.7391])

tensor([0.3837, 0.6443, 0.3635, 0.5296])

tensor([0.4677, 0.3635, 0.2099, 0.8057, 0.3101])

torch.Tensor

float

In [100]:
a = torch.arange(10.0)
slice_1 = a[2:8]
slice_2 = a[::2]
slice_3 = a[:5]
slice_4 = a[5::2]
slice_5 = a[5:]
slice_6 = a[::]
s_slice = a[6].item()
display(slice_1, slice_2, slice_3, slice_4, slice_5, slice_6)
display(type(s_slice))

tensor([2., 3., 4., 5., 6., 7.])

tensor([0., 2., 4., 6., 8.])

tensor([0., 1., 2., 3., 4.])

tensor([5., 7., 9.])

tensor([5., 6., 7., 8., 9.])

tensor([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

float

In [103]:
# We can perform indexing with boolean tensors, where only values correspondin to True will be returned
# This is called boolean masking where instead of integer values acting as index for capturing specific values within the tensor
# We are using boolean tensor containing only True and False, based on a given condition
# Boolean tensor can be derived from the original tensor, thus their shapes will be equal and must be equal for boolean masking to work
# Boolean masking/indexing, does not preserve the shape when values are returned unlike slicing which does preserve the shape
# of original tensor, Boolean masking/indexing will also make a distinct copy of original tensor, therefore no changes will be reflected

a = torch.tensor([[3,4,5],[6,7,8],[1,2,9]])
b = torch.ones(3,3)
c = a > b
display(c)
y = a[c]
display(y)

tensor([[ True,  True,  True],
        [ True,  True,  True],
        [False,  True,  True]])

tensor([3, 4, 5, 6, 7, 8, 2, 9])

In [107]:
a = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
mask = a > 4
print(mask.dtype)
output = a[mask] # Boolean tensor is provided as an index, and only values corresponding to True in index positions will be returned
print(output)

mask_2 = a < 5
print(mask_2) # Boolean Tensor
print(a[mask_2])

torch.bool
tensor([5, 6, 7, 8, 9])
tensor([[ True,  True,  True],
        [ True, False, False],
        [False, False, False]])
tensor([1, 2, 3, 4])


In [109]:
# We can also modify the values based on condition
a[a <= 4] = 0 # all the values below 4 are now 0, similar how ReLu works
print(a)

tensor([[0, 0, 0],
        [0, 5, 6],
        [7, 8, 9]])


In [230]:
x = torch.randn(5,7)
mask = x > 1
y = x[mask]
x[x >= 1] = 1
display(y)
display(x)

tensor([1.6515, 1.2300, 1.3310, 1.7905, 1.6108])

tensor([[-9.7886e-02, -8.6619e-01, -6.4920e-01,  4.9512e-01,  3.4835e-01,
          7.6029e-01,  1.0000e+00],
        [ 1.0000e+00, -1.0876e+00,  9.3101e-01, -2.0407e-01, -4.8372e-01,
          4.8101e-01,  6.9200e-01],
        [-1.4987e-01,  2.8305e-01, -3.4163e-01,  9.8276e-01,  1.0000e+00,
          6.2071e-01,  1.0000e+00],
        [-1.3958e+00, -1.2621e+00, -6.7389e-01,  5.3808e-01, -1.4386e+00,
          5.9616e-01, -6.5851e-01],
        [ 1.0000e+00, -1.8491e+00,  9.1861e-02, -1.9241e-01, -1.6943e-03,
          6.2034e-01,  6.4031e-01]])

In [278]:
x = torch.rand(3,7)
mask = x > 0.5
print(mask)
print(x[mask])
x[x <= 0.5 ] = 0
print(x)

tensor([[False, False, False,  True,  True, False,  True],
        [False, False,  True, False, False, False, False],
        [False,  True, False,  True,  True, False,  True]])
tensor([0.7758, 0.8148, 0.5171, 0.8012, 0.7501, 0.6935, 0.9376, 0.7177])
tensor([[0.0000, 0.0000, 0.0000, 0.7758, 0.8148, 0.0000, 0.5171],
        [0.0000, 0.0000, 0.8012, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7501, 0.0000, 0.6935, 0.9376, 0.0000, 0.7177]])


In [290]:
x = torch.rand(8,5)
mask_1 = x < 0.2
#k = [i for i in x if i < 0.4]
mask_2 = x < 0.4
display(mask_2)

y_1 = x[mask_2]
y_2 = x[mask_1]
display(y_1, y_2)

tensor([[False, False, False,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False, False],
        [False,  True, False,  True,  True],
        [ True, False, False, False,  True],
        [ True,  True, False, False, False],
        [ True, False, False,  True,  True],
        [False, False,  True, False, False]])

tensor([0.1836, 0.3841, 0.2054, 0.3287, 0.0135, 0.2012, 0.0368, 0.2202, 0.0154,
        0.1289, 0.2149, 0.3630, 0.2507, 0.3311, 0.0946])

tensor([0.1836, 0.0135, 0.0368, 0.0154, 0.1289, 0.0946])

In [301]:
a = torch.arange(12).reshape(3,4)
print(a)

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])


In [309]:
# Just like boolean indexing, we can provide integer tensors for selecting an arbitrary order of elements from tensors
# Those integer tensors acting as indexes, must be 0-dimensional acting as a scalar, or simply a list of integers
a = torch.arange(20).reshape(5,4)
display(a)
print(a[[0,4],[3,0]])

ind = [0,1,3]
print(a[ind])

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])

tensor([ 3, 16])
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [12, 13, 14, 15]])


In [328]:
# See the tensor acting as an index, allows more flexibility to choose any values in arbitrary order
a = torch.arange(100, 200, 15)
ind = torch.tensor([0,4,5,2,2])
print(f"Indexes {ind} and corresponding values:\n{a[ind]}\n")

# row index
b = torch.arange(20).reshape(4,5)
display(b)
row_index = torch.tensor([1,3]) 
row_index_1 = torch.tensor([0,2])
print(f"Indexes {row_index} {row_index_1} and corresponding rows:\n{b[row_index]}\n{b[row_index_1]}")
# Since the first value corresponds to the # of rows, only given rows of 1 and 3 are sliced
# Also, just like for boolean masking, tensor indexing also create a new distinct copy of original tensor, without any reference
# Hence no updates in one will reflect the other

# column index
col_index_1 = torch.tensor([1,3])
col_index_2 = torch.tensor([0,2,4])
print(f"Column indexes {col_index_1} {col_index_2} and corresponding columns:\n{b[:,col_index_1]}\n{b[:,col_index_2]}")


# row and column index to select [3,17,12] and [7,14,16]
crow_index_1 = torch.tensor([0,3,2])
rcol_index_1 = torch.tensor([3,2,2])
crow_index_2 = torch.tensor([1,2,3])
rcol_index_2 = torch.tensor([2,4,1])

print(f"Row index {crow_index_1} column index {rcol_index_1}:\n{b[crow_index_1, rcol_index_1]}\n")
print(f"Row index {crow_index_2} column index {rcol_index_2}:\n{b[crow_index_2, rcol_index_2]}\n")

Indexes tensor([0, 4, 5, 2, 2]) and corresponding values:
tensor([100, 160, 175, 130, 130])



tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]])

Indexes tensor([1, 3]) tensor([0, 2]) and corresponding rows:
tensor([[ 5,  6,  7,  8,  9],
        [15, 16, 17, 18, 19]])
tensor([[ 0,  1,  2,  3,  4],
        [10, 11, 12, 13, 14]])
Column indexes tensor([1, 3]) tensor([0, 2, 4]) and corresponding columns:
tensor([[ 1,  3],
        [ 6,  8],
        [11, 13],
        [16, 18]])
tensor([[ 0,  2,  4],
        [ 5,  7,  9],
        [10, 12, 14],
        [15, 17, 19]])
Row index tensor([0, 3, 2]) column index tensor([3, 2, 2]):
tensor([ 3, 17, 12])

Row index tensor([1, 2, 3]) column index tensor([2, 4, 1]):
tensor([ 7, 14, 16])



In [346]:
x = torch.arange(0,1,1e-6).reshape(1000, 1000)
display(x.shape)

torch.Size([1000, 1000])

In [388]:
# reshape(a,b) must so that a*b equal to given input of arange(c) a*b = c, 10*10=100
a = torch.arange(100).reshape(10, 10)
print(a)
row_index = [2,6,2,7,-1, -3]
col_index = [2,3,6,8,-1, -2]
out = a[row_index, col_index]
out_1 = a[:,col_index]
out_2 = a[row_index,:]
print(out, out.shape, out_1, out_1.shape, out_2, out_2.shape)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
        [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
        [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
        [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
        [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])
tensor([22, 63, 26, 78, 99, 78]) torch.Size([6]) tensor([[ 2,  3,  6,  8,  9,  8],
        [12, 13, 16, 18, 19, 18],
        [22, 23, 26, 28, 29, 28],
        [32, 33, 36, 38, 39, 38],
        [42, 43, 46, 48, 49, 48],
        [52, 53, 56, 58, 59, 58],
        [62, 63, 66, 68, 69, 68],
        [72, 73, 76, 78, 79, 78],
        [82, 83, 86, 88, 89, 88],
        [92, 93, 96, 98, 99, 98]]) torch.Size([10, 6]) tensor([[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [60, 61, 62, 63, 64, 65, 66, 67

In [367]:
a = torch.arange(250).reshape(5,5,10)
print(a, a.dtype, type(a))
a[a > 125] = 125
a[a < 125] = 0
print(a)

tensor([[[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9],
         [ 10,  11,  12,  13,  14,  15,  16,  17,  18,  19],
         [ 20,  21,  22,  23,  24,  25,  26,  27,  28,  29],
         [ 30,  31,  32,  33,  34,  35,  36,  37,  38,  39],
         [ 40,  41,  42,  43,  44,  45,  46,  47,  48,  49]],

        [[ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59],
         [ 60,  61,  62,  63,  64,  65,  66,  67,  68,  69],
         [ 70,  71,  72,  73,  74,  75,  76,  77,  78,  79],
         [ 80,  81,  82,  83,  84,  85,  86,  87,  88,  89],
         [ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99]],

        [[100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
         [110, 111, 112, 113, 114, 115, 116, 117, 118, 119],
         [120, 121, 122, 123, 124, 125, 126, 127, 128, 129],
         [130, 131, 132, 133, 134, 135, 136, 137, 138, 139],
         [140, 141, 142, 143, 144, 145, 146, 147, 148, 149]],

        [[150, 151, 152, 153, 154, 155, 156, 157, 158, 159],
         [160, 161

In [387]:
a = torch.randn(5,5,10)
a[a <= 0] = 0
print(a)
k = a.bool()
print(k)
a = a[k]
print(a)
a = a.cpu()
a = a.to("cpu").numpy()
print(type(a))

tensor([[[1.0452, 2.1169, 0.0000, 0.0000, 0.0000, 0.0000, 0.1204, 0.2706,
          0.5941, 0.1667],
         [0.0000, 0.3531, 0.0000, 0.0000, 0.3922, 0.7303, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.0000, 1.7069, 0.0000, 0.0000, 0.9918, 0.0000, 1.5400, 0.0000,
          0.0000, 0.0000],
         [0.0000, 0.0000, 0.4139, 0.7814, 0.0000, 0.1294, 0.0000, 1.9712,
          0.0000, 0.0000],
         [0.0713, 0.0000, 0.6040, 0.0000, 0.9000, 0.9442, 0.0000, 0.0000,
          1.3106, 1.4688]],

        [[0.9692, 0.9037, 0.0000, 1.3800, 2.4983, 1.9402, 0.0000, 1.2193,
          0.7208, 0.6699],
         [0.0000, 0.9197, 1.2402, 0.1188, 0.0000, 0.0000, 1.3453, 0.8899,
          0.0000, 0.0000],
         [1.5770, 0.0000, 0.6776, 0.0000, 0.0000, 0.8369, 0.5640, 0.4449,
          0.0000, 0.8447],
         [0.3091, 1.5039, 0.0000, 1.6581, 0.0000, 0.4514, 0.0000, 0.3266,
          0.0000, 0.0766],
         [0.0000, 0.9519, 0.0000, 0.6262, 0.2252, 0.5458, 0.0000, 0.0000,
          0.000

In [436]:
data = torch.randn(5,9,6)
print(data)
data_ = data.clone()
mask = data_ >= 2.5
out = data_[mask]
print(out)

tensor([[[ 0.1541, -0.7640,  1.4316,  0.4440, -0.1638,  0.5339],
         [-1.3795,  1.6910, -0.2075,  0.0386, -0.4740,  1.4058],
         [-0.2388,  1.9894, -2.6206,  0.5318,  0.8574,  0.2210],
         [-0.9408, -0.2698,  2.3347,  0.1570,  1.0208,  1.9685],
         [ 1.9682, -0.7210, -0.5699, -0.3891, -0.6902, -0.1473],
         [ 1.0022, -1.1553, -0.6832, -0.7163,  1.1684, -1.0898],
         [-0.7565, -1.8426,  0.8631,  1.3623, -0.2753, -0.9114],
         [ 2.0284,  0.8553, -1.2546, -1.6465,  0.0274, -1.7507],
         [-1.9344, -0.6700, -1.3205,  0.2744,  0.0488, -0.7669]],

        [[ 0.2334, -0.3849, -0.0371, -2.1344, -1.5492, -1.4440],
         [ 0.0851, -0.4351,  0.5415,  1.0551, -0.5707, -0.7926],
         [ 0.4770, -1.0899,  0.3575, -1.0402,  1.0323,  0.5071],
         [ 0.8206,  0.1172,  0.7503, -0.8246,  0.1645,  2.7949],
         [-1.0634, -0.5220, -0.2783,  0.2780, -1.0693,  0.1411],
         [-0.0453,  0.1379, -1.2321,  0.1092, -0.7607, -0.9166],
         [-0.4697, -1.2

In [440]:
a = torch.rand(8,4,3)
print(a)
a = a.clone()
a[a <= 0.5] = 0
k = a.bool()
print(a)
print(a[k])

tensor([[[0.3295, 0.6129, 0.3455],
         [0.6025, 0.8643, 0.7655],
         [0.4261, 0.2768, 0.4233],
         [0.5592, 0.3922, 0.9427]],

        [[0.2687, 0.2633, 0.6238],
         [0.2583, 0.6621, 0.2675],
         [0.7784, 0.1841, 0.7578],
         [0.7109, 0.6096, 0.8750]],

        [[0.4524, 0.3265, 0.3561],
         [0.7671, 0.0372, 0.0468],
         [0.1766, 0.5985, 0.9410],
         [0.2882, 0.3704, 0.6663]],

        [[0.0797, 0.4322, 0.9529],
         [0.3096, 0.4884, 0.8075],
         [0.9607, 0.0531, 0.9894],
         [0.6854, 0.7408, 0.5432]],

        [[0.6648, 0.7839, 0.0036],
         [0.7835, 0.7145, 0.4826],
         [0.0291, 0.9633, 0.6656],
         [0.1704, 0.7170, 0.2273]],

        [[0.9336, 0.6461, 0.4245],
         [0.3716, 0.8536, 0.0368],
         [0.6886, 0.4181, 0.7093],
         [0.1621, 0.3817, 0.1797]],

        [[0.7348, 0.9000, 0.9823],
         [0.3002, 0.8980, 0.7955],
         [0.7578, 0.0923, 0.4431],
         [0.4791, 0.0858, 0.4528]],

      

# Reshaping and Rearranging Tensors

In [496]:
# Contiguous tensors are the ones which did not go through slicing or transposing, basically nothing is changed about them
# regarding their position of rows and columns or taking slices of the given tensor.

a = torch.arange(10).reshape(2,5) 
print(a.is_contiguous())
# since after that we did not make any changes to its shape by slicing or transposing, thus it is contiguous tensor
a = a.T
print(a[[1,3]])
# now it is no longer contiguous tensor, since it has been either sliced and transposed
print(a.is_contiguous())
x = a.reshape(5,2)
display(x, a)

True
tensor([[1, 6],
        [3, 8]])
False


tensor([[0, 5],
        [1, 6],
        [2, 7],
        [3, 8],
        [4, 9]])

tensor([[0, 5],
        [1, 6],
        [2, 7],
        [3, 8],
        [4, 9]])

In [497]:
a = torch.arange(18)
print(f"Tensor:\n{a}\nTensor shape: {a.shape}\nIs tensor contiguous?: {a.is_contiguous()}\n")

# both view and reshape seem to produce the same results
x = a.view(3,6)
print(f"Tensor:\n{x}\nTensor shape: {x.shape}\nIs tensor contiguous?: {x.is_contiguous()}")
print(f"Are both tensors stored in the same memory location?: {a.storage().data_ptr() == x.storage().data_ptr()}\n")

z = x.view(9,2)
print(f"Tensor:\n{z}\nTensor shape:\n{z.shape}\nIs tensor contiguous?: {z.is_contiguous()}")
print(f"Are both tensors are stored in the same memory location?: {x.storage().data_ptr() == z.storage().data_ptr()}\n")
print(f"Are all three tensors of different sizes created with .view() share the same memory location?:\
 {a.storage().data_ptr() == x.storage().data_ptr() == z.storage().data_ptr()}\n")

Tensor:
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])
Tensor shape: torch.Size([18])
Is tensor contiguous?: True

Tensor:
tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16, 17]])
Tensor shape: torch.Size([3, 6])
Is tensor contiguous?: True
Are both tensors stored in the same memory location?: True

Tensor:
tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])
Tensor shape:
torch.Size([9, 2])
Is tensor contiguous?: True
Are both tensors are stored in the same memory location?: True

Are all three tensors of different sizes created with .view() share the same memory location?: True



In [530]:
# .view() method works for contiguous tensors, so before reshaping check whether a given tensor is_contiguous()
# or otherwise reshaping will not work

a = torch.arange(12)
x = a.view(3,-1,2)
print(x)

tensor([[[ 0,  1],
         [ 2,  3]],

        [[ 4,  5],
         [ 6,  7]],

        [[ 8,  9],
         [10, 11]]])


In [535]:
# With .view() we can infer only one dimension based on other dimensions
# if the total number of elements are 12, then we can use "-1" for PyTorch to infer the dimension of that last 
# element by itselfs so that the product of shape elements equal to the total number of elements, here being 12
a = torch.arange(12)
x_1 = a.view(2,2,-1) # "-1" here inferring, so that 2*2*x = 12, hence x=3
x_2 = a.view(-1,3,2) # "-1" here inferring, so that x*3*2 = 12, hence x=2
x_3 = a.view(3, -1, 4) # "-1" here inferring, so that 3*x*4 = 12, hence x=1
display(x_1, x_1.shape, x_2, x_2.shape, x_3, x_3.shape)
# Again only one dimension can be inferred, and only one.

print(f"\nAre these tensors share the same memory location?: \
{a.storage().data_ptr() == x_1.storage().data_ptr() == x_2.storage().data_ptr() == x_3.storage().data_ptr()}")

tensor([[[ 0,  1,  2],
         [ 3,  4,  5]],

        [[ 6,  7,  8],
         [ 9, 10, 11]]])

torch.Size([2, 2, 3])

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]]])

torch.Size([2, 3, 2])

tensor([[[ 0,  1,  2,  3]],

        [[ 4,  5,  6,  7]],

        [[ 8,  9, 10, 11]]])

torch.Size([3, 1, 4])


Are these tensors share the same memory location?: True


In [555]:
# If we try to reshape with .view() for non-contiguous tensors then nothing will change and error may pop up
a = torch.arange(15).view(3,5)
b = a.t()
print(b.is_contiguous())

try:
    x = b.view(15)
    display(x, x.shape)
except RuntimeError as e:
    print(f"\nTensor is non-congtiguous {e}")

False

Tensor is non-congtiguous view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.


In [568]:
# .reshape() behaves similarly to .view() but .reshape() is more flexible and versatile since it does not require
# tensors to be contiguous, because if the tensors are non-contiguous then .reshape() simply makes copies of original
# tensor and reshapes them as needed, potentially creating several copies of the original tensor, which is 
# unlike .view() the resulting tensors shares the same memory location hence any changes will reflect other tensors

a = torch.arange(12).reshape(3,4)
a = a.t()
print(f"\nIs contiguous?: {a.is_contiguous()}\nShape: {a.shape}\n")
x = a.reshape(2,6)
print(f"Is contiguous?: {x.is_contiguous()}\nShape: {x.shape}\n")
z = x.reshape(4,-1) # "-1" will be inferred as 3 since 4*3=12
print(f"Is contiguous?: {z.is_contiguous()}\nShape:{z.shape}\n")
flat = z.view(-1) # flattens the matrix into a list of scalar/singular values in 0-dimensions

# Now check whether reshape() has created a copy of original tensor, or simply took its view
# reshape() will create a copy of original tensor if only it is non-contiguous, otherwise no copy is created
print(f"Are these tensors share the same memory location?: {a.storage().data_ptr() == x.storage().data_ptr()}")
display(id(a), id(x)) # IDs do not represent memory location
display(flat, flat.shape)


Is contiguous?: False
Shape: torch.Size([4, 3])

Is contiguous?: True
Shape: torch.Size([2, 6])

Is contiguous?: True
Shape:torch.Size([4, 3])

Are these tensors share the same memory location?: False


5376074432

5389109344

tensor([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

torch.Size([12])

In [582]:
image_tensor = torch.rand(3, 128, 128) # C, H, W
print(image_tensor.shape)
permutated_tensor = image_tensor.permute(1,2,0)
print(permutated_tensor.shape)
original_again = permutated_tensor.permute(2,0,1)
print(original_again.shape)
print(f"Are permutated tensors are contiguous: {permutated_tensor.is_contiguous()} {original_again.is_contiguous()}\n\
Does original tensor and permutated back to original tensor share the same memory location?: \
{image_tensor.storage().data_ptr() == original_again.storage().data_ptr()}")

# permute() just like view() does not create a copy of original tensor, but rather changes the original tensor
# thus any reference to them, will cause change in other tensors as well
# permute() however, returns non-contiguous tensors, in order to fix that use .contiguous() method

print(f"\nIs permutated tensor contiguous?: {permutated_tensor.is_contiguous()}")
permutated_tensor_to_contiguous = permutated_tensor.contiguous()
print(f"\nIs permutated tensor after using .contiguous() method caused it to become contiguous?:\
 {permutated_tensor_to_contiguous.is_contiguous()}")

torch.Size([3, 128, 128])
torch.Size([128, 128, 3])
torch.Size([3, 128, 128])
Are permutated tensors are contiguous: False True
Does original tensor and permutated back to original tensor share the same memory location?: True

Is permutated tensor contiguous?: False

Is permutated tensor after using .contiguous() method caused it to become contiguous?: True


In [600]:
# .view() does the reshaping faster than .reshape(), because it does not copy
# but it assumges contiguity in order to work without RuntimeError while .reshape() is more flexible and can handle
# non-contiguous tensors by making copies, though taking time for this operation
# .permute() simply changes the order of given input shapes by using their indexes

big_data = torch.rand(3, 3, 64, 64)
display(big_data.shape)
permute_data = big_data.permute(2, 0, 3, 1)
display(permute_data.shape)
permute_data_2 = permute_data.permute(0, 2, 1, 3)
display(permute_data_2.shape)
permute_back_to_original = permute_data_2.permute(2,3,0,1)
display(permute_back_to_original.shape)

print(f"Are those tensors contiguous?: {permute_data.is_contiguous()}, {permute_data_2.is_contiguous()}, \
{permute_back_to_original.is_contiguous()}")
permute_data = permute_data.contiguous()
permute_data_2 = permute_data_2.contiguous()
print(f"Are those tensors contiguous after using .contiguous() method?: \
{permute_data.is_contiguous()}, {permute_data_2.is_contiguous()}, \
{permute_back_to_original.is_contiguous()}")

# .permute() simply changes the order of dimensions/shapes of the tensor by taking indexes of those dimensions as input

torch.Size([3, 3, 64, 64])

torch.Size([64, 3, 64, 3])

torch.Size([64, 64, 3, 3])

torch.Size([3, 3, 64, 64])

Are those tensors contiguous?: False, False, True
Are those tensors contiguous after using .contiguous() method?: True, True, True


In [643]:
# We can also flatten the matrix into a list of scalar/singular values
# by flatenning we can see how man elements are there in the tensor

x = torch.rand(64, 64, 3)
print(x.shape)
z = x.reshape(-1)
z = x.view(-1)
print(z.shape)

a = torch.rand(4,4,3,2)
full = a.reshape(-1)
print(f"Total number of elements: {full.shape}")
mask_1 =  a >= 0.5
mask_2 = a < 0.5
output_1 = a[mask_1]
output_2 = a[mask_2]
print(f"Total number of element above or equal to 0.5: {output_1.shape}")
print(f"Total number of element below 0.5: {output_2.shape}")

# Below approach uses conditions, while the above approach uses boolean masking which requires less code to implement
copy = a.clone()
copy[copy < 0.5] = 0
msk = copy.bool()
copy = a[msk]
print(copy.shape)

torch.Size([64, 64, 3])
torch.Size([12288])
Total number of elements: torch.Size([96])
Total number of element above or equal to 0.5: torch.Size([47])
Total number of element below 0.5: torch.Size([49])
torch.Size([47])


In [655]:
x = torch.randn(3,224,224)
per = x.permute(1,2,0)
display(per.shape)
print(x.storage().data_ptr() == per.storage().data_ptr())
display(x.shape)
x = x.view(-1)
x = x.reshape(-1)
print(x.shape)

torch.Size([224, 224, 3])

True


torch.Size([3, 224, 224])

torch.Size([150528])


In [657]:
x.is_same_size(z)
display(x.shape, z.shape)

torch.Size([150528])

torch.Size([12288])

In [666]:
a = torch.ones(3,4,2)
b = torch.zeros(3,4,2)
display(b.is_same_size(a))
display(a.is_same_size(b))

True

True

# Joining and Splitting Tensors

In [680]:
# we can join sequence of tensors as long as their shapes are equal (except the one we are trying to concatenate) or be empty

a = torch.rand(3,4)
b = torch.rand(3,4)
cat_dim0 = torch.cat((a,b), dim=0) # (3+3,4) = (6,4) 
# dim=#, the # is the index of the first value of the shape, hence not always rows when it comes to multidimensional tensors
print(f"Initial shape of tensors: {a.shape}, {b.shape}")
print(f"Joining/Concatenating tensors along dim=0, rows: {cat_dim0.shape}")
cat_dim1 = torch.cat((a,b), dim=1) # (3, 4+4) = (3, 8)
print(f"Joining/Concatenating tensors along dim=1, columns: {cat_dim1.shape}")

Initial shape of tensors: torch.Size([3, 4]), torch.Size([3, 4])
Joining/Concatenating tensors along dim=0, rows: torch.Size([6, 4])
Joining/Concatenating tensors along dim=1, columns: torch.Size([3, 8])


In [715]:
# Now we will concatenate tensors but the shape will be equal, except the one we are trying to concatenate

a = torch.rand(5, 12)
b = torch.rand(7, 12)
cat_dim0 = torch.cat((a,b), dim=0) # dim=0, refers to the first index in the shape order
display(cat_dim0.shape) # (5+7, 12) = (12,12)

c = torch.rand(3,12,45)
d = torch.rand(3,12,32)
cat_dim2 = torch.cat((c,d), dim=2) # dim=2, looks into the 2-index position of the shape, which is 45 and 32 in C and D
display(cat_dim2.shape) # (3,12,45+32) = (3,12,77)

e = torch.arange(3584).view(4, 56, 8, -1)
f = torch.arange(2048).view(4,32,8,-1)
display(e.shape, f.shape)
cat_dim1 = torch.cat((e,f), dim=1)
display(cat_dim1.shape) # (4, 56+32, 8, 2) = (4, 88, 8, 2)
print(cat_dim1)

torch.Size([12, 12])

torch.Size([3, 12, 77])

torch.Size([4, 56, 8, 2])

torch.Size([4, 32, 8, 2])

torch.Size([4, 88, 8, 2])

tensor([[[[   0,    1],
          [   2,    3],
          [   4,    5],
          ...,
          [  10,   11],
          [  12,   13],
          [  14,   15]],

         [[  16,   17],
          [  18,   19],
          [  20,   21],
          ...,
          [  26,   27],
          [  28,   29],
          [  30,   31]],

         [[  32,   33],
          [  34,   35],
          [  36,   37],
          ...,
          [  42,   43],
          [  44,   45],
          [  46,   47]],

         ...,

         [[ 464,  465],
          [ 466,  467],
          [ 468,  469],
          ...,
          [ 474,  475],
          [ 476,  477],
          [ 478,  479]],

         [[ 480,  481],
          [ 482,  483],
          [ 484,  485],
          ...,
          [ 490,  491],
          [ 492,  493],
          [ 494,  495]],

         [[ 496,  497],
          [ 498,  499],
          [ 500,  501],
          ...,
          [ 506,  507],
          [ 508,  509],
          [ 510,  511]]],


        [[[ 896, 

In [712]:
# For joining/concatenation to work, all the shapes/dimensions must be equal, except the one which we are concatenating
# or be empty such as torch.tensor([]) nothing inside
a = torch.rand(4,7,10)
b = torch.tensor([])
cat_dim2 = torch.cat((a,b), dim=2)
display(cat_dim2.shape)

torch.Size([4, 7, 10])

In [717]:
a = torch.rand(3, 32, 64)
b = torch.rand(5, 32, 64)
c = torch.rand(7, 32, 64)
d = torch.rand(2, 32, 64)
res = torch.cat((a,b,c,d), dim=0)
print(res.shape)
# .cat() does not create a new dimension, but adds tensors along the existing dimension

torch.Size([17, 32, 64])


In [752]:
# torch.stack() adds tensors along a new dimension, the one preceding the existing one
# if two tensors have shape(2,3), then torch.stack() will add them along a new dimension preceding them, resulting in (2,2,3)
# shapes of all tensors must the equal, unlike for torch.cat() for which the concatenating one need not be the same for both tensors

a = torch.rand(2,3)
b = torch.rand(2,3)
output = torch.stack((a,b), dim=0) # it stacks these 2by3 submatrcies next to each other.
display(output.shape)
print(output)

torch.Size([2, 2, 3])

tensor([[[0.0667, 0.0571, 0.7986],
         [0.0580, 0.3755, 0.8325]],

        [[0.5012, 0.1691, 0.9647],
         [0.2593, 0.5427, 0.3676]]])


In [768]:
a = torch.rand(4,7)
b = torch.rand(4,7)
c = torch.rand(4,7)
# Since here, are three individual tensors, a new dimension along 0-D will be 3
output_0 = torch.stack((a,b,c), dim=0)
output_1 = torch.stack((a,b,c), dim=1)
output_2 = torch.stack((a,b,c), dim=2)
# for torch.stack() the dim=#, will be equal to the total sequence/number of tensors being stacked
# so for each dim=#, the resulting dimension for that particular shape/dim index is the total number of tensors being stacked

print(a.shape)
print(output_0.shape)
print(output_1.shape)
print(output_2.shape)

torch.Size([4, 7])
torch.Size([3, 4, 7])
torch.Size([4, 3, 7])
torch.Size([4, 7, 3])


In [776]:
# for torch.stack(), the order of shape/dimensions and their number must be the same or otherwise error will popo up
# torch.stack() adds tensors along a new dimension preceding the existing dimension, for instance
# shape(4,3) for tensors becomes shape(2,4,3)
a = torch.rand(3,32,64)
b = torch.rand(3,32,64)
c = torch.rand(3,32,64)
d = torch.rand(3,32,64)


output_0 = torch.stack((a,b,c,d), dim=0)
output_1 = torch.stack((a,b,c,d), dim=1)
output_2 = torch.stack((a,b,c,d), dim=2)
output_3 = torch.stack((a,b,c,d), dim=3)

out_0 = torch.cat((a,b,c,d),dim=0)
out_1 = torch.cat((a,b,c,d),dim=1)
out_2 = torch.cat((a,b,c,d),dim=2)


print(f"Original shape of all three tensors: {a.shape}")
print("\nBelow are dimensions of torch.stack():")
display(output_0.shape, output_1.shape, output_2.shape, output_3.shape)
print("\nBelow are dimensions of torch.cat():")
display(out_0.shape, out_1.shape, out_2.shape)

Original shape of all three tensors: torch.Size([3, 32, 64])

Below are dimensions of torch.stack():


torch.Size([4, 3, 32, 64])

torch.Size([3, 4, 32, 64])

torch.Size([3, 32, 4, 64])

torch.Size([3, 32, 64, 4])


Below are dimensions of torch.cat():


torch.Size([12, 32, 64])

torch.Size([3, 128, 64])

torch.Size([3, 32, 256])

In [790]:
data = torch.arange(18).reshape(6,3)
data = torch.arange(18).view(6,3)
print(f"Original data:\n{data}\n")
equal_split_1 = torch.split(data, 2, dim=0) # dim=0 referring to the rows or the first index in the dim order of shapes
equal_split_2 = torch.split(data, 3, dim=0)
col_split = torch.split(data, 1, dim=1) # dim=1 referring to the columns or the second index inthe dim order of shapes

for i, chunk in enumerate(equal_split):
    print(f"Chunk #{i+1}, Shape {chunk.shape}:\n{chunk}\n")
# Since there are 6 rows, splitting it into 3 chunks with each one having two rows

for i, chunk in enumerate(equal_split_2):
    print(f"Chunk #{i+1}, Shape {chunk.shape}:\n{chunk}\n")

for i, cup in enumerate(col_split):
    print(f"Chunk #{i}, Shape {cup.shape}:\n{cup}\n")


# in torch.split(tensor, chunk_size, dim=#), you can specify the size of each chunk or provide a list of various sizes for each chunk
# but the total number of chunks must equal to the dimension we are splitting the tensor

Original data:
tensor([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]])

Chunk #1, Shape torch.Size([2, 3]):
tensor([[0, 1, 2],
        [3, 4, 5]])

Chunk #2, Shape torch.Size([2, 3]):
tensor([[ 6,  7,  8],
        [ 9, 10, 11]])

Chunk #3, Shape torch.Size([2, 3]):
tensor([[12, 13, 14],
        [15, 16, 17]])

Chunk #1, Shape torch.Size([3, 3]):
tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]])

Chunk #2, Shape torch.Size([3, 3]):
tensor([[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]])

Chunk #0, Shape torch.Size([6, 1]):
tensor([[ 0],
        [ 3],
        [ 6],
        [ 9],
        [12],
        [15]])

Chunk #1, Shape torch.Size([6, 1]):
tensor([[ 1],
        [ 4],
        [ 7],
        [10],
        [13],
        [16]])

Chunk #2, Shape torch.Size([6, 1]):
tensor([[ 2],
        [ 5],
        [ 8],
        [11],
        [14],
        [17]])



In [791]:
data = torch.rand(10,25)
row_split_1 = torch.split(data, 2, dim=0)
row_split_2 = torch.split(data, [1,2,3,4], dim=0)
row_split_3 = torch.split(data, [3,7], dim=0)
row_split_4 = torch.split(data, 5, dim=0)

col_split_1 = torch.split(data, 5, dim=1)
col_split_2 = torch.split(data, [10,12,3], dim=1)
col_split_3 = torch.split(data, [14,11], dim=1)
col_split_4 = torch.split(data, [9,16], dim=1)

for i, cup in enumerate(row_split_1):
    print(f"Chunk #{i} Shape {cup.shape}:\n{cup}\n")


Chunk #0 Shape torch.Size([2, 25]):
tensor([[0.4032, 0.2665, 0.5689, 0.9558, 0.7335, 0.7716, 0.5554, 0.3178, 0.9897,
         0.5315, 0.0098, 0.2412, 0.0468, 0.8986, 0.6931, 0.8588, 0.5396, 0.2338,
         0.6108, 0.5326, 0.2469, 0.4281, 0.3185, 0.0626, 0.1491],
        [0.2311, 0.8909, 0.6141, 0.2340, 0.3224, 0.8999, 0.0766, 0.6744, 0.8163,
         0.0356, 0.6372, 0.3902, 0.7307, 0.6787, 0.0254, 0.8702, 0.3233, 0.8955,
         0.7436, 0.3276, 0.5380, 0.6280, 0.7185, 0.5924, 0.6137]])

Chunk #1 Shape torch.Size([2, 25]):
tensor([[0.8391, 0.8178, 0.6335, 0.4885, 0.2518, 0.1025, 0.8629, 0.9401, 0.0244,
         0.4913, 0.6978, 0.5261, 0.4289, 0.5088, 0.0057, 0.7855, 0.6461, 0.4029,
         0.9442, 0.4653, 0.4173, 0.3630, 0.7301, 0.1023, 0.8310],
        [0.0269, 0.1513, 0.8446, 0.8116, 0.1698, 0.6665, 0.3376, 0.8438, 0.6839,
         0.4616, 0.3571, 0.1008, 0.2831, 0.6438, 0.2404, 0.8769, 0.2470, 0.1786,
         0.6707, 0.2115, 0.4791, 0.6347, 0.0774, 0.6267, 0.7447]])

Chunk #2 Sha

In [792]:
for i, cup in enumerate(row_split_2):
    print(f"Chunk #{i} Shape {cup.shape}:\n{cup}\n")

Chunk #0 Shape torch.Size([1, 25]):
tensor([[0.4032, 0.2665, 0.5689, 0.9558, 0.7335, 0.7716, 0.5554, 0.3178, 0.9897,
         0.5315, 0.0098, 0.2412, 0.0468, 0.8986, 0.6931, 0.8588, 0.5396, 0.2338,
         0.6108, 0.5326, 0.2469, 0.4281, 0.3185, 0.0626, 0.1491]])

Chunk #1 Shape torch.Size([2, 25]):
tensor([[0.2311, 0.8909, 0.6141, 0.2340, 0.3224, 0.8999, 0.0766, 0.6744, 0.8163,
         0.0356, 0.6372, 0.3902, 0.7307, 0.6787, 0.0254, 0.8702, 0.3233, 0.8955,
         0.7436, 0.3276, 0.5380, 0.6280, 0.7185, 0.5924, 0.6137],
        [0.8391, 0.8178, 0.6335, 0.4885, 0.2518, 0.1025, 0.8629, 0.9401, 0.0244,
         0.4913, 0.6978, 0.5261, 0.4289, 0.5088, 0.0057, 0.7855, 0.6461, 0.4029,
         0.9442, 0.4653, 0.4173, 0.3630, 0.7301, 0.1023, 0.8310]])

Chunk #2 Shape torch.Size([3, 25]):
tensor([[0.0269, 0.1513, 0.8446, 0.8116, 0.1698, 0.6665, 0.3376, 0.8438, 0.6839,
         0.4616, 0.3571, 0.1008, 0.2831, 0.6438, 0.2404, 0.8769, 0.2470, 0.1786,
         0.6707, 0.2115, 0.4791, 0.6347, 0

In [812]:
for i, cup in enumerate(col_split_1):
    print(f"Chunk #{i} Shape {cup.shape}:\n{cup}\n")

Chunk #0 Shape torch.Size([10, 5]):
tensor([[0.4032, 0.2665, 0.5689, 0.9558, 0.7335],
        [0.2311, 0.8909, 0.6141, 0.2340, 0.3224],
        [0.8391, 0.8178, 0.6335, 0.4885, 0.2518],
        [0.0269, 0.1513, 0.8446, 0.8116, 0.1698],
        [0.8740, 0.5422, 0.0165, 0.8561, 0.7294],
        [0.9116, 0.1909, 0.5668, 0.0351, 0.1990],
        [0.2753, 0.0154, 0.6505, 0.4634, 0.1127],
        [0.4482, 0.4008, 0.1151, 0.1501, 0.7122],
        [0.1428, 0.4605, 0.8928, 0.5830, 0.9445],
        [0.8379, 0.5845, 0.3492, 0.4667, 0.0021]])

Chunk #1 Shape torch.Size([10, 5]):
tensor([[0.7716, 0.5554, 0.3178, 0.9897, 0.5315],
        [0.8999, 0.0766, 0.6744, 0.8163, 0.0356],
        [0.1025, 0.8629, 0.9401, 0.0244, 0.4913],
        [0.6665, 0.3376, 0.8438, 0.6839, 0.4616],
        [0.3002, 0.8806, 0.4817, 0.6837, 0.2224],
        [0.2575, 0.3091, 0.9088, 0.8170, 0.0043],
        [0.4970, 0.3040, 0.3445, 0.8309, 0.1767],
        [0.2017, 0.7427, 0.7841, 0.4604, 0.8754],
        [0.6096, 0.0723, 0

In [796]:
for i, cup in enumerate(col_split_2):
    print(f"Chunk #{i} Shape {cup.shape}:\n{cup}\n")

Chunk #0 Shape torch.Size([10, 10]):
tensor([[0.4032, 0.2665, 0.5689, 0.9558, 0.7335, 0.7716, 0.5554, 0.3178, 0.9897,
         0.5315],
        [0.2311, 0.8909, 0.6141, 0.2340, 0.3224, 0.8999, 0.0766, 0.6744, 0.8163,
         0.0356],
        [0.8391, 0.8178, 0.6335, 0.4885, 0.2518, 0.1025, 0.8629, 0.9401, 0.0244,
         0.4913],
        [0.0269, 0.1513, 0.8446, 0.8116, 0.1698, 0.6665, 0.3376, 0.8438, 0.6839,
         0.4616],
        [0.8740, 0.5422, 0.0165, 0.8561, 0.7294, 0.3002, 0.8806, 0.4817, 0.6837,
         0.2224],
        [0.9116, 0.1909, 0.5668, 0.0351, 0.1990, 0.2575, 0.3091, 0.9088, 0.8170,
         0.0043],
        [0.2753, 0.0154, 0.6505, 0.4634, 0.1127, 0.4970, 0.3040, 0.3445, 0.8309,
         0.1767],
        [0.4482, 0.4008, 0.1151, 0.1501, 0.7122, 0.2017, 0.7427, 0.7841, 0.4604,
         0.8754],
        [0.1428, 0.4605, 0.8928, 0.5830, 0.9445, 0.6096, 0.0723, 0.9198, 0.3845,
         0.9378],
        [0.8379, 0.5845, 0.3492, 0.4667, 0.0021, 0.7452, 0.9432, 0.5371,

In [799]:
display(col_split_2, type(col_split_2))

(tensor([[0.4032, 0.2665, 0.5689, 0.9558, 0.7335, 0.7716, 0.5554, 0.3178, 0.9897,
          0.5315],
         [0.2311, 0.8909, 0.6141, 0.2340, 0.3224, 0.8999, 0.0766, 0.6744, 0.8163,
          0.0356],
         [0.8391, 0.8178, 0.6335, 0.4885, 0.2518, 0.1025, 0.8629, 0.9401, 0.0244,
          0.4913],
         [0.0269, 0.1513, 0.8446, 0.8116, 0.1698, 0.6665, 0.3376, 0.8438, 0.6839,
          0.4616],
         [0.8740, 0.5422, 0.0165, 0.8561, 0.7294, 0.3002, 0.8806, 0.4817, 0.6837,
          0.2224],
         [0.9116, 0.1909, 0.5668, 0.0351, 0.1990, 0.2575, 0.3091, 0.9088, 0.8170,
          0.0043],
         [0.2753, 0.0154, 0.6505, 0.4634, 0.1127, 0.4970, 0.3040, 0.3445, 0.8309,
          0.1767],
         [0.4482, 0.4008, 0.1151, 0.1501, 0.7122, 0.2017, 0.7427, 0.7841, 0.4604,
          0.8754],
         [0.1428, 0.4605, 0.8928, 0.5830, 0.9445, 0.6096, 0.0723, 0.9198, 0.3845,
          0.9378],
         [0.8379, 0.5845, 0.3492, 0.4667, 0.0021, 0.7452, 0.9432, 0.5371, 0.2272,
         

tuple

In [800]:
display(row_split_2, type(row_split_2))

(tensor([[0.4032, 0.2665, 0.5689, 0.9558, 0.7335, 0.7716, 0.5554, 0.3178, 0.9897,
          0.5315, 0.0098, 0.2412, 0.0468, 0.8986, 0.6931, 0.8588, 0.5396, 0.2338,
          0.6108, 0.5326, 0.2469, 0.4281, 0.3185, 0.0626, 0.1491]]),
 tensor([[0.2311, 0.8909, 0.6141, 0.2340, 0.3224, 0.8999, 0.0766, 0.6744, 0.8163,
          0.0356, 0.6372, 0.3902, 0.7307, 0.6787, 0.0254, 0.8702, 0.3233, 0.8955,
          0.7436, 0.3276, 0.5380, 0.6280, 0.7185, 0.5924, 0.6137],
         [0.8391, 0.8178, 0.6335, 0.4885, 0.2518, 0.1025, 0.8629, 0.9401, 0.0244,
          0.4913, 0.6978, 0.5261, 0.4289, 0.5088, 0.0057, 0.7855, 0.6461, 0.4029,
          0.9442, 0.4653, 0.4173, 0.3630, 0.7301, 0.1023, 0.8310]]),
 tensor([[0.0269, 0.1513, 0.8446, 0.8116, 0.1698, 0.6665, 0.3376, 0.8438, 0.6839,
          0.4616, 0.3571, 0.1008, 0.2831, 0.6438, 0.2404, 0.8769, 0.2470, 0.1786,
          0.6707, 0.2115, 0.4791, 0.6347, 0.0774, 0.6267, 0.7447],
         [0.8740, 0.5422, 0.0165, 0.8561, 0.7294, 0.3002, 0.8806, 0.4817

tuple

In [839]:
# Since the splitted tensors along row or column dimensions are stored in tuples, they do not change
# even if the original tensor initializes randomly each time updated
# torch.split() requires chunk size as input while torch.chunk() requires the number of total chunks to be taken from the tensor
# and torch.chunk() specifies the size of each chunk depending on the total number of chunks the tensor must be splitted

data = torch.rand(6,7)
row_chunk = torch.chunk(data, 3, dim=0) # dim=0 along rows and in total there must be 3 chunks
print(f"Original tensor:\n{data}\n{data.shape}\n")
print("Tensor rows are splitted into 3 chunks:", end=" ")
for i, cup in enumerate(row_chunk):
    print(f"\nChunk shape{cup.shape}\n{cup}\n")

print("Tensor columns are splitted into 5 chunks:", end=" ")
col_chunk = torch.chunk(data, 5, dim=1)
for i, cup in enumerate(col_chunk):
    print(f"\nChunk shape{cup.shape}\n{cup}\n")

Original tensor:
tensor([[0.1045, 0.0863, 0.5751, 0.6409, 0.5114, 0.5787, 0.5644],
        [0.2856, 0.3925, 0.4852, 0.9777, 0.7159, 0.5573, 0.3067],
        [0.4334, 0.6696, 0.5189, 0.1549, 0.9224, 0.5600, 0.3944],
        [0.7698, 0.1562, 0.2757, 0.0176, 0.8966, 0.5232, 0.4803],
        [0.0809, 0.6718, 0.1052, 0.0651, 0.3589, 0.1446, 0.9719],
        [0.3556, 0.3636, 0.2126, 0.8544, 0.0205, 0.8106, 0.7103]])
torch.Size([6, 7])

Tensor rows are splitted into 3 chunks: 
Chunk shapetorch.Size([2, 7])
tensor([[0.1045, 0.0863, 0.5751, 0.6409, 0.5114, 0.5787, 0.5644],
        [0.2856, 0.3925, 0.4852, 0.9777, 0.7159, 0.5573, 0.3067]])


Chunk shapetorch.Size([2, 7])
tensor([[0.4334, 0.6696, 0.5189, 0.1549, 0.9224, 0.5600, 0.3944],
        [0.7698, 0.1562, 0.2757, 0.0176, 0.8966, 0.5232, 0.4803]])


Chunk shapetorch.Size([2, 7])
tensor([[0.0809, 0.6718, 0.1052, 0.0651, 0.3589, 0.1446, 0.9719],
        [0.3556, 0.3636, 0.2126, 0.8544, 0.0205, 0.8106, 0.7103]])

Tensor columns are splitted int

In [4]:
# While torch.split() takes chunk-size as input or list of chunk-sizes as input, the torch.chunk()
# takes the total number of desired chunks, and if it the number of chunks is not perfectly divisible by the given dimension
# then the last chunk's size will be smaller, overall we are not defining the chunk-size as it is being done for us.
# but rather we are defining the number of chunks we want to split the original tensor.
# for torch.chunk() we care about the number of chunks, their size is of low importance
# for torch.split() we care about the size of individual chunk, the total number of individual chunks is of low importance



# Broadcasting

In [20]:
# In order for two tensors be broadcastable, element-wise dimensions from right to left, must follow at least one of 3 rules
# 1. Each dimension must be equal
# 2. At least one dimension must be 1
# 3. Or one dimension should be empty
# In order to avoid manual change of each tensors, these statements must be met for tensors to be broadcastable

a = torch.arange(15).view(5,3)
b = torch.tensor(4)
c = a + b
c_ = torch.add(a,b)

print(f"Tensor A info: {a.shape, a.device, a.is_cpu, a.is_mps, a.dtype, type(a)}")
print(f"Scanar B info: {b.shape, b.device, b.is_cpu, b.is_mps, b.dtype, type(b)}")
print(f"Tensor C info: {c.shape, c.device, c.is_cpu, c.is_mps, c.dtype, type(c)}")

# In this case, the scalar/singular value is a tensor of empty dimension since it is 0-dimensional
# and because as it is stated above, scalar has empty dimension or 0-dimension, it is broadcastable
# and this singular value is expanded to match the shape of the tensor and element-wise opeation is performed for each
# element of the tensor

Tensor A info: (torch.Size([5, 3]), device(type='cpu'), True, False, torch.int64, <class 'torch.Tensor'>)
Scanar B info: (torch.Size([]), device(type='cpu'), True, False, torch.int64, <class 'torch.Tensor'>)
Tensor C info: (torch.Size([5, 3]), device(type='cpu'), True, False, torch.int64, <class 'torch.Tensor'>)


In [33]:
s = torch.tensor(5)
f = torch.tensor([5])

print(f"Tensor/Scalar dimension of torch.tensor(5), without []: {s.shape}")
print(f"Tensor/Scalar dimension of torch.tensor([5]), by using []: {f.shape}")

# Look, when no square brackets are used [], the dimension of tensor/scalar is zero
# while, by using square brackets [], the dimension is one
# In both cases these scalar/tensor values are broadcastable since they meet 2 of 3 at least needed requirements
# which are one of two tensors, either must have dimension of 1 or be empty(the same as having zero dimension)

# Therefore these scalar/tensor values are broadcastable to any tensors they are being used for element-wise operations
a = torch.rand(5,6)
print(torch.add(a,s,f))
print(torch.mul(a,s))
print(torch.sub(a,s,f))

Tensor/Scalar dimension of torch.tensor(5), without []: torch.Size([])
Tensor/Scalar dimension of torch.tensor([5]), by using []: torch.Size([1])
tensor([[25.9251, 25.1830, 25.6939, 25.0616, 25.7822, 25.4442],
        [25.2247, 25.8715, 25.1613, 25.6851, 25.8000, 25.2355],
        [25.1913, 25.9505, 25.8545, 25.3986, 25.5565, 25.5515],
        [25.5750, 25.2453, 25.2338, 25.6556, 25.9781, 25.5161],
        [25.2935, 25.6953, 25.1240, 25.8574, 25.9913, 25.9949]])
tensor([[4.6253, 0.9150, 3.4697, 0.3079, 3.9109, 2.2208],
        [1.1234, 4.3577, 0.8067, 3.4256, 3.9998, 1.1776],
        [0.9563, 4.7525, 4.2723, 1.9929, 2.7824, 2.7576],
        [2.8751, 1.2263, 1.1688, 3.2782, 4.8907, 2.5804],
        [1.4675, 3.4764, 0.6200, 4.2871, 4.9565, 4.9743]])
tensor([[-24.0749, -24.8170, -24.3061, -24.9384, -24.2178, -24.5558],
        [-24.7753, -24.1285, -24.8387, -24.3149, -24.2000, -24.7645],
        [-24.8087, -24.0495, -24.1455, -24.6014, -24.4435, -24.4485],
        [-24.4250, -24.7547, -24

In [8]:
# If dimension of one tensor is fewer than the other, in that case, the missing dimension of first tensor
# will be treated as having dimension of one, such as [2,3,3] and [3,3], here since [3,3] has fewer dimensions
# than [2,3,3], then [3,3] is treated as [1,3,3], as having one dimension in the place of missing dimension

# Always remember that for broadcasting the dimensions are compared against from the right to left
# starting comparison from the rightmost dimension to the left.

# If one of the dimension of the tensor is 1, then the dimension of the other(larger) tensor's dimension replaces that 1-d
# For instance for tensors of shapes [2,3] and [1,3], look we start comparing from the rigth to the left
# since 3 and 3 are equal we compare the next left dimension, and look just like mentioned above, one of the dimension
# of these two tensors is 1, and it is not possible to do element-wise operations without changing that 1
# therefore that 1 is changed to the dimension of the other(larger) tensor, which is 2 in this case
# hence automatically 1 becomes 2, since it meets one of three requirements, this tensor is broadcastable

# ROW VECTOR has dimension of [1, n] but in terms of tensors it is written as [n]
# COLUMN VECTOR has dimension of [n, 1], and shown as it is for tensors as well

a = torch.arange(8).reshape(2,4)
b = torch.tensor([3,5,7,1])

print(f"Shapes of matrix/tensor A: {a.shape}, and shape of row vector/tensor B: {b.shape}\
\n row vector/tensor of B dimensions is shown as [4], but in reality it is [1,4], we get the one since \n\
 tensor B has fewer dimension compared to tensor A, and in place of empty/missing dimension is filled with one.")

# These tensors meet the broadcastable requirements hence, we can perform element-wise operations with them
c = a + b
c = torch.add(a,b)
c_mul = torch.mul(a,b)
c_div = torch.div(a,b)
print(f"\nInitial values of tensors:\n{a}\n{b}\n")
print(f"Tensor C shape {c.shape}, \n{c}")

Shapes of matrix/tensor A: torch.Size([2, 4]), and shape of row vector/tensor B: torch.Size([4])
 row vector/tensor of B dimensions is shown as [4], but in reality it is [1,4], we get the one since 
 tensor B has fewer dimension compared to tensor A, and in place of empty/missing dimension is filled with one.

Initial values of tensors:
tensor([[0, 1, 2, 3],
        [4, 5, 6, 7]])
tensor([3, 5, 7, 1])

Tensor C shape torch.Size([2, 4]), 
tensor([[ 3,  6,  9,  4],
        [ 7, 10, 13,  8]])


In [19]:
a = torch.arange(20).reshape(2,2,5)
b = torch.arange(5).view(1,1,5) # [1,1,5], all the ones become twos 
print(f"Initial tensors A and B: \n{a}\n{b}\n")
print(f"Shape of A: {a.shape} of B: {b.shape}")

# Look two elements dimension of tensor is equal to 1, while others are equal
# Basically 1, changes to the value of other(larger) tensor for that particular dimension element

c_plus = torch.add(a,b)
print(f"{c_plus}\nShape of C: {c_plus.shape}") # [2,2,5]
c_diff = torch.sub(a,b)
c_pow = torch.pow(a,b)
c_mul = torch.mul(a,b)
c_div = torch.div(b,a)
display(c_diff, c_pow, c_mul, c_div)

Initial tensors A and B: 
tensor([[[ 0,  1,  2,  3,  4],
         [ 5,  6,  7,  8,  9]],

        [[10, 11, 12, 13, 14],
         [15, 16, 17, 18, 19]]])
tensor([[[0, 1, 2, 3, 4]]])

Shape of A: torch.Size([2, 2, 5]) of B: torch.Size([1, 1, 5])
tensor([[[ 0,  2,  4,  6,  8],
         [ 5,  7,  9, 11, 13]],

        [[10, 12, 14, 16, 18],
         [15, 17, 19, 21, 23]]])
Shape of C: torch.Size([2, 2, 5])


tensor([[[ 0,  0,  0,  0,  0],
         [ 5,  5,  5,  5,  5]],

        [[10, 10, 10, 10, 10],
         [15, 15, 15, 15, 15]]])

tensor([[[     1,      1,      4,     27,    256],
         [     1,      6,     49,    512,   6561]],

        [[     1,     11,    144,   2197,  38416],
         [     1,     16,    289,   5832, 130321]]])

tensor([[[ 0,  1,  4,  9, 16],
         [ 0,  6, 14, 24, 36]],

        [[ 0, 11, 24, 39, 56],
         [ 0, 16, 34, 54, 76]]])

tensor([[[   nan, 1.0000, 1.0000, 1.0000, 1.0000],
         [0.0000, 0.1667, 0.2857, 0.3750, 0.4444]],

        [[0.0000, 0.0909, 0.1667, 0.2308, 0.2857],
         [0.0000, 0.0625, 0.1176, 0.1667, 0.2105]]])

In [26]:
# Row Vector

a = torch.arange(24).view(4,6)
b = torch.tensor([2,3,4,5,6,1]) # torch.Size[4], a row vector, but it reality it is [1,4]
# broadcastable
c_plus = torch.add(a,b)
c_diff = torch.sub(a,b)
c_mul = torch.mul(a,b)
c_pow = torch.pow(b,a)
c_div = torch.div(a,b)

print(f"Initial Tensors:\n{a}\n{b}\n")
display(c_plus, c_diff, c_mul, c_pow, c_div)

Initial Tensors:
tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22, 23]])
tensor([2, 3, 4, 5, 6, 1])



tensor([[ 2,  4,  6,  8, 10,  6],
        [ 8, 10, 12, 14, 16, 12],
        [14, 16, 18, 20, 22, 18],
        [20, 22, 24, 26, 28, 24]])

tensor([[-2, -2, -2, -2, -2,  4],
        [ 4,  4,  4,  4,  4, 10],
        [10, 10, 10, 10, 10, 16],
        [16, 16, 16, 16, 16, 22]])

tensor([[  0,   3,   8,  15,  24,   5],
        [ 12,  21,  32,  45,  60,  11],
        [ 24,  39,  56,  75,  96,  17],
        [ 36,  57,  80, 105, 132,  23]])

tensor([[                 1,                  3,                 16,
                        125,               1296,                  1],
        [                64,               2187,              65536,
                    1953125,           60466176,                  1],
        [              4096,            1594323,          268435456,
                30517578125,      2821109907456,                  1],
        [            262144,         1162261467,      1099511627776,
            476837158203125, 131621703842267136,                  1]])

tensor([[ 0.0000,  0.3333,  0.5000,  0.6000,  0.6667,  5.0000],
        [ 3.0000,  2.3333,  2.0000,  1.8000,  1.6667, 11.0000],
        [ 6.0000,  4.3333,  3.5000,  3.0000,  2.6667, 17.0000],
        [ 9.0000,  6.3333,  5.0000,  4.2000,  3.6667, 23.0000]])

In [31]:
# Column vector

a = torch.arange(20,44).view(4,6)
b = torch.tensor([[1],[2],[3],[4]]) # column vector of shape 4 by 1
# We could also create a row vector and transpose it, to get a column vector, which is much easier and accurate than manually doing it
print(f"Initial tensors A and B: \n{a}\n{b}\nTheir shapes: {a.shape}, {b.shape}")
# Since 4 and 4 match, and one of them has 1, [4,1] will become [4,6] taking the value of the other(larger) dimension of tensor

c_plus = torch.add(a,b)
c_diff = torch.sub(a,b)
c_div = torch.div(a,b)
c_pow = torch.pow(a,b)
c_mul = torch.mul(a,b)
display(c_plus, c_diff, c_div, c_pow, c_mul)

Initial tensors A and B: 
tensor([[20, 21, 22, 23, 24, 25],
        [26, 27, 28, 29, 30, 31],
        [32, 33, 34, 35, 36, 37],
        [38, 39, 40, 41, 42, 43]])
tensor([[1],
        [2],
        [3],
        [4]])
Their shapes: torch.Size([4, 6]), torch.Size([4, 1])


tensor([[21, 22, 23, 24, 25, 26],
        [28, 29, 30, 31, 32, 33],
        [35, 36, 37, 38, 39, 40],
        [42, 43, 44, 45, 46, 47]])

tensor([[19, 20, 21, 22, 23, 24],
        [24, 25, 26, 27, 28, 29],
        [29, 30, 31, 32, 33, 34],
        [34, 35, 36, 37, 38, 39]])

tensor([[20.0000, 21.0000, 22.0000, 23.0000, 24.0000, 25.0000],
        [13.0000, 13.5000, 14.0000, 14.5000, 15.0000, 15.5000],
        [10.6667, 11.0000, 11.3333, 11.6667, 12.0000, 12.3333],
        [ 9.5000,  9.7500, 10.0000, 10.2500, 10.5000, 10.7500]])

tensor([[     20,      21,      22,      23,      24,      25],
        [    676,     729,     784,     841,     900,     961],
        [  32768,   35937,   39304,   42875,   46656,   50653],
        [2085136, 2313441, 2560000, 2825761, 3111696, 3418801]])

tensor([[ 20,  21,  22,  23,  24,  25],
        [ 52,  54,  56,  58,  60,  62],
        [ 96,  99, 102, 105, 108, 111],
        [152, 156, 160, 164, 168, 172]])

In [47]:
# row vector and column vector

r = torch.tensor([4,5,6,7])
c = torch.tensor([[4],[5],[6],[7]])
print(f"Shape of Row Vector {r.shape}\nShow of Column Vector{c.shape}")
# Even though for row vector it shows [4] it is actually [1,4] since it has a single row and 4 columns
# For column vector it shows fully

print(f"Row Vector before {r} {r.shape}")
print(f"Row Vector after transposing:\n{r.T} {r.T.shape}")

print(f"Column Vector before:\n{c} {c.T.shape}")
print(f"Column Vector after transposing: {c.t()} {c.T.shape}")

Shape of Row Vector torch.Size([4])
Show of Column Vectortorch.Size([4, 1])
Row Vector before tensor([4, 5, 6, 7]) torch.Size([4])
Row Vector after transposing:
tensor([4, 5, 6, 7]) torch.Size([4])
Column Vector before:
tensor([[4],
        [5],
        [6],
        [7]]) torch.Size([1, 4])
Column Vector after transposing: tensor([[4, 5, 6, 7]]) torch.Size([1, 4])


# Type Casting

In [56]:
a = torch.arange(20, dtype=torch.float32).reshape(4,5)
print(f"Data Type: {a.dtype}\n{a}")
a_ = a.to(torch.int32)
print(f"Data type after type casting: {a_.dtype}\n{a_}")

# Just like we can use .to() for changing the location of tensors from CPU to GPU and vice versa
# We can use to convert its data type, type casting

Data Type: torch.float32
tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.]])
Data type after type casting: torch.int32
tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]], dtype=torch.int32)


In [73]:
a = torch.randn(4,5)
display(a, a.shape)
a_ = a.to(dtype=torch.int8)
display(a_)

# Likewise we can use .float() and long() and bool()
a_long = a.long() # is the same as int64
a_fl = a.float() # is the same as float32
a_bool = a.bool() # acting as boolean masking, 0=False, else True
display(a_long.dtype, a_fl.dtype, a_bool.dtype)

display(a_long, a_fl, a_bool)

tensor([[-0.2596, -1.1229,  1.8660,  1.1105,  1.0734],
        [ 0.6009, -0.2864, -1.2107, -1.3312, -0.1966],
        [ 1.0371, -0.0436, -0.9606, -0.0787,  0.3518],
        [-0.4379,  0.1628, -1.5787,  0.4633,  0.2021]])

torch.Size([4, 5])

tensor([[ 0, -1,  1,  1,  1],
        [ 0,  0, -1, -1,  0],
        [ 1,  0,  0,  0,  0],
        [ 0,  0, -1,  0,  0]], dtype=torch.int8)

torch.int64

torch.float32

torch.bool

tensor([[ 0, -1,  1,  1,  1],
        [ 0,  0, -1, -1,  0],
        [ 1,  0,  0,  0,  0],
        [ 0,  0, -1,  0,  0]])

tensor([[-0.2596, -1.1229,  1.8660,  1.1105,  1.0734],
        [ 0.6009, -0.2864, -1.2107, -1.3312, -0.1966],
        [ 1.0371, -0.0436, -0.9606, -0.0787,  0.3518],
        [-0.4379,  0.1628, -1.5787,  0.4633,  0.2021]])

tensor([[True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True]])

In [77]:
# .long() is the same as int64
# .float() is the same as float32
# .bool() acts as boolean masking, if 0=False else True
# Data conversion creates a new copy of the original tensor, and will not have reference to original tensor

a = torch.randn(3,5)
a_ = a.to(dtype=torch.int32)
_a_ = a.long() # the same as int64
display(a.dtype, a_.dtype, _a_.dtype)
print(f"Do these tensors share the same memory: {a.storage().data_ptr() == a_.storage().data_ptr()}")
print(f"Do these tensors share the same memory: {a_.storage().data_ptr() == _a_.storage().data_ptr()}")
# Their location in the memory is different, so they do not have reference to each other

torch.float32

torch.int32

torch.int64

Do these tensors share the same memory: False
Do these tensors share the same memory: False


In [80]:
torch.get_default_device()

device(type='cpu')

In [88]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "mps:0" if torch.backends.mps.is_available() else "cpu")
a = torch.rand(4,5, device=device)
display(a.device)

ls = [torch.cuda.get_device_name(k) for k in range(torch.cuda.device_count())]
print(ls)

device(type='mps', index=0)

[]


In [93]:
try:
    a = torch.rand((3,4), device="cuda")
except RuntimeError as e:
    print(f"Could not create tensor directly on CUDA GPU: {e}")
except AssertionError as ae:
    print(f"CUDA is not enabled on this device: {ae}")

CUDA is not enabled on this device: Torch not compiled with CUDA enabled


In [101]:
if a.is_cpu:
    a.to("mps")
a.device
a = a.cpu()
a.device

device(type='cpu')

In [107]:
device.type

'mps'

In [109]:
# We will face a RuntimeError if we try to make computations on tensors stored in two different devices

a = torch.rand(4,5, device = "mps")
b = torch.rand(4,5, device = "cpu") # by default tensors are stored on the CPU

try:
    c = a + b
    print(f"Operations performed successfully, even if they are on the different devices\n{c}")
except RuntimeError as e:
    print(f"Operations of two or more tensors stored in different devices cannot be performed, you need to convert them\
    back to a single device\n{e}")

Operations of two or more tensors stored in different devices cannot be performed, you need to convert them    back to a single device
Expected all tensors to be on the same device, but found at least two devices, mps:0 and cpu!


In [114]:
x = torch.ones(4,3)
a = torch.tensor([2,3,4])
b = torch.tensor([[2,3,4]])
display(x.shape, a.shape, b.shape)
c = x + a
c_ = x + b
print(f"Shapes after addition: {c.shape} {c_.shape}")

torch.Size([4, 3])

torch.Size([3])

torch.Size([1, 3])

Shapes after addition: torch.Size([4, 3]) torch.Size([4, 3])


# AutoGrad

In [2]:
torch.get_default_device()

device(type='cpu')

In [3]:
torch.get_default_dtype()

torch.float32

In [15]:
img = torch.rand(16,32,64,128,3)
display(img.shape[1])
display(img.shape)
c = img.permute(4,0,1,2,3).contiguous().view(img.shape[0], -1)
print(c.shape)
f = img.reshape(-1)
print(f.shape)

32

torch.Size([16, 32, 64, 128, 3])

torch.Size([16, 786432])
torch.Size([12582912])


In [48]:
a = torch.rand(3,4,5,10)
extract = a[0,0,3:,4]


b = torch.rand(4,5,10)
et = a[0,:,-2:]
display(et.shape)

torch.Size([4, 2, 10])

In [56]:
import torch

w = torch.tensor(5.0, requires_grad=True)
x = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)

y = w * x # or torch.mul(w, x)
z = y + b # or torch.add(y, b)
print(f"Result z: {z}")

Result z: 16.0


In [69]:
w = torch.rand(3,4, requires_grad=True)
x = torch.rand(3,4, requires_grad=True)
b = torch.arange(3.0, requires_grad=True).view(3,1)

y = torch.mul(w,x)
z = torch.add(y,b)
print(f"Result z:\n{z}\n{z.shape, z.device}")

Result z:
tensor([[0.0909, 0.1682, 0.1878, 0.3820],
        [1.1382, 1.4012, 1.3964, 1.1866],
        [2.0526, 2.0295, 2.1878, 2.0068]], grad_fn=<AddBackward0>)
(torch.Size([3, 4]), device(type='cpu'))


In [78]:
# So any tensors that have requires_grad=True, will be recorded by PyTorch in grad_fn
# and any operations performed on them and associated in the variable, which are called intermediate tensors
# and therefore any other tensors without requires_grad=True or an intermediate variable without those grad tensor operations
# will not be recorded or included in the comptionational graph on the PyTorch

w = torch.rand(5,6, requires_grad=True)
x = torch.rand(6,5, requires_grad=True)
b = torch.rand(5,1, requires_grad=True)

y = torch.matmul(w,x)
z = torch.add(y,b)
print(f"Result z:\n{z}\n{z.shape}")

# grad_fn=<AddBackward0> has a reference to a sequential order of all the operations/functions performed on tensors
# that have requires_grad=True enabled, even including intermediate variables which did not have that enabled
# but did have tensors which have it enabled and operations were performed on them
display(z.grad_fn)

Result z:
tensor([[2.5217, 2.6221, 2.4600, 2.6091, 1.4784],
        [1.5324, 1.9483, 1.7585, 1.9915, 1.1791],
        [2.2603, 2.3784, 2.3125, 2.4800, 1.3092],
        [1.1756, 1.3836, 1.3794, 1.4984, 0.6779],
        [2.5021, 2.7108, 2.4398, 2.4713, 1.7434]], grad_fn=<AddBackward0>)
torch.Size([5, 5])


<AddBackward0 at 0x11f65d880>

In [80]:
# we use .backward() for the final output (final a single loss value) and it uses
# grad_fn=<AddBackward0> in order to use chain rule to traverse back to the initial input value
# by following all the operations/functions on tensors in the reverse-mode order in order to calculate 
# how much the final loss is affected by a single change in input and intermediate tensors

# All the PyTorch tensors have boolean attribute of requires_grad, which is set to False by default
# it is used in order for PyTorch know which tensors should be recorded for the potential computations of gradients
# since not all tensors require gradients, makes sence by default to be as False or otherwise it would consume
# extra memory and computation

a = torch.rand(3,5).view(5,3)
print(f"Does this tensor have requires_grad enabled by default: {a.requires_grad}")
b = torch.rand(6,7, requires_grad=False) # it is False by default anyways 
print(f"Does this tensor have requires_grad enabled by default: {b.requires_grad}")

# ONLY those tensors that have requires_grad, will be recorded by AutoGrad for 
# computing the gradient, including the intermediate variables that simply store and provide reference
# to the results of operations performed on grad enabled tensors

Does this tensor have requires_grad enabled by default: False
Does this tensor have requires_grad enabled by default: False


In [82]:
# There are two ways to have tell the AutoGrad of PyTorch to track the tensors for potential gradient computation

# enable at creation time, by passing as an argument
a = torch.rand(4,5, requires_grad=True)
print(f"Is grad is enabled: {a.requires_grad}")

# or enable it after tensor creation
b = torch.rand(5,6)
print(f"Before: {b.requires_grad}")
b.requires_grad_(True) # This function enabled the requires_grad after creation of the tensor
print(f"After: {b.requires_grad}")

Is grad is enabled: True
Before: False
After: True


In [85]:
a = torch.rand(9,4).view(3,3,2,2)
print(f"Before: {a.requires_grad}")
a.requires_grad_(True)
print(f"After: {a.requires_grad}")

Before: False
After: True


In [88]:
a = torch.arange(10,100, 0.1, requires_grad=True).view(10,90)
display(a.requires_grad) # check if it is being tracked by AutoGrad for computations of gradients

True

In [100]:
# since gradients involve continuous values, trying to have requires_grad=True on integer values will show RuntimeError
# because gradients represented by discrete numbers will produce a huge round-off errors
# mathematically gradients will be inaccurate

a = torch.rand(6,7,8)
a = a.permute(2,1,0)
display(a.is_contiguous())
display(a.shape)
display(a.requires_grad)
a.requires_grad_(False)
display(a.requires_grad)
a.requires_grad_(True)
display(a.requires_grad)

False

torch.Size([8, 7, 6])

False

False

True

In [103]:
try:
    a = torch.arange(50).view(5,5,2)
    a.requires_grad_(True)
    display(a.requires_grad)
except RuntimeError as e:
    print(f"Integer dtype tensors cannot have requires_grad enabled, since gradients by nature are continuous values: \n{e}")
    print(a.requires_grad)

Integer dtype tensors cannot have requires_grad enabled, since gradients by nature are continuous values: 
only Tensors of floating point dtype can require gradients
False


In [106]:
# If any tensor, if at least one tensor has gradients enabled, then the resulting output will also have
# requires_grad=True enabled, because not all input tensors will requires operations such as 
# feature tensor which stores independent variables, but we do not need gradients for them, since we do not aim to change them

x = torch.arange(10,20,0.1).view(4,5,5) # requires_grad=False is not enabled
w = torch.rand(5,5, requires_grad=True)
b = torch.randn(4,5,1)
b.requires_grad_(True)

print(f"x: {x.requires_grad}, w: {w.requires_grad}, b: {b.requires_grad}\n")

intermediate = w * x # or torch.mul(w,x)
print(f"Does Intermediate variable's output has requires_grad=True when at least one tensors has it enabled for operations: \
{intermediate.requires_grad}")

y = intermediate + b # or torch.add(intermediate, b)
print(f"Does the final output has requires_grad=True enabled if at least one of its tensors has it enabled for operation:\
{y.requires_grad}")

x: False, w: True, b: True

Does Intermediate variable's output has requires_grad=True when at least one tensors has it enabled for operations: True
Does the final output has requires_grad=True enabled if at least one of its tensors has it enabled for operation:True


In [125]:
a = torch.rand(5,6, requires_grad=True)
b = torch.arange(30).view(1,5,6)
c = torch.randn(4,5,6)
d = torch.rand(6)
e = torch.ones(1,5,1)
f = torch.zeros(4,1,6)
output = a + b + c + d + e + f + 4 * torch.tensor(6) + torch.tensor([456])
print(f"Does output have requires_grad=True, even if only one of its tensor had it enabled for operations: \
{output.requires_grad}\nNotice all these tensors are broadcastable and their resulting shape is: {output.shape}") # 4,5,6

Does output have requires_grad=True, even if only one of its tensor had it enabled for operations: True
Notice all these tensors are broadcastable and their resulting shape is: torch.Size([4, 5, 6])


In [150]:
# grad_fn attribute is attached to all tensors that have gradients required enabled, or resulting output tensors
# grad_fn will be None for manually created tensors which did not result from any operations of grad tensors
# grad_fn will have the name of the function for the tensors that were resulted from the operation of grad tensors

x = torch.rand(4,5)
w = torch.randn(5,4, requires_grad=True)
b = torch.ones(5,1, requires_grad=True) # or b.requires_grad_(True)
display(x.requires_grad, w.requires_grad, b.requires_grad)
print(f"grad_fn of leaf tensors, x: {x.grad_fn}, w: {w.grad_fn}, b: {b.grad_fn}\n")
intermediate = torch.matmul(w,x) 
print(f"Intermediate requires_grad={intermediate.requires_grad}\nIntermediate variable grand_fn: {intermediate.grad_fn}")
output = intermediate + b
print(f"Output requires_grad={output.requires_grad}\nOutput variable grand_fn: {output.grad_fn}")

False

True

True

grad_fn of leaf tensors, x: None, w: None, b: None

Intermediate requires_grad=True
Intermediate variable grand_fn: <MmBackward0 object at 0x11f894580>
Output requires_grad=True
Output variable grand_fn: <AddBackward0 object at 0x11f894580>


In [5]:
# grad_fn is basically a reference to what function was applied for the tensors with gradients required
# then .backward() will use grad_fn references for computing the gradients from output to input by following
# what functions were used in the reverse-mode order

# AGAIN for leaf tensors, leaf meaning they are outside, manually created have grad_fn=None
# since they were not created through operations or passing through a function.
# non-leaf tensors represent tensors which are created through operations or passing through function
# the tensors with gradients required, non-leaf meaning they did not come from outside but rather created
# from the those that came outside with gradients enabled

a = torch.rand(4,5)
b = torch.rand(4,5)
print(a.grad_fn, b.grad_fn, a.requires_grad, b.requires_grad)
a.requires_grad_(True)
b.requires_grad_(True)
print(a.grad_fn, b.grad_fn, a.requires_grad, b.requires_grad)
c_plus = torch.add(a,b)
c_div = torch.div(a,b)
c_sub = torch.sub(a,b)
c_pow = torch.pow(a,b)
c_mul = torch.mul(a,b)
c_exp = torch.exp(a)
c_log = torch.log(b)
c_sqrt = torch.sqrt(a)
c_abs = torch.abs(b)
c_matmul = a @ b.T # or torch.matmul(a,b)

print(f"grad_fn list:\nc_plus: {c_plus.grad_fn}\nc_div: {c_div.grad_fn}\nc_sub: {c_sub.grad_fn}\nc_pow: {c_pow.grad_fn}\n\
c_mul: {c_mul.grad_fn}\nc_exp: {c_exp.grad_fn}\nc_log: {c_log.grad_fn}\nc_sqrt: {c_sqrt.grad_fn}\nc_abs: {c_abs.grad_fn}\n\
c_matmul: {c_matmul.grad_fn}")

None None False False
None None True True
grad_fn list:
c_plus: <AddBackward0 object at 0x105e58b20>
c_div: <DivBackward0 object at 0x105e58b20>
c_sub: <SubBackward0 object at 0x105e58b20>
c_pow: <PowBackward1 object at 0x105e58b20>
c_mul: <MulBackward0 object at 0x105e58b20>
c_exp: <ExpBackward0 object at 0x105e58b20>
c_log: <LogBackward0 object at 0x105e58b20>
c_sqrt: <SqrtBackward0 object at 0x105e58b20>
c_abs: <AbsBackward0 object at 0x105e58b20>
c_matmul: <MmBackward0 object at 0x105e58b20>


In [60]:
a = torch.rand(4,9)
b = torch.rand(4)
a.requires_grad_(True)

c = torch.matmul(b,a) # b is treated as shape[1,4] row vector
display(c.requires_grad, c.grad_fn, c)
# in this case we used inner matmul which effectively Squeezed the original two grad tensors into a flattened tensor

True

<SqueezeBackward4 at 0x13d75e280>

tensor([1.3965, 0.7085, 1.4566, 0.6300, 0.3112, 0.8569, 1.5620, 1.1497, 1.3617],
       grad_fn=<SqueezeBackward4>)

In [70]:
a = torch.randn(3,98,6, requires_grad=True)
b = torch.randn(3,6,12)
c = torch.matmul(a,b)
display(c.requires_grad, c.grad_fn)

True

<UnsafeViewBackward0 at 0x13d6c61c0>

In [66]:
a = torch.rand(4,9)
b = torch.rand(4)
b.requires_grad_(True)
display(a.requires_grad, b.requires_grad)
c_sq = b @ a
display(c.grad_fn)

False

True

<SqueezeBackward4 at 0x13d7ad880>

In [106]:
x = torch.rand(34,7)
w = torch.rand(3,34,1, requires_grad=True)
b = torch.rand(34,1, requires_grad=True)
print(f"Requires_grad status x: {x.requires_grad}, w: {w.requires_grad}, b: {b.requires_grad}")

intermediate = x * w
print(f"Intermediate(non-leaf) tensor: is grad enabled even if \"x\" does not have grad enabled: {intermediate.requires_grad}\n\
Gradient function of intermediate(non-leaf) tensor {intermediate.grad_fn}")

output = intermediate + b
print(f"Output(final loss) tensor: is grad enabled: {output.requires_grad}\n\
Gradient function of output(final loss) tensor: {output.grad_fn}")

a = torch.rand(2,5,6)
b = torch.ones(2,6,5)
c = a @ b # torch.matmul(a,b)
display(c.grad_fn) 
# It will show non, because tensors used for operations, none of them, or at least one of them, do not have
# gradients required so PyTorch is not recording it for potential gradient computation
a.requires_grad_(True)
display(c.requires_grad, c.grad_fn) # so it will still show False and None because
# even though A was enabled with gradients requiring, it was done after all the operations were completed
# that is why, always set the requires_grad=True before doing any operations
e = a @ b
display(e.requires_grad, e.grad_fn) # now it works because gradient required was enabled before doing any operations
# hence creating a new non-leaf/intermediate tensor
display(e.grad)

Requires_grad status x: False, w: True, b: True
Intermediate(non-leaf) tensor: is grad enabled even if "x" does not have grad enabled: True
Gradient function of intermediate(non-leaf) tensor <MulBackward0 object at 0x13d6c61c0>
Output(final loss) tensor: is grad enabled: True
Gradient function of output(final loss) tensor: <AddBackward0 object at 0x13d6c61c0>


None

False

None

True

<UnsafeViewBackward0 at 0x13d6c61c0>

  display(e.grad)


None

In [108]:
a = torch.tensor(5)
b = torch.tensor([5])
c = torch.tensor([[5]])
e = torch.tensor([[[5]]])
display(a.shape,b.shape,c.shape,e.shape)

torch.Size([])

torch.Size([1])

torch.Size([1, 1])

torch.Size([1, 1, 1])

In [116]:
# the real work or engine of AutoGrad starts when we use .backward() on the scalar tensor
# which is usually final output or loss function value

x = torch.tensor(4.0, requires_grad=True)
w = torch.tensor(4.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)

y = w * x + b # or torch.add(torch.mul(w,x),b)
loss = y * y

# Gradients of tensors before backward pass
print(f"Gradients of X before backward pass: {x.grad}")
print(f"Gradients of W before backward pass: {w.grad}")
print(f"Gradients of B before backward pass: {b.grad}\n")

loss.backward() 
# now the real work begins with .backward()
# now AutoGrad goes backwards, by tracking the tensors which have gradients required and any non-leaf tensors
# and corresponding functions performed on them in the sequential reverse order


print(f"Gradients of X after backward pass: {x.grad}")
print(f"Gradients of W before backward pass: {w.grad}")
print(f"Gradients of B before backward pass: {b.grad}\n")

Gradients of X before backward pass: None
Gradients of W before backward pass: None
Gradients of B before backward pass: None

Gradients of X after backward pass: 136.0
Gradients of W before backward pass: 136.0
Gradients of B before backward pass: 34.0



In [141]:
x = torch.tensor(43, dtype=torch.float64, requires_grad=True)
w = torch.tensor(67, dtype=torch.float32, requires_grad=True)
b = torch.tensor(18)
b = b.to(dtype=torch.float64)
b.requires_grad_(True)

y = w * x + b
loss = torch.mul(y,y)
display(loss.requires_grad)
display(y.grad_fn)
display(loss.grad_fn)
print(f"Gradients before backward pass: {x.grad, w.grad, b.grad}\n")
loss.backward()
print(f"Gradients after backward pass: {x.grad, w.grad, b.grad}")
print(f"Gradients after backward pass: {x.grad}, {w.grad}, {b.grad}")

True

<AddBackward0 at 0x10834a8b0>

<MulBackward0 at 0x10834a0a0>

Gradients before backward pass: (None, None, None)

Gradients after backward pass: (tensor(388466., dtype=torch.float64), tensor(249314.), tensor(5798., dtype=torch.float64))
Gradients after backward pass: 388466.0, 249314.0, 5798.0


In [30]:
x = torch.rand(4,5, requires_grad=True)
w = torch.randn(4,5, requires_grad=True)
b = torch.rand(4,1, requires_grad=True)

y = w * x + b
loss = torch.mul(y,y)
display(y.requires_grad, y.grad_fn)

grad_like = torch.ones_like(loss)
loss.backward(gradient=grad_like)
print(f"Gradients after backward pass: {x.grad}, {w.grad}, {b.grad}")
display(grad_like.shape, loss.shape)

True

<AddBackward0 at 0x13095f400>

Gradients after backward pass: tensor([[ 4.6386e+00,  1.4188e-01,  1.7259e+00,  1.6847e+00, -1.1178e+00],
        [ 3.2277e-01, -2.0231e-01,  5.1504e+00, -1.3771e-01,  1.3713e+00],
        [-1.0984e+00,  2.3315e-01, -4.0364e-01, -4.7043e-01,  1.8674e+00],
        [ 1.7413e+00,  3.9444e+00,  5.6293e+00,  6.9434e-01, -2.9229e-03]]), tensor([[ 3.8198,  1.2594,  0.4467,  1.7931,  0.0947],
        [ 0.0274,  0.2519,  1.5673,  0.4631,  0.0220],
        [ 0.2447,  1.4471,  0.7935,  0.3999,  0.3566],
        [ 0.6189, -1.6583,  2.0312,  0.1783,  0.0046]]), tensor([[12.1273],
        [ 6.0584],
        [ 7.6945],
        [ 3.9756]])


torch.Size([4, 5])

torch.Size([4, 5])

In [51]:
x = torch.tensor(5.0, requires_grad=True)
w = torch.tensor(6.4, requires_grad=True)
b = torch.tensor(3.2, requires_grad=True)

print(f"Gradients before backward pass: {x.grad}, {w.grad}, {b.grad}")
y = w * x + b
y.backward()
print(f"Gradients after backward pass: {x.grad}, {w.grad}, {b.grad}") # these are the derivates
# basically these corresponding derivates, represent how does a unit change in those parameters effect
# and influence the value fo the range/dependent value Y

loss = torch.pow(x,b)
print(f"Leaf tensor: X: {x.is_leaf}, W: {w.is_leaf}, B: {b.is_leaf}")
print(f"Non-leaf tensor: Y: {y.is_leaf}, LOSS: {loss.is_leaf}")
# Leaf tensors are tensors which were not created through operations with gradient tensors
# but they were created by explicitly requires_grad=True, and manually created or taken from outside
# while the non-leaf tensors are the ones which were neither created manually or taken from outside
# and did not have requires_grad=True enabled explicitly, so what are non-leaf tensors then?
# non-leaf tensors are tensors which were created through the operations or functions where
# at least of the tensors used in that operation or function is gradient tensor,
# even if there are 1000 tensors and only one gradient tensor, the resulting output
# will be automatically become or so to speak implicitly become requires_grad=True

Gradients before backward pass: None, None, None
Gradients after backward pass: 6.400000095367432, 5.0, 1.0
Leaf tensor: X: True, W: True, B: True
Non-leaf tensor: Y: False, LOSS: False


In [68]:
a = torch.rand(6,7,13, requires_grad=True)
b = torch.rand(13)
c = torch.rand(4,1,1,13)
e = torch.ones(1,13)
f = torch.ones(6,7,1)
g = torch.zeros(1,7,1)
k = torch.zeros(1,1,7,1)
l = torch.randn(4,6,7,13)
h = torch.randn(3,1,6,1,13)
a1 = torch.rand_like(a) # produces tensors with the same shape and dtype of input tensor
b1 = torch.rand_like(b)
c1 = torch.rand_like(c)
e1 = torch.ones_like(e)
f1 = torch.ones_like(f)
g1 = torch.zeros_like(g)
k1 = torch.zeros_like(k)
l1 = torch.randn_like(l)
h1 = torch.randn_like(h)
# even though the dimensions are different, and from linear algebra perspective element-wise operations
# cannot be performed, but thanks to broadcasting, the element-wise operations are possible
# for two tensors to be broadcastable, the # of dimensions must be equal, or one must have 1 or 0 dimension
# and in case of 0 or 1, the other(larger) tensors dimension will change to that, but not stay as 0 and 1.

print(f"Addition: {(a + b + c + e + f + g + k + l + h + a1 + b1 + c1 + e1 + f1 + g1 + k1 + l1 + h1).shape}")
print(f"Addition: {(a * b * c * e * f * g * k * l * h * a1 * b1 * c1 * e1 * f1 * g1 * k1 * l1 * h1).shape}")
print(f"Addition: {(a - b - c - e - f - g - k - l - h - a1 - b1 - c1 - e1 - f1 - g1 - k1 - l1 - h1).shape}")
print(f"Addition: {(a - b * c + e - f * g - k + l - h * a1 + b1 - c1 * e1 * f1 + g1 + k1 - l1 ** h1).shape}")
print(f"Addition: {(a - b * c + e - f * g - k + l - h * a1 + b1 - c1 * e1 * f1 + g1 + k1 - l1 ** h1).requires_grad}")
print(f"Addition: {(a - b * c + e - f * g - k + l - h * a1 + b1 - c1 * e1 * f1 + g1 + k1 - l1 ** h1).is_leaf}")
print(f"Leaf tensor: {a.is_leaf}")
print(f"Non Leaf tensor: {a1.is_leaf}")

Addition: torch.Size([3, 4, 6, 7, 13])
Addition: torch.Size([3, 4, 6, 7, 13])
Addition: torch.Size([3, 4, 6, 7, 13])
Addition: torch.Size([3, 4, 6, 7, 13])
Addition: True
Addition: False
Leaf tensor: True
Non Leaf tensor: True


In [76]:
import copy

a = torch.rand(56,78)
a.requires_grad_(True)
b = a + 2
f = torch.tensor(5)
b1 = torch.ones_like(b)
a1 = torch.ones_like(a1)
a2 = a.clone()
print(f"Is leaf tensor: a:{a.is_leaf}, a1:{a1.is_leaf}, a2:{a2.is_leaf}, b:{b.is_leaf}, b1:{b1.is_leaf}, f:{f.is_leaf}")
# So new tensors created with _like from leaf tensors, the resulting new _like tensor with the same shape and dtype of leaf tensor
# will also be a leaf tensor
# looks like any tensors manually created or taken from outside are considered leaf tensors

Is leaf tensor: a:True, a1:True, a2:False, b:False, b1:True, f:True


In [117]:
# When we use .backward() the gradients start the computations,
# but this method does not return the gradients
# but rather this gradients are stored in the individual tensors themselves, 
# in the tensors' attribute called .grad, and to access gradients
# we use .grad on the desired tensor, to get their corresponding gradient/derivate
# this .grad attribute even though it does exist for any gradient tensors
# PyTorch by default only stores the gradients to .grad attribute of only Leaf Tensors
# meaning, Non-Leaf/Intermediate tensors gradients are not stored in their .grad attribute
# if we want to explicitly store the gradients of non-leaf tensors, then we use retain_graph=True to the .backward() method as argument

x = torch.tensor([[[4.0]]], requires_grad=True)
w = torch.tensor([[[3.0]]], requires_grad=True)
b = torch.tensor([[[2.0]]], requires_grad=True)
print(f"Gradient attributes before backward pass: {x.grad}, {w.grad}, {b.grad}")
y = w * x + b
y.backward() # important to note that each time we call .backward() the gradients are accumulated
# meaning with each run of .backward(), the gradient values will be added to the previous ones and so forth
# it is useful for gradient accumulation across mini-batches
# but if we do not need this gradient accumulation, then we need to empty the gradients attribute values
# before calling .backward() with the optimizer.zero_grad()

print(f"Gradient of Y with respect to X: dy/dx: {x.grad}")
print(f"Gradient of Y with respect to W: dy/dw: {w.grad}")
print(f"Gradient of Y with respect to B: dy/db: {b.grad}")
print("These values represent how much the value of Y is changed/influenced/effected for every unit change in {x,w,b} respectively")
# These values represent how much the value of Y is changed/influenced/effected for every unit change in {x,w,b}

# fi we run this .backward() several times, the value of gradients will add up

Gradient attributes before backward pass: None, None, None
Gradient of Y with respect to X: dy/dx: tensor([[[3.]]])
Gradient of Y with respect to W: dy/dw: tensor([[[4.]]])
Gradient of Y with respect to B: dy/db: tensor([[[1.]]])
These values represent how much the value of Y is changed/influenced/effected for every unit change in {x,w,b} respectively


In [100]:
y.backward(retain_graph=True) # important to note that each time we call .backward() the gradients are accumulated
# meaning with each run of .backward(), the gradient values will be added to the previous ones and so forth
# it is useful for gradient accumulation across mini-batches
# but if we do not need this gradient accumulation, then we need to empty the gradients attribute values
# before calling .backward() with the optimizer.zero_grad()

print(f"Gradient of Y with respect to X: dy/dx: {x.grad}")
print(f"Gradient of Y with respect to W: dy/dw: {w.grad}")
print(f"Gradient of Y with respect to B: dy/db: {b.grad}")
print("These values represent how much the value of Y is changed/influenced/effected for every unit change in {x,w,b} respectively")
# These values represent how much the value of Y is changed/influenced/effected for every unit change in {x,w,b}

# fi we run this .backward() several times, the value of gradients will add up

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [135]:
# requires_grad=True or requires_grad_(True), makes sure that given (leaf)tensors and intermediate(non-leaf) tensors
# which performed operations with gradient tensors, will be recorded by PyTorch for potential gradient calculation
# grad_fn is attribute is attached for every gradient tensor, and they tell us which what operations
# a given intermediate(non-leaf) tensors was created, for leaf tensors that were manually created or taken from outside
# will show None since there was no operation involved in order to create these tensors in the first place
# .grad attribute stores the calculated gradients of the leaf tensors only by default
# .backward() does the gradient computation and draws the computational graph, it does not return gradients
# to access those gradients, we need to call .grad attribute for the desired tensor
# also, .backward() can be used only for the scalar tensors, meaning 
# it does not matter how many dimensions it has, as long as those tensors have a single/singular/scalar value
# then .backward() will work without RuntimeError
# also important note, is that when we call .backward() several times, within each time
# gradients are accumulated, meaning the new calculated gradient is added to the previous ones that used .backward() before it
# if we want to empty gradients value attribute, as to avoid gradient accumulation, the use optimizer.zero_grad()
# if we want to store the gradients for non-leaf tensors, then we have to explicitly call 
# retain_graph=True as an argument for the .backward() method

a = torch.rand(5,6, requires_grad=True)
b = torch.rand(3,6,9)
b.requires_grad_(True)
c = a @ b
e = c + 129
display(c.shape)
print(f"Is leaf: a:{a.is_leaf}, b:{b.is_leaf}, c:{c.is_leaf}, e:{e.is_leaf}")
print(f"Grad_fn: a:{a.grad_fn}, b:{b.grad_fn}, c:{c.grad_fn}, e:{e.grad_fn}")

torch.Size([3, 5, 9])

Is leaf: a:True, b:True, c:False, e:False
Grad_fn: a:None, b:None, c:<CloneBackward0 object at 0x156743be0>, e:<AddBackward0 object at 0x156743be0>


In [187]:
x = torch.rand(1,5, requires_grad=True)
w = torch.randn(5,1, requires_grad=True)
b = torch.ones(5,1, requires_grad=True)
y = w * x + b
y_ones = torch.ones_like(y)
y.backward(gradient=y_ones)
print(f"Gradient of Y with respect to X: {x.grad}")
print(f"Gradient of Y with respect to W: {w.grad}")
print(f"Gradient of Y with respect to B: {b.grad}")

Gradient of Y with respect to X: tensor([[-0.2350, -0.2350, -0.2350, -0.2350, -0.2350]])
Gradient of Y with respect to W: tensor([[2.2113],
        [2.2113],
        [2.2113],
        [2.2113],
        [2.2113]])
Gradient of Y with respect to B: tensor([[5.],
        [5.],
        [5.],
        [5.],
        [5.]])


In [206]:
x = torch.rand(1,5, requires_grad=True)
w = torch.randn(5,1, requires_grad=True)
b = torch.ones(5,1, requires_grad=True)
y = w * x + b
y = torch.mean(y)
y.backward() # has to scalar tensor, does not matter if it is N-dimensional, as long as it has a singular/single/scalar tensor
print(f"Gradient of Y with respect to X: {x.grad}")
print(f"Gradient of Y with respect to W: {w.grad}")
print(f"Gradient of Y with respect to B: {b.grad}")

Gradient of Y with respect to X: tensor([[-0.0686, -0.0686, -0.0686, -0.0686, -0.0686]])
Gradient of Y with respect to W: tensor([[0.1441],
        [0.1441],
        [0.1441],
        [0.1441],
        [0.1441]])
Gradient of Y with respect to B: tensor([[0.2000],
        [0.2000],
        [0.2000],
        [0.2000],
        [0.2000]])


In [234]:
x = torch.tensor(4.0, requires_grad=True)
w = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(1.3, requires_grad=True)
y = torch.add(torch.mul(w,x),b)
f = 2**y
f.retain_grad()
y.backward(gradient=torch.tensor(1.0)) # it needs to take a gradient of the same shape as the final output, here Y
display(x.grad,w.grad,b.grad, f.grad)
# so by default, .backward() works for scalar tensors with gradient=torch.tensor(1.0), 
# but for matrices or vectors, we need to explicitly specify the gradient, and it has to have the same shape
# as the final output tensor, Y

tensor(3.)

tensor(4.)

tensor(1.)

None

In [261]:
x = torch.rand(3,4,18, requires_grad=True)
w = torch.randn(4,18, requires_grad=True)
b = torch.rand(3,4,1, requires_grad=True)
y = w * x + b
y.retain_grad()
loss = y*y

loss_ones = torch.ones_like(loss)
loss.backward(gradient=torch.ones_like(loss))
print(x.is_leaf,w.is_leaf,b.is_leaf,y.is_leaf,loss.is_leaf, loss.shape)
print(f"Gradient of Loss with respect to X:\n{x.grad}")
print(f"Gradient of Loss with respect to W:\n{w.grad}")
print(f"Gradient of Loss with respect to B:\n{b.grad}")
print(f"Gradient of Loss with respect to Y:\n{y.grad}") # since Y is non-leaf, gradients ARE NOT stored in .grad attribute
# in order for this non-leaf/intermediate tensor to explicitly store the gradients in their .grad attribute
# use tensor.retain_grad() method

True True True False False torch.Size([3, 4, 18])
Gradient of Loss with respect to X:
tensor([[[ 7.6566e-03,  4.6448e+00,  8.7880e-02,  9.9911e-01,  1.2427e-02,
           5.0039e+00,  3.7647e-01,  3.6018e-02,  1.7974e+00,  2.5786e-02,
           1.1600e+00,  2.4531e-03,  8.9086e-03, -2.7080e-02,  4.0341e-01,
           1.5871e+00, -3.4288e-03,  1.3255e+00],
         [ 5.6679e-02,  8.3234e-01,  2.8144e-01,  1.1311e+00,  7.6381e-01,
           5.6489e+00, -1.3147e-01,  2.2801e+00,  1.3443e+00, -2.0175e-01,
          -3.2055e-02, -3.9420e-02,  1.0131e+00,  9.4108e-01,  6.1843e-01,
           7.7810e-01,  1.0663e+00,  4.8585e-01],
         [-2.3978e-02,  2.6324e+00,  1.7885e+00,  1.4892e-01,  1.0352e-02,
           2.5893e-01,  2.2468e+00,  2.3662e-02,  4.9263e-02, -3.4814e-03,
          -5.5805e-02, -6.5501e-03, -1.0111e-02, -2.7019e-01, -6.9381e-02,
           3.8361e-01,  1.1225e+00, -5.1395e-01],
         [-8.2629e-01,  1.3847e+00, -1.3318e+00, -3.6174e-01, -2.2580e+00,
          -2.0

In [264]:
"""
    There are cases when computing gradients is pointless. 
    
    First: for model evaluation, all we do here is simply forward pass the data and provide predictions
    at this evaluation point we do not make any changes/updates to parameters, hence computing gradients
    for those parameters is useless, and makes the evaluation process much slower, without any benefit

    Second: for fine-tuning, we may need to update only certain parameters of the model, mostly pre-trained models
    and fine-tune them to our needs. With transfer learning, we do not need to train the model from scratch,
    but rather fine-tune some of its parameters to our needs, by freezing other parameters as to not change them
    and rather keep them.

    Third: In both above two cases, calculations of gradients unnecessarily takes memory for drawing computational graph,
    operations of chain rule, and storing those gradients in .grad attribute, causes the model take more time and memory
    without any benefit. 
"""

'\n    There are cases when computing gradients is pointless. \n    \n    First: for model evaluation, all we do here is simply forward pass the data and provide predictions\n    at this evaluation point we do not make any changes/updates to parameters, hence computing gradients\n    for those parameters is useless, and makes the evaluation process much slower, without any benefit\n\n    Second: for fine-tuning, we may need to update only certain parameters of the model, mostly pre-trained models\n    and fine-tune them to our needs. With transfer learning, we do not need to train the model from scratch,\n    but rather fine-tune some of its parameters to our needs, by freezing other parameters as to not change them\n    and rather keep them.\n\n    Third: In both above two cases, calculations of gradients unnecessarily takes memory for drawing computational graph,\n    operations of chain rule, and storing those gradients in .grad attribute, causes the model take more time and memory\

In [279]:
# by using "with torch.no_grad():" any tensors that are used inside of this statement
# will not have gradients required, even if they were explicitly set as requires_grad=True
x = torch.rand(4,5, requires_grad=True)
w = torch.rand(2,5,4, requires_grad=True)
b = torch.rand(2,1,5, requires_grad=True)
y = w @ x + b
print(f"Is gradient required: y:{y.requires_grad}")
print(f"Is leaf: y:{y.is_leaf}, x:{x.is_leaf}, w:{w.is_leaf}, b:{b.is_leaf}")
print(f"Grad_fn: y:{y.grad_fn}, x:{x.grad_fn}, w:{w.grad_fn}, b:{b.grad_fn}")
print("\nFollowing is written inside of torch.no_grad()")
with torch.no_grad():
    z = w @ x + b
    print(f"Is gradient required: z:{z.requires_grad}, x:{x.requires_grad}, w:{w.requires_grad}, b:{b.requires_grad}")
    print(f"Grad_fn: z:{z.grad_fn}")
    # Look it does acknowledge that the gradient tensors have grad required, but operations performed with them
    # will not have gradients required because it is inside of with torch.no_grad()
    e = x @ w * 56
    print(f"Is gradient required e: {e.requires_grad}")
print("\nOutside the torch.no_grad()")
h = w @ x + b
print(f"Is gradient enabled: h:{h.requires_grad}\nGrad_fn h:{h.grad_fn}")

Is gradient required: y:True
Is leaf: y:False, x:True, w:True, b:True
Grad_fn: y:<AddBackward0 object at 0x155b621f0>, x:None, w:None, b:None

Following is written inside of torch.no_grad()
Is gradient required: z:False, x:True, w:True, b:True
Grad_fn: z:None
Is gradient required e: False

Outside the torch.no_grad()
Is gradient enabled: h:True
Grad_fn h:<AddBackward0 object at 0x15663dd90>


In [340]:
# Another method is by using .detach() which creates a new tensor that has reference to original gradient tensor
# meaning if change is done for either detached or gradient tensor, the changes will be reflected in both
a = torch.rand(4,5, requires_grad=True)
b = a**2
print(f"Tensor B: b.requires_grad: {b.requires_grad}, b.grad_fn: {b.grad_fn}")

# C is detached from being tracked for gradient computation, but keeps the reference to original gradient tensor
c = b.detach()
print(f"Detached Tensor C: c.requires_grad: {c.requires_grad}")
c.contiguous()
c = c.view(5,2,2)
# so any operations done with C will not be tracked
f = c**5
print(f.requires_grad, f.grad_fn)
display(c.shape,b.shape)

Tensor B: b.requires_grad: True, b.grad_fn: <PowBackward0 object at 0x148181e50>
Detached Tensor C: c.requires_grad: False
False None


torch.Size([5, 2, 2])

torch.Size([4, 5])

In [366]:
# torch.no_grad() is standar practice for evaluation and inferencing code block
# while .detach() offer more control as to which gradient tensor to disregard from being tracked for gradient computation

w = torch.ones(3,4,8, requires_grad=True)
x = torch.arange(32, dtype=torch.float32).view(8,-1) # -1 for inferencing the other dimension so that the resulting dimensions are 32
x.requires_grad_(True)
b = torch.arange(12, dtype=torch.float64).reshape(3,4,1)
b.requires_grad_(True)

Y = torch.matmul(w,x) + b
mask = Y > 132.0
loss = Y[mask]
loss.backward(gradient=torch.ones_like(loss))
# gradients of Loss with respect to each parameter is now stored in the leaf-tensors .grad attribute


with torch.no_grad():
    # we can either permute order of dimensions
    x_ = x.clone()
    x_ = x_.permute(1,0)
    
    # or use .view() or .reshape(), for .view() make sure that tensors are contiguous
    print(x.is_contiguous())
    x.view(4,8)
    
    y_ = w * x_ + b
    print(x.shape, w.shape, b.shape)
    
    loss_ = y_[y_> 15.0]
    #eg = torch.sum(loss_) 
    #eg.backward() # will show error since gradients are not required inside of torch.no_grad()

    # any operations performed with gradient tensors inside of torch.no_grad() will not have gradients required
    # hence grad_fn=None and no computations will be done for them, usually used on code block
    # performing evaluation and inference 


_x = x.clone()
print("_x CLONE",_x.grad_fn) # so cloning is also tracked into grad_fn as CloneBackward0
_x = _x.detach()
# now _x is no longer being tracked 

print(f"Initial tensors Y: {Y.requires_grad}, {Y.grad_fn}")
print(f"Gradients of Loss with respect to parameters:\nw:{w.grad}\nx:{x.grad}\nb:{b.grad}")
print(f"\nTensors with torch.no_grad(): x_:{x_.requires_grad}, loss_:{loss_.requires_grad}, loss_.grad_fn:{loss_.grad_fn}")
print(f"\nDetached tensor with .detach(): _x:{_x.requires_grad}, _x.grad_fn:{_x.grad_fn}")

True
torch.Size([8, 4]) torch.Size([3, 4, 8]) torch.Size([3, 4, 1])
_x CLONE <CloneBackward0 object at 0x156a37340>
Initial tensors Y: True, <AddBackward0 object at 0x156a37340>
Gradients of Loss with respect to parameters:
w:tensor([[[ 3.,  7., 11., 15., 19., 23., 27., 31.],
         [ 3.,  7., 11., 15., 19., 23., 27., 31.],
         [ 3.,  7., 11., 15., 19., 23., 27., 31.],
         [ 3.,  7., 11., 15., 19., 23., 27., 31.]],

        [[ 3.,  7., 11., 15., 19., 23., 27., 31.],
         [ 5., 13., 21., 29., 37., 45., 53., 61.],
         [ 5., 13., 21., 29., 37., 45., 53., 61.],
         [ 5., 13., 21., 29., 37., 45., 53., 61.]],

        [[ 5., 13., 21., 29., 37., 45., 53., 61.],
         [ 5., 13., 21., 29., 37., 45., 53., 61.],
         [ 5., 13., 21., 29., 37., 45., 53., 61.],
         [ 5., 13., 21., 29., 37., 45., 53., 61.]]])
x:tensor([[ 0.,  0.,  7., 12.],
        [ 0.,  0.,  7., 12.],
        [ 0.,  0.,  7., 12.],
        [ 0.,  0.,  7., 12.],
        [ 0.,  0.,  7., 12.],
    

In [356]:
x4 = torch.ones(3,4, requires_grad=True)
x1 = x4.clone()
display(x4.requires_grad)
display(x4.storage().data_ptr() == x1.storage().data_ptr())
# so cloning gradient tensor, the cloned one will also be gradient required tensor, but will not have reference to each other

True

False

In [364]:
# Again note that torch.no_grad() and .detach() help to NOT to compute the gradients for tensors with grad enabled
# it is used in cases of fine-tuning pre-trained model, evaluation and inferencing

In [403]:
# Gradient accumulation is basically about using .backward() method several times on the same loss value scalar tensor
# basically if we run .backward() times at first it will show RuntimeError, but in order to avoid it
# and allow to run .backward() more than one time, we need to pass retrain_graph=True as argument to .backward()
# in this way the new calculated gradients will be added to the previous ones in .grad attribute

a = torch.tensor([4.5], requires_grad=True)
y = torch.mul(a,a)
z = torch.exp(y)

# if we want to run .backward() on the same tensor multiple times, then in the first .backward() call 
# we need to pass retain_grap=True
z.backward(retain_graph=True)
print(f"Gradients of Z with respect to A dz/da: {a.grad}") # only leaf tensors gradients are stored in .grad attribute by default

#f = torch.sqrt(z)

z.backward(retain_graph=True)
print(f"Gradients of Z with respect to A df/da: {a.grad}") # only leaf tensors gradients are stored in .grad attribute by default
z.backward()
print(f"Gradients of Z with respect to A df/da: {a.grad}") # only leaf tensors gradients are stored in .grad attribute by default
a.grad.zero_()
print(f"Gradients of Z with respect to A df/da: {a.grad}") # only leaf tensors gradients are stored in .grad attribute by default

Gradients of Z with respect to A dz/da: tensor([5.6067e+09])
Gradients of Z with respect to A df/da: tensor([1.1213e+10])
Gradients of Z with respect to A df/da: tensor([1.6820e+10])
Gradients of Z with respect to A df/da: tensor([0.])


In [404]:
"""
    Basically if we want to run .backward() two times, then in the first .backward() call we need to
    pass retain_graph=True as argument, and so forth
"""

'\n    Basically if we want to run .backward() two times, then in the first .backward() call we need to\n    pass retain_graph=True as argument, and so forth\n'

In [424]:
a = torch.tensor([5.0], requires_grad=True)
y = a * a
z = y * 2
print(a.is_leaf, y.is_leaf, z.is_leaf)
z.backward(gradient=torch.tensor([1.0]),retain_graph=True)
display(a.grad)

# IMPORTANT, we can call .backward() only a single time after passing 
# for the first .backward() retain_graph=True, and if we want to use .backward() the third time for the same tensor
# then pass retain_graph=True for the .backward() above it, meaning after passing retain_graph=True
# we can only call .backward() only one more time
z.backward(retain_graph=True)
display(a.grad)
z.backward(retain_graph=True)
display(a.grad)
z.backward()
display(a.grad)

# Look the gradients are being accumulated, so a new calculated one and the previous existing ones are simply added by default
# if we want to empty .grad attribute of tensor, then use tensor.grad.zero_()
a.grad.zero_()
display(a.grad)

True False False


tensor([20.])

tensor([40.])

tensor([60.])

tensor([80.])

tensor([0.])

In [6]:
x = torch.arange(10, dtype=torch.float32, requires_grad=True).view(2,5)
w = torch.arange(10.0, requires_grad=True).reshape(2,5)
x_ = x.clone()
b = x_.reshape(5,2)
b = b.permute(1,0)
b.requires_grad_(True)
x.retain_grad()
w.retain_grad()
b.retain_grad()
y = w * x + b
z = y**2

z_ones = torch.ones_like(z)

z.backward(gradient=torch.ones(2,5), retain_graph=True)
print(x.grad)
z.backward(gradient=z_ones, retain_graph=True)
print(x.grad)
z.backward(gradient=z_ones, retain_graph=True)
print(x.grad)
z.backward(gradient=z_ones, retain_graph=True)
print(x.grad)
z.backward(gradient=z_ones, retain_graph=True)
print(x.grad)
# gradients are simply being accumulated and added, and we can empty it
x.grad.zero_()
print(x.grad)
z.backward(gradient=z_ones, retain_graph=True)
print(x.grad)
z.backward(gradient=z_ones)
print(x.grad)

# after tensor.grad.zero_() the gradients stored in .grad attribute of a given leaf tensor are emptied, 
# and then we can start adding again

tensor([[   0.,   58.,   38.,  168.,  208.],
        [ 368.,  498.,  898., 1184., 1800.]])
tensor([[   0.,  116.,   76.,  336.,  416.],
        [ 736.,  996., 1796., 2368., 3600.]])
tensor([[   0.,  174.,  114.,  504.,  624.],
        [1104., 1494., 2694., 3552., 5400.]])
tensor([[   0.,  232.,  152.,  672.,  832.],
        [1472., 1992., 3592., 4736., 7200.]])
tensor([[   0.,  290.,  190.,  840., 1040.],
        [1840., 2490., 4490., 5920., 9000.]])
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])
tensor([[   0.,   58.,   38.,  168.,  208.],
        [ 368.,  498.,  898., 1184., 1800.]])
tensor([[   0.,  116.,   76.,  336.,  416.],
        [ 736.,  996., 1796., 2368., 3600.]])


In [46]:
x = torch.arange(40.0, requires_grad=True).view(2,5,4)
w = torch.randn(5,4, requires_grad=True)
b = torch.ones(1,5,1,requires_grad=True)

truth = torch.randn(2,5,4)
y = w * x + b
y.retain_grad()
x.retain_grad()
loss = torch.abs(truth - y)
loss_ones = torch.ones_like(loss)
print(f"Gradients after forward pass: x={x.grad}, w={w.grad}, b={b.grad}, y={y.grad}")
loss.backward(gradient=loss_ones, retain_graph=True)
print(f"Is leaf: x={x.is_leaf}, w={w.is_leaf}, b={b.is_leaf}, y={y.is_leaf}")
print(f"Gradient after FIRST backward pass:\nx={x.grad},\nw={w.grad},\nb={b.grad}\n")
loss.backward(gradient=loss_ones, retain_graph=True)
print(f"Gradient after SECOND backward pass:\nx={x.grad},\nw={w.grad},\nb={b.grad}\n")
loss.backward(gradient=loss_ones, retain_graph=True)
print(f"Gradient after THIRD backward pass:\nx={x.grad},\nw={w.grad},\nb={b.grad}\n")
# Gradients are being accumulated, in their corresponding .grad attribute
# but we can manually empty each of them
x.grad.zero_()
w.grad.zero_()
b.grad.zero_()
print("Gradients after manually empting them\n",x.grad, w.grad, b.grad)

loss.backward(gradient=loss_ones)
print(f"\nGradient after GRAD.ZERO_() backward pass:\nx={x.grad},\nw={w.grad},\nb={b.grad}\n")
print(f"Gradients for non-leaf/intermediate tensors: y={y.grad_fn}")

Gradients after forward pass: x=None, w=None, b=None, y=None
Is leaf: x=False, w=True, b=True, y=False
Gradient after FIRST backward pass:
x=tensor([[[-2.1362,  0.0395,  1.4157, -0.0429],
         [ 0.8641,  0.4581,  0.3930, -0.1724],
         [ 0.8493,  0.6583,  1.8618,  0.6285],
         [ 0.5152,  0.9438,  0.3897,  1.8575],
         [ 0.3382,  1.2126,  1.5895,  1.2673]],

        [[ 2.1362, -0.0395,  1.4157, -0.0429],
         [ 0.8641,  0.4581,  0.3930,  0.1724],
         [ 0.8493,  0.6583,  1.8618,  0.6285],
         [ 0.5152,  0.9438,  0.3897,  1.8575],
         [ 0.3382,  1.2126,  1.5895,  1.2673]]]),
w=tensor([[-20.,  20.,  24.,  26.],
        [ 28., -30.,  32., -20.],
        [ 36., -38., -40.,  42.],
        [-44., -46.,  48., -50.],
        [ 52., -54.,  56., -58.]]),
b=tensor([[[ 4.],
         [ 2.],
         [ 0.],
         [-4.],
         [ 0.]]])

Gradient after SECOND backward pass:
x=tensor([[[-4.2725,  0.0791,  2.8315, -0.0858],
         [ 1.7283,  0.9161,  0.7860, -0

In [2]:
# optimizer.zero_grad() is used to empty gradient attributes of all parameters handling by optimizer in the mode
# optimizer.step() updates the weights after calculating loss_fn, gradients
# typically use optimizer.zero_grad() at the top of training loop
# and use optimizer.step() at the last so only after relevant computations are done, then update weights based on gradients



In [3]:
# when we call some_tensor.backward(), the gradients of some_tensor with respect to all the previous
# connected tensors will be computed and stored on the leaf tensors
# meaning any tensors that come after some_tensor and used for operations will not be included
# or computed since in this case we are calling .backward() from the intermediate tensor
# say a -> b -> c -> d -> e -> f
# if we call c.backward() then gradients of C with respect to A and B will be computed and stored in A 
# or B, depending which one is leaf tensor, if both then gradients will be stored in both
# if we call e.backward() then gradients of E with respect to D, C, B and A will be computed and 
# stored in leaf tensors, the same f.backward() will be compute gradients of F with respect to D, E, C, B and A

# gradient attributes are stored only on the leaf tensors by default
# but if we want to retain gradients for intermediate tensors then we use .retain_grad() method
# it will store all the gradients computed before it, and not after it
# because with .retain_grad() we are computing the gradient of intermediate_tensor with respect 
# to all other tensors that comes before it and which are connected
# also note that if we use retain_grad() on the final output
# and then try to access .grad attribute we will simply get a tensor filled with 1.0s
# because a gradient of G with respect to G, meaning to itself, will always be 1

# note when computing the gradient of say Y with respect to X, 
# firstly all the pair-wise node derivatives of all the unique paths connected from Y to X
# will be multiplied, and then those multiplied derivates of all possible paths from Y to X will be summed
# and then resulting value will be an actual and accurate derivate of Y with respect to X
# it is accurate compared to only multipliying derivates of unique paths and summing them over for
# only directly connected nodes coming from leaf node(tensor), because those leaf-nodes
# can have indirect impact on the final loss value as well, even if indirectly connected
# through other intermediate tensors

In [50]:
# leaf tensors
a = torch.rand(4,5, requires_grad=True)
b = torch.rand(4,1, requires_grad=True)
c = torch.rand(1,5, requires_grad=True)
print(a.is_leaf,b.is_leaf,c.is_leaf)

# intermediate tensors, even though gradients are calculated for them, they are not explicitly stored in their .grad attribute
inter_1 = a @ c.T + b
inter_2 = a * c + b
inter_1.retain_grad()
inter_2.retain_grad()
# leaf tensors, but since no gradients required, no derivates will be computed for them with respect to previous tensors
e = torch.arange(180,200).view(4,5)  
f = torch.ones(4,1)

inter_3 = inter_1.T @ f
inter_4 = torch.mul(inter_2, e)

output = inter_3 + inter_4 # predicted values
labels = torch.randn(4,5) # ground truth
# sum of squared errors
loss = (output - labels) ** 2
loss.retain_grad()
# now the gradients of LOSS with respect to every preceding tensors connected to it, directly and indirectly will have computed
# and stored in their corresponding leaf tensors
# leaf tensors will contain gradients that are both directly and indirectly come from the input values
loss.backward(gradient=torch.ones(4,5))
print(f"grad_fn functions involed in intermediate(non-lefa) tensors: \ninter_1={inter_1.grad_fn}, inter_2={inter_2.grad_fn},\
inter_3={inter_3.grad_fn},\ninter_4={inter_4.grad_fn}\n")

# for now we can only access gradients computed for leaf tensors
print(f"Gradients of leaf tensors:\na={a.grad}\nb={b.grad}\nc={c.grad}")
# again, the gradients stored in .grad attribute are simply the sum of all possible paths of multiplied pair-wise derivatives
# which are directly and indirectly connected from output to input
print(f"\nGradients for leaf tensors but without gradients required, no gradients will be calculated for them,\n\
hence nothing will be in their corresponding .grad attribute: e={e.grad}, f={f.grad}")
print(f"\nIf we try to access .grad for non-leaf(intermediate) but gradients required tensors, .grad attribute\n\
will be empty, because by default, only for leaf-tensors gradients are stored.\n")
# if we want to compute the gradients for those intermediate/non-leaf tensors
# we need to use retain_grad() method and it will store gradients of that non-leaf tensor
# and basically derivates of non-leaf tensor with respect to all previous tensors, and not after it.
# autograd does calculate the gradients of those non-leaf tensors, but they are not stored in their .grad attribute
print(f"Gradients of non-leaf tensors now are stored in their .grad attribute:\ninter_1={inter_1.grad}\n\
inter_2={inter_2.grad},\nloss={loss.grad}")

True True True
grad_fn functions involed in intermediate(non-lefa) tensors: 
inter_1=<AddBackward0 object at 0x11d227af0>, inter_2=<AddBackward0 object at 0x11d227af0>,inter_3=<MmBackward0 object at 0x11d227af0>,
inter_4=<MulBackward0 object at 0x11d227af0>

Gradients of leaf tensors:
a=tensor([[31687.2227, 41356.3164, 29994.5762, 41956.3008, 22144.4102],
        [50141.6562, 57926.3008, 30274.7070, 51902.4336, 23156.7715],
        [19674.8945, 21032.0840, 20240.4805, 17646.8984,  8854.6533],
        [ 8443.3594, 12747.5840,  4610.7793, 26268.8164,  8273.5068]])
b=tensor([[342821.6875],
        [430090.1562],
        [167753.9375],
        [105263.8125]])
c=tensor([[125020.1406, 107215.4141, 103531.3672, 119119.1875, 193669.6250]])

Gradients for leaf tensors but without gradients required, no gradients will be calculated for them,
hence nothing will be in their corresponding .grad attribute: e=None, f=None

If we try to access .grad for non-leaf(intermediate) but gradients required te

In [2]:
# note that the loss.grad, are all ones, in fact the gradient of the A with respect to A, is one
# derivate of the value with respect to itself, is always one

In [18]:
def f(x):
    return 3 * x ** 2 - 4 * x

def numerical_lim(f,x,h):
    return (f(x + h) - f(x)) / h

h = 0.1 # step size
for i in range(7):
    print(f'h={h:.7f}, numerical limit ={numerical_lim(f,1,h):.7f}')
    h *= 0.1

h=0.1000000, numerical limit =2.3000000
h=0.0100000, numerical limit =2.0300000
h=0.0010000, numerical limit =2.0030000
h=0.0001000, numerical limit =2.0003000
h=0.0000100, numerical limit =2.0000300
h=0.0000010, numerical limit =2.0000030
h=0.0000001, numerical limit =2.0000003


In [32]:
def f(x):
    return 4 * x ** 2 - 3 * x
    
def numerical_diff(f,x,h):
    return (f(x + h) - f(x))/h

h = 1e-2
for i in range(10):
    print(f"h={h:.10f}, numerical diff={numerical_diff(f,1,h):.10f}")
    h *=0.1

h=0.0100000000, numerical diff=5.0400000000
h=0.0010000000, numerical diff=5.0040000000
h=0.0001000000, numerical diff=5.0004000000
h=0.0000100000, numerical diff=5.0000400000
h=0.0000010000, numerical diff=5.0000040002
h=0.0000001000, numerical diff=5.0000004048
h=0.0000000100, numerical diff=4.9999999696
h=0.0000000010, numerical diff=5.0000004137
h=0.0000000001, numerical diff=5.0000004137
h=0.0000000000, numerical diff=5.0000004137


In [36]:
# Basically when we call .backward() for scalar tensor
# by default the vector v value in Jacobian-gradient product is 1.0 with 0-D dimension
# hence if we are using .backward() on non-scalar tensor
# we must provide the vector v value that matches with the shape of that output tensor
# v vector is simply vector that matches with the output tensor, and has only 1.0 in of all its entries
# simply filled with 1.0s

# torch.nn

In [2]:
!conda config --show channels

channels:
  - conda-forge
  - defaults
  - https://repo.anaconda.com/pkgs/main
  - https://repo.anaconda.com/pkgs/r


In [3]:
!conda config --describe channel_priority

# # channel_priority (ChannelPriority)
# #   Accepts values of 'strict', 'flexible', and 'disabled'. The default
# #   value is 'flexible'. With strict channel priority, packages in lower
# #   priority channels are not considered if a package with the same name
# #   appears in a higher priority channel. With flexible channel priority,
# #   the solver may reach into lower priority channels to fulfill
# #   dependencies, rather than raising an unsatisfiable error. With channel
# #   priority disabled, package version takes precedence, and the
# #   configured priority of channels is used only to break ties. In
# #   previous versions of conda, this parameter was configured as either
# #   True or False. True is now an alias to 'flexible'.
# # 
# channel_priority: flexible



FOR ADDING

conda config --add channels conda-forge

In [11]:
# If we do not know what arguments a given can accept, then call that function
# but instead of closing it with (), simply type ? twice
torch.tensor??

[0;31mDocstring:[0m
tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor

Constructs a tensor with no autograd history (also known as a "leaf tensor", see :doc:`/notes/autograd`) by copying :attr:`data`.


    When working with tensors prefer using :func:`torch.Tensor.clone`,
    :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for
    readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to
    ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)``
    is equivalent to ``t.clone().detach().requires_grad_(True)``.

.. seealso::

    :func:`torch.as_tensor` preserves autograd history and avoids copies where possible.
    :func:`torch.from_numpy` creates a tensor that shares storage with a NumPy array.

Args:
    data (array_like): Initial data for the tensor. Can be a list, tuple,
        NumPy ``ndarray``, scalar, and other types.

Keyword args:
    dtype (:class:`torch.dtype`, optional): the desi

In [15]:
torch.logical_and??

[0;31mDocstring:[0m
logical_and(input, other, *, out=None) -> Tensor

Computes the element-wise logical AND of the given input tensors. Zeros are treated as ``False`` and nonzeros are
treated as ``True``.

Args:
    input (Tensor): the input tensor.
    other (Tensor): the tensor to compute AND with

Keyword args:
    out (Tensor, optional): the output tensor.

Example::

    >>> torch.logical_and(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
    tensor([ True, False, False])
    >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
    >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
    >>> torch.logical_and(a, b)
    tensor([False, False,  True, False])
    >>> torch.logical_and(a.double(), b.double())
    tensor([False, False,  True, False])
    >>> torch.logical_and(a.double(), b)
    tensor([False, False,  True, False])
    >>> torch.logical_and(a, b, out=torch.empty(4, dtype=torch.bool))
    tensor([False, False,  True, False])
[0;31mType:[0m    

In [16]:
torch.autograd??

[0;31mType:[0m        module
[0;31mString form:[0m <module 'torch.autograd' from '/opt/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/autograd/__init__.py'>
[0;31mFile:[0m        /opt/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/autograd/__init__.py
[0;31mSource:[0m     
[0;31m# mypy: allow-untyped-defs[0m[0;34m[0m
[0;34m[0m[0;34m"""[0m
[0;34m``torch.autograd`` provides classes and functions implementing automatic differentiation of arbitrary scalar valued functions.[0m
[0;34m[0m
[0;34mIt requires minimal changes to the existing code - you only need to declare :class:`Tensor` s[0m
[0;34mfor which gradients should be computed with the ``requires_grad=True`` keyword.[0m
[0;34mAs of now, we only support autograd for floating point :class:`Tensor` types ([0m
[0;34mhalf, float, double and bfloat16) and complex :class:`Tensor` types (cfloat, cdouble).[0m
[0;34m"""[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0mt

In [17]:
# So in case if you do not understand what arguments we can pass, call ?? without brackets for a given method

In [4]:
import torch
import torch.nn as nn

class CustomModuleWithParameter(nn.Module):
    def __init__(self):
        super().__init__()
    
        # learnable parameter tensor, that has required_grad=True by default
        self.param_weight = nn.Parameter(torch.randn(3,4))
    
        # a simple tensor that does not have required required by default, so will not be recorded 
        # for computing the gradients and included into the computational graph
        self.tensor_weight = torch.ones(6,4)

    def forward(self, x):
        # goes through components
        return torch.dot(x, self.param_weight)

model = CustomModuleWithParameter()

for name, parameters in module.named_parameters():
    print(f"Parameter name: {name}, shape: {parameters.shape}, is gradients required?: {parameters.requires_grad}")

# look a simple tensor is not included as the part of parameters of the model, since it does not have
# gradients required by default

for name in module.parameters():
    print(f"Parameter shape: {name.shape}, is gradients required?: {name.requires_grad}")

Parameter name: param_weight, shape: torch.Size([3, 4]), is gradients required?: True
Parameter shape: torch.Size([3, 4]), is gradients required?: True


In [5]:
import torch
import torch.nn as nn

class ParamNet(nn.Module):
    def __init__(self):
        super().__init__()
        # included into the list of parameters of the module and required gradient by default
        self.weight_param1 = nn.Parameter(torch.rand(3,5))
        self.weight_param2 = nn.Parameter(torch.ones(7,9))
        
        # not included into the list of parameters of the moduel and does not require gradients
        self.tensor_no_param_no_grad = torch.tensor([3,4,5], dtype=torch.float32)
        
        # even with gradients enabled, it will still not be included into the list of model parameters
        self.tensor_no_param_yes_grad = torch.rand(2,3, requires_grad=True)

    def forward(self, data_input):
        # define the data flow going through layers/activation_functions
        return torch.matmul(data_input, self.weight_param)

# create an instance of the model
model = ParamNet()

# now iterate throught all the parameters of the model
for name, param in model.named_parameters():
    print(f"Parameter name: {name}, type: {type(param)}, shape: {param.shape}, is gradient required?: {param.requires_grad}")

# this code also iterates through all the parameters of the model, but does not return the name of the parameters
for param in model.parameters():
   print(f"\nParameter shape: {param.shape}, class type: {type(param)}, is gradients required?: {param.requires_grad}\n{param}\n")



Parameter name: weight_param1, type: <class 'torch.nn.parameter.Parameter'>, shape: torch.Size([3, 5]), is gradient required?: True
Parameter name: weight_param2, type: <class 'torch.nn.parameter.Parameter'>, shape: torch.Size([7, 9]), is gradient required?: True

Parameter shape: torch.Size([3, 5]), class type: <class 'torch.nn.parameter.Parameter'>, is gradients required?: True
Parameter containing:
tensor([[0.6613, 0.7870, 0.2244, 0.7261, 0.3084],
        [0.9800, 0.7766, 0.2283, 0.8950, 0.4856],
        [0.2311, 0.5942, 0.1179, 0.7616, 0.3801]], requires_grad=True)


Parameter shape: torch.Size([7, 9]), class type: <class 'torch.nn.parameter.Parameter'>, is gradients required?: True
Parameter containing:
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.],
      

In [68]:
import torch
import torch.nn as nn

# Can we perform operations on variables on two classes torch.Tensor and torch.nn.parameter.Parameter
a = nn.Parameter(torch.rand(3,2))
a_ = a.clone()
b = torch.rand(2,5)
c = a @ b
display(c.shape, c.dtype, type(c), c.device, c.requires_grad, c.grad_fn, a.requires_grad, b.requires_grad, a.grad, b.grad, c.grad)
print(type(a))

e = a + a_
# So Operations of torch.nn.parameter.Parameter results in torch.Tensor regardless
print("The operations of only torch.nn.parameter.Parameter object types, results in object type of ", type(e))
# so if there is operations done with tensors of type torch.Tensor and torch.nn.parameter.Parameter (or shortly torch.nn.Parameter)
# then the resulting tensor will be of type torch.Tensor, at in this case!

  display(c.shape, c.dtype, type(c), c.device, c.requires_grad, c.grad_fn, a.requires_grad, b.requires_grad, a.grad, b.grad, c.grad)


torch.Size([3, 5])

torch.float32

torch.Tensor

device(type='cpu')

True

<MmBackward0 at 0x304cf3880>

True

False

None

None

None

<class 'torch.nn.parameter.Parameter'>
The operations of only torch.nn.parameter.Parameter object types, results in object type of  <class 'torch.Tensor'>


In [77]:
import torch
import torch.nn as nn

class ParamExperimentNet(nn.Module):
    def __init__(self, 
                 weight_param1: torch.nn.parameter.Parameter, 
                 weight_param2: torch.nn.parameter.Parameter, 
                 weight_param3: torch.nn.parameter.Parameter,
                 tn_nograd1: torch.Tensor,
                 tn_nograd2: torch.Tensor):
        super().__init__()
        # torch.nn.parameter.Parameter objects: requires_grad=True by default
        self.weight_param1 = weight_param1
        self.weight_param2 = weight_param2
        self.weight_param3 = weight_param3
        
        # torch.Tensor objects: requires_grad=False by default
        self.tn_nograd1 = tn_nograd1
        self.tn_nograd2 = tn_nograd2

    # forward-pass, compute loss, compute gradients
    def forward(self, data_input: torch.Tensor):
        # define the flow of data through layers/activation_function/components initialized in __init__
        op1 = torch.add(data_input, self.weight_param1)
        op2 = torch.add(self.weight_param2, self.weight_param3)
        op3 = torch.add(self.tn_nograd1, self.tn_nograd2) 
        return op1 + op2 + op3

# create an instance of the ParParamExperimentNet Class and provide 5 positional arguments
x1 = nn.Parameter(torch.rand(4,5))
x2 = nn.Parameter(torch.arange(5.0).view(1,5))
x3 = nn.Parameter(torch.ones(4,1))
k1 = torch.zeros(2,4,1)
k2 = torch.randn(1,4,5)
# these shapes are broadcastable, numbers along each dimensions, either match, 1 or 0.
model_heh = ParamExperimentNet(x1,x2,x3,k1,k2)

# iterate through all the parameters, only objects of type torch.nn.parameter.Parameter (or torch.nn.Parameter)
# will be captured by .named_parameters() and parameters()

# Since there is only tensors of type torch.nn.parameter.Parameter, only those are listed as parameters of the model
# and torch.Tensor object types are not listed, nn.Parameter have requires_grad=True by default and are recorded
# into computational graph
for name, param in model_heh.named_parameters():
    print(f"Parameter name: {name}, shape: {param.shape}, type: {type(param)}, requires_grad={param.requires_grad}")

# Does the same thing as named_parameters(), with a single difference being, it does capture the name 
# of the individual nn.Parameter, also torch.Tensor are not listed here as well
print("")
for param in model_heh.parameters():
    print(f"Parameter shape: {param.shape}, type: {type(param)}, requires_grad={param.requires_grad}")
print("")
forward_pass = model_heh.forward(torch.ones(4,5))
# see the resulting tensor object type is torch.Tensor, even though operations were performed between torch.nn.parameter.Parater
# and torch.Tensor
print(forward_pass.shape, type(forward_pass), forward_pass.requires_grad, forward_pass.is_leaf)



Parameter name: weight_param1, shape: torch.Size([4, 5]), type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True
Parameter name: weight_param2, shape: torch.Size([1, 5]), type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True
Parameter name: weight_param3, shape: torch.Size([4, 1]), type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True

Parameter shape: torch.Size([4, 5]), type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True
Parameter shape: torch.Size([1, 5]), type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True
Parameter shape: torch.Size([4, 1]), type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True

torch.Size([2, 4, 5]) <class 'torch.Tensor'> True False


In [3]:
model_heh.forward??

[0;31mSignature:[0m [0mmodel_heh[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0mdata_input[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Define the computation performed at every call.

Should be overridden by all subclasses.

.. note::
    Although the recipe for forward pass needs to be defined within
    this function, one should call the :class:`Module` instance afterwards
    instead of this since the former takes care of running the
    registered hooks while the latter silently ignores them.
[0;31mSource:[0m   
    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mdata_input[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;31m# define the flow of data through layers/activation_function/components initialized in __init__[0m[0;34m[0m
[0;34m[0m        [0mop1[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0madd[0m[0;34m(

In [6]:
model_heh.state_dict??

[0;31mSignature:[0m [0mmodel_heh[0m[0;34m.[0m[0mstate_dict[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0mdestination[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mprefix[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m [0mkeep_vars[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mstate_dict[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0mdestination[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mprefix[0m[0;34m=[0m[0;34m""[0m[0;34m,[0m [0mkeep_vars[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34mr"""Return a dictionary containing references to the whole state of the module.[0m
[0;34m[0m
[0;34m        Both parameters and persistent buffers (e.g. running averages) are[0m
[0;34m        included. Keys are corresponding parameter and buffer names.[0m
[0;34m        Parameters and buffers set to ``None`` are not included.[0

model_heh.load_state_dict??

In [11]:
model_heh.train??

[0;31mSignature:[0m [0mmodel_heh[0m[0;34m.[0m[0mtrain[0m[0;34m([0m[0mmode[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m)[0m [0;34m->[0m [0;34m~[0m[0mT[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mtrain[0m[0;34m([0m[0mself[0m[0;34m:[0m [0mT[0m[0;34m,[0m [0mmode[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m)[0m [0;34m->[0m [0mT[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34mr"""Set the module in training mode.[0m
[0;34m[0m
[0;34m        This has any effect only on certain modules. See documentations of[0m
[0;34m        particular modules for details of their behaviors in training/evaluation[0m
[0;34m        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,[0m
[0;34m        etc.[0m
[0;34m[0m
[0;34m        Args:[0m
[0;34m            mode (bool): whether to set training mode (``True``) or evaluation[0m
[0;34m                         mode (``False`

In [12]:
model_heh.eval??

[0;31mSignature:[0m [0mmodel_heh[0m[0;34m.[0m[0meval[0m[0;34m([0m[0;34m)[0m [0;34m->[0m [0;34m~[0m[0mT[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0meval[0m[0;34m([0m[0mself[0m[0;34m:[0m [0mT[0m[0;34m)[0m [0;34m->[0m [0mT[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34mr"""Set the module in evaluation mode.[0m
[0;34m[0m
[0;34m        This has any effect only on certain modules. See documentations of[0m
[0;34m        particular modules for details of their behaviors in training/evaluation[0m
[0;34m        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,[0m
[0;34m        etc.[0m
[0;34m[0m
[0;34m        This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.[0m
[0;34m[0m
[0;34m        See :ref:`locally-disable-grad-doc` for a comparison between[0m
[0;34m        `.eval()` and several similar mechanisms that may be confused with it.[0m
[0;34m[0m
[0;34m        Ret

In [13]:
model_heh.modules??

[0;31mSignature:[0m [0mmodel_heh[0m[0;34m.[0m[0mmodules[0m[0;34m([0m[0;34m)[0m [0;34m->[0m [0mIterator[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'Module'[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mmodules[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0mIterator[0m[0;34m[[0m[0;34m"Module"[0m[0;34m][0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34mr"""Return an iterator over all modules in the network.[0m
[0;34m[0m
[0;34m        Yields:[0m
[0;34m            Module: a module in the network[0m
[0;34m[0m
[0;34m        Note:[0m
[0;34m            Duplicate modules are returned only once. In the following[0m
[0;34m            example, ``l`` will be returned only once.[0m
[0;34m[0m
[0;34m        Example::[0m
[0;34m[0m
[0;34m            >>> l = nn.Linear(2, 2)[0m
[0;34m            >>> net = nn.Sequential(l, l)[0m
[0;34m            >>> for idx, m in enumerate(net.modules(

In [15]:
model_heh.children??

[0;31mSignature:[0m [0mmodel_heh[0m[0;34m.[0m[0mchildren[0m[0;34m([0m[0;34m)[0m [0;34m->[0m [0mIterator[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'Module'[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mchildren[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0mIterator[0m[0;34m[[0m[0;34m"Module"[0m[0;34m][0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34mr"""Return an iterator over immediate children modules.[0m
[0;34m[0m
[0;34m        Yields:[0m
[0;34m            Module: a child module[0m
[0;34m        """[0m[0;34m[0m
[0;34m[0m        [0;32mfor[0m [0mname[0m[0;34m,[0m [0mmodule[0m [0;32min[0m [0mself[0m[0;34m.[0m[0mnamed_children[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;32myield[0m [0mmodule[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      /opt/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/module.p

In [3]:
import torch
import torch.nn as nn

class AgainParamNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.weight1 = nn.Parameter(torch.rand(3,4))
        self.weight2 = nn.Parameter(torch.rand(5,6))
        self.weight3 = torch.rand(9,8)

    def forward(self, data_input):
        return data_input + self.weight3

model = AgainParamNet()

for name, param in model.named_parameters():
    print(f"Parameter name: {name}, type: {type(param)}, requires_grad={param.requires_grad}")

for param in model.parameters():
    print(param)

Parameter name: weight1, type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True
Parameter name: weight2, type: <class 'torch.nn.parameter.Parameter'>, requires_grad=True
Parameter containing:
tensor([[0.8615, 0.3522, 0.1782, 0.1892],
        [0.5046, 0.6216, 0.1529, 0.2444],
        [0.9990, 0.7839, 0.2083, 0.6377]], requires_grad=True)
Parameter containing:
tensor([[0.2053, 0.5451, 0.6087, 0.9635, 0.8662, 0.0589],
        [0.5184, 0.1772, 0.0550, 0.4578, 0.6219, 0.0965],
        [0.7546, 0.7754, 0.9498, 0.5987, 0.0716, 0.8863],
        [0.9625, 0.3226, 0.1820, 0.8026, 0.3104, 0.3272],
        [0.5890, 0.1193, 0.4736, 0.4637, 0.5690, 0.9625]], requires_grad=True)


In [48]:
# torch.nn.parameter.Parameter will still have its required_grad=True
# even inside of context manager "with torch.no_grad(): "
# no_grad() does not effect nn.Parameter

import torch
from torch import nn
from torch import optim

class SimpleNet(nn.Module):
    def __init__(self,):
        super(SimpleNet, self).__init__()
        # Define learnable parameters of the model that will be updated
        self.weight = nn.Parameter(torch.rand(1))

    def forward(self, x):
        return torch.mul(x, self.weight)

model = SimpleNet()
print(f"Initial model parameters: {list(model.parameters())}")

#define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

#sampe input and target
input_tensor = torch.tensor([2.0])
target_tensor = torch.tensor([4.0])

for epoch in range(100):
    optimizer.zero_grad() # avoid gradient accumulation of other batches effecting the updates of parameters
    output = model(input_tensor) # forward pass
    loss = criterion(output, target_tensor) # calculate loss
    loss.backward()
    optimizer.step() # update parameters

    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, Loss: {loss.item()}, weight: {model.weight.item()}")

print(f"Final Model parameters: {list(model.parameters())}")

Initial model parameters: [Parameter containing:
tensor([0.6953], requires_grad=True)]
Epoch: 0, Loss: 6.809258460998535, weight: 0.7996504902839661
Epoch: 10, Loss: 1.284861445426941, weight: 1.4785820245742798
Epoch: 20, Loss: 0.24244500696659088, weight: 1.773501992225647
Epoch: 30, Loss: 0.04574774205684662, weight: 1.9016119241714478
Epoch: 40, Loss: 0.00863232184201479, weight: 1.957261323928833
Epoch: 50, Loss: 0.0016288697952404618, weight: 1.98143470287323
Epoch: 60, Loss: 0.00030735816108062863, weight: 1.9919354915618896
Epoch: 70, Loss: 5.799321661470458e-05, weight: 1.9964969158172607
Epoch: 80, Loss: 1.0943246707029175e-05, weight: 1.9984782934188843
Epoch: 90, Loss: 2.064821728708921e-06, weight: 1.999338984489441
Final Model parameters: [Parameter containing:
tensor([1.9997], requires_grad=True)]


In [57]:
criterion??

[0;31mSignature:[0m      [0mcriterion[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           MSELoss
[0;31mString form:[0m    MSELoss()
[0;31mFile:[0m           /opt/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/torch/nn/modules/loss.py
[0;31mSource:[0m        
[0;32mclass[0m [0mMSELoss[0m[0;34m([0m[0m_Loss[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Creates a criterion that measures the mean squared error (squared L2 norm) between[0m
[0;34m    each element in the input :math:`x` and target :math:`y`.[0m
[0;34m[0m
[0;34m    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:[0m
[0;34m[0m
[0;34m    .. math::[0m
[0;34m        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad[0m
[0;34m        l_n = \left( x_n - y_n \right)^2,[0m
[0;34m[0m
[0;34m    where :math:`N` is the batch size. If :attr:`reduction` i

In [4]:
import torch
from torch import nn
from torch import optim

class SimNet(nn.Module):
    def __init__(self):
        super(SimNet, self).__init__()
        self.weight = nn.Parameter(torch.rand(2,3))
        self.bias = nn.Parameter(torch.rand(2,1))

    def forward(self, data: torch.Size([2,3])):
        return data * self.weight + self.bias


modnet = SimNet()
print(f"Initial Model learnable parameters: {list(modnet.named_parameters())}") # or simply .parameters()

# input and target values
input_value = torch.rand(2,3)
target_value = torch.rand(2,3)

# define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(modnet.parameters(), lr=0.01)

for epoch in range(100):
    # clear all the gradients accumulation before processing a new batch
    optimizer.zero_grad()
    output = modnet(input_value) # forward pass
    loss = criterion(output, target_value) # compute loss
    loss.backward(gradient=torch.tensor(1.0)) # since loss is non-scalar, we need to provide tensor of ones with the same shape as loss
    optimizer.step() # update weights and biases

    if epoch % 10 == 0:
        print(f"\nEpoch: {epoch}, Weight: {modnet.weight.reshape(-1)}, Bias: {modnet.bias.reshape(-1)}, Loss: {loss:.4f}")

Initial Model learnable parameters: [('weight', Parameter containing:
tensor([[0.7893, 0.9549, 0.1121],
        [0.0870, 0.9928, 0.4731]], requires_grad=True)), ('bias', Parameter containing:
tensor([[0.0925],
        [0.0916]], requires_grad=True))]

Epoch: 0, Weight: tensor([0.7883, 0.9553, 0.1137, 0.0868, 0.9924, 0.4732],
       grad_fn=<ViewBackward0>), Bias: tensor([0.0958, 0.0910], grad_fn=<ViewBackward0>), Loss: 0.2101

Epoch: 10, Weight: tensor([0.7779, 0.9595, 0.1294, 0.0853, 0.9889, 0.4740],
       grad_fn=<ViewBackward0>), Bias: tensor([0.1269, 0.0855], grad_fn=<ViewBackward0>), Loss: 0.1960

Epoch: 20, Weight: tensor([0.7669, 0.9635, 0.1443, 0.0838, 0.9856, 0.4749],
       grad_fn=<ViewBackward0>), Bias: tensor([0.1549, 0.0806], grad_fn=<ViewBackward0>), Loss: 0.1840

Epoch: 30, Weight: tensor([0.7555, 0.9672, 0.1586, 0.0825, 0.9825, 0.4759],
       grad_fn=<ViewBackward0>), Bias: tensor([0.1802, 0.0763], grad_fn=<ViewBackward0>), Loss: 0.1737

Epoch: 40, Weight: tensor([0.

In [82]:
# So main difference between torch.Tensor and torch.nn.parameter.Parameter is that 
# nn.Parameter wrapped around torch.Tensor makes it explicit that these tensors are learnable parameters
# of the model, and have requires_grad=True by default, and they are included into the list
# of model parameters, in model.parameters(), unlike just torch.Tensor is not included into the
# list of model parameters even with gradients required, and even after that torch.Tensor
# is not involed in the optimization and updating of the weights, but only nn.Parameter are

In [5]:
nn.Linear??

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mLinear[0m[0;34m([0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.[0m
[0;34m[0m
[0;34m    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.[0m
[0;34m[0m
[0;34m    On certain ROCm devices, when using float16 inputs this modul

In [25]:
# nn.Linear takes number of columns as input features, because
# each column in matrix corresponds to a unique feature while the rows
# corresponding to individual records of data, as long as number of columns equal to in_features, 
# linear model should work

model = nn.Linear(in_features=30, out_features=10)
data = torch.rand(256, 30) # 256 individual records each with 30 unique features
print(f"Input feature shape: {data.shape}")
output = model(data)
print(f"Ouput feature shape: {output.shape}, dtype: {output.dtype}, type: {type(output)}, requires_grad={output.requires_grad}")
display(output, output.grad_fn, output.is_leaf, output.requires_grad, output.size(), output.shape)

Input feature shape: torch.Size([256, 30])
Ouput feature shape: torch.Size([256, 10]), dtype: torch.float32, type: <class 'torch.Tensor'>, requires_grad=True


tensor([[ 0.0902, -0.2231, -0.0956,  ...,  0.0401, -0.5219,  0.2464],
        [ 0.1397, -0.0361,  0.2733,  ...,  0.0066, -0.7993, -0.0205],
        [-0.0953, -0.1675,  0.1679,  ...,  0.0898, -0.3153, -0.2594],
        ...,
        [ 0.2344, -0.1465,  0.2307,  ..., -0.0650, -0.6518, -0.0357],
        [ 0.2038, -0.2387, -0.0833,  ...,  0.0567, -0.4366,  0.1892],
        [ 0.1586, -0.2073, -0.1245,  ..., -0.1068, -0.5967,  0.0478]],
       grad_fn=<AddmmBackward0>)

<AddmmBackward0 at 0x16bf4e490>

False

True

torch.Size([256, 10])

torch.Size([256, 10])

In [2]:
import torch
from torch import nn

class SimpleLinearModel(nn.Module):
    def __init__(self, in_features, out_features):
        super(SimpleLinearModel, self).__init__()
        # define a linear layer
        self.linear_layer = nn.Linear(in_features, out_features)
        # model initilized with in_features and out_features
        print(f"Model initialized with in_features={in_features} and out_features={out_features}")
        print(f"Defined a linear layer {self.linear_layer}")

    def forward(self, data_input):
        # input data shape/size
        print(f"Input data shape: {data_input.size()}") # also can do data_input.shape, no difference with .size()
        output = self.linear_layer(data_input)
        print(f"Input data after linear layer: {output.shape}")
        return output

in_dim = 30
out_dim = 14
linmod = SimpleLinearModel(in_features=in_dim, out_features=out_dim)

# random input data
data = torch.arange(1290, dtype=torch.float32).view(43, 30)

# forward pass
output = linmod(data)
print(f"Model output shape: {output.shape}\n")

# Model parameter
for name, param in linmod.named_parameters():
    print(f"Parameter name: {name}, shape: {param.size()}")

Model initialized with in_features=30 and out_features=14
Defined a linear layer Linear(in_features=30, out_features=14, bias=True)
Input data shape: torch.Size([43, 30])
Input data after linear layer: torch.Size([43, 14])
Model output shape: torch.Size([43, 14])

Parameter name: linear_layer.weight, shape: torch.Size([14, 30])
Parameter name: linear_layer.bias, shape: torch.Size([14])


In [4]:
nn.Linear??

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mLinear[0m[0;34m([0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.[0m
[0;34m[0m
[0;34m    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.[0m
[0;34m[0m
[0;34m    On certain ROCm devices, when using float16 inputs this modul

In [26]:
import torch
from torch import nn
from torch import optim

class SimpleLinearModel(nn.Module):
    def __init__(self, in_feature, out_feature):
        super().__init__()
        #define the layer
        self.linear_layer = nn.Linear(in_feature, out_feature)

    def forward(self, data):
        output = self.linear_layer(data)
        return output

# create an instance of the class
inputs = 45
outputs = 12
model = SimpleLinearModel(in_feature=inputs, out_feature=outputs)

# loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# input data
dm = torch.rand(32, 45) # 45 is number of features, while 32 is the number of individual records
output = model(dm) # forward pass

for param in model.parameters():
    print(param)

display(output.requires_grad, output.grad_fn, output.shape, output)

Parameter containing:
tensor([[ 0.0039, -0.0131, -0.0172, -0.0807, -0.0624, -0.1078, -0.0286, -0.1331,
         -0.0718, -0.0279, -0.0602, -0.0578,  0.0239,  0.0658, -0.1053,  0.1004,
         -0.0655,  0.0263,  0.0576,  0.0890, -0.1239, -0.1270, -0.0558, -0.1327,
          0.1340, -0.1274,  0.0080, -0.0992, -0.0706, -0.0779,  0.0381,  0.0431,
         -0.0101, -0.1080,  0.1249, -0.0319, -0.0694,  0.1283,  0.0793, -0.0864,
          0.0463,  0.0050,  0.1030, -0.0348, -0.0298],
        [ 0.0958, -0.0722, -0.0986,  0.1186, -0.1114, -0.1209, -0.0830,  0.0182,
          0.1394,  0.1088, -0.1145, -0.0831,  0.0150, -0.1429, -0.1408,  0.0304,
          0.0432,  0.0073, -0.1487, -0.0418, -0.0709,  0.0682, -0.0925,  0.0834,
          0.0844,  0.0950, -0.0365, -0.1371,  0.1028,  0.0794,  0.0188,  0.0218,
         -0.0943, -0.1338,  0.1062, -0.0656,  0.1050,  0.0091,  0.1473,  0.0441,
          0.0917,  0.0200, -0.0119, -0.0814,  0.0880],
        [-0.0232, -0.1239, -0.1010, -0.0473,  0.1402,  0.1

True

<AddmmBackward0 at 0x302625c40>

torch.Size([32, 12])

tensor([[-2.2470e-01, -5.8710e-02, -2.5785e-01,  3.5865e-01, -1.0950e-02,
         -1.9977e-01,  9.3109e-03,  3.0657e-02, -4.3339e-01, -2.5841e-02,
         -6.4272e-01,  1.5443e-01],
        [-3.3919e-01, -1.2645e-01,  8.0899e-02, -1.1143e-01, -3.9810e-01,
         -9.6678e-02, -6.4685e-02, -2.2056e-01, -2.0543e-01, -4.2925e-01,
         -6.0277e-01, -1.5761e-02],
        [-4.9108e-01, -2.3577e-01,  9.6117e-03,  1.2700e-01, -1.7415e-01,
         -3.8185e-01, -2.3065e-01, -3.3726e-01, -6.2643e-01, -2.1114e-01,
         -5.9623e-01,  2.7459e-01],
        [-2.7542e-01, -1.9220e-01, -2.5051e-01, -2.8725e-01, -1.2958e-01,
         -5.5331e-02, -2.4683e-01, -1.7822e-01, -6.0195e-01, -3.8581e-01,
         -8.9296e-01,  4.5097e-02],
        [-3.8682e-01, -2.3957e-01, -1.7267e-01,  1.0346e-01, -2.7054e-01,
          4.0088e-03, -1.3918e-01, -1.3058e-01, -2.8258e-01, -1.6801e-01,
         -6.6350e-01,  8.4326e-02],
        [-2.7883e-01, -1.7829e-01, -2.6589e-01,  4.6228e-02, -3.9745e-01,
      

In [71]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F # Often used for functional APIs like activation functions
import math

class SimpleMultiLayerPerceptron(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        # defining layers
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.activation = nn.ReLU() 
        self.layer2 = nn.Linear(hidden_size, output_size)
        print(f"Model parameters initialized with: {input_size}, {hidden_size}, {output_size}")
        print(f"Layer 1: {self.layer1}")
        print(f"Activation layer: {self.activation}")
        print(f"Layer 2: {self.layer2}")

    def forward(self, data):
        print(f"Forward pass input shape: {data.shape}")
        data = self.layer1(data)
        print(f"Input data after first layer: {data.shape}")
        data = self.activation(data) # or we can also define it as F.relu(data), without having to define nn.ReLU()
        print(f"Input data after activation layer: {data.shape}")
        data = self.layer2(data)
        print(f"Input data after second layer: {data.shape}")
        return data

a = torch.rand(224,224) # image size of 224,224
input_features = a.reshape(-1).clone() # flaten the image into a single row vector
input_features = math.floor(input_features.sum().tolist()) # summed and turned into int, since nn.Linear expects a scalar int value
hidden_layer = 128 # number of hidden layers
output_layer = 10 # spits out 10 classifications

# create an instance of the model class
model = SimpleMultiLayerPerceptron(input_size=input_features, hidden_size=hidden_layer, output_size=output_layer)

# define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# input_data
data_data = torch.randn(28, input_features)
target_data = torch.randn(28, 10) # since after given layers, the resulting shape is (28,10)

for name, param in model.named_parameters():
    print(f"Parameter name: {name}, shape: {param.shape}")

for epoch in range(100):
    optimizer.zero_grad()
    output = model(data_data) # forward pass
    loss = criterion(output, target_data) # outputs a scalar value
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        for name, param in model.named_parameters():         
            print(f"Epoch: {epoch}, parameter name: {name}, shape: {param.shape}, loss: {loss:.4f}")
        break

Model parameters initialized with: 24975, 128, 10
Layer 1: Linear(in_features=24975, out_features=128, bias=True)
Activation layer: ReLU()
Layer 2: Linear(in_features=128, out_features=10, bias=True)
Parameter name: layer1.weight, shape: torch.Size([128, 24975])
Parameter name: layer1.bias, shape: torch.Size([128])
Parameter name: layer2.weight, shape: torch.Size([10, 128])
Parameter name: layer2.bias, shape: torch.Size([10])
Forward pass input shape: torch.Size([28, 24975])
Input data after first layer: torch.Size([28, 128])
Input data after activation layer: torch.Size([28, 128])
Input data after second layer: torch.Size([28, 10])
Epoch: 0, parameter name: layer1.weight, shape: torch.Size([128, 24975]), loss: 0.9810
Epoch: 0, parameter name: layer1.bias, shape: torch.Size([128]), loss: 0.9810
Epoch: 0, parameter name: layer2.weight, shape: torch.Size([10, 128]), loss: 0.9810
Epoch: 0, parameter name: layer2.bias, shape: torch.Size([10]), loss: 0.9810


In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import time

class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleMLP, self).__init__()
        # define layers
        self.layer1 = nn.Linear(input_size, hidden_size)
        #self.activativation = nn.ReLU() # commented out since for this example we will use directly F.relu() in the def forward
        self.layer2 = nn.Linear(hidden_size, output_size)

    # receive input data, and define its flow inside of forward method
    def forward(self, x):
        #op1 = self.layer1(x)
        #op2 = F.relu(op1)
        #op3 = self.layer2(op2)
        #return op3
        x = self.layer1(x)
        x = F.relu(x)
        x = self.layer2(x)
        return x

# here we are not defining input and label datas, but we are simply defining the 
# SIZES: input sample size and output sample size
# input referring to the number of unique characteristics, in the case of matrix, refers to the number of rows, not columns
# output defines the total number of classification outputs, such as from given input
# we are trying to classify it into to distinct classes
# again when creating an instance of the model class, we are passing only the input/hidden/output sample sizes
# only sizes when passing an argument to the instance of the class
# we will pass an actual input features when doing forward pass
in_size = 784 # flattened version of 28x28 image
hi_size = 128 # number of hidden layers
ou_size = 10 # aim is to define 10 distinct classes from the given image
model = SimpleMLP(in_size, hi_size, ou_size)

name_ = []
for name, param in model.named_parameters():
    print(f"Parameter name: {name}, shape: {param.shape}, requires_grad={param.requires_grad}")
    name_.append(name)

# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# input data and target label
dum = torch.randn(64,784)
label = torch.randn(64,10) # since for element-wise oprations the dimension must match

# iterate and update parameters
start_time = time.time()
for epoch in range(100):
    optimizer.zero_grad() # to avoid gradient accumulation 
    output = model(dum) # forward pass
    loss = loss_fn(output, label)
    loss.backward() # scalar value
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, loss: {loss:.5f}, ")
end_time = time.time() - start_time
print(f"{end_time/60:.4f} minutes")
print(f"{end_time:.4f} seconds")

Parameter name: layer1.weight, shape: torch.Size([128, 784]), requires_grad=True
Parameter name: layer1.bias, shape: torch.Size([128]), requires_grad=True
Parameter name: layer2.weight, shape: torch.Size([10, 128]), requires_grad=True
Parameter name: layer2.bias, shape: torch.Size([10]), requires_grad=True
Epoch: 0, loss: 1.08200, 
Epoch: 10, loss: 1.07211, 
Epoch: 20, loss: 1.06247, 
Epoch: 30, loss: 1.05300, 
Epoch: 40, loss: 1.04370, 
Epoch: 50, loss: 1.03455, 
Epoch: 60, loss: 1.02553, 
Epoch: 70, loss: 1.01668, 
Epoch: 80, loss: 1.00799, 
Epoch: 90, loss: 0.99943, 
0.0023 minutes
0.1381 seconds


In [59]:
nn.Linear??

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mLinear[0m[0;34m([0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.[0m
[0;34m[0m
[0;34m    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.[0m
[0;34m[0m
[0;34m    On certain ROCm devices, when using float16 inputs this modul

In [60]:
time.time??

[0;31mDocstring:[0m
time() -> floating point number

Return the current time in seconds since the Epoch.
Fractions of a second may be present if the system clock provides them.
[0;31mType:[0m      builtin_function_or_method

In [86]:
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from time import time

class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.activation = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x) # x = F.relu(x)
        x = self.layer2(x)
        return x

# we define here the sample size
model = SimpleMLP(input_size=1156, hidden_size=64, output_size=5)

# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# define input features and labels
input_feature = torch.randn(89,1156)
target_label = torch.randn(89,5)
start_time = time()
for epoch in range(100000):
    optimizer.zero_grad() # to avoid gradient accumulation
    output = model(input_feature)
    loss = loss_fn(output, target_label)
    loss.backward()
    optimizer.step() # update weights after gradient calculation

    if epoch % 10000 == 0:
        print(f"Epoch: {epoch}, loss: {loss}")
        
    if loss <= 1e-13:
        print(f"Learning stopped with {loss:.6f}.")
        break
        
end_time = time() - start_time
print(f"{end_time:.13f} seconds")


Epoch: 0, loss: 1.1556340456008911
Epoch: 10000, loss: 1.4406993393417666e-12
Epoch: 20000, loss: 4.2720227312262327e-13
Epoch: 30000, loss: 2.315888095443669e-13
Epoch: 40000, loss: 1.5679954079930725e-13
Epoch: 50000, loss: 1.1333288636988098e-13
Learning stopped with 0.000000.
51.1217720508575 seconds


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class SimpleMLP(nn.Module):
    def __init__(self, ints, hids, outs):
        super(SimpleMLP, self).__init__()
        self.layer1 = nn.Linear(ints, hids)
        self.layer2 = nn.Linear(hids, outs)

    def forward(self, x):
        x = self.layer1(x)
        x = F.relu(x)
        x = self.layer2(x)
        return x

modelNet = SimpleMLP(ints=456, hids=256, outs=15)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(modelNet.parameters(), lr=0.001)
data = torch.arange(44688, 89376, dtype=torch.float32).reshape(98,456)
labels = torch.ones(98,15)

for epoch in range(10000):
    optimizer.zero_grad()
    output = modelNet(data)
    loss = loss_fn(output, labels)
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print(f"Epoch: {epoch}, loss: {loss.item()}")

Epoch: 0, loss: 123039976.0
Epoch: 1000, loss: 3.6826285920888816e+27
Epoch: 2000, loss: 2.82058066069858e+27
Epoch: 3000, loss: 2.1603244666356866e+27
Epoch: 4000, loss: 1.6546243915813448e+27
Epoch: 5000, loss: 1.2673013528926223e+27
Epoch: 6000, loss: 9.706442826871111e+26
Epoch: 7000, loss: 7.434306446298663e+26
Epoch: 8000, loss: 5.694045775473244e+26
Epoch: 9000, loss: 4.3611515637903016e+26


In [9]:
import torch
from torch import nn

# also called as fully connected layer or dense layer which flatens all entires into a single row or column vector
linear_layer = nn.Linear(in_features=30, out_features=20) # features representing rows
input_feature = torch.randn(54, 30)
output_feature = linear_layer(input_feature)
print(f"input_feature shape: {input_feature.shape}")
print(f"output_feature shape: {output_feature.shape}")
print(f"linear_layer weight: {linear_layer.weight.shape}, bias: {linear_layer.bias.shape}")

input_feature shape: torch.Size([54, 30])
output_feature shape: torch.Size([54, 20])
linear_layer weight: torch.Size([20, 30]), bias: torch.Size([20])


In [14]:
lin = nn.Linear(45, 78) # the first number is the number of input features, second is number of output features
ins = torch.arange(1530.0).view(34,45)
ous = lin(ins)
print(ins.shape, ous.size())
print(lin.weight.shape, lin.bias.shape)

torch.Size([34, 45]) torch.Size([34, 78])
torch.Size([78, 45]) torch.Size([78])


In [23]:
import torch
import torch.nn as nn

linear_layer = nn.Linear(60, 14)
input_feature = torch.randn(45, 60)
output_feature = linear_layer(input_feature)
print(input_feature.shape, output_feature.size())
print(linear_layer.weight.shape, linear_layer.bias.size())
print(linear_layer.weight) # both weight and bias after linear layer, will have gradients required
# nn.Linear flattens given tensor, also called as fully connected layer or dense layer
# this often serves as the final classification or regression layer in complex models of CNN/RNN

torch.Size([45, 60]) torch.Size([45, 14])
torch.Size([14, 60]) torch.Size([14])
Parameter containing:
tensor([[-0.0061,  0.0606, -0.0500,  0.0603,  0.1140, -0.0347, -0.0051, -0.1037,
          0.0037, -0.0633,  0.0330, -0.0456, -0.0558,  0.0357, -0.1094, -0.0602,
          0.0158,  0.1178,  0.0089, -0.0164, -0.0570, -0.0210,  0.0202,  0.0972,
         -0.1023,  0.0030,  0.1271,  0.0175,  0.0117, -0.0627,  0.0944,  0.1082,
         -0.0351,  0.0133,  0.0246, -0.1029,  0.0806, -0.0058, -0.0594,  0.1200,
         -0.1040,  0.0264, -0.0815, -0.0827, -0.0368,  0.0113,  0.0162,  0.0106,
          0.0103, -0.0860,  0.0624,  0.1177,  0.0793,  0.0866,  0.1153, -0.1158,
         -0.0847, -0.0106,  0.0095, -0.1252],
        [-0.0206, -0.1165, -0.0636,  0.0647,  0.1050,  0.0027, -0.0200,  0.0205,
          0.0084, -0.0773,  0.0278,  0.0681,  0.0221, -0.0159,  0.1042, -0.0104,
         -0.0341,  0.0785,  0.0771, -0.0292, -0.0799, -0.0347, -0.1158, -0.1174,
          0.1006, -0.0149,  0.0528,  0.087

In [46]:
fc_layer = nn.Linear(in_features=10, out_features=3)
features = torch.randn(5,10)
outputs = fc_layer(features)
print(f"Features: {features.shape}, Outputs: {outputs.size()}")
print(f"Parameters of fc_layer: weight={fc_layer.weight.shape}, bias={fc_layer.bias.shape}")
display(fc_layer.weight, fc_layer.bias)
print(f"Gradients required?: features={features.requires_grad}, outputs={outputs.requires_grad}")
print(f"Gradients required?: weight={fc_layer.weight.requires_grad}, bias={fc_layer.bias.requires_grad}")
print(f"Is leaf: features={features.is_leaf}, outputs={outputs.is_leaf}")
print(f"Grad_fn: features={features.grad_fn}, outputs={outputs.grad_fn}, weight={fc_layer.weight.grad_fn},\
 bias={fc_layer.bias.grad_fn}")


Features: torch.Size([5, 10]), Outputs: torch.Size([5, 3])
Parameters of fc_layer: weight=torch.Size([3, 10]), bias=torch.Size([3])


Parameter containing:
tensor([[-0.0205,  0.0809,  0.1949,  0.2649,  0.3076,  0.0447,  0.2620, -0.2888,
          0.2190, -0.0489],
        [-0.1672,  0.1811,  0.1750, -0.1704,  0.2198, -0.3022, -0.2651,  0.2293,
         -0.1227, -0.0345],
        [ 0.2470,  0.1746, -0.1729,  0.1715,  0.1494,  0.2128, -0.1646,  0.2819,
          0.1728,  0.2751]], requires_grad=True)

Parameter containing:
tensor([-0.1094,  0.2055,  0.1367], requires_grad=True)

Gradients required?: features=False, outputs=True
Gradients required?: weight=True, bias=True
Is leaf: features=True, outputs=False
Grad_fn: features=None, outputs=<AddmmBackward0 object at 0x16b4fd0d0>, weight=None, bias=None


In [60]:
nn.Conv2d?
# there are other variants of Conv1d and Conv3d are also available

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mConv2d[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_channels[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_channels[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel_size[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mint[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstride[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mint[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mint[0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mint[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdilation[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m

In [87]:
# in_channels, in the case of RGB is 3, since there are Red, Green and Blue color channels
# out_channels, represent the number of outputted feature maps
# meaning we can produce as many output feature maps as we want
# kernel_size is the dimension of filters (kernels), we can either provide a single int
# or tuple, in case of int will be square, such (H,W) where both are equal
# or tuple if the filter(kernel) size is not square
# padding helps to preserve the initial spatial dimensions, but by default is 0,
# so effectively convol_layer with padding=0 will also act as pooling_layer for downsizing the spatial dimensions
convol_layer = nn.Conv2d(in_channels=3, out_channels=15, kernel_size=5,device="mps")

# PyTorch expects channel-first format for images:
# [batch_size, input_channels, height, width]
input_image_batch = torch.randn(25, 3, 32, 32, device="mps")
# batch_size is basically the number of total images being passed as inputs for convol_layer
output_feature_maps = convol_layer(input_image_batch)

print(f"Input image shape: {input_image_batch.shape}")
print(f"Output feature map shape: {output_feature_maps.shape}")
# for input and output image/feature_maps [batch_size, input_color_channels, height, width]
print(f"\nConvoluation layer weight: {convol_layer.weight.shape}")
# [number_of_feature_maps, input_color_channels, kernel_height, kernel_width]
print(f"Convoluation layer weight: {convol_layer.bias.shape}")

Input image shape: torch.Size([25, 3, 32, 32])
Output feature map shape: torch.Size([25, 15, 28, 28])

Convoluation layer weight: torch.Size([15, 3, 5, 5])
Convoluation layer weight: torch.Size([15])


In [113]:
device = torch.device("mps" if torch.backends.mps.is_available else "gpu" if torch.cuda.is_available else "cpu")
convol_layer = nn.Conv2d(in_channels=3, out_channels=30, kernel_size=5, stride=1, padding=2, device=device)
input_image_batches = torch.randn(120, 3, 224, 224, device=device)
output_feature_maps = convol_layer(input_image_batches)
print(f"Input image: {input_image_batches.shape}")
print(f"Output feature maps: {output_feature_maps.size()}")
print(f"\nConvol weight: {convol_layer.weight.shape}\nConvol bias: {convol_layer.bias.shape}")

Input image: torch.Size([120, 3, 224, 224])
Output feature maps: torch.Size([120, 30, 224, 224])

Convol weight: torch.Size([30, 3, 5, 5])
Convol bias: torch.Size([30])


In [120]:
convolutional_layer = nn.Conv2d(in_channels=3, out_channels=120, kernel_size=5, padding=0)
input_image = torch.arange(19660800, dtype=torch.float32).reshape(400, 3, 128, 128)
# [batch_size, input_color_channels, height, width]
output_feature_maps = convolutional_layer(input_image)
# produces features maps where each one is learning specific patterns from the given input images
print(f"Input image shape: {input_image.shape}")
print(f"Output feature maps shape: {output_feature_maps.shape}")
print(f"\nConvol weight shape: {convolutional_layer.weight.shape}")
print(f"Convol bias shape: {convolutional_layer.bias.shape}")

Input image shape: torch.Size([400, 3, 128, 128])
Output feature maps shape: torch.Size([400, 120, 124, 124])

Convol weight shape: torch.Size([120, 3, 5, 5])
Convol bias shape: torch.Size([120])


In [122]:
nn.RNN?

[0;31mInit signature:[0m [0mnn[0m[0;34m.[0m[0mRNN[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
__init__(input_size,hidden_size,num_layers=1,nonlinearity='tanh',bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)

Apply a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}`
non-linearity to an input sequence. For each element in the input sequence,
each layer computes the following function:

.. math::
    h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})

where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
previous layer at time `t-1` or the initial hidden state at time `0`.
If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.

.. code-block:: python

    # Efficient implementation equivalent to th

wrap LaTex equation inside of double $$ signs

$$h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})$$


In [150]:
from IPython.display import display, Math, Latex
display(Math(r'h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})'))

<IPython.core.display.Math object>

In [163]:
import torch
from torch import nn
recurrent_layer = nn.RNN(input_size=5, hidden_size=30, batch_first=True)

# input sequence (batch_size, sequence_length, input_features_size)
input_sequence_batch = torch.randn(10, 20, 5)

# hidden state (number_of_stacked_layers, batch_size, hidden_size)
# by default the num_layer=1
# if hidden state is not provided by default it will be zero
hidden_state_size = torch.randn(1, 10, 30)

output_sequence, ouput_hidden_state = recurrent_layer(input_sequence_batch, hidden_state_size)

print(f"Input sequence shape: {input_sequence_batch.shape}")
print(f"Initial hidden state shape: {hidden_state_size.shape}")
print(f"\nOutput sequence shape: {output_sequence.shape}")
print(f"Output hidden state shape: {output_sequence.shape}")
print(f"\nRecurrent weight shape: {recurrent_layer.all_weights}")
print(f"Recurrent Layer: {recurrent_layer}")

Input sequence shape: torch.Size([10, 20, 5])
Initial hidden state shape: torch.Size([1, 10, 30])

Output sequence shape: torch.Size([10, 20, 30])
Output hidden state shape: torch.Size([10, 20, 30])

Recurrent weight shape: [[Parameter containing:
tensor([[ 0.0559, -0.1811,  0.1618,  0.0336,  0.0444],
        [-0.1492,  0.1553, -0.1643,  0.0388, -0.1554],
        [ 0.1153, -0.0060, -0.1494, -0.1464, -0.0359],
        [ 0.0321, -0.0835, -0.0366, -0.1035, -0.1246],
        [-0.1563, -0.1467,  0.0580,  0.0204,  0.0921],
        [-0.1378,  0.1149, -0.1678, -0.0393, -0.1366],
        [ 0.0694,  0.1011,  0.1518,  0.0730, -0.1385],
        [ 0.0221,  0.1031, -0.1167, -0.0681,  0.0298],
        [-0.0408, -0.1622, -0.1772, -0.0034, -0.0027],
        [ 0.0559,  0.0495, -0.1254,  0.1144,  0.0998],
        [-0.0197,  0.0708, -0.1566, -0.1436, -0.1746],
        [-0.1671,  0.0665, -0.1535,  0.1005, -0.1689],
        [ 0.0774, -0.1448, -0.0972,  0.0486, -0.0556],
        [-0.0423,  0.0505,  0.1191,  

In [168]:
nn.Linear?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.

This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

Args:
    in_features: size of each input sample
    out_features: size of each output sample
    bias:

In [187]:
from IPython.display import display, Math, Latex
display(Math(r'y = xW^T + b'))

<IPython.core.display.Math object>

In [188]:
recurrent_layer = nn.RNN(input_size=15, hidden_size=50, batch_first=True, device="mps")

# [batch_size, sequence_length, input_feature_size]
input_sequence = torch.randn(120, 60, 15, device="mps")
# [num_layers, batch_size, hidden_size]
initial_hidden_state = torch.randn(1, 120, 50, device="mps")

output_sequence, final_hidden_state = recurrent_layer(input_sequence, initial_hidden_state)

print(f"Input sequence shape: {input_sequence.shape}")
print(f"Initial hidden state shape: {initial_hidden_state.shape}")
print(f"\nOutput sequence shape: {output_sequence.shape}")
print(f"Final hidden state shape: {final_hidden_state.shape}")

Input sequence shape: torch.Size([120, 60, 15])
Initial hidden state shape: torch.Size([1, 120, 50])

Output sequence shape: torch.Size([120, 60, 50])
Final hidden state shape: torch.Size([1, 120, 50])


In [189]:
rnn_layer = nn.RNN(input_size=78, hidden_size=243, batch_first=True)
# [batch_size, sequence_length, input_featuer_size]
input_sequence = torch.randn(400, 123, 78)
# [num_layer, batch_size, hidden_size]
initial_hidden_state = torch.randn(1, 400, 243)

output_sequence, final_hidden_state = rnn_layer(input_sequence, initial_hidden_state)

print(f"Input sequence shape: {input_sequence.shape}")
print(f"Initial hidden state shape: {initial_hidden_state.shape}")
print(f"\nOutput sequence shape: {output_sequence.shape}")
print(f"Final hidden state shape: {final_hidden_state.shape}")

Input sequence shape: torch.Size([400, 123, 78])
Initial hidden state shape: torch.Size([1, 400, 243])

Output sequence shape: torch.Size([400, 123, 243])
Final hidden state shape: torch.Size([1, 400, 243])


In [195]:
device = torch.device("mps" if torch.backends.mps.is_available else "cuda" if torch.cuda.is_available else "cpu")
recurrent_layer = nn.RNN(input_size=32, hidden_size=98, batch_first=True, device=device)

# [batch_size, sequence_length, input_feature_size]
input_sequence = torch.rand(456, 103, 32, device=device)

# [num_layer, batch_size, hidden_size]
initial_hidden_state = torch.randn(1, 456, 98, device=device)

output_sequence, final_hidden_state = recurrent_layer(input_sequence, initial_hidden_state)

print(f"Input sequence shape: {input_sequence.shape}")
print(f"Initial hidden state shape: {initial_hidden_state.shape}")
print(f"\nOutput sequence shape: {output_sequence.shape}")
print(f"Final hidden state shape: {final_hidden_state.shape}")

Input sequence shape: torch.Size([456, 103, 32])
Initial hidden state shape: torch.Size([1, 456, 98])

Output sequence shape: torch.Size([456, 103, 98])
Final hidden state shape: torch.Size([1, 456, 98])


In [199]:
# activation(non-linear) function
import torch.nn.functional as F
F.relu??

[0;31mSignature:[0m [0mF[0m[0;34m.[0m[0mrelu[0m[0;34m([0m[0minput[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m [0minplace[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m)[0m [0;34m->[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mrelu[0m[0;34m([0m[0minput[0m[0;34m:[0m [0mTensor[0m[0;34m,[0m [0minplace[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m)[0m [0;34m->[0m [0mTensor[0m[0;34m:[0m  [0;31m# noqa: D400,D402[0m[0;34m[0m
[0;34m[0m    [0;34mr"""relu(input, inplace=False) -> Tensor[0m
[0;34m[0m
[0;34m    Applies the rectified linear unit function element-wise. See[0m
[0;34m    :class:`~torch.nn.ReLU` for more details.[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m    [0;32mif[0m [0mhas_torch_function_unary[0m[0;34m([0m[0minput[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mhandle_t

In [200]:
nn.ReLU??

[0;31mInit signature:[0m [0mnn[0m[0;34m.[0m[0mReLU[0m[0;34m([0m[0minplace[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mReLU[0m[0;34m([0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Applies the rectified linear unit function element-wise.[0m
[0;34m[0m
[0;34m    :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)`[0m
[0;34m[0m
[0;34m    Args:[0m
[0;34m        inplace: can optionally do the operation in-place. Default: ``False``[0m
[0;34m[0m
[0;34m    Shape:[0m
[0;34m        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.[0m
[0;34m        - Output: :math:`(*)`, same shape as the input.[0m
[0;34m[0m
[0;34m    .. image:: ../scripts/activation_images/ReLU.png[0m
[0;34m[0m
[0;34m    Examples::[0m
[0;34m[0m
[0;34m        >>> m = nn.ReLU()[0m
[0;34m        >>> input = torch.randn(2)[0m
[0;34m        >>> ou

$$\text{ReLU}(x) = (x)^+ = \max(0, x)$$

In [214]:
from IPython.display import display, Math, Latex
display(Math(r'\text{ReLU}(x) = (x)^+ = \max(0, x)'))

<IPython.core.display.Math object>

In [330]:
import torch
from torch import nn
from IPython.display import display, Math, Latex
from torch.nn import functional as F

relu = nn.ReLU()
a = torch.tensor([-10])
out = relu(a)
print(f"Input: {a}")
print(f"After ReLU: {out}\n")
# simple any negatives values are changed to 0, while positives values stay as they are

display(Math(r'\text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)'))
m = nn.LeakyReLU()
inputs = torch.tensor(-4)
output = m(inputs.double())
print(output)

class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        # defining layers, expects 456 input_feature_size, and outputs it into 233 dimensions
        self.fc1 = nn.Linear(456, 233)
        self.relu_layer = nn.ReLU()
        self.fc2 = nn.Linear(233, 30)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu_layer(x) # or F.relu(x)
        x = self.fc2(x)
        return x

model = SimpleNet()
# batch size of 45 and number of features are 456, dimensions only specificed at this point
data = torch.arange(456*45, dtype=torch.float32).view(45,456)
output = model(data) # forward pass
print(f"\nInput feature shape: {data.shape}")
print(f"Output feature shape: {output.shape}")

Input: tensor([-10])
After ReLU: tensor([0])



<IPython.core.display.Math object>

tensor(-0.0400, dtype=torch.float64)

Input feature shape: torch.Size([45, 456])
Output feature shape: torch.Size([45, 30])


In [153]:
import torch
from torch import nn

class SimpleNett(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(in_features=input_size, out_features=hidden_size)
        self.relu_layer = nn.ReLU()
        self.fc2 = nn.Linear(in_features=hidden_size, out_features=output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu_layer(x)
        x = self.fc2(x)
        return x

model = SimpleNett(input_size=678, hidden_size=99, output_size=10)
data = torch.randn(101, 678)
output = model(data)
print(f"Simplnet here: {SimpleNett}")
print(f"Input shape: {data.shape}")
print(f"Output shape: {output.shape}")
for name, param in model.named_parameters():
    print(f"\n{name}\n{param}")

Simplnet here: <class '__main__.SimpleNett'>
Input shape: torch.Size([101, 678])
Output shape: torch.Size([101, 10])

fc1.weight
Parameter containing:
tensor([[ 0.0019,  0.0027, -0.0225,  ...,  0.0321, -0.0231,  0.0119],
        [ 0.0206,  0.0009,  0.0196,  ..., -0.0176,  0.0227, -0.0317],
        [ 0.0295,  0.0151, -0.0250,  ..., -0.0037, -0.0282,  0.0188],
        ...,
        [ 0.0181,  0.0057, -0.0311,  ..., -0.0270, -0.0292, -0.0313],
        [-0.0020, -0.0078,  0.0020,  ..., -0.0127,  0.0217, -0.0142],
        [-0.0274, -0.0236,  0.0315,  ..., -0.0353, -0.0285,  0.0055]],
       requires_grad=True)

fc1.bias
Parameter containing:
tensor([-0.0059,  0.0095,  0.0254,  0.0094,  0.0174,  0.0240, -0.0250,  0.0104,
         0.0306, -0.0013,  0.0266,  0.0342,  0.0234, -0.0381, -0.0038, -0.0235,
        -0.0313,  0.0297, -0.0225,  0.0318, -0.0125,  0.0376,  0.0013,  0.0094,
         0.0350, -0.0180, -0.0234,  0.0136,  0.0367, -0.0055,  0.0210, -0.0281,
        -0.0132, -0.0231, -0.0024,  

In [31]:
nn.Sigmoid??

[0;31mInit signature:[0m [0mnn[0m[0;34m.[0m[0mSigmoid[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mSigmoid[0m[0;34m([0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Applies the Sigmoid function element-wise.[0m
[0;34m[0m
[0;34m    .. math::[0m
[0;34m        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}[0m
[0;34m[0m
[0;34m[0m
[0;34m    Shape:[0m
[0;34m        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.[0m
[0;34m        - Output: :math:`(*)`, same shape as the input.[0m
[0;34m[0m
[0;34m    .. image:: ../scripts/activation_images/Sigmoid.png[0m
[0;34m[0m
[0;34m    Examples::[0m
[0;34m[0m
[0;34m        >>> m = nn.Sigmoid()[0m
[0;34m        >>> input = torch.randn(2)[0m
[0;34m        >>> output = m(input)[0m
[0;34m    """[0m[0;34m[0

$$\text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}$$

In [50]:
sigmoid_fn = nn.Sigmoid()
a = torch.randn(4,5)
out = sigmoid_fn(a)
print(out)

tensor([[0.8413, 0.2666, 0.2209, 0.4478, 0.2555],
        [0.3689, 0.5505, 0.6077, 0.3919, 0.6345],
        [0.8020, 0.8913, 0.8339, 0.1325, 0.5005],
        [0.6600, 0.3008, 0.6081, 0.3239, 0.4794]])


In [83]:
sig = nn.Sigmoid()
x = torch.arange(1,11, dtype=torch.float32).view(2,-1)
out = sig(x)
print(out)

tensor([[0.0117, 0.0317, 0.0861, 0.2341, 0.6364],
        [0.0117, 0.0317, 0.0861, 0.2341, 0.6364]])


In [105]:
sig = nn.Softmax()
x = torch.arange(100,200).reshape(10,-1)
out = sig(x.double())
print(out)

tensor([[7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01],
        [7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01],
        [7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01],
        [7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01],
        [7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01],
        [7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01],
        [7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
         3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01],
        [7.8013e-05, 2.1206

In [135]:
tanh_fn = nn.Tanh()
a = torch.randn(5,3)
out = tanh_fn(a)
print(a, out)

tensor([[-1.2553, -1.3892, -0.3123],
        [ 1.0924, -0.1668, -0.4186],
        [ 0.5846,  0.6687,  0.1507],
        [-0.5314, -0.2875,  0.4560],
        [ 1.2602,  1.0357, -0.1451]]) tensor([[-0.8498, -0.8830, -0.3025],
        [ 0.7977, -0.1653, -0.3957],
        [ 0.5260,  0.5842,  0.1495],
        [-0.4864, -0.2798,  0.4268],
        [ 0.8511,  0.7762, -0.1441]])


In [151]:
tanh = nn.Tanh()
data = torch.randn(4,5)
output = tanh(data)
display(data, output)

tensor([[-0.8751,  0.4436,  0.2700,  0.6443,  1.4003],
        [ 0.3759, -0.3399, -0.2924,  0.6470,  1.0304],
        [-1.6272, -1.5808,  0.1687,  1.2274, -0.2017],
        [-0.2560,  0.9555, -1.1243,  1.6118,  0.4275]])

tensor([[-0.7040,  0.4166,  0.2636,  0.5678,  0.8854],
        [ 0.3592, -0.3274, -0.2844,  0.5696,  0.7741],
        [-0.9257, -0.9187,  0.1671,  0.8418, -0.1990],
        [-0.2505,  0.7423, -0.8091,  0.9234,  0.4032]])

In [161]:
import torch
from torch import nn
from collections import OrderedDict
from torch.nn import functional as F

input_size = 567
hidden_size = 128
output_size = 10

model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_size)
)
# the problem is that weights and biases and each layers can be accessed with int indexes, which can be 
# hard for debugging, better assign names for each layers with OrderedDict
data = torch.randn(45, 567)
output = model(data)
print(f"Input: {data.shape}")
print(f"Output: {output.shape}")
for name, param in model.named_parameters():
    print(name, param.shape)

Input: torch.Size([45, 567])
Output: torch.Size([45, 10])
0.weight torch.Size([128, 567])
0.bias torch.Size([128])
2.weight torch.Size([10, 128])
2.bias torch.Size([10])


In [162]:
OrderedDict??

[0;31mInit signature:[0m [0mOrderedDict[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mOrderedDict[0m[0;34m([0m[0mdict[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m'Dictionary that remembers insertion order'[0m[0;34m[0m
[0;34m[0m    [0;31m# An inherited dict maps keys to values.[0m[0;34m[0m
[0;34m[0m    [0;31m# The inherited dict provides __getitem__, __len__, __contains__, and get.[0m[0;34m[0m
[0;34m[0m    [0;31m# The remaining methods are order-aware.[0m[0;34m[0m
[0;34m[0m    [0;31m# Big-O running times for all methods are the same as regular dictionaries.[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;31m# The internal self.__map dict maps keys to links in a doubly linked list.[0m[0;34m[0m
[0;34m[0m    [0;31m# The circular doubly linked list starts and en

In [170]:
from collections import OrderedDict

input_size = 782
hidden_size = 64
output_size= 13

model_v2 = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(input_size, hidden_size)),
    ('relu', nn.ReLU()),
    ('fc2', nn.Linear(hidden_size, output_size))
]))
# OrderedDict from python collections, we can access given layers and corresponding weights and biases
# without indexes but rather with intuitive names
data = torch.randn(56, input_size)
output = model_v2(data)
print(f"Model parameters shape: {model_v2.fc1.weight.shape}, {model_v2.fc2.bias.shape}")

Model parameters shape: torch.Size([64, 782]), torch.Size([13])


In [184]:
import torch
from torch import nn
from collections import OrderedDict

# unlike nn.Module where when defining the layers/activation functions in the constructor
# assigning as the attributes of the model, the order did not matter, because we were specifying the order
# inside of the forward function, but for models where output of one function directly goes to the 
# input of another, without having any skips or changing directions, then we can use
# nn.Sequential without having to define forward function
# we simply pass as arguments the submodules (layers/activation functions), and their order matters
# we are both defining the layers and how the data will flow through the layers, sequentially
# one after another

model_v4 = nn.Sequential(OrderedDict([
    ("layer1", nn.Linear(600, 300)),
    ("relu", nn.ReLU()),
    ("layer2", nn.Linear(300, 10))
]))

data = torch.randn(56, 600)
output =  model_v4(data)
print(f"Model parameters: {model_v4.layer1}, {model_v4.relu}\n,{model_v4.layer2}")
display(model_v4.layer1.weight, model_v4.layer1.weight.shape, model_v4.layer1.bias, model_v4.layer1.bias.shape)
display(model_v4.relu, model_v4.layer2, model_v4.layer2.weight, model_v4.layer2.weight.shape)
display(model_v4.layer2.bias, model_v4.layer2.bias.shape)

Model parameters: Linear(in_features=600, out_features=300, bias=True), ReLU()
,Linear(in_features=300, out_features=10, bias=True)


Parameter containing:
tensor([[-0.0175,  0.0137, -0.0230,  ...,  0.0202, -0.0355, -0.0203],
        [-0.0282,  0.0266, -0.0140,  ...,  0.0200, -0.0021,  0.0129],
        [ 0.0206, -0.0360, -0.0055,  ...,  0.0149,  0.0388,  0.0267],
        ...,
        [-0.0320,  0.0116,  0.0202,  ..., -0.0082,  0.0236, -0.0013],
        [ 0.0399, -0.0038,  0.0194,  ..., -0.0357,  0.0204,  0.0200],
        [ 0.0293, -0.0382,  0.0241,  ..., -0.0038,  0.0085,  0.0268]],
       requires_grad=True)

torch.Size([300, 600])

Parameter containing:
tensor([ 0.0407,  0.0016,  0.0061,  0.0033,  0.0151, -0.0386,  0.0227,  0.0004,
        -0.0276, -0.0212, -0.0076, -0.0070, -0.0091, -0.0305,  0.0292, -0.0238,
         0.0175, -0.0004,  0.0272, -0.0396,  0.0341, -0.0369,  0.0021, -0.0032,
         0.0231,  0.0405,  0.0187,  0.0010, -0.0161,  0.0270,  0.0404,  0.0300,
         0.0334, -0.0144,  0.0341, -0.0218, -0.0230, -0.0213, -0.0049,  0.0231,
         0.0051,  0.0066,  0.0106,  0.0304,  0.0389,  0.0275, -0.0275,  0.0369,
         0.0250,  0.0208,  0.0209,  0.0171,  0.0242,  0.0399,  0.0315, -0.0403,
        -0.0076,  0.0277,  0.0279, -0.0378,  0.0012,  0.0350,  0.0200, -0.0236,
        -0.0115,  0.0201, -0.0264,  0.0037,  0.0399, -0.0333, -0.0042,  0.0247,
        -0.0237, -0.0106,  0.0349,  0.0341,  0.0198, -0.0122, -0.0057,  0.0101,
        -0.0169,  0.0056,  0.0339, -0.0279,  0.0330, -0.0162,  0.0041, -0.0198,
        -0.0162,  0.0022, -0.0197,  0.0035,  0.0359,  0.0387, -0.0316,  0.0015,
        -0.0108, -

torch.Size([300])

ReLU()

Linear(in_features=300, out_features=10, bias=True)

Parameter containing:
tensor([[ 0.0412,  0.0139, -0.0110,  ...,  0.0027, -0.0018,  0.0428],
        [ 0.0538,  0.0253, -0.0577,  ..., -0.0107, -0.0512, -0.0273],
        [ 0.0269,  0.0497,  0.0056,  ..., -0.0018,  0.0068, -0.0007],
        ...,
        [ 0.0564,  0.0095, -0.0552,  ...,  0.0101, -0.0048,  0.0051],
        [-0.0217,  0.0099,  0.0156,  ..., -0.0221,  0.0190,  0.0185],
        [ 0.0248, -0.0280, -0.0562,  ..., -0.0362,  0.0295, -0.0189]],
       requires_grad=True)

torch.Size([10, 300])

Parameter containing:
tensor([ 0.0325,  0.0295, -0.0543,  0.0170,  0.0010,  0.0235, -0.0474,  0.0253,
         0.0096,  0.0366], requires_grad=True)

torch.Size([10])

In [2]:
import torch
from torch import nn
from collections import OrderedDict

# one of the applications of the nn.Sequential is creating a reusable blocks of code
# for data to go through given functions defined in nn.Sequential in the sequential order
# without skipping or branching, and reuse this block of code inside of the more complex
# model inside of custom nn.Module structure

# Also used for feed-forward networks, where output of one functions directly goes to the
# input of the following functions, without skipping or branching and the order is important
# nn.Sequential has a linear nature, meaning functions are defined one next to the other
# in the linear fashion and must go through each of them sequentially or linearly
# Also, this nn.Sequential only assumed that there is a single input and a single output
# meaning more than one input tensor will not fit here

model = nn.Sequential(OrderedDict([
    ("fc1", nn.Linear(748, 345)),
    ("relu", nn.ReLU()),
    ("fc2", nn.Linear(345, 12))
]))
data = torch.randn(34,748)
output = model(data)
display(model.fc1.weight.shape, model.fc1.bias.shape, model.relu, model.fc2.weight.shape, model.fc2.bias.shape)

torch.Size([345, 748])

torch.Size([345])

ReLU()

torch.Size([12, 345])

torch.Size([12])

In [188]:
# nn.Sequential layers and containing weights and biases can be accessed with indexing
# but this approach is not practical, therefore it is better to assign names
# for each layer with OrderedDict, then we can get respective weights and biases on need

$$\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
        l_n = \left( x_n - y_n \right)^2$$

In [65]:
import torch
from torch import nn

loss_fn = nn.MSELoss() # or also referred as L2Loss

predictions = torch.randn(3,4, requires_grad=True)
labels = torch.randn(3,4)

loss = loss_fn(predictions, labels)
print(f"Loss value: {loss.item()}")

Loss value: 2.819516181945801


$$\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
    l_n = \left| x_n - y_n \right|$$

In [60]:
# Also there is Mean Absolute Error, also known as L1Loss
loss_fn = nn.L1Loss()

prediction = torch.rand(3,5, requires_grad=True)
target = torch.rand(3,5)
loss = loss_fn(prediction, target)
print(f"Loss value: {loss.item()}")

Loss value: 0.38298141956329346


In [3]:
# Mean Squared Error (L1Loss) and Mean Absolute Error (L2Loss), are mostly used
# for regression tasks where the objective is to output a continuous value
# for Classification tasks we use classification loss such as Cross Entropy Loss

criterion_L1 = nn.MSELoss()
criterion_L2 = nn.L1Loss()

outputs = nn.Parameter(torch.randn(4,5, requires_grad=True))
labels = torch.randn(4,5)
l2_loss = criterion_L2(outputs, labels)
l1_loss = criterion_L1(outputs, labels)
print(f"MSELoss(L2 loss): {l1_loss.item()}, L1Loss: {l2_loss.item()}")

MSELoss(L2 loss): 1.6080474853515625, L1Loss: 0.9996082186698914


In [4]:
# Mean Squared Error penilizes errors more harshly by giving much higher losses compared to Mean Absolute Error
# hence MSELoss is very sensitive to outliers, while L1Loss is not so much
# MSELoss will try to move a line closer to outlier data point even if it means to shift away from the 
# most of the data points, while L1Loss will generally stay where the most of the data points are
# and only sligthly move towards outlier, but not as much as L2loss
# again MSE and MAE are used for regression tasks where we want to predict a continuous value

# Cross Entropy Loss is for multi-class classifications where in each sample out of all classess
# there is only one correct class, and it cannot belong to several classes at the same time
# For sign language, in our case there are 100 classes and each signed word can belong to only a single
# class word, even though it is true that one sign can mean two or more different things
# based on context, for simplicity we can relax this assumption, and say that each signed word
# belongs to only a single class word

$$\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
          l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
          \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}$$

In [33]:
cross_loss = nn.CrossEntropyLoss()
output = torch.randn(3,5, requires_grad=True)
target = torch.tensor([1,4,2])
loss = cross_loss(output, target)
print(f"Cross Entropy Loss (Multi-class) classification loss: {loss.item()}")

Cross Entropy Loss (Multi-class) classification loss: 1.587830662727356


In [41]:
categorical_loss = nn.CrossEntropyLoss()
output = torch.randn(5, 10, requires_grad=True)
label = torch.arange(5)
loss = categorical_loss(output, label)
print(f"Categorical CrossEntropyLoss: {loss.item()}")

Categorical CrossEntropyLoss: 2.5165951251983643


In [53]:
categorical_loss = nn.CrossEntropyLoss()
# batch size of 7(rows), and 10(columns) input features
output = torch.randn(7, 10, requires_grad=True)
label = torch.tensor([1,0,5,6,5,8,9])
loss = categorical_loss(output, label)
print(f"Categorical Cross Entropy Loss: {loss.item()}")

Categorical Cross Entropy Loss: 2.789682388305664


In [54]:
# the total number of values in the label/target output must match
# with the batch size, and the number of classes must range between 0-(C-1) 
# where C is the total number of classes
# nn.CrossEntropyLoss() is for multi-class classification where 
# each object/data_point belongs to only a single class out of N-classes
# if each data_point/object can belong to more than one class at the same time
# then use Binary Cross Entropy Loss
# for CrossEntropyLoss since there are many classes, we assign integer values


In [70]:
categorical_cross_loss = nn.CrossEntropyLoss()
output = torch.randn(5,10, requires_grad=True)
# the number of values inside of the target label, must match the batch size
# and the individual number values can range between 0-(C-1)
label = torch.tensor([4,1,1,3,7])
loss = categorical_cross_loss(output, label)
print(f"Categorical Loss: {loss.item()}")

Categorical Loss: 3.179534912109375


In [2]:
# nn.BCELoss expects the model outputs to be already probabilites
# meaning before passing outputs to nn.BCELoss, we have to apply sigmoid function
# to squeeze the output ranges from 0 to 1, and then pass these outputs to nn.BCELoss
# however, for nn.BCEWithLogitsLoss, the sigmoid is done automatically
# meaning we can pass the raw model outputs(logits) to the nn.BCEWithLogitsLoss()
# and it will provide loss, and one more thing, the target labels
# individual value range must be either 0 or 1, and nothing more
# because a given data points either belongs to certain class, as 1 if ti does or not which is defined as 0
# and the shape of target labels must be the same as the model outputs

bce_loss = nn.BCELoss()
output = torch.randn(3,4, requires_grad=True)
output = torch.sigmoid(output) # squeezes output in the range of 0 and 1
target_label = torch.tensor([[1.0,1.0,0.0,1.0],[1.0,1.0,0.0,0.0],[1.0,1.0,1.0,0.0]])
loss = bce_loss(output, target_label)
print(f"Binary Cross Entropy Loss: {loss.item()}")

Binary Cross Entropy Loss: 0.7762537002563477


In [15]:
# BCELoss expects the output to be in probability format in the range of 0-1
# hence need to use Sigmoid in order to squeeze the output into range of 0-1
binary_loss = nn.BCELoss()

output = torch.randn(3,4, requires_grad=True)
output = torch.sigmoid(output)
label = torch.tensor([[1,0,0,1],[1,0,1,1],[0,1,1,0]], dtype=torch.float32)

loss = binary_loss(output, label)
print(f"Binary Cross Entropy Loss: {loss}")

Binary Cross Entropy Loss: 0.8383407592773438


In [20]:
# nn.BCEWithLogitsLoss() automatically applies sigmoid function
# to the raw output(logits) of the model, so no need to use sigmoid explicitly
binary_loss = nn.BCEWithLogitsLoss()

output = torch.randn(4,5, requires_grad=True)
label = torch.ones(4,5)

loss = binary_loss(output, label)
print(f"Binary Cross Entropy Loss with Logits: {loss.item()}")

Binary Cross Entropy Loss with Logits: 0.5449265241622925


In [21]:
nn.BCEWithLogitsLoss??

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mBCEWithLogitsLoss[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mweight[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msize_average[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreduce[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreduction[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'mean'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpos_weight[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mBCEWithLogitsLoss[0m[0;34m([0m[0m_Loss[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""This loss combines a `Sigmoid` layer 

In [53]:
# The target label must have the same shape as the output of the mode
# nn.BCELoss() requires the output to be in the range of 0-1, hence use torch.sigmoid before passing
# output to BCELoss()
# nn.BCEWithLogitsLoss() applies sigmoid automatically to the raw output logits
binary_loss = nn.BCEWithLogitsLoss()

output = torch.randn(4,6, requires_grad=True)
# the target label must be either 0 or 1
label = torch.zeros(4,6)

loss = binary_loss(output, label)
print(f"Binary Cross Entropy Loss: {loss.item()}")

Binary Cross Entropy Loss: 1.0016568899154663


In [54]:
from torch import optim

optim.SGD??

[0;31mInit signature:[0m
[0moptim[0m[0;34m.[0m[0mSGD[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mparams[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmomentum[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdampening[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweight_decay[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnesterov[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaximize[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mforeach[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m 

In [11]:
from torch import optim

optimizer = optim.SGD(model.parameters(), lr=0.01)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# They are the variant of gradient descent
# they update the actual weights and biases based on the calculated gradients/derivatives
# of the loss with respect to all parameters(weights, biases)

In [12]:
optim.SGD?

[0;31mInit signature:[0m
[0moptim[0m[0;34m.[0m[0mSGD[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mparams[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmomentum[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdampening[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweight_decay[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnesterov[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaximize[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mforeach[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m 

In [14]:
optim.Adam?

[0;31mInit signature:[0m
[0moptim[0m[0;34m.[0m[0mAdam[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mparams[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mIterable[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m[0;34m,[0m [0mIterable[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlr[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbetas[0m[0;34m:[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m [0;34m=[0m [0;34m([0m[0;36m0.9[0m[0;34m,[0m [0;36m0.999[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meps[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1e-08[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweight_decay[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0

In [22]:
a = torch.randint(0,100, (5,4)).float()
print(a)

tensor([[ 1., 12., 47., 35.],
        [49., 11., 62., 26.],
        [89.,  3., 69., 35.],
        [56., 64., 15., 51.],
        [62., 33., 46., 53.]])


In [43]:
b = torch.randint(6,16, (3,4)).float()
print(b)

tensor([[11., 10., 10.,  6.],
        [11.,  8., 11.,  7.],
        [15., 10.,  7.,  7.]])


In [61]:
import torch
from torch import nn
from torch import optim

class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNet, self).__init__() # initilizes parent class
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.sigmoid(x)
        return x

model = SimpleNet(400, 235, 13)

data = torch.randint(100,400, (56, 400)).float() # the lowest number is 100, and highest is 400, and shape is (56,400)
# will output only integers between these range
label = torch.randint(0,1,(56,13)).float()

optimizer = optim.SGD(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss() # expects output to be in range 0-1, while BCEWithLogitsLoss() converts it automatically

for epoch in range(1000):
    optimizer.zero_grad()
    output = model(data)
    loss = loss_fn(output, label)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, Loss: {loss.item()}")


Epoch: 0, Loss: 48.84001159667969
Epoch: 100, Loss: 8.658561512220331e-08
Epoch: 200, Loss: 8.269275042493973e-08
Epoch: 300, Loss: 7.920038314068734e-08
Epoch: 400, Loss: 7.601553164704455e-08
Epoch: 500, Loss: 7.309704130875616e-08
Epoch: 600, Loss: 7.042604011076037e-08
Epoch: 700, Loss: 6.797645113465478e-08
Epoch: 800, Loss: 6.571800525989602e-08
Epoch: 900, Loss: 6.363453053381818e-08


In [62]:
import os

os.path.join??

[0;31mSignature:[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0ma[0m[0;34m,[0m [0;34m*[0m[0mp[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mjoin[0m[0;34m([0m[0ma[0m[0;34m,[0m [0;34m*[0m[0mp[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Join two or more pathname components, inserting '/' as needed.[0m
[0;34m    If any component is an absolute path, all previous path components[0m
[0;34m    will be discarded.  An empty last part will result in a path that[0m
[0;34m    ends with a separator."""[0m[0;34m[0m
[0;34m[0m    [0ma[0m [0;34m=[0m [0mos[0m[0;34m.[0m[0mfspath[0m[0;34m([0m[0ma[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0msep[0m [0;34m=[0m [0m_get_sep[0m[0;34m([0m[0ma[0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mpath[0m [0;34m=[0m [0ma[0m[0;34m[0m
[0;34m[0m    [0;32mtry[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0;32mnot

# Data Handling

In [3]:
import numpy as np

a = np.random.randint(5,10,size=10)
print(a)
b = np.random.randn(2,5)
print(b)

features = np.random.rand(100,10)
labels = np.random.randint(0,5,size=100)
display(features.shape, labels.shape, labels)
assert len(features) == len(labels), "shape no MATCH!" # used for debugging and returns AssertionError if statement is False
display(features.shape, labels.shape, labels)

[9 9 6 7 6 9 6 8 9 5]
[[ 0.27489797  0.79296663 -1.53291725  1.449387    1.31492947]
 [-1.28704841 -1.3266105  -0.67658688 -0.38525619 -1.35918974]]


(100, 10)

(100,)

array([2, 3, 2, 3, 1, 1, 4, 0, 0, 3, 0, 2, 0, 2, 1, 3, 3, 3, 2, 2, 4, 2,
       4, 0, 1, 0, 0, 2, 1, 4, 2, 3, 4, 3, 4, 1, 3, 4, 2, 2, 4, 1, 2, 4,
       2, 4, 0, 1, 4, 4, 3, 3, 2, 3, 3, 4, 0, 4, 3, 4, 4, 1, 1, 0, 1, 3,
       1, 3, 4, 1, 0, 3, 0, 1, 1, 2, 4, 2, 3, 2, 2, 4, 3, 2, 3, 3, 0, 2,
       2, 3, 3, 3, 3, 1, 2, 4, 4, 1, 3, 0])

(100, 10)

(100,)

array([2, 3, 2, 3, 1, 1, 4, 0, 0, 3, 0, 2, 0, 2, 1, 3, 3, 3, 2, 2, 4, 2,
       4, 0, 1, 0, 0, 2, 1, 4, 2, 3, 4, 3, 4, 1, 3, 4, 2, 2, 4, 1, 2, 4,
       2, 4, 0, 1, 4, 4, 3, 3, 2, 3, 3, 4, 0, 4, 3, 4, 4, 1, 1, 0, 1, 3,
       1, 3, 4, 1, 0, 3, 0, 1, 1, 2, 4, 2, 3, 2, 2, 4, 3, 2, 3, 3, 0, 2,
       2, 3, 3, 3, 3, 1, 2, 4, 4, 1, 3, 0])

In [110]:
import torch
from torch.utils.data import Dataset
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(features) # given that both features and labels size must be equal, returning one's size is enough

    def __getitem__(self, idx):
        # get a single data from the sample by index
        feature = self.features[idx]
        label = self.labels[idx]

        # we can take as input list or numpy ndarrays, but we must convert them to tensors and return
        # features and labels as tuple
        sample = (torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.int64))
        return sample

batch_size = 100
input_size = 10
features_data = np.random.rand(batch_size, input_size) # 100 by 10
labels_data = np.random.randint(5,10, size=batch_size)
# look 10 features, corresponding to having only one label
# that is why the shape of labels is (100) basically a row vector, not a matrix
# again even with millions of features, it would have only one single corresponding label

my_dataset = CustomDataset(features_data, labels_data)
print(f"My dataset size: {len(my_dataset)}\n")

first_sample = my_dataset[0]
first_feature, first_label = first_sample

print(f"First sample size: {first_sample}")
print(f"First feature size: {first_feature.shape}")
print(f"First label size: {first_label.shape}")

My dataset size: 100

First sample size: (tensor([0.8712, 0.6464, 0.4607, 0.8589, 0.3803, 0.7976, 0.2634, 0.7944, 0.8359,
        0.8287]), tensor(8))
First feature size: torch.Size([10])
First label size: torch.Size([])


In [125]:
import torch
from torch.utils.data import Dataset
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels) # if AssertionError pops up then we will know that it is because of this
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # for a given index, provide both feature and label of that given index from the sample
        feature = self.features[idx]
        label = self.labels[idx]

        # we have to convert to tensor and return as a tuple of feature and label for given index
        sample = (torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.float32))
        return sample

features_d = np.random.randn(400, 50) # bath size 400, and number of features is 50
labels_d = np.random.randn(400)

# remember, dataset returns a tuple containing feature and label
custom_dataset = CustomDataset(features_d, labels_d)
print(f"Custom dataset size: {len(custom_dataset)}")

feature_1, label_1 = custom_dataset[0]
print(f"First feature shape: {feature_1.shape}")
print(f"First label shape: {label_1.shape}")

for idx, sample in enumerate(custom_dataset):
    print(idx, sample)

Custom dataset size: 400
First feature shape: torch.Size([50])
First label shape: torch.Size([])
0 (tensor([-0.7775,  0.5033,  0.0716,  0.8516,  1.5418, -0.6231, -0.4957,  0.6414,
         0.2658,  0.9195,  0.9075, -2.5162,  1.5511,  0.4663, -0.4269,  0.7292,
         1.2050, -0.8770,  0.5460,  0.5613,  1.1625, -0.1521, -1.1106, -0.4349,
         0.6739,  0.3471,  0.0924, -0.5720, -0.6781, -0.2323,  0.5186, -0.0785,
         0.4389,  0.5458, -1.0737,  0.1319,  0.2427, -2.0302,  0.3010,  1.4198,
         0.9126,  0.2522, -0.5937,  0.5791,  0.0549, -0.2832, -0.5893,  0.1626,
        -0.4605, -0.7168]), tensor(0.8611))
1 (tensor([ 0.4048,  0.5614,  0.5157,  0.3312, -1.2971,  2.5868, -1.7598, -1.3626,
         2.2573, -0.6847, -0.0719,  0.2926,  0.9393,  0.4447,  0.3880, -0.6699,
         0.9073,  0.3172,  1.6562, -0.1892,  1.0904,  1.0268, -2.0419,  0.3942,
        -0.3816, -0.2055,  0.0864, -0.7578, -0.2828, -1.7723,  0.9638,  0.4744,
         0.0063, -1.8630,  0.6693,  0.8392, -1.8720, 

In [20]:
import torch
from torch.utils.data import Dataset
import numpy as np

class CustomData(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        sample = (torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.float32))
        return sample

inputs = np.random.randint(100,500, size=(560, 43))
labels = np.random.randn(560)

datac = CustomData(inputs, labels)
print(f"Dataset size: {len(datac)}")

input_1, label_1 = datac[0]
print(f"First feature:\n{input_1}\nFirst label: {label_1}")

Dataset size: 560
First feature:
tensor([154., 209., 209., 347., 190., 495., 379., 248., 218., 352., 301., 177.,
        370., 485., 423., 270., 499., 306., 115., 467., 325., 221., 205., 474.,
        308., 102., 298., 332., 455., 466., 199., 357., 178., 310., 306., 315.,
        222., 271., 490., 235., 356., 442., 480.])
First label: 0.6686887145042419


In [2]:
a = torch.nn.parameter.Parameter(torch.tensor(2.0))
print(isinstance(a, torch.nn.parameter.Parameter))
print(isinstance(a, torch.Tensor))

True
True


In [4]:
import torchvision
from torchvision import transforms

# transforms functions, converts PIL image format into PyTorch Tensors
# torchvision datasets usually have images in PIL image format
transformation = transforms.Compose([transforms.ToTensor()])

# Loading train/test dataset
# root defines the root directory where dataset will be saved
# download specifies whether given dataset will be dowloaded
# train=True specifiecs whether it is training set
# transform= specifies if we want to convert PIL image format into PyTorch Tensors

training_dataset = torchvision.datasets.CIFAR10(
                                                root="./dataset_CIFAR10",
                                                train=True,
                                                download=True,
                                                transform=transformation
                                                )

testing_dataset = torchvision.datasets.CIFAR10(
    root="./dataset_CIFAR10",
    train=False,
    transform=transformation,
    download=True
)

# Note that any time you run this code, it will firstly look whether a given dataset
# is present in the root directory, and if not then it will download
# if already present then it will not download the dataset again, avoiding duplication
print(f"\nCIFAR10 training dataset size: {len(training_dataset)}")
print(f"CIFAR10 testing dataset size: {len(testing_dataset)}")

image, label = training_dataset[0] # remember Dataset Class returns a tuple of data point and corresponding label
print(f"\nFirst Image size: {image.size()}")
print(f"First Label size: {label}")

Files already downloaded and verified
Files already downloaded and verified

CIFAR10 training dataset size: 50000
CIFAR10 testing dataset size: 10000

First Image size: torch.Size([3, 32, 32])
First Label size: 6


In [20]:
import torchvision
from torchvision import transforms

# convert PIL images format into PyTorch Tensors
transformation = transforms.Compose([transforms.ToTensor()])

training_set = torchvision.datasets.MNIST(
    root="./data_MNIST",
    train=True,
    transform=transformation,
    download=True
)

testing_set = torchvision.datasets.MNIST(
    root="./data_MNIST",
    train=False,
    transform=transformation,
    download=True
)
# Subsequent runs, will not download the same dataset again,
# it will first check whether a given dataset is present at the root dir
# if and only if not present, then downloads it

print(f"\nMNIST training set size: {len(training_set)}")
print(f"MNIST testing set size: {len(testing_set)}")

# remember that dataset class returns the tuple of data point and respective label
image, label = training_set[0]
print(f"\nFirst image size: {image.shape}")
print(f"First label: {label}")


MNIST training set size: 60000
MNIST testing set size: 10000

First image size: torch.Size([1, 28, 28])
First label: 5


In [23]:
transforms.ToTensor??

[0;31mInit signature:[0m [0mtransforms[0m[0;34m.[0m[0mToTensor[0m[0;34m([0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mToTensor[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Convert a PIL Image or ndarray to tensor and scale the values accordingly.[0m
[0;34m[0m
[0;34m    This transform does not support torchscript.[0m
[0;34m[0m
[0;34m    Converts a PIL Image or numpy.ndarray (H x W x C) in the range[0m
[0;34m    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0][0m
[0;34m    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)[0m
[0;34m    or if the numpy.ndarray has dtype = np.uint8[0m
[0;34m[0m
[0;34m    In the other cases, tensors are returned without scaling.[0m
[0;34m[0m
[0;34m    .. note::[0m
[0;34m        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when[0m
[0;34

In [24]:
transforms.CenterCrop??

[0;31mInit signature:[0m [0mtransforms[0m[0;34m.[0m[0mCenterCrop[0m[0;34m([0m[0msize[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mCenterCrop[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Crops the given image at the center.[0m
[0;34m    If the image is torch Tensor, it is expected[0m
[0;34m    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.[0m
[0;34m    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.[0m
[0;34m[0m
[0;34m    Args:[0m
[0;34m        size (sequence or int): Desired output size of the crop. If size is an[0m
[0;34m            int instead of sequence like (h, w), a square crop (size, size) is[0m
[0;34m            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).[0m
[0;34m    """[0m

In [25]:
transforms.Resize??

[0;31mInit signature:[0m
[0mtransforms[0m[0;34m.[0m[0mResize[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msize[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minterpolation[0m[0;34m=[0m[0;34m<[0m[0mInterpolationMode[0m[0;34m.[0m[0mBILINEAR[0m[0;34m:[0m [0;34m'bilinear'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mantialias[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mResize[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Resize the input image to the given size.[0m
[0;34m    If the image is torch Tensor, it is expected[0m
[0;34m    to have [..., H, W] shape, where ... means a maximum of two leading dimensions[0m
[0;34m[0m
[0;34m    Args:[0m
[0;34m        size (sequence o

In [26]:
transforms.Normalize??

[0;31mInit signature:[0m [0mtransforms[0m[0;34m.[0m[0mNormalize[0m[0;34m([0m[0mmean[0m[0;34m,[0m [0mstd[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mNormalize[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Normalize a tensor image with mean and standard deviation.[0m
[0;34m    This transform does not support PIL Image.[0m
[0;34m    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``[0m
[0;34m    channels, this transform will normalize each channel of the input[0m
[0;34m    ``torch.*Tensor`` i.e.,[0m
[0;34m    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``[0m
[0;34m[0m
[0;34m    .. note::[0m
[0;34m        This transform acts out of place, i.e., it does not mutate the input tensor.[0m
[0;34m[0m
[0;34m    Args:[0m


In [27]:
transforms.Compose??

[0;31mInit signature:[0m [0mtransforms[0m[0;34m.[0m[0mCompose[0m[0;34m([0m[0mtransforms[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mCompose[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Composes several transforms together. This transform does not support torchscript.[0m
[0;34m    Please, see the note below.[0m
[0;34m[0m
[0;34m    Args:[0m
[0;34m        transforms (list of ``Transform`` objects): list of transforms to compose.[0m
[0;34m[0m
[0;34m    Example:[0m
[0;34m        >>> transforms.Compose([[0m
[0;34m        >>>     transforms.CenterCrop(10),[0m
[0;34m        >>>     transforms.PILToTensor(),[0m
[0;34m        >>>     transforms.ConvertImageDtype(torch.float),[0m
[0;34m        >>> ])[0m
[0;34m[0m
[0;34m    .. note::[0m
[0;34m        In order to script the transformations, please use ``torch.nn.Sequential`` as below.[0m
[0;34m[0m
[0;34m        >>> transforms = torch.nn.Sequential([0m


In [28]:
transforms.RandomHorizontalFlip??

[0;31mInit signature:[0m [0mtransforms[0m[0;34m.[0m[0mRandomHorizontalFlip[0m[0;34m([0m[0mp[0m[0;34m=[0m[0;36m0.5[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mRandomHorizontalFlip[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Horizontally flip the given image randomly with a given probability.[0m
[0;34m    If the image is torch Tensor, it is expected[0m
[0;34m    to have [..., H, W] shape, where ... means an arbitrary number of leading[0m
[0;34m    dimensions[0m
[0;34m[0m
[0;34m    Args:[0m
[0;34m        p (float): probability of the image being flipped. Default value is 0.5[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mp[0m[0;34m=[0m[0;36m0.5[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0msuper[0m[0;34m([0m

In [30]:
transforms.ColorJitter?

[0;31mInit signature:[0m
[0mtransforms[0m[0;34m.[0m[0mColorJitter[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mbrightness[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcontrast[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msaturation[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhue[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;3

In [31]:
transforms.RandomResizedCrop?

[0;31mInit signature:[0m
[0mtransforms[0m[0;34m.[0m[0mRandomResizedCrop[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msize[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale[0m[0;34m=[0m[0;34m([0m[0;36m0.08[0m[0;34m,[0m [0;36m1.0[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mratio[0m[0;34m=[0m[0;34m([0m[0;36m0.75[0m[0;34m,[0m [0;36m1.3333333333333333[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minterpolation[0m[0;34m=[0m[0;34m<[0m[0mInterpolationMode[0m[0;34m.[0m[0mBILINEAR[0m[0;34m:[0m [0;34m'bilinear'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mantialias[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Crop a random portion of image and resize it to a given size.

If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbi

In [32]:
transforms.RandomRotation?

[0;31mInit signature:[0m
[0mtransforms[0m[0;34m.[0m[0mRandomRotation[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdegrees[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minterpolation[0m[0;34m=[0m[0;34m<[0m[0mInterpolationMode[0m[0;34m.[0m[0mNEAREST[0m[0;34m:[0m [0;34m'nearest'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexpand[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcenter[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfill[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Rotate the image by angle.
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.

Args:
    degrees (sequence or number): Range of degrees to select from.
        If degrees is a number instead of sequence like (min, max), the range of degrees
        will be (-degrees, +

In [7]:
from torchvision import transforms
import torchvision

# Remember, that any data preprocessing is applied for all data splits train/val/test
# However, data augmentation is only applied to training split, to avoid overfitting and achieve better generalization
train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(244),
    transforms.RandomHorizontalFlip(), # data augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=[2.0,3.9,4.2], std = [1.2,1.4,1.6])
])

test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(244),
    transforms.ToTensor(), # Converts PIL image format or Numpy Arrays to PyTorch Tensors
    transforms.Normalize(mean=[2.0,3.9,4.2], std=[1.2,1.4,1.6])
])

print(train_transform)
print(test_transform)

train_set = torchvision.datasets.MNIST(
    root = "./data_MNIST",
    train=True,
    download=True,
    transform=train_transform
)
test_set = torchvision.datasets.MNIST(
    root = "./data_MNIST",
    train=False,
    download=True,
    transform=test_transform
)

Compose(
    Resize(size=256, interpolation=bilinear, max_size=None, antialias=True)
    RandomCrop(size=(244, 244), padding=None)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=[2.0, 3.9, 4.2], std=[1.2, 1.4, 1.6])
)
Compose(
    Resize(size=256, interpolation=bilinear, max_size=None, antialias=True)
    RandomCrop(size=(244, 244), padding=None)
    ToTensor()
    Normalize(mean=[2.0, 3.9, 4.2], std=[1.2, 1.4, 1.6])
)


In [14]:
from pathlib import Path
from torchvision.datasets import ImageFolder
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.Resize(255),
    transforms.RandomCrop(144),
    transforms.RandomHorizontalFlip(), # data augmentation
    transforms.AutoAugment(), # data augmentation
    transforms.Normalize(mean=[1.2,3.4,4.5], std=[1.4,5.6,7.8])
])

# No data augmentation is applied here
test_transform = transforms.Compose([
    transforms.Resize(255),
    transforms.RandomCrop(144),
    transforms.Normalize(mean=[1.2,3.4,4.5], std=[1.4,5.6,7.8])
])

train_set_path = Path("./data_MNIST")
test_set_path = Path("./data_MNIST")

train_set = ImageFolder(
    root = train_set_path,
    transform=train_transform
)

test_set = ImageFolder(
    root=test_set_path,
    transform=test_transform
)

sample_image, sample_label = train_set[0] # returns a tuple of data and label

FileNotFoundError: Found no valid file for the classes MNIST. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp

In [16]:
from torch.utils.data import Dataset
from PIL import Image

class CustomDataset(Dataset):
    def __init__(self, image_paths, label_paths, transform=None): # by default no transformation is applied
        self.image_paths = image_paths
        self.label_paths =  label_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.label_paths[idx]

        image = Image.open(image_path).convert("RGB") # load the image and convert to RGB 
        # apply preprocessing steps before returning the image and label
        if self.transform:
            image = self.transform(image)

        return tuple(image, label)

custom_train_dataset = CustomDataset(train_path, train_label, transform=train_transform)
custom_test_dataste = CustomDataset(test_path, test_label, transform=test_transform)

NameError: name 'train_path' is not defined

In [None]:
from torchvision.datasets import ImageFolder
from pathlib import Path
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image

train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.ToTensor(),
    transforms.RandomVerticalFlip()
])

test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.ToTensor()
])

train_path = Path("path/to/training/set")
test_path = Path("path/to/testing/set")

train_set = ImageFolder(root=train_path, transform=train_transform)
test_set = ImageFolder(root=test_path, transform=test_transform)

class CustomDataset(Dataset):
    def __init__(self, image_path, label_path, transform=None):
        self.image_path = image_path
        self.label_path = label_path
        self.transform = transform

    def __len__(self):
        return len(self.image_path)

    def __getitem__(self, idx):
        image_path = self.image_path[idx]
        label = self.label_path[idx]

        # preprocessing is applied AFTER capturing individual data points and BEFORE returning it as a tuple
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # after preprocessing is applied, now we can return the organized, normalized dataset
        return tuple(image, label)

custom_train_dataset = CustomDataset(train_path, train_label, transform=train_transform)
custom_test_dataset = CustomDataset(test_path, test_label, transform=test_transform)

In [22]:
from torchvision.datasets import ImageFolder
from pathlib import Path
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image

train_path = Path("path/to/train/set")
test_path = Path("path/to/test/set")

# remember any data preprocessing is applied to all data splits train/val/test
# but data augmentation is applied only to training set
train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(244),
    transforms.RandomHorizontalFlip(p=0.6) # data augmentation
])

test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(244)
])

train_set = ImageFolder(root=train_path, transform=train_transform)
test_set = ImageFolder(root=test_path, transform=test_transform)

class CustomDataset(Dataset):
    def __init__(self, image_path, label_path, transform=None):
        self.image_path = image_path
        self.label_path = label_path
        self.transform = transform

    def __len__(self):
        return len(self.image_path)

    def __getitem__(self, idx):
        image_path = self.image_path[idx]
        label = self.label_path[idx]

         # before returning, we need to apply transformations here
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        return tuple(torch.tensor(image), torch.tensor(label))

custom_train_dataset = CustomDataset(train_path, train_label, transform=train_transform)
custom_test_dataset = CustomDataset(test_path, test_label, transform=test_transform)

[0;31mInit signature:[0m [0mPath[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mPath[0m[0;34m([0m[0mPurePath[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""PurePath subclass that can make system calls.[0m
[0;34m[0m
[0;34m    Path represents a filesystem path but unlike PurePath, also offers[0m
[0;34m    methods to do system calls on path objects. Depending on your system,[0m
[0;34m    instantiating a Path will return either a PosixPath or a WindowsPath[0m
[0;34m    object. You can also instantiate a PosixPath or WindowsPath directly,[0m
[0;34m    but cannot instantiate a WindowsPath on a POSIX system or vice versa.[0m
[0;34m    """[0m[0;34m[0m
[0;34m[0m    [0m__slots__[0m [0;34m=[0m [0;34m([0m[0;34m[0m
[0;34m[0m        [0;34m'_accessor'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m)[0m[0;34m[0m
[0;34m[0m[0

In [2]:
from pathlib import Path
p = Path("/")
for subdir in p.iterdir():
    if subdir.is_dir():
        print(subdir)

/home
/usr
/.resolve
/bin
/sbin
/etc
/var
/Library
/System
/private
/.vol
/Users
/Applications
/opt
/dev
/Volumes
/.nofollow
/tmp
/cores


In [5]:
from torchvision.datasets import ImageFolder
ImageFolder??

[0;31mInit signature:[0m
[0mImageFolder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mroot[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtransform[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mCallable[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtarget_transform[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mCallable[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloader[0m[0;34m:[0m [0mCallable[0m[0;34m[[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mAny[0m[0;34m][0m [0;34m=[0m [0;34m<[0m[0mfunction[0m [0mdefault_loader[0m [0mat[0m [0;36m0x13ffc7ee0[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mis_valid_file[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mCallable[0m[0;34m[[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mbool[0m[0;34m][

In [16]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, num_samples=200, transform=None): # set default as 200 if not provided
        self.num_samples = num_samples
        self.features = torch.randn(num_samples, 10) #  features
        self.labels = torch.randint(0,2,(num_samples,))
        self.transform = transform

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# create an instance of the Dataset Class
dataset_inst = CustomDataset(num_samples=150)

# now create an instance of the DataLoader class
train_loader = DataLoader(dataset=dataset_inst, batch_size=32, shuffle=True)

print(f"Dataset size: {len(dataset_inst)}")
print(f"DataLoader batch size: {train_loader.batch_size}")

for epoch in range(1):
    print(f"Epoch:{epoch+1}")
    for i, batch in enumerate(train_loader):
        feature, label = batch
        print(f"Batch: {i+1}, Feature:{feature.shape}, Label: {label.shape}")

# Notice that while each batch have the same batch-size, this is not always the case for the last batch
# because total number of samples is not always perfectly divisible by the batch-size
# when this is the case, the last batch will have the remaining samples that will not necessarily have the same batch-size
# If out intention is to keep the uniform distribution of batch-size for each batch, then we can set
# drop_last argument in DataLoader as True which will get rid of the last batch containing the leftover
# sample size which is not equal to the actual batch-size, but if it is equal then the last
# batch will not be dropped, in our case last batch has a size of 22 samples, while the rest 32.

Dataset size: 150
DataLoader batch size: 32
Epoch:1
Batch: 1, Feature:torch.Size([32, 10]), Label: torch.Size([32])
Batch: 2, Feature:torch.Size([32, 10]), Label: torch.Size([32])
Batch: 3, Feature:torch.Size([32, 10]), Label: torch.Size([32])
Batch: 4, Feature:torch.Size([32, 10]), Label: torch.Size([32])
Batch: 5, Feature:torch.Size([22, 10]), Label: torch.Size([22])


In [15]:
print(torch.randint(0,2,(100,))) # row vector

tensor([1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
        0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 0])


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, num_samples=0, transform=None): # by default set to 0 and None
        self.num_samples = num_samples
        self.transform = transform
        self.features = torch.arange(num_samples*25).view(num_samples, 25) # 25 features
        self.labels = torch.randint(0,5,(num_samples,)) # a row vector

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

custom_dataset = CustomDataset(num_samples=245)
data_loader = DataLoader(dataset=custom_dataset, batch_size=16, shuffle=True, drop_last=True, num_workers=0)

print(f"Dataset size: {len(custom_dataset)}")
print(f"Batch size: {data_loader.batch_size}\n")

for epoch in range(1):
    print(f"Epoch={epoch+1}")
    for i, batch in enumerate(data_loader):
        feature, label = batch # each batch returns a tuple of feature and label
        print(f"Batch:{i+1}, feature shape:{feature.size()}, label shape:{label.shape}")

# total number of samples divided by the batch size, and the resulting number if the number of iterations

Dataset size: 245
Batch size: 16

Epoch=1
Batch:1, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:2, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:3, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:4, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:5, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:6, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:7, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:8, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:9, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:10, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:11, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:12, feature shape:torch.Size([16, 25]), label shape:torch.Size([16])
Batch:13, feature shape:torch.Size([16, 25]), label shape:torch.Siz

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomData(Dataset):
    def __init__(self, samples_size=0, transform=None):
        self.samples_size = samples_size
        self.transform = transform
        self.features = torch.arange(samples_size*13).reshape(samples_size, 13) # 13 features
        self.labels = torch.randint(0,5,(samples_size,))

    def __len__(self):
        return self.samples_size

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# create instance of the Dataset and DataLoader classes
custom = CustomData(samples_size=256)
# shuffle=True, shuffles the indicies of the data points for each epoch
# hence avoiding overfitting for model to learn the sequences of the example samples
loader = DataLoader(dataset=custom, batch_size=36, shuffle=True, drop_last=False, num_workers=0, pin_memory=True)
# DataLoader provides an iterable interface/abstraction of Dataset Class

print(f"Dataset size: {len(custom)}")
print(f"Loader batch size: {loader.batch_size}\n")

# now iterate through the epoch
for epoch in range(10):
    print("="*20, epoch+1, "="*20)
    for i, batch in enumerate(loader):
        # each batch represent either tuple or list of tuples 
        # in each tuple, there is feature and label
        feature, label = batch
        print(f"Batch:{i+1}, Feature shape: {feature.shape}, Label shape: {label.size()}")
    print()


Dataset size: 256
Loader batch size: 36

Batch:1, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:2, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:3, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:4, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:5, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:6, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:7, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:8, Feature shape: torch.Size([4, 13]), Label shape: torch.Size([4])

Batch:1, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:2, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:3, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:4, Feature shape: torch.Size([36, 13]), Label shape: torch.Size([36])
Batch:5, Feature shape: torch.Size([36, 13]), La

In [27]:
DataLoader?

[0;31mInit signature:[0m
[0mDataLoader[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdataset[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0mdataset[0m[0;34m.[0m[0mDataset[0m[0;34m[[0m[0;34m+[0m[0m_T_co[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshuffle[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msampler[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mdata[0m[0;34m.[0m[0msampler[0m[0;34m.[0m[0mSampler[0m[0;34m,[0m [0mIterable[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_sampler[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtorch[0m[0;34m.[

In [5]:
from torchvision import transforms
transforms.Normalize?

[0;31mInit signature:[0m [0mtransforms[0m[0;34m.[0m[0mNormalize[0m[0;34m([0m[0mmean[0m[0;34m,[0m [0mstd[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Normalize a tensor image with mean and standard deviation.
This transform does not support PIL Image.
Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
channels, this transform will normalize each channel of the input
``torch.*Tensor`` i.e.,
``output[channel] = (input[channel] - mean[channel]) / std[channel]``

.. note::
    This transform acts out of place, i.e., it does not mutate the input tensor.

Args:
    mean (sequence): Sequence of means for each channel.
    std (sequence): Sequence of standard deviations for each channel.
    inplace(bool,optional): Bool to make this operation in-place.
[0;31mInit docstring:[0m Initialize internal Module state, shared by both nn.Module and ScriptModule.
[0;31mFile:[0m  

In [41]:
a = torch.randint(0,3,(4,5))
print(a)
print(torch.bincount(a.view(-1))) # bincount counts the number of items per each class
# it can only accept a flattened row vector

tensor([[2, 2, 0, 2, 2],
        [1, 2, 1, 1, 2],
        [0, 1, 2, 0, 0],
        [2, 0, 2, 2, 2]])
tensor([ 5,  4, 11])


In [90]:
import numpy as np

b = torch.randint(0,3,(5,5))
c = np.random.randint(0,2,(5,5))
display(b,c)

b,c = b.reshape(-1), c.reshape(-1)
class_b_count = torch.bincount(b)
class_c_count = torch.bincount(torch.tensor(c))
display(class_b_count, class_c_count)

tensor([[0, 1, 2, 1, 0],
        [1, 2, 2, 1, 2],
        [0, 1, 0, 1, 1],
        [1, 2, 1, 0, 2],
        [1, 2, 1, 2, 2]])

array([[0, 0, 0, 1, 0],
       [0, 1, 0, 1, 1],
       [1, 1, 0, 1, 1],
       [0, 1, 1, 1, 1],
       [0, 0, 1, 0, 1]])

tensor([ 5, 11,  9])

tensor([11, 14])

In [30]:
import torch
import numpy as np

a = torch.randint(0,2,(5,5))
b = np.random.randint(0,3,(5,5))

a_ = torch.randint(0,2, (5,))
b_ = np.random.randint(0,3,(5,))

display(a,b,a_,b_)

# torch.bincount() shows the total number of each unique class
a, b, a_, b_ = a.reshape(-1), b.reshape(-1), a_.reshape(-1), b_.reshape(-1)
count_a = torch.bincount(a)
count_a_ = torch.bincount(a_)
count_b = torch.bincount(torch.tensor(b))
count_b_ = torch.bincount(torch.tensor(b_))
display(count_a,count_a_,count_b, count_b_)

tensor([[0, 0, 0, 1, 0],
        [1, 0, 1, 0, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 1, 0],
        [1, 0, 0, 0, 0]])

array([[2, 0, 0, 1, 2],
       [1, 1, 1, 1, 1],
       [2, 0, 1, 0, 0],
       [1, 2, 0, 1, 1],
       [2, 2, 0, 2, 2]])

tensor([0, 1, 0, 0, 1])

array([2, 1, 1, 1, 2])

tensor([13, 12])

tensor([3, 2])

tensor([ 7, 10,  8])

tensor([0, 3, 2])

In [45]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms

num_samples = 130
num_features = 15

features = torch.randn(num_samples, num_features) # 15 features and 130 example samples
labels = torch.randint(0,2,(num_samples,)) # size (100) or (100,), in both cases it has a single bracket, hence flattened row vector

class SyntheticDataset(Dataset):
    def __init__(self, features, labels, transform=None):
        # number of samples for features and labels must be equal
        assert features.shape[0] == labels.shape[0] #[0] represents the number of samples while [1] is for number of features
        self.features = features
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return self.features.shape[0] # or self.labels.shape[0], they both equal or at least must be equal

    def __getitem__(self, idx):
        feature_sample = self.features[idx]
        label_sample = self.labels[idx]

        sample = {"feature": feature_sample, "label": label_sample}
        
        if self.transform:
            sample = self.transform(sample)
            
        # implicitly Python treats returning a sequence of values separated by comma as tuples
        return sample["feature"], sample["label"]


feature_mean = features.mean(dim=0)
feature_std = features.std(dim=0)

# In order to avoid division by zero, we replace all 0 to 1
feature_std[feature_std==0] = 1.0

# we are defining a transformation 
class ToTensorType(object):
    # Converts Features to FloatTensor and labels to LongTensor
    def __call__(self, sample):
        feature, label = sample["feature"], sample["label"]
        return {"feature": feature.float(), "label": label.long()}

class NormalizedFeatureVector(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, sample):
        feature, label = sample["feature"], sample["label"]
        normalized_feature = (feature - self.mean) / self.std
        return {"feature": normalized_feature, "label": label}

data_transform = transforms.Compose([
    ToTensorType(),
    NormalizedFeatureVector(mean=feature_mean, std=feature_std)
])


custom_dataset = SyntheticDataset(features, labels, transform=data_transform)

feature, label = custom_dataset[3]
print(f"Sample feature shape: {feature.shape}") 
print(f"Sample label shape: {label.shape}, {label.item()}")
print(f"Dataset length: {len(custom_dataset)}\n")

# look since we are capturing individual data point, its preceding sample number is 1
# and virtually it is the same as not including it all, so shape will represent the number of features
# a single label represents a scalar number, that is why its shape is empty, scalar numbers are 0-Dimensional
# while features shape shows 15, which means 15 numbers arranged in the sequence order just like row vector but with single brackets
# scalar numbers always have the 0-Dimensional shape, so empty shape does not represent an empty value for the label

# DataLoader provides an iterable interface for the Dataset 
loader = DataLoader(dataset=custom_dataset, batch_size=16, shuffle=True, num_workers=0, drop_last=False, pin_memory=True)

feature, label = next(iter(loader))
print(f"Feature shape:{feature.shape}, Label shape:{label.size()}")

# for epoch in range(5):
#     print("="*20, epoch+1, "="*20)
#     for i, batch in enumerate(loader):
#         feature, label = batch
#         print(f"Batch={batch+1}, Feature shape:{feature.shape}, Label shape:{label.size()}")

Sample feature shape: torch.Size([15])
Sample label shape: torch.Size([]), 0
Dataset length: 130

Feature shape:torch.Size([16, 15]), Label shape:torch.Size([16])


# Training Loop

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
a = torch.randn(3,4, device=device)
print(a.device)
a = a.to("cpu")
print(a.device)


mps:0
cpu


In [28]:
!conda list | grep -E "matplotlib"

matplotlib                3.9.4                    pypi_0    pypi
matplotlib-inline         0.1.7            py39hca03da5_0  


In [17]:
# First define packages and basic configurations: imports and hyperparameters
import torch
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader

learning_rate = 0.01 # how much of a step does each gradient take to update weights and biases
num_epoch = 100 # how many times should the model see the entire datasets
batch_size = 16 # how many individual examples of entire dataset will the model process at a time before updating the weights and biases

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Current device: {device}")

Current device: mps


In [18]:
# the actual equation at the end should look like this y = 2*X + 1 + (noise)
# where weight=2 and bias=1, input features=X, noise is a small number added

true_weight = torch.tensor([[2]], dtype=torch.float32)
true_bias = torch.tensor([1], dtype=torch.float32)

# creating a training set
X_train = torch.randn(100,1) * 5 # 100 examples with 1 feature
y_train = true_weight * X_train + true_bias + torch.randn(100,1) * 0.5 # Adding small noise

# creating a separate validation set
X_val = torch.randn(20,1) * 5
y_val = true_weight * X_val + true_bias + torch.randn(20,1) * 0.5

# Creating dataset
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

print(len(train_dataset))
print(len(val_dataset)) 

# Create an iterable interface for the dataset
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) # only for training the shuffle is True
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False) 
# to keep consistent metrics comparison Shuffle is set to False

100
20


In [19]:
# now define a model, loss function and optimizer, and make sure to move the model to a desired device (CPU or GPU)
# a single layer for this example is enough
model = nn.Linear(1,1)

# now move the model to the desired device
model = model.to(device)
# below checks in what device, parameters reside
print(next(model.parameters()).device)

# define the loss function, since it is regressions problem MSE or MAE will do
loss_fn = nn.MSELoss()

# define how the computed gradients will be used to update weights and biases, provide parameters and learning rate
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# now simply iterate throught the model parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Name: {name}, Parameters: {param.data.squeeze()}, Device:{param.data.device}")

mps:0
Name: weight, Parameters: 0.8020482063293457, Device:mps:0
Name: bias, Parameters: -0.05859243869781494, Device:mps:0


In [20]:
for param in model.parameters():
    print(param.data.device)

mps:0
mps:0


In [21]:
print(next(model.parameters()).data.device)

mps:0


In [22]:
for param in model.parameters():
    print(param.data.squeeze()) # squeeze produces only the actual numerical values of the weights and biases
    # while .device produces the exact device where the parameters reside

tensor(0.8020, device='mps:0')
tensor(-0.0586, device='mps:0')


In [23]:
print("weight device:", model.weight.device)
print("bias device:", model.bias.device)

weight device: mps:0
bias device: mps:0


In [24]:
import time
start_time = time.time()
for epoch in range(num_epoch):
    # set the model to training mode, because some layers behave differently during training and validation modes
    model.train()
    running_loss = 0.0
    num_batch = 0.0
    for idx, (feature, label) in enumerate(train_loader):
        # move input features and target labels to the same device where model parameters reside
        feature = feature.to(device)
        label = label.to(device)

        # now when features/labels and weights/biases are on the same device we can start without RuntimeError
        # run forward pass in order to build up the computational graph and record sequences of operations leading to the final value
        # implicitly passing features into model() activates forward method of the model
        outputs = model(feature)

        # now calculate the loss, of predicted labels against the ground truth labels, the output is a scalar value
        # repeseting the average loss of all the parameters
        # the loss is calculated only for the current batch and not the entire dataset
        loss = loss_fn(outputs, label)

        # before calculating gradients make sure to clear out .grad attribute as to avoid gradient accumulation
        # zero_grad() must be called before or after the loss.backward()
        optimizer.zero_grad()

        # now calculate the gradients, here is the intensive work is done
        loss.backward()

        # now after gradients are calculated, they are stored in .grad attribute of leaf weights/biases with requires_grad=True
        # and we need to use them to update the weights and biases for the current batch
        optimizer.step()

        # calculate the total loss and total number of batches
        running_loss += loss.item() # since it is a scalar value
        num_batch += 1
        # now we can exit the inner batch loop

    average_loss = running_loss / num_batch
    if (epoch+1) % 10 == 0:
        print(f"Epoch: {epoch+1}/{num_epoch}, Loss: {average_loss:.4f}")

# after the two inner loops are finished, the below code will be executed to check speed
end_time = time.time() - start_time
print(f"{end_time:.3f} milli-seconds")

# note that MPS does not support float64 version of tensors
# on MPS 0.766 milli-seconds and 0.129 milli-seconds on CPU, Looks like CPU runs faster than MPS

Epoch: 10/100, Loss: 0.2947
Epoch: 20/100, Loss: 0.2474
Epoch: 30/100, Loss: 0.2299
Epoch: 40/100, Loss: 0.2413
Epoch: 50/100, Loss: 0.2278
Epoch: 60/100, Loss: 0.2352
Epoch: 70/100, Loss: 0.2587
Epoch: 80/100, Loss: 0.2212
Epoch: 90/100, Loss: 0.2251
Epoch: 100/100, Loss: 0.2060
0.776 milli-seconds


In [31]:
# now we do the validation

# since some layers behave different for training and validation sets
# we must explicitly state that we are right now in the validation mode in order to avoid unexpected behaviors of those layers
model.eval()
val_loss = 0.0
num_batch = 0.0

# with no_grad() we make sure that gradients are not computed even if we accidentally call .backward() inside of validation set
with torch.no_grad():
    for feature, label in val_loader:
        # move input features and labels to the same device as the model parameters were defined
        feature = feature.to(device)
        label = label.to(device)

        # make the forward pass
        outputs = model(feature)

        # compute the loss
        loss = loss_fn(outputs, label)

        # that is it pretty much, and no .backward() and no .step() are used since we are not updating the weights/biases
        # we are only checking the accuracy
        val_loss += loss.item() # retuns a scalar loss value 0-Dimensional
        num_batch += 1

average_loss = val_loss / num_batch
print(f"Average validation loss: {average_loss:.4f}")

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Name: {name}, Parameters: {param.data.squeeze():.4f}")



Average validation loss: 0.4323
Name: weight, Parameters: 1.9747
Name: bias, Parameters: 0.9868


In [35]:
# Saving model checkpoints

# save in .pth extension
path = "checkpoints.pth"
checkpoints = {
    "learning_rate": learning_rate,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict()
}
torch.save(checkpoints, path)

In [49]:
# In order to load the checkpoints we must first create an instance of the model and then call load_state_dict()
# because this function will not recreate an object itself

model = nn.Linear(1,1).to(device)

checkpoints_load = torch.load("checkpoints.pth", weights_only=True, map_location="mps") # we can define where to load if needed

model.load_state_dict(checkpoints_load["model_state_dict"])

# set to evaluation mode, and for this mode we do not need to load the optimizer.load_state_dict()
model.eval()

with torch.no_grad():
    sample_input = torch.tensor([[20.0]])
    # move to the same device as model
    sample_input = sample_input.to(device)
    prediction = model(sample_input)
    print(f"Prediction for input=10: {prediction}") # pretty close to 41

# y = 2*x + 1 , if X=20 then prediction should be close to 41

Prediction for input=10: tensor([[40.4800]], device='mps:0')


In [15]:
for param in model.parameters():
    print(param.grad)

# as of now grad attributes are empty
optimizer.step() # cannot do any updates of the parameters since it does not have gradients
# which tell the sensitivity of the loss value to a small/unit change in the weights/biases

tensor([[-4.3435]], device='mps:0')
tensor([-0.6435], device='mps:0')


In [4]:
# Source - https://stackoverflow.com/questions/76898415/cpu-computation-faster-than-mps-on-pytorch-tensors
# Posted by Kadir
# Retrieved 05/11/2025, License - CC-BY-SA 4.0

import time
import torch

device = "mps"

torch.manual_seed(1234)
TENSOR_A_CPU = torch.rand(50, 50)
TENSOR_B_CPU = torch.rand(50, 50)

torch.manual_seed(1234)
TENSOR_A_MPS = torch.rand(50, 50).to(device)
TENSOR_B_MPS = torch.rand(50, 50).to(device)

start_time = time.time()
torch.matmul(TENSOR_A_CPU, TENSOR_B_CPU)
print("CPU : --- %s seconds ---" % (time.time() - start_time))
cpu_time = time.time() - start_time

start_time = time.time()
torch.matmul(TENSOR_A_MPS, TENSOR_B_MPS)
print("MPS : --- %s seconds ---" % (time.time() - start_time))
mps_time = time.time() - start_time

speed = mps_time / cpu_time
percentage = (1 - (cpu_time/mps_time)) * 100
print(f"CPU performs {speed:.4f} times faster compared to MPS on small operations.")
print(f"CPU performs {percentage:.4f}% faster compared to MPS on small operations.")

CPU : --- 0.00013518333435058594 seconds ---
MPS : --- 0.000560760498046875 seconds ---
CPU performs 2.0913 times faster compared to MPS on small operations.
CPU performs 52.1822% faster compared to MPS on small operations.


In [5]:
# Source - https://stackoverflow.com/questions/76898415/cpu-computation-faster-than-mps-on-pytorch-tensors
# Posted by Kadir
# Retrieved 05/11/2025, License - CC-BY-SA 4.0

import time
import torch

device = "mps"

torch.manual_seed(1234)
TENSOR_A_CPU = torch.rand(5000, 5000)
TENSOR_B_CPU = torch.rand(5000, 5000)

torch.manual_seed(1234)
TENSOR_A_MPS = torch.rand(5000, 5000).to(device)
TENSOR_B_MPS = torch.rand(5000, 5000).to(device)

# Warm-up
for _ in range(100):
    torch.matmul(torch.rand(500,500).to(device), torch.rand(500,500).to(device))
    
start_time = time.time()
torch.matmul(TENSOR_A_CPU, TENSOR_B_CPU)
print("CPU : --- %s seconds ---" % (time.time() - start_time))
cpu_time = time.time() - start_time

start_time = time.time()
torch.matmul(TENSOR_A_MPS, TENSOR_B_MPS)
print("MPS : --- %s seconds ---" % (time.time() - start_time))
mps_time = time.time() - start_time

speed = cpu_time / mps_time
percentage = (1 - (mps_time/cpu_time)) * 100
print(f"MPS performs {speed:.4f} times faster compared to CPU on very complex operations.")
print(f"MPS performs {percentage:.4f}% faster compared to CPU on very complex operations.")

# NOW as the complexity of the model increases, the MPS starts to perform much, much faster compared to CPU on Mac
# as of now MPS performs 1168 times faster than CPU, in the subsequent runs MPS performs 3217 times faster than CPU
# from Percentage perspective MPS performs 99.96% faster compared to CPU on Mac M2 as of now

CPU : --- 0.7261269092559814 seconds ---
MPS : --- 0.00047779083251953125 seconds ---
MPS performs 1435.6715 times faster compared to CPU on very complex operations.
MPS performs 99.9303% faster compared to CPU on very complex operations.


In [90]:
# Training Loop

# setup of imports and hyperparameters
import torch
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader, Dataset

learning_rate = 0.001
num_epoch = 1000
batch_size = 16 

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [94]:
# define the ground truth weights and biases and create synthetic dataset
# y = 5*X+3 + (noise)
true_weight = torch.tensor([[5.0]]) # by default it is float32, and keep it that way since MPS cannot work with float64
true_bias = torch.tensor([3.0])

# create input features and labels for training set
X_train = torch.randn(250, 1) # 250 examples with 1 feature
y_train = true_weight * X_train + true_bias + torch.randn(250,1) * 0.513

# create input features and labels for validation set
X_val = torch.randn(50, 1) # 50 examples with 1 feature
y_val = true_weight * X_val + true_bias + torch.randn(50,1) * 0.634

# we can either use TensorDataset or Dataset as a wrapper for our train/val dataset
class Simpledata(Dataset):
    def __init__(self, features, labels):
        assert features.shape[0] == labels.shape[0] # must have the same number of examples/batches
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = Simpledata(X_train, y_train)
val_dataset = Simpledata(X_val, y_val)

# now wrap around DataLoader for batching and shuffling
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False) # because we want to keep consistent metrics comparison


In [95]:
# Now we can define model, optimizer and loss(criterion, objective, cost) function

model = nn.Linear(in_features=1, out_features=1)# weights and biases are randomly initiliazed in the model
# move model to the given device for faster computations
model = model.to(device)

criterion = nn.MSELoss()

optimizer = optim.SGD(model.parameters(), lr=learning_rate)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Name:{name}, Parameters:{param.data.squeeze():.5f}, Device:{param.data.device}")

Name:weight, Parameters:-0.22189, Device:mps:0
Name:bias, Parameters:-0.12961, Device:mps:0


In [98]:
import time

start_time = time.time()
for epoch in range(num_epoch):
    # set the mode to training
    model.train()
    running_loss = 0.0
    num_batch = 0.0

    for idx, batch in enumerate(train_loader):
        # empty gradients from the previous batch to avoid gradient accumulation
        optimizer.zero_grad()
        
        # unpack the current batch
        feature, label = batch
        # move them to the same device as model parameters
        feature = feature.to(device)
        label = label.to(device)
        
        outputs = model(feature)
        
        loss = criterion(outputs, label) # loss is computed only for the current batch

        loss.backward()

        optimizer.step() # now using gradients update weights and biases for the current batch

        running_loss += loss
        num_batch += 1

    average_loss = running_loss / num_batch
    if (epoch+1) % 20 == 0:
        print(f"Epoch:{epoch+1}/{num_epoch}, Average Loss for the current batch: {average_loss:.4f}")

end_time = time.time() - start_time
print(f"Finished in {end_time:.4f} seconds.")

Epoch:20/1000, Average Loss for the current batch: 0.2853
Epoch:40/1000, Average Loss for the current batch: 0.2771
Epoch:60/1000, Average Loss for the current batch: 0.2872
Epoch:80/1000, Average Loss for the current batch: 0.2754
Epoch:100/1000, Average Loss for the current batch: 0.2784
Epoch:120/1000, Average Loss for the current batch: 0.2804
Epoch:140/1000, Average Loss for the current batch: 0.2831
Epoch:160/1000, Average Loss for the current batch: 0.2879
Epoch:180/1000, Average Loss for the current batch: 0.2760
Epoch:200/1000, Average Loss for the current batch: 0.2747
Epoch:220/1000, Average Loss for the current batch: 0.2773
Epoch:240/1000, Average Loss for the current batch: 0.2776
Epoch:260/1000, Average Loss for the current batch: 0.2815
Epoch:280/1000, Average Loss for the current batch: 0.2772
Epoch:300/1000, Average Loss for the current batch: 0.2790
Epoch:320/1000, Average Loss for the current batch: 0.2765
Epoch:340/1000, Average Loss for the current batch: 0.2808
E

In [107]:
# now performs the validation on the validation set
# since some layers behave differently in training and validation mode set mode to eval
model.eval()
val_loss = 0.0
num_batch = 0.0

with torch.no_grad(): # even if we accidentally call .backward() then no gradients will be computed, hence much faster performance
    for feature, label in val_loader:
        feature = feature.to(device)
        label = label.to(device)

        outputs = model(feature)
        loss = criterion(outputs, label) # predicted vs ground truth
        # that is it
        val_loss += loss
        num_batch += 1

average_loss = val_loss / num_batch
print(f"Validation loss: {average_loss:.4f}")

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Name: {name}, Calculated Gradients in .grad attribute: {param.grad.squeeze()}")

Validation loss: 0.5019
Name: weight, Calculated Gradients in .grad attribute: -1.1735316514968872
Name: bias, Calculated Gradients in .grad attribute: 0.6433367133140564


In [119]:
# saving checkpoints

checkpoints = {
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "model_state_dict": model.state_dict(), # contains model parameters (weights and biases)
    "optimizer_state_dict": optimizer.state_dict() # contains details related to Optimization algorithms
}

torch.save(checkpoints, "checkpoints1.pth") # define what you want to save and where 
# note that since we performed operations on MPS, so if we try to load checkpoints
# with torch.load then it will try to load them into MPS, but if device is not available then will throw error
# if we do not have that device then we can use map_location and set the "cpu" if needed

print(model.state_dict())
print(optimizer.state_dict())

OrderedDict([('weight', tensor([[5.0208]], device='mps:0')), ('bias', tensor([2.9789], device='mps:0'))])
{'state': {}, 'param_groups': [{'lr': 0.001, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}


In [133]:
# loading the checkpoints, but first we must create an instance of the model itself
# because load_state_dict() will not recreate an instance/object of the model

model = nn.Linear(1,1).to(device) #must be in the same device

checkpoints = torch.load("checkpoints1.pth", weights_only=True)
# it is just a dictionary
print(checkpoints)

model.load_state_dict(checkpoints["model_state_dict"])
optimizer.load_state_dict(checkpoints["optimizer_state_dict"])
# since we are now using it for evaluation and not continuining training, then we do not need to load optimizer, 
# but the sake of example it is included here

# again some layers behave differently in training and validation modes so set explicitly to .eval during validation
model.eval()

# y = 5*X + 3, with X=15 then prediction should 78
with torch.no_grad():
    inputs = torch.tensor([[15.0]])
    inputs = inputs.to(device)
    pred = model(inputs)
    print(f"\nPrediction on input= 15.0, Result:{pred}") # pretty close to 78

{'learning_rate': 0.001, 'batch_size': 16, 'model_state_dict': OrderedDict([('weight', tensor([[5.0208]], device='mps:0')), ('bias', tensor([2.9789], device='mps:0'))]), 'optimizer_state_dict': {'state': {}, 'param_groups': [{'lr': 0.001, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'fused': None, 'params': [0, 1]}]}}

Prediction on input= 15.0, Result:tensor([[78.2916]], device='mps:0')


In [1]:
import pandas as pd

In [2]:
DEFAULT_SIGNS = [
    'uy', 'maktab', "ko'cha", 'eshik', 'stol', 'stul', 'karavot', 'mashina', 'poezd',
    'metro', 'kema', "ko'prik", "yomg'ir", 'qor', 'shamol', 'qish', 'bahor', 'kun',
    'iltimos', 'assalomu_alaykum', 'kechirasiz', 'yaxshi', 'tez', 'birga', 'katta',
    'kichik', 'yangi', 'mehribon', 'ovqat_tayyorlash', 'yozish', 'yordam_berish', "o'ynash", 'sayr_qilish',
    'qidirish', "yo'qotish", 'jismoniy_tarbiya', 'turish', 'ketish', 'olib_kelish', 'ochish',
    'yopish', "yorug'", "qorong'i", 'toza', 'televizor', "to'la", "bo'sh", 'oxiri',
    'boshlanishi', 'kech', 'tez_orada', 'restoran', 'zavod', 'cherkov', 'bozor', 'shokolad',
    'mehmonxona', 'kitob', "qog'oz", 'parda', 'kiyim', 'oyoq_kiyim', 'paypoq', "qo'lqop",
    'bosh_kiyim', 'bank_kartasi', 'non', 'likopcha', 'muzlatkich', 'internet', 'musiqa',
    'javob', 'yer', "o't", 'tosh', 'it', 'mushuk', 'sigir', 'ot', "qo'y",
    "cho'chqa", 'kartoshka', 'sabzi', 'karam', 'pomidor', 'bodring', 'sarimsoq', "qo'ziqorin",
    "sariyog'", 'stakan', 'futbol', "o'qiyman", "o'simlik_yog'i", 'sovun', 'yostiq',
    'quyon', 'tozalayman', 'topaman', 'kir', 'ryukzak'
]

In [3]:
top_100_words = ['дом', 'школа', 'улица', 'дверь', 'стол', 'стул', 'кровать', 'машина', 'поезд', 
                 'метро', 'корабль', 'мост', 'дождь', 'снег', 'ветер', 'зима', 'весна', 'день', 
                 'пожалуйста', 'здравствуйте', 'извините', 'нормально', 'быстро', 'вместе', 'большой', 
                 'маленький', 'новый', 'добрый', 'готовить', 'писать', 'помогать', 'играть', 'гулять', 
                 'искать', 'терять', 'физкультура', 'вставать', 'уходить', 'приносить', 'открывать', 
                 'закрывать', 'светлый', 'тёмный', 'чистый', 'телевизор', 'полный', 'пустой', 'конец', 
                 'начало', 'поздно', 'скоро', 'ресторан', 'завод', 'церковь', 'базар', 'шоколад', 
                 'гостиница', 'книга', 'бумага', 'занавеска', 'одежда', 'обувь', 'носки', 'перчатки', 
                 'шапка', 'банковская_карта', 'хлеб', 'тарелка', 'холодильник', 'интернет', 'музыка', 
                 'ответ', 'земля', 'трава', 'камень', 'собака', 'кошка', 'корова', 'лошадь', 'овца', 
                 'свинья', 'картошка', 'морковь', 'капуста', 'помидор', 'огурец', 'чеснок', 'грибы', 
                 'сливочное_масло', 'стакан', 'футбол', 'читаю', 'подсолнечное_масло', 'мыло', 'подушка', 
                 'кролик', 'убираю', 'нахожу', 'грязный', 'рюкзак']


In [5]:
mat= []
for uzb_idx, uzb_word in enumerate(DEFAULT_SIGNS, start=1):
    for rus_idx, rus_word in enumerate(top_100_words, start=1):
        if uzb_idx == rus_idx:
            print(f"{uzb_idx}. {uzb_word} ({rus_word})");\
            mat.append(f"{uzb_idx}. {uzb_word} ({rus_word})")

1. uy (дом)
2. maktab (школа)
3. ko'cha (улица)
4. eshik (дверь)
5. stol (стол)
6. stul (стул)
7. karavot (кровать)
8. mashina (машина)
9. poezd (поезд)
10. metro (метро)
11. kema (корабль)
12. ko'prik (мост)
13. yomg'ir (дождь)
14. qor (снег)
15. shamol (ветер)
16. qish (зима)
17. bahor (весна)
18. kun (день)
19. iltimos (пожалуйста)
20. assalomu_alaykum (здравствуйте)
21. kechirasiz (извините)
22. yaxshi (нормально)
23. tez (быстро)
24. birga (вместе)
25. katta (большой)
26. kichik (маленький)
27. yangi (новый)
28. mehribon (добрый)
29. ovqat_tayyorlash (готовить)
30. yozish (писать)
31. yordam_berish (помогать)
32. o'ynash (играть)
33. sayr_qilish (гулять)
34. qidirish (искать)
35. yo'qotish (терять)
36. jismoniy_tarbiya (физкультура)
37. turish (вставать)
38. ketish (уходить)
39. olib_kelish (приносить)
40. ochish (открывать)
41. yopish (закрывать)
42. yorug' (светлый)
43. qorong'i (тёмный)
44. toza (чистый)
45. televizor (телевизор)
46. to'la (полный)
47. bo'sh (пустой)
48. oxiri 

In [7]:
for word in mat:
    print(word)

1. uy (дом)
2. maktab (школа)
3. ko'cha (улица)
4. eshik (дверь)
5. stol (стол)
6. stul (стул)
7. karavot (кровать)
8. mashina (машина)
9. poezd (поезд)
10. metro (метро)
11. kema (корабль)
12. ko'prik (мост)
13. yomg'ir (дождь)
14. qor (снег)
15. shamol (ветер)
16. qish (зима)
17. bahor (весна)
18. kun (день)
19. iltimos (пожалуйста)
20. assalomu_alaykum (здравствуйте)
21. kechirasiz (извините)
22. yaxshi (нормально)
23. tez (быстро)
24. birga (вместе)
25. katta (большой)
26. kichik (маленький)
27. yangi (новый)
28. mehribon (добрый)
29. ovqat_tayyorlash (готовить)
30. yozish (писать)
31. yordam_berish (помогать)
32. o'ynash (играть)
33. sayr_qilish (гулять)
34. qidirish (искать)
35. yo'qotish (терять)
36. jismoniy_tarbiya (физкультура)
37. turish (вставать)
38. ketish (уходить)
39. olib_kelish (приносить)
40. ochish (открывать)
41. yopish (закрывать)
42. yorug' (светлый)
43. qorong'i (тёмный)
44. toza (чистый)
45. televizor (телевизор)
46. to'la (полный)
47. bo'sh (пустой)
48. oxiri 

In [15]:
empty_column = [k*0*"" for k in range(1,101)]
empty_column

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [9]:
pd.DataFrame??

[0;31mInit signature:[0m
[0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0;34m'Axes | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0;34m'Axes | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'Dtype | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m:[0m [0;34m'bool | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'None'[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mDataFrame[0m[0;34m([0m[0mNDFrame[0m[0;34m,[0m [0mOpsMixin[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    Two-dimensional, size-mutable, potentially heterogeneous tabular data.[0m
[0;34m

In [52]:


df = pd.DataFrame({"So'zlar": mat, "Signer02": empty_column, "Signer03": empty_column, "Signer04": empty_column, "Signer05": empty_column\
                  , "Signer06": empty_column, "Signer07": empty_column, "Signer08": empty_column, "Signer09": empty_column, \
                   "Signer10": empty_column
                  })

#df = df.set_index("So'zlar")

In [53]:
df

Unnamed: 0,So'zlar,Signer02,Signer03,Signer04,Signer05,Signer06,Signer07,Signer08,Signer09,Signer10
0,1. uy (дом),,,,,,,,,
1,2. maktab (школа),,,,,,,,,
2,3. ko'cha (улица),,,,,,,,,
3,4. eshik (дверь),,,,,,,,,
4,5. stol (стол),,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
95,96. quyon (кролик),,,,,,,,,
96,97. tozalayman (убираю),,,,,,,,,
97,98. topaman (нахожу),,,,,,,,,
98,99. kir (грязный),,,,,,,,,


In [54]:
df.to_excel("output.xlsx", engine='xlsxwriter')

In [47]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.9
