In [2]:
import torch
torch.__version__

'2.4.0+cu121'

In [3]:
import torch
torch.cuda.is_available()

True

In [5]:
import torch
tensor0d = torch.tensor(1) #A
tensor1d = torch.tensor([1, 2, 3]) #B
tensor2d = torch.tensor([[1, 2], [3, 4]]) #C
tensor3d = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) #D

In [11]:
# tensor types
tensor1d = torch.tensor([1, 2, 3])
print(tensor1d.dtype)

float_tensor = torch.tensor([1.0, 2.0, 3.0])
print(float_tensor.dtype)

float_tensor = tensor1d.to(torch.float32)
print(float_tensor.dtype)

torch.int64
torch.float32
torch.float32


In [19]:
tensor2d = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(tensor2d)
print(tensor2d.shape)
print(tensor2d.reshape(3, 2))
print(tensor2d.view(3, 2))
print(tensor2d.T)
# matrix multiply
print(tensor2d.matmul(tensor2d.T))
print(tensor2d @ tensor2d.T)


tensor([[1, 2, 3],
        [4, 5, 6]])
torch.Size([2, 3])
tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 4],
        [2, 5],
        [3, 6]])
tensor([[14, 32],
        [32, 77]])
tensor([[14, 32],
        [32, 77]])


In [21]:
import torch.nn.functional as F

y = torch.tensor([1.0])  # true label
x1 = torch.tensor([1.1]) # input feature
w1 = torch.tensor([2.2]) # weight parameter
b = torch.tensor([0.0])  # bias unit

z = x1 * w1 + b          # net input
a = torch.sigmoid(z)     # activation & output

loss = F.binary_cross_entropy(a, y)
print(loss)

tensor(0.0852)


In [27]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b 
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)

grad_L_w1 = grad(loss, w1, retain_graph=True)
grad_L_b = grad(loss, b, retain_graph=True)

print(grad_L_w1)
print(grad_L_b)

(tensor([-0.0898]),)
(tensor([-0.0817]),)


In [28]:
# PyTorch takes care of the calculus for us via the .backward method
loss.backward()

print(w1.grad)
print(b.grad)

tensor([-0.0898])
tensor([-0.0817])


In [33]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(
                
            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30), # n -> 30 nodes
            torch.nn.ReLU(),

            # 2nd hidden layer
            torch.nn.Linear(30, 20), # 30 -> 20 nodes
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs), # 20 -> output nodes
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits
    
model = NeuralNetwork(50, 3)
print(model)


NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


In [46]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable model parameters:", num_params)
# first layer
print(model.layers[0].weight)
print(model.layers[0].weight.shape)
print(model.layers[0].bias) # singular bias unit

Total number of trainable model parameters: 2213
Parameter containing:
tensor([[ 0.1081,  0.1174, -0.0331,  ...,  0.0253,  0.0718, -0.0862],
        [-0.1400, -0.0546, -0.1085,  ..., -0.0477, -0.0501, -0.1368],
        [-0.0810,  0.0353, -0.0187,  ...,  0.1142,  0.1288, -0.1121],
        ...,
        [-0.0031, -0.0573,  0.0515,  ...,  0.0271, -0.0928, -0.1175],
        [-0.0444, -0.1318, -0.0660,  ...,  0.0647, -0.1230, -0.0531],
        [ 0.0023, -0.1223,  0.0797,  ...,  0.0369,  0.0862,  0.1328]],
       requires_grad=True)
torch.Size([30, 50])
Parameter containing:
tensor([ 0.1205, -0.0081,  0.0274, -0.0846,  0.0547, -0.1356,  0.0667,  0.0259,
         0.0435,  0.0518, -0.0649,  0.1214,  0.0675,  0.0929,  0.0187,  0.0888,
         0.0952,  0.0641,  0.1077, -0.0257,  0.0930,  0.0326,  0.0851,  0.1377,
         0.0320, -0.0525, -0.0141,  0.0398,  0.0238, -0.0386],
       requires_grad=True)


In [47]:
# In deep learning, initializing model
# weights with small random numbers is desired to break symmetry during
# training -- otherwise, the nodes would be just performing the same operations
# and updates during backpropagation, which would not allow the network to
# learn complex mappings from inputs to outputs
torch.manual_seed(42)

model = NeuralNetwork(50, 3)
print(model.layers[0].weight)

Parameter containing:
tensor([[ 0.1081,  0.1174, -0.0331,  ...,  0.0253,  0.0718, -0.0862],
        [-0.1400, -0.0546, -0.1085,  ..., -0.0477, -0.0501, -0.1368],
        [-0.0810,  0.0353, -0.0187,  ...,  0.1142,  0.1288, -0.1121],
        ...,
        [-0.0031, -0.0573,  0.0515,  ...,  0.0271, -0.0928, -0.1175],
        [-0.0444, -0.1318, -0.0660,  ...,  0.0647, -0.1230, -0.0531],
        [ 0.0023, -0.1223,  0.0797,  ...,  0.0369,  0.0862,  0.1328]],
       requires_grad=True)


In [57]:
# grad_fn=<AddmmBackward0> represents the last-used function to
# compute a variable in the computational graph
torch.manual_seed(123)
X = torch.rand((1, 50))
out = model(X)
print(out)

with torch.no_grad():
    out = model(X)
print(out)

# turn into probability vector
with torch.no_grad():
    out = torch.softmax(model(X), dim=1)
print(out)



tensor([[ 0.1019, -0.0396, -0.1432]], grad_fn=<AddmmBackward0>)
tensor([[ 0.1019, -0.0396, -0.1432]])
tensor([[0.3772, 0.3275, 0.2953]])


### Implementing a data loader

In [60]:
# create two toy datasets: 5 training examples and two for testing

X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])

y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])

y_test = torch.tensor([0, 1])

In [62]:
from torch.utils.data import Dataset


class ToyDataset(Dataset):
    # each dataset has __init__ __getitem__ and __len__
    def __init__(self, X, y):
        # our data
        self.features = X
        # labels for each training data vector (eg. features=[-1.2, 3.1] -> output=0)
        self.labels = y

    def __getitem__(self, index):
        one_x = self.features[index]
        one_y = self.labels[index]        
        return one_x, one_y

    def __len__(self):
        return self.labels.shape[0]

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)
len(train_ds)

5

In [69]:
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2, # group 5 training into batches of 2
    shuffle=True,
# for Jupyter notebooks, setting num_workers to greater than 0
# can sometimes lead to issues related to the sharing of resources between
# different processes, resulting in errors or notebook crashes
    num_workers=0,
    drop_last=True # drop last batch cause only has i instead of two examples
)


In [70]:
# iterate through data loader
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y) # prints our inputs

Batch 1: tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2: tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])


### Training loop demonstration

In [79]:
import torch.nn.functional as F


torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)
# stochastic gradient descent optimizer with learning rate 0.5
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 3

for epoch in range(num_epochs):
    
    # TRAINING MODE
    model.train() 
    for batch_idx, (features, labels) in enumerate(train_loader):

        logits = model(features)
        
        loss = F.cross_entropy(logits, labels) # Loss function
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        ### LOGGING
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
              f" | Train/Val Loss: {loss:.2f}")
    
    # EVALUATION MODE
    model.eval()
    # Optional model evaluation

Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75
Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65
Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44
Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13
Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03
Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00


In [80]:
# Exercise A.3
# How many parameters does the neural network introduced at the beginning of
# this section have?

model_test = NeuralNetwork(2, 2)

num_params = sum(p.numel() for p in model_test.parameters() if p.requires_grad)
print("Total number of trainable model parameters:", num_params)

Total number of trainable model parameters: 752


In [82]:
# make predictions
model.eval()

with torch.no_grad():
    outputs = model(X_train)

# output tensors: what the value is at the two outputs of neural network
print(outputs)

# normalize 
torch.set_printoptions(sci_mode=False)
probabilities = torch.softmax(outputs, dim=1)
print(probabilities)

# Here, the first value
# (column) means that the training example has a 99.91% probability of
# belonging to class 0 and a 0.09% probability of belonging to class 1

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])
tensor([[    0.9991,     0.0009],
        [    0.9982,     0.0018],
        [    0.9949,     0.0051],
        [    0.0491,     0.9509],
        [    0.0307,     0.9693]])


In [86]:
# now, return the index of the highest probability 
predictions = torch.argmax(probabilities, dim=1)
print(predictions)

# can also apply to non-normalized outputs
predictions = torch.argmax(outputs, dim=1)
print(predictions)


tensor([0, 0, 0, 1, 1])
tensor([0, 0, 0, 1, 1])


In [87]:
# check if our predictions match training data
predictions == y_train
# number of correct predictions
torch.sum(predictions == y_train)

tensor([True, True, True, True, True])

In [89]:
# function to compute our accuracy (on training data only? not testing data i think?)
def compute_accuracy(model, dataloader):

    model = model.eval()
    correct = 0.0
    total_examples = 0
    
    for idx, (features, labels) in enumerate(dataloader):
        
        with torch.no_grad():
            logits = model(features)
        
        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions
        correct += torch.sum(compare)
        total_examples += len(compare)

    return (correct / total_examples).item()

print(compute_accuracy(model, train_loader))


1.0


In [92]:
# saving the model
torch.save(model.state_dict(), "model.pth")


In [91]:
# loading model in future
model = NeuralNetwork(2, 2) # needs to match the original model exactly
model.load_state_dict(torch.load("model.pth", weights_only=True))

<All keys matched successfully>

End of pytorch basics

Rest of this notebook:
## Using GPU's to optimize training:

In [95]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())


2.4.0+cu121
True


In [97]:
tensor_1 = torch.tensor([1., 2., 3.])
tensor_2 = torch.tensor([4., 5., 6.])

# adding is carried out on the CPU by default:
print(tensor_1 + tensor_2)

tensor([5., 7., 9.])


In [99]:
# adding on cuda:
tensor_1 = tensor_1.to("cuda")
tensor_2 = tensor_2.to("cuda")

print(tensor_1 + tensor_2)

tensor([5., 7., 9.], device='cuda:0')


In [103]:
# trying to add when tensors are on different device
tensor_1 = tensor_1.to("cpu")
print(tensor_1 + tensor_2)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Now train previous model on GPU instead of CPU

In [105]:
# toy dataset:
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])

y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])

y_test = torch.tensor([0, 1])

In [107]:
from torch.utils.data import Dataset

# dataset module:
class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y

    def __getitem__(self, index):
        one_x = self.features[index]
        one_y = self.labels[index]
        return one_x, one_y

    def __len__(self):
        return self.labels.shape[0]

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

In [109]:
# make dataloader:
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=1,
    drop_last=True
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=1
)

In [111]:
# neural network:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(

            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30),
            torch.nn.ReLU(),

            # 2nd hidden layer
            torch.nn.Linear(30, 20),
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs),
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits

In [114]:
# finally, train neural network on GPU
import torch.nn.functional as F


torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # NEW: use cuda
model = model.to(device) # NEW

optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 3

for epoch in range(num_epochs):

    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):

        features, labels = features.to(device), labels.to(device) # NEW: switch to correct device for tensors
        logits = model(features)
        loss = F.cross_entropy(logits, labels) # Loss function

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ### LOGGING
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
              f" | Train/Val Loss: {loss:.2f}")

    model.eval()

# done!!

Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75
Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65
Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44
Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13
Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03
Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00


In [141]:
# Exercise A.4
# Compare the runtime of matrix multiplication on a CPU to a GPU. At what
# matrix size do you begin to see the matrix multiplication on the GPU being
# faster than on the CPU? Hint: I recommend using the %timeit command in
# Jupyter to compare the runtime. For example, given matrices a and b, run the
# command %timeit a @ b in a new notebook cell.

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(device)

a = torch.rand(400, 800, device=device)
b = torch.rand(800, 1200, device=device)


cuda


In [142]:
print(a.get_device(), b.get_device) # -1 means cpu
%timeit a @ b

0 <built-in method get_device of Tensor object at 0x7f82e910c710>
2.78 ms ± 70.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [145]:

a, b = a.to("cpu"), b.to("cpu")
print(a.get_device(), b.get_device) # -1 means cpu
%timeit a @ b

-1 <built-in method get_device of Tensor object at 0x7f82e910d0d0>
2.36 ms ± 47.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [147]:
!nvidia-smi

Wed Sep 18 15:21:07 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.52.01              Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...    On  |   00000000:01:00.0  On |                  N/A |
| N/A   55C    P8             17W /   40W |    1251MiB /   8192MiB |     40%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                