In [129]:
import math
import time
import torch.nn.functional as F
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as data
import pytorch_lightning as pl

from IPython.display import set_matplotlib_formats
from torch import Tensor
from tqdm.notebook import tqdm  # Progress bar

In [130]:
x = torch.ones((3,))
print(x.requires_grad)

False


In [131]:
x = torch.arange(3, dtype=torch.float32, requires_grad=True)  # Only float tensors can have gradients
print("X", x)

X tensor([0., 1., 2.], requires_grad=True)


In [132]:
a = x + 2
b = a**2
c = b + 3
y = c.mean()
print("Y", y)

Y tensor(12.6667, grad_fn=<MeanBackward0>)


In [133]:
y.backward()

In [134]:
x.grad

tensor([1.3333, 2.0000, 2.6667])

In [135]:
gpu_avail = torch.cuda.is_available()
print(f"Is the GPU available? {gpu_avail}")

Is the GPU available? True


In [136]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device", device)

Device cuda


In [137]:
x = torch.randn(5000, 5000)

# CPU version
start_time = time.time()
_ = torch.matmul(x, x)
end_time = time.time()
print(f"CPU time: {(end_time - start_time):6.5f}s")
# GPU version
if torch.cuda.is_available():
    x = x.to(device)
    # CUDA is asynchronous, so we need to use different timing functions
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    _ = torch.matmul(x, x)
    end.record()
    torch.cuda.synchronize()  # Waits for everything to finish running on the GPU
    print(f"GPU time: {0.001 * start.elapsed_time(end):6.5f}s")  # Milliseconds to seconds

CPU time: 0.61136s
GPU time: 0.19415s


In [138]:
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

In [139]:
class SimpleClassifier(nn.Module):
    def __init__(self, num_inputs, num_hidden, num_outputs):
        super().__init__()
        # Initialize the modules we need to build the network
        self.linear1 = nn.Linear(num_inputs, num_hidden)
        self.act_fn = nn.Tanh()
        self.linear2 = nn.Linear(num_hidden, num_outputs)

    def forward(self, x):
        # Perform the calculation of the model to determine the prediction
        x = self.linear1(x)
        x = self.act_fn(x)
        x = self.linear2(x)
        return x

In [140]:
model = SimpleClassifier(num_inputs=2, num_hidden=4, num_outputs=1)
# Printing a module shows all its submodules
print(model)
for name, param in model.named_parameters():
    print(f"Parameter {name}, shape {param.shape}")

SimpleClassifier(
  (linear1): Linear(in_features=2, out_features=4, bias=True)
  (act_fn): Tanh()
  (linear2): Linear(in_features=4, out_features=1, bias=True)
)
Parameter linear1.weight, shape torch.Size([4, 2])
Parameter linear1.bias, shape torch.Size([4])
Parameter linear2.weight, shape torch.Size([1, 4])
Parameter linear2.bias, shape torch.Size([1])


In [141]:


class XORDataset(data.Dataset):
    def __init__(self, size, std=0.1):
        """
        Inputs:
            size - Number of data points we want to generate
            std - Standard deviation of the noise (see generate_continuous_xor function)
        """
        super().__init__()
        self.size = size
        self.std = std
        self.generate_continuous_xor()

    def generate_continuous_xor(self):
        # Each data point in the XOR dataset has two variables, x and y, that can be either 0 or 1
        # The label is their XOR combination, i.e. 1 if only x or only y is 1 while the other is 0.
        # If x=y, the label is 0.
        data = torch.randint(low=0, high=2, size=(self.size, 2), dtype=torch.float32)
        label = (data.sum(dim=1) == 1).to(torch.long)
        # To make it slightly more challenging, we add a bit of gaussian noise to the data points.
        data += self.std * torch.randn(data.shape)

        self.data = data
        self.label = label

    def __len__(self):
        # Number of data point we have. Alternatively self.data.shape[0], or self.label.shape[0]
        return self.size

    def __getitem__(self, idx):
        # Return the idx-th data point of the dataset
        # If we have multiple things to return (data point and label), we can return them as tuple
        data_point = self.data[idx]
        data_label = self.label[idx]
        return data_point, data_label

In [142]:
dataset = XORDataset(size=200)
dataset[0]

(tensor([-0.1551,  0.9780]), tensor(1))

In [143]:
data_loader = data.DataLoader(dataset, batch_size=8
                              , shuffle=True)
data_inputs, data_labels = next(iter(data_loader))

# The shape of the outputs are [batch_size, d_1,...,d_N] where d_1,...,d_N are the
# dimensions of the data point returned from the dataset class
print("Data inputs", data_inputs.shape, "\n", data_inputs)
print("Data labels", data_labels.shape, "\n", data_labels)
for x,y in data_loader:
    print("x",x)
    print("y",y)
# for i,(batch_x,batch_y) in enumerate(data_loader):
#     print(i,batch_x)
#     print(batch_y)

Data inputs torch.Size([8, 2]) 
 tensor([[ 0.0652,  1.0126],
        [-0.0993,  1.0482],
        [ 1.0389,  0.1050],
        [ 0.9692,  0.0072],
        [ 0.9528,  0.1234],
        [ 0.9717,  0.9721],
        [ 1.0728,  1.0110],
        [ 1.1344,  0.1440]])
Data labels torch.Size([8]) 
 tensor([1, 1, 1, 1, 1, 0, 0, 1])
x tensor([[ 0.9236,  1.2389],
        [-0.1895,  0.0415],
        [ 1.0006, -0.0328],
        [ 1.1356,  0.9897],
        [ 0.0238,  0.1245],
        [ 0.0253,  0.9259],
        [ 1.0245, -0.0026],
        [ 0.8484,  0.0956]])
y tensor([0, 0, 1, 0, 0, 1, 1, 1])
x tensor([[ 0.0483,  0.9763],
        [ 0.0181,  0.0113],
        [-0.1895,  0.0491],
        [-0.1033, -0.0035],
        [ 1.0602, -0.0724],
        [ 0.9495, -0.0157],
        [ 0.1236,  0.1485],
        [-0.0378,  1.0500]])
y tensor([1, 0, 0, 0, 1, 1, 0, 1])
x tensor([[ 1.1615,  0.0324],
        [ 0.8822,  0.7730],
        [ 0.9787, -0.0072],
        [ 0.0976,  0.8198],
        [ 0.8450, -0.1084],
        [ 1.0

In [144]:
loss_module = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)


In [145]:
train_dataset = XORDataset(size=1000)
train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)
model.to(device)

SimpleClassifier(
  (linear1): Linear(in_features=2, out_features=4, bias=True)
  (act_fn): Tanh()
  (linear2): Linear(in_features=4, out_features=1, bias=True)
)

In [146]:
def train_model(model, optimizer, data_loader, loss_module, num_epochs=100):
    # Set model to train mode
    model.train()
    # Training loop
    for epoch in tqdm(range(num_epochs)):
        for data_inputs,data_labels in data_loader:
            # Step 1: Move input data to device (only strictly necessary if we use GPU)
            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)

            # Step 2: Run the model on the input data
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1)
            # Step 3: Calculate the loss
            loss = loss_module(preds,data_labels.float())
            # Step 4: Perform backpropagation
            # Before calculating the gradients, we need to ensure that they are all zero.
            # The gradients would not be overwritten, but actually added to the existing ones.
            optimizer.zero_grad()
            # Perform backpropagation
            loss.backward()

            # Step 5: Update the parameters
            optimizer.step()


train_model(model, optimizer, train_data_loader, loss_module)

  0%|          | 0/100 [00:00<?, ?it/s]

In [147]:
state_dict = model.state_dict()
print(state_dict)

OrderedDict([('linear1.weight', tensor([[ 2.5314, -2.3760],
        [-0.4756, -0.5388],
        [-1.2652, -0.3048],
        [-1.0653,  1.7114]], device='cuda:0')), ('linear1.bias', tensor([ 1.3886, -0.7650,  0.2168,  0.3253], device='cuda:0')), ('linear2.weight', tensor([[-2.6771, -0.7409, -1.0515, -2.0297]], device='cuda:0')), ('linear2.bias', tensor([0.9799], device='cuda:0'))])


In [148]:
torch.save(state_dict, "our_model.tar")



In [149]:
# Load state dict from the disk (make sure it is the same name as above)
state_dict = torch.load("our_model.tar")

# Create a new model and load the state
new_model = SimpleClassifier(num_inputs=2, num_hidden=4, num_outputs=1)
new_model.load_state_dict(state_dict)

# Verify that the parameters are the same
print("Original model\n", model.state_dict())
print("\nLoaded model\n", new_model.state_dict())

Original model
 OrderedDict([('linear1.weight', tensor([[ 2.5314, -2.3760],
        [-0.4756, -0.5388],
        [-1.2652, -0.3048],
        [-1.0653,  1.7114]], device='cuda:0')), ('linear1.bias', tensor([ 1.3886, -0.7650,  0.2168,  0.3253], device='cuda:0')), ('linear2.weight', tensor([[-2.6771, -0.7409, -1.0515, -2.0297]], device='cuda:0')), ('linear2.bias', tensor([0.9799], device='cuda:0'))])

Loaded model
 OrderedDict([('linear1.weight', tensor([[ 2.5314, -2.3760],
        [-0.4756, -0.5388],
        [-1.2652, -0.3048],
        [-1.0653,  1.7114]])), ('linear1.bias', tensor([ 1.3886, -0.7650,  0.2168,  0.3253])), ('linear2.weight', tensor([[-2.6771, -0.7409, -1.0515, -2.0297]])), ('linear2.bias', tensor([0.9799]))])


In [150]:
test_dataset = XORDataset(size=500)
test_data_loader = data.DataLoader(test_dataset,batch_size=128,shuffle=False,drop_last=False)

In [151]:
def eval_model(model,data_loader):
    model.eval()
    true_preds, num_preds = 0.0,0.0
    with torch.no_grad():
        for data_inputs,data_labels in data_loader:
            data_inputs,data_labels = data_inputs.to(device),data_labels.to(device)
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1)
            preds = torch.sigmoid(preds)
            pred_labels = (preds >= 0.5).long()  # Binarize predictions to 0 and 1

            true_preds+=(pred_labels==data_labels).sum()
            num_preds+=data_labels.shape[0]

    acc = true_preds/num_preds
    print(f"Accuracy of the model:{100.0*acc:4.2f}%")

In [152]:
eval_model(model, test_data_loader)

Accuracy of the model:99.80%


In [153]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention


In [154]:
seq_len, d_k = 3, 2
pl.seed_everything(42)
q = torch.randn(seq_len, d_k)
k = torch.randn(seq_len, d_k)
v = torch.randn(seq_len, d_k)
values, attention = scaled_dot_product(q, k, v)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("Values\n", values)
print("Attention\n", attention)


Global seed set to 42


Q
 tensor([[ 0.3367,  0.1288],
        [ 0.2345,  0.2303],
        [-1.1229, -0.1863]])
K
 tensor([[ 2.2082, -0.6380],
        [ 0.4617,  0.2674],
        [ 0.5349,  0.8094]])
V
 tensor([[ 1.1103, -1.6898],
        [-0.9890,  0.9580],
        [ 1.3221,  0.8172]])
Values
 tensor([[ 0.5698, -0.1520],
        [ 0.5379, -0.0265],
        [ 0.2246,  0.5556]])
Attention
 tensor([[0.4028, 0.2886, 0.3086],
        [0.3538, 0.3069, 0.3393],
        [0.1303, 0.4630, 0.4067]])
