In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import time
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [5]:
#use %%time to record the time taken for the cell to execute
start_time = time.time()
#matrix operations here
zeros = torch.zeros(1,1)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}")

0.06374192


In [4]:
#Generates 2 random matrices using pytorch and then numpy
#Measures time taken to multiply by matrices
#This is so we understand which library to use for the LLM
torch_rand1 = torch.rand(10000, 10000).to(device)
torch_rand2 = torch.rand(10000, 10000).to(device)
np_rand1 = torch.rand(10000, 10000)
np_rand2 = torch.rand(10000, 10000)

start_time = time.time()

rand = (torch_rand1 @ torch_rand2)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}")

start_time = time.time()

rand = np.multiply(np_rand1, np_rand2)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}")

75.07070255
1.27283192


In [6]:
# torch.stack, torch.multinomial, torch.tril, torch.triu, input.T / input.transpose, nn.linear, torch.cat, F.softmax

#Define a probability tensor
probabilities = torch.tensor([0.1, 0.9])
#10% or 0.1 that we get a 0, 90% or 0.9 chance that we get a 1. each probability points to the index of the probability in the tensor
#Draw 5 samples from the multinomial distribution
samples = torch.multinomial(probabilities, num_samples=10, replacement=True)
print(samples)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [7]:
#creates a tensor with values [1, 2, 3, 4]
tensor = torch.tensor([1,2,3,4])
#concatenates the tensor [1, 2, 3, 4] with the tensor [5] along the 0th dimension
out = torch.cat((tensor, torch.tensor([5])), dim= 0)
out

tensor([1, 2, 3, 4, 5])

In [8]:
#Creates a matrix of 1s only, then returns the lower triangular part 
#tril = triangle lower
#1 models predicted events, and 0 models events that havent been predicted yet
out = torch.tril(torch.ones(5,5))
out

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [9]:
#Same as above but triangle upper
out = torch.triu(torch.ones(5,5))
out

tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])

In [9]:
#Exponentiate (2.71) every element, e.g. 2.71^0 = 1, 2..71^-inf = 0
out = torch.zeros(5, 5).masked_fill(torch.tril(torch.ones(5,5)) == 0, float('-inf'))
out

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

In [12]:
torch.exp(out)

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [10]:
input = torch.zeros(2,3,4)
#Swap dimensions with transpose()
out = input.transpose(0,2)
out.shape

torch.Size([4, 3, 2])

In [15]:
tensor1 = torch.tensor([1,2,3])
tensor2 = torch.tensor([4,5,6])
tensor3 = torch.tensor([7,8,9])

#Stack tensors along a new dimension to make a batch
stacked_tensor = torch.stack([tensor1, tensor2, tensor3])
stacked_tensor

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [11]:
#nn (neural network) module contains learnable parameters
#applies a linear transformation to the incoming data, which is defined as y = xA^T + b
#where x is the input tensor, A is the weight tensor, b is the bias tensor, and y is the output tensor
import torch.nn as nn
sample = torch.tensor([10.,10.,10.])
linear = nn.Linear(3,3, bias=False)
print(linear(sample))

tensor([-3.6842,  3.2855,  0.1867], grad_fn=<SqueezeBackward4>)


In [12]:
import torch.nn.functional as F

#Create a tensor
tensor1 = torch.tensor([1.0,2.0,3.0])

#Apply softmax using torch.nn.functional.softmax()
#softmax(x_i) = exp(x_i) / sum(exp(x_j))
# exponentiate the input vector and then divide by the sum of the exponentiated values to normalize the output
#so that it can be interpreted as a probability distribution
softmax_output = F.softmax(tensor1, dim=0)

print(softmax_output)

tensor([0.0900, 0.2447, 0.6652])


In [13]:
#Initialise an embedding layer
#vocab_size is the size of the dictionary of embeddings (number of rows)
vocab_size = 80
#embedding_dim is the size of each embedding vector (number of columns)
embedding_dim = 6
embedding = nn.Embedding(vocab_size, embedding_dim)

#Create some input indices
#list of indices that correspond to the words in your vocabulary
input_indices = torch.LongTensor([1, 5, 3, 2])

#Apply the embedding layer
embedded_output = embedding(input_indices)

#The output will be a tensor of shape (4, 100), where 4 is number of inputs
#and 100 is dimensionality of embedding vectors
#Each row in the output tensor represents an embedding vector for a word in your vocabulary
print(embedded_output.shape)
print(embedded_output)

torch.Size([4, 6])
tensor([[-0.4971,  0.4729,  0.0194, -0.8119, -1.6965,  0.6117],
        [-2.3138, -0.2311, -1.0784, -0.8175,  1.1843,  1.1019],
        [ 0.5366, -0.4469, -0.0517,  2.8779, -0.7432, -0.4882],
        [-0.0876, -0.0231, -0.2826, -0.1637, -0.5108,  0.3409]],
       grad_fn=<EmbeddingBackward0>)


In [15]:
a = torch.tensor([[1,2],[3,4],[5,6]])
b = torch.tensor([[7,8,9],[10,11,12]])
# print(a @ b)
print(torch.matmul(a, b))

tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])


In [16]:
int_64 = torch.randint(1, (3, 2)).float()
#type int64
float_32 = torch.rand(2,3)
#type float32
# print(int_64.dtype, float_32.dtype)
result = torch.matmul(int_64, float_32)
print(result)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


In [17]:
#creates a tensor of shape (2, 3, 5) with random values between 0 and 1
a = torch.rand(2, 3, 5)
print(a.shape)
#unpacks the shape of the tensor into three variables
x, y, z = a.shape
a = a.view(x,y,z)
# print(x, y, z)
print(a.shape)

torch.Size([2, 3, 5])
torch.Size([2, 3, 5])


In [18]:
input = torch.rand((4, 8, 10))
B, T, C = input.shape
output = input.view(B*T, C)
print(output)
# print(input)
print(output[:, -1, :])

tensor([[0.4394, 0.1717, 0.7571, 0.6782, 0.2371, 0.0914, 0.9774, 0.7899, 0.1303,
         0.3030],
        [0.2766, 0.7958, 0.8020, 0.6955, 0.6326, 0.4468, 0.1796, 0.9522, 0.7304,
         0.7200],
        [0.7972, 0.2609, 0.2898, 0.8218, 0.3028, 0.2769, 0.6641, 0.0464, 0.1306,
         0.8955],
        [0.3694, 0.3220, 0.2227, 0.5814, 0.7105, 0.0363, 0.3237, 0.4678, 0.1850,
         0.7279],
        [0.0757, 0.3911, 0.0380, 0.0210, 0.0857, 0.5724, 0.5446, 0.5311, 0.8942,
         0.9466],
        [0.3389, 0.6812, 0.6447, 0.3236, 0.1608, 0.4118, 0.9511, 0.8297, 0.0216,
         0.5838],
        [0.1455, 0.7162, 0.4766, 0.5543, 0.9786, 0.5666, 0.6339, 0.7358, 0.3760,
         0.1205],
        [0.4084, 0.8237, 0.9453, 0.7386, 0.2686, 0.1598, 0.5625, 0.5251, 0.5881,
         0.5821],
        [0.0825, 0.8532, 0.1607, 0.7779, 0.5082, 0.7726, 0.0669, 0.4325, 0.9921,
         0.7020],
        [0.1126, 0.6090, 0.6649, 0.3436, 0.1271, 0.6355, 0.7352, 0.2743, 0.8078,
         0.9145],
        [0

IndexError: too many indices for tensor of dimension 2

In [19]:
x = torch.tensor([10], dtype=torch.float32)
y = F.tanh(x)
print(y)

tensor([1.])


In [None]:
#Activation functions
#ReLu works in forward passes - if a number is below 0, number will be turned into 0
#Sigmoid maps any input value to a value between 0 and 1- 1/(1+ exp(-x)) = 2.71^-x
#Tanh = hyperboliic tangent, outputs values between -1 and 1