In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import numpy as np
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print (device)

mps


In [2]:
randint = torch.randint(-100,100,(6,))
randint

tensor([ 35, -30,  13, -42,  23,  82])

In [3]:
zeros = torch.zeros(3,6)
zeros

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [4]:
tensor = torch.tensor([[1.5, 2.8],[0.7, 0.1], [1.8, 1.3]])
tensor

tensor([[1.5000, 2.8000],
        [0.7000, 0.1000],
        [1.8000, 1.3000]])

In [5]:
ones = torch.ones(3,5)
ones

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [6]:
input = torch.empty(2,3)
input

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [7]:
arange = torch.arange(10)
arange

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
linspace = torch.linspace(2,10, 8)
linspace

tensor([ 2.0000,  3.1429,  4.2857,  5.4286,  6.5714,  7.7143,  8.8571, 10.0000])

In [9]:
logspace = torch.logspace(start = -10, end=10, steps = 5)
logspace

tensor([1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10])

In [10]:
eye = torch.eye(5)
eye

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

In [11]:
a = torch.empty((2,4), dtype = torch.int64)
empty_like = torch.empty_like(a)
empty_like

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0]])

## CPU vs GPU performance

In [12]:
torch_rand1 = torch.rand(100,100,100,100).to(device)
torch_rand2 = torch.rand(100,100,100,100).to(device)
np_rand1 = torch.rand(100,100,100,100)
np_rand2 = torch.rand(100,100,100,100)

start_time = time.time()
rand = (torch_rand1 @ torch_rand2)
end_time = time.time()

elapsed_time = end_time-start_time

print(f"{elapsed_time:.8f}")
start_time = time.time()

rand = np.multiply(np_rand1, np_rand2)
end_time = time.time()
elapsed_time = end_time -start_time
print(f"{elapsed_time:.8f}")


0.63568592
0.76226306


In [13]:
torch.mps.current_allocated_memory()

800000000

In [14]:
%%time
start_time = time.time()
# Matrix operations here
zeros = torch.zeros(1,1)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.8f}") # wall time is real time, cpu time is just the execution time

0.01087284
CPU times: user 621 μs, sys: 1.19 ms, total: 1.81 ms
Wall time: 11.5 ms


## torch.stack, torch.multinomial, torch.tril, torch, triu, input.T/ input.transpose, nn.linear, torch.cat, F.softmax

In [15]:
# multinomial: used to predict what comes next in the model
# Define a probability tensor
probabilities = torch.tensor([0.1, 0.9])
# 10% or 0.1 =>0, 90% or 0.9 => 1, each probability points to the index of the probability in the tensor
# Draw 5 samples from the multinomial distribution
samples = torch.multinomial(probabilities, num_samples =10, replacement =True)
print(samples)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [16]:
# concat: will be used to concatenate the generated letters together
tensor = torch.tensor([1,2,3,4,5])
out = torch.cat((tensor, torch.tensor([28])), dim = 0)
out

tensor([ 1,  2,  3,  4,  5, 28])

In [17]:
# tril: triangle lower, using it to predict without relying on the answer
out = torch.tril(torch.ones(5,5))
out

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [18]:
# tril: triangle upper, using it to predict without relying on the answer
out = torch.triu(torch.ones(5,5))
out

tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])

In [19]:
out = torch.zeros(5,5).masked_fill(torch.tril(torch.ones(5,5))==0, float('-inf'))
out

tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])

In [20]:
torch.exp(out)
# e^0 = 1, and e^-inf = 0

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])

In [21]:
# transpose, swaps the 0th and 2nd tensor
input = torch.zeros(2,3,4)
out = input.transpose(0,2)
out.shape

torch.Size([4, 3, 2])

In [22]:
# torch.stack (used at the start)
tensor1 = torch.tensor([1,2,3])
tensor2 = torch.tensor([4,5,6])
tensor3 = torch.tensor([7,8,9])

#stack the tensors along a new dimension, used in batch size
stacked_tensor = torch.stack([tensor1, tensor2,tensor3])
stacked_tensor

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [23]:
# nn.linear
import torch.nn as nn
sample = torch.tensor([10.,10.,10.])
linear = nn.Linear(3,3,bias=False)
print(linear(sample))

tensor([ 7.3845,  5.1792, -2.5351], grad_fn=<SqueezeBackward4>)


In [24]:
import torch.nn.functional as F

tensor1 = torch.tensor([1.0,2.0,3.0])

softmax_output = F.softmax(tensor1, dim=0)

print(softmax_output)

tensor([0.0900, 0.2447, 0.6652])


 ## nn.Embedding

In [25]:
# Define an embedding layer
embedding = nn.Embedding(num_embeddings=1000, embedding_dim=50)

# Input: list of word indices
word_indices = torch.tensor([2, 5, 7])

# Get the corresponding word embeddings
word_embeddings = embedding(word_indices)

print(word_embeddings.shape)  # Output: torch.Size([3, 50])

torch.Size([3, 50])


## Matrix Multiplication

In [26]:
a = torch.tensor([[1,2],[3,4],[5,6]])
b = torch.tensor([[11,45,6],[32,41,33]])

print(a@b)

tensor([[ 75, 127,  72],
        [161, 299, 150],
        [247, 471, 228]])


In [27]:
# in pytorch you can not multiply int and float matrices together, showcased in this cell
#type int64
int_64 = torch.randint(1,(3,2))
# type float32
float_32 =  torch.rand(2,3)
print(int_64.dtype,float_32.dtype)
result = torch.matmul(int_64, float_32)
print(result)

torch.int64 torch.float32


RuntimeError: expected m1 and m2 to have the same dtype, but got: long long != float

In [28]:
#type int64 typecast into float for it to work
int_64 = torch.randint(5,(3,2)).float()
# type float32
float_32 =  torch.rand(2,3)
print(int_64.dtype,float_32.dtype)
result = torch.matmul(int_64, float_32)
print(result)

torch.float32 torch.float32
tensor([[0.3484, 0.6913, 0.7214],
        [0.6026, 1.0999, 0.3931],
        [2.0621, 3.7084, 0.8510]])


## logits: a bunch of floating point numbers which are normalized

In [29]:
a = torch.rand(2,3,5)
x, y, z = a.shape # x = 2, y = 3, z = 5
a = a.view(x,y,z) # makes the tensor again with the same dimensions
print(a.shape)

torch.Size([2, 3, 5])


In [33]:
# Activation function ReLU: if a number is less than zero it makes it zero, positive numbers unchanged
# it adds non linearity to our function: so if you are a 100 layers deep, and every second layer you have a relu, that network will learn alot more than without it
x = torch.tensor([-0.05], dtype = torch.float32)
y = nn.ReLU(x)
print(y)

ReLU(inplace=True)


In [35]:
# Activation function Sigmoid: plugs x into the equation in the documentation
# The sigmoid function outputs values in the range [0, 1]
x = torch.tensor([-0.05], dtype = torch.float32)
y = F.sigmoid(x)
print(y)

tensor([0.4875])


In [38]:
# Activation function Sigmoid: plugs x into the equation in the documentation
# The tanh function outputs values in the range [-1, 1]
x = torch.tensor([20], dtype = torch.float32)
y = F.tanh(x)
print(y)

tensor([1.])
