In [3]:
from torch import manual_seed
manual_seed(seed=42)

<torch._C.Generator at 0x2157fb906b0>

In [8]:
# Creating Embeddings
from torch.nn import Embedding
from torch import tensor

In [5]:
vocab_size = 6 # 6 different words in the vocabulary
embed_size = 3 # Each vocab token/word will be represented by a vector of size = 3, e.g: [0.523, 0.1442, 0.5234]

In [6]:
embedding_layer = Embedding(num_embeddings=vocab_size,
                            embedding_dim=embed_size)

In [30]:
embedding_layer.weight

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)

In [31]:
# (6x3) x (3x1) = (6, 1)
embedding_layer(tensor([1]))

tensor([[ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

In [29]:
embedding_layer(tensor([1]))

tensor([[ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

In [21]:
embedding_layer(tensor([0]))

tensor([[ 1.9269,  1.4873, -0.4974]], grad_fn=<EmbeddingBackward0>)

In [None]:
embedding_layer(tensor([4]))

tensor([[-0.2316,  0.0418, -0.2516]], grad_fn=<EmbeddingBackward0>)

In [23]:
embedding_layer(tensor([5]))

tensor([[ 0.8599, -0.3097, -0.3957]], grad_fn=<EmbeddingBackward0>)

In [None]:
# Important Observation:
# The i/p value must be within the range of : [0, vocab_size - 1]
embedding_layer(tensor([6]))

IndexError: index out of range in self

In [None]:
# Important Observation:
embedding_layer(tensor([7]))

IndexError: index out of range in self

In [None]:
embedding_layer(tensor([1, 2]))

tensor([[ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559]], grad_fn=<EmbeddingBackward0>)

In [16]:
embedding_layer(tensor([1, 2, 3]))

tensor([[ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347]], grad_fn=<EmbeddingBackward0>)

In [32]:
# Starting:

vocab_size = 50257
output_dim = 256
token_embedding_layer = Embedding(num_embeddings=vocab_size, 
                                  embedding_dim=output_dim)

In [35]:
token_embedding_layer.weight

Parameter containing:
tensor([[-0.2234,  1.7174,  0.3189,  ..., -0.0366, -0.4808,  0.3163],
        [-0.5419, -0.4410, -0.3136,  ..., -0.6076, -0.0453, -0.3573],
        [-1.3658,  1.1117, -0.6228,  ..., -0.1383,  0.9864, -0.3893],
        ...,
        [ 1.7149,  0.5462, -2.2715,  ...,  0.5018, -0.5849, -0.4181],
        [-0.3580, -0.5171,  0.4712,  ..., -0.1611,  0.1126,  0.6221],
        [-1.4073, -1.0815,  0.3490,  ...,  0.3407, -1.6303, -0.0582]],
       requires_grad=True)

In [42]:
# token_embedding_layer(tensor([0, 1]))
# token_embedding_layer(tensor([50256]))
# token_embedding_layer(tensor([50257])) // ‚ùå Error:

In [93]:
from torch.utils.data import Dataset, DataLoader
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')

class GPTStyleDataset_V1(Dataset):
    def __init__(self, tokens, max_length = 4, stride = 1):
        self.input_tokens = []
        self.target_tokens = []
        for i in range(0, len(tokens) - max_length, stride):
            self.input_tokens.append(tensor(tokens[i: i + max_length]))
            self.target_tokens.append(tensor(tokens[i+1: i + max_length + 1]))
    
    def __len__(self):
        return len(self.input_tokens)
    
    def __getitem__(self, index):
        return (self.input_tokens[index], self.target_tokens[index])

def create_dataloader(txt):
    encodings = tokenizer.encode(txt)
    dataset = GPTStyleDataset_V1(encodings, max_length=4, stride=4)
    dataLoader = DataLoader(dataset, batch_size=8, shuffle=False, drop_last=False)
    return dataLoader

with open('../01_main-chapter-code/the-verdict.txt', 'r') as file:
    raw_text = file.read()

dataloader = create_dataloader(raw_text)

In [94]:
data_iterator = iter(dataloader)
data_item = next(data_iterator)
data_item

[tensor([[   40,   367,  2885,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  922,  5891,  1576,   438],
         [  568,   340,   373,   645],
         [ 1049,  5975,   284,   502],
         [  284,  3285,   326,    11]]),
 tensor([[  367,  2885,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   922],
         [ 5891,  1576,   438,   568],
         [  340,   373,   645,  1049],
         [ 5975,   284,   502,   284],
         [ 3285,   326,    11,   287]])]

In [95]:
from torch import arange

max_length = 4
context_length = max_length
pos_embedding_layer = Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [100]:
# Starting:
dataloader = create_dataloader(raw_text)
data_iter = iter(dataloader)
data_item = next(data_iter)
inputs, targets = data_item

In [103]:
inputs.shape, targets.shape

(torch.Size([8, 4]), torch.Size([8, 4]))

In [105]:
inputs

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

In [116]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = Embedding(num_embeddings=vocab_size, 
                                  embedding_dim=output_dim)
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])