In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
''' 
token embaddeing matrics is a look up table.
now we need to create input ids, sowe can generate vector embedding or token embedding.
to create input ids we need to use dataloader,vwe embaded each token in each batch into a 256-dimensional
vector.If we have a batch size of 8 with four tokens each, the result will be an 8 x 4 X 256 tensor.
'''

' \ntoken embaddeing matrics is a look up table.\nnow we need to create input ids, sowe can generate vector embedding or token embedding.\nto create input ids we need to use dataloader,vwe embaded each token in each batch into a 256-dimensional\nvector.If we have a batch size of 8 with four tokens each, the result will be an 8 x 4 X 256 tensor.\n'

In [4]:
#let's intantiate the data loader(Data sampling with a sliding window), first

In [5]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.terget_ids = []

        token_ids = tokenizer.encode(text, allowed_special= {"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i+max_length]
            terget_chunk = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk,dtype=torch.long))
            self.terget_ids.append(torch.tensor(terget_chunk,dtype=torch.long))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.terget_ids[idx]



def create_dataloader_v1(text,batch_size = 4, max_length = 256,
                         stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    #initialize tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")
    #Craete dataset
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    #create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last= drop_last,
        num_workers=num_workers

    )
    return dataloader

In [6]:
#GPT2 size
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [7]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [8]:
max_length = 4
dataloader = create_dataloader_v1 (raw_text, batch_size= 8, max_length= max_length,stride=max_length, shuffle=False)
data_itr = iter(dataloader)
inputs, targets = next(data_itr)

In [9]:
print("Token IDs:\n", inputs)
print("\n Inputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

 Inputs shape:
 torch.Size([8, 4])


In [10]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [11]:
'''As we can tell based on the 8x4x256 - dimensional tensor output, each token ID is now  
embedded as a 256 -dimensional vector.

For a GPT model's absolute embedding approach, we just need to create another embedding layer
that has the same dimension

'''

"As we can tell based on the 8x4x256 - dimensional tensor output, each token ID is now  \nembedded as a 256 -dimensional vector.\n\nFor a GPT model's absolute embedding approach, we just need to create another embedding layer\nthat has the same dimension\n\n"

In [12]:
#context length means how many token id we are taking as input in a batch means 4
# Since each token has 256 dimenssion so for positional embedding dimenssion will be 256
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length,output_dim)

In [13]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [14]:
#we can see that our positional embedding shape is also[4,256]
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
