In [1]:
! pip3 install tiktoken



In [2]:
import importlib
import tiktoken
print('tiktoken version:', importlib.metadata.version('tiktoken'))

tiktoken version: 0.11.0


In [3]:
# we can get class like SimpleTokenizerV2 which we made in the last file 
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
text = ("Hello, dou you like tea? <|endoftext|> in the sunlit terraces" 
        " of someunknownPlace")

integers = tokenizer.encode(text, allowed_special= {"<|endoftext|>"})
print(integers)

[15496, 11, 2255, 345, 588, 8887, 30, 220, 50256, 287, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271]


In [5]:
text_f = tokenizer.decode(integers)

In [6]:
text_f

'Hello, dou you like tea? <|endoftext|> in the sunlit terraces of someunknownPlace'

In [7]:
#Creating input-terget Pairs

In [8]:
#This section we implement a data loader that fetches the input-terget pairs using sliding window approach
# To get started, we will first tokenize the whole the verdict short story, we worked with earlier BPE tokenizer

In [9]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text) #token words id
print(len(enc_text))

5145


In [10]:
#we are going to remove first 50 tokens from the data set for denonstration purpose which results in a slightly more interesting
#enc_text is encoded version of our rwa_text
enc_sample = enc_text[50:]

In [11]:
#the context size determines how many tokens are included in the input 
context_size = 4  #length of the input
#the context size of 4 means that the model is traiined to look at a sequence of 4 words 
# to predict the next word in the sequence.
x = enc_sample[:context_size]
y = enc_sample[1: context_size+1]

print(f"x: {x}")
print(f"y: {y}")

x: [290, 4920, 2241, 287]
y: [4920, 2241, 287, 257]


In [12]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "------>", desired)

[290] ------> 4920
[290, 4920] ------> 2241
[290, 4920, 2241] ------> 287
[290, 4920, 2241, 287] ------> 257


In [13]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "----->", tokenizer.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [14]:
#Implement a data Loader

'''
What is a Tensor?
A tensor is just a container for numbers (like arrays or matrices), but it can have any number of dimensions.
1. 0D tensor (scalar): just one number : 7
2. 1D tensor (vector): a list of numbers: [1, 2, 3]
3. 2D tensor (matrix): rows and columns:
                                        [[1, 2, 3],
                                         [4, 5, 6]]


4. 3D tensor: like a cube (stack of matrices):

                        [[[1, 2], [3, 4]],
                        
                         [[5, 6], [7, 8]]]

5. ND tensor: can go to higher dimensions (for images, videos, etc.)


Why use tensors?

They run fast on CPU and GPU.

They support autograd (automatic differentiation for training).

They are the universal data format for deep learning models.

'''

In [15]:
#for the efficient data loader implementation, we will use pytorch's built-in Dataset and DataLoader classes
'''
Step-1: Tokenize the entire text
Step-2: Use a sliding window to chunk the book into overlapping sequences of max_length
Step-3: Return the total number of rows in the dataset
Step-4: Return a single row from the dataset



'''

'\nStep-1: Tokenize the entire text\nStep-2: Use a sliding window to chunk the book into overlapping sequences of max_length\nStep-3: Return the total number of rows in the dataset\nStep-4: Return a single row from the dataset\n\n\n\n'

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk  = token_ids[i : i+max_length]
            target_chunk = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [17]:
#The following code will use the GPTDatasetV1 to load the inputs in batches via a pytorch DataLoader

In [18]:
'''
Step 1: Initialize the tokenizer

Step-2: Create the Dataset

Step-3: drop_last = True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training

Step-4: The number of cpu process to use for preprocessing

'''

'\nStep 1: Initialize the tokenizer\n\nStep-2: Create the Dataset\n\nStep-3: drop_last = True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training\n\nStep-4: The number of cpu process to use for preprocessing\n\n'

In [19]:
def create_dataloader_v1(txt, batch_size = 4, max_length = 256,
                         stride = 128, shuffle = True, drop_last = True,
                          num_workers = 0):
    #Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    #crate dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    #Create Dataloader
    # Here DataLoader will access pervious GPTDatasetV1 class by getitem method

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers

    )

    return dataloader





In [20]:
''' 
Let's test the dataloader with a batch size of 1 for an LLM with a context size of 4
This will develop an intution of how the GPTDatasetV1 class and the create_dataloader_v1 work together

'''

" \nLet's test the dataloader with a batch size of 1 for an LLM with a context size of 4\nThis will develop an intution of how the GPTDatasetV1 class and the create_dataloader_v1 work together\n\n"

In [21]:
with open("the-verdict.txt", "r",encoding= "utf-8") as f:
    raw_text = f.read()

In [22]:
#Convert dataloader into a python iterator to fetch the next entry via Python's built-in next() function

In [23]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=4, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 2.5.1
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [24]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]


In [25]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle= False)

data_iter = iter(dataloader)
inputs, tergets = next(data_iter)
print("Inputs:\n",inputs)
print("\nTergets:\n", tergets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Tergets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
