In [1]:
with open('the-verdict.txt', mode='r') as f:
    raw_data = f.read()

print(len(raw_data))

20479


## Creating Input-Target Pairs
- we will implement a data loader that fetches the Input-Target pairs using a sliding approach

In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2") 

In [3]:
enc_text = tokenizer.encode(raw_data)
len(enc_text)

5145

In [9]:
enc_sample = enc_text[50:]
len(enc_sample)

5095

In [14]:
print(enc_sample[:4])
print("    ",enc_sample[1:5])

[290, 4920, 2241, 287]
     [4920, 2241, 287, 257]


In [15]:
# the context size determines how many tokens are included in the input
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f'x:{x}')
print(f'y:     {y}')

x:[290, 4920, 2241, 287]
y:     [4920, 2241, 287, 257]


In [16]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, '---->' ,desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [17]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), '---->' ,tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


### Implementing Data Loader
- is a task before go to embeddings, implementing an efficient data loader that iterates over the input dataset and return the inputs and targets as PyTorch tensors

In [23]:
len(enc_sample) - context_size

5091

In [24]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f'x:{x}')
print(f'y:     {y}')

x:[290, 4920, 2241, 287]
y:     [4920, 2241, 287, 257]


In [40]:
for i in range(0, len(enc_sample)-context_size):
    x = enc_sample[i : context_size+i]
    y = enc_sample[i+1 : context_size+i+1]
    print('\n')
    print(f'x:{x}')
    print(f'y:     {y}')



x:[290, 4920, 2241, 287]
y:     [4920, 2241, 287, 257]


x:[4920, 2241, 287, 257]
y:     [2241, 287, 257, 4489]


x:[2241, 287, 257, 4489]
y:     [287, 257, 4489, 64]


x:[287, 257, 4489, 64]
y:     [257, 4489, 64, 319]


x:[257, 4489, 64, 319]
y:     [4489, 64, 319, 262]


x:[4489, 64, 319, 262]
y:     [64, 319, 262, 34686]


x:[64, 319, 262, 34686]
y:     [319, 262, 34686, 41976]


x:[319, 262, 34686, 41976]
y:     [262, 34686, 41976, 13]


x:[262, 34686, 41976, 13]
y:     [34686, 41976, 13, 357]


x:[34686, 41976, 13, 357]
y:     [41976, 13, 357, 10915]


x:[41976, 13, 357, 10915]
y:     [13, 357, 10915, 314]


x:[13, 357, 10915, 314]
y:     [357, 10915, 314, 2138]


x:[357, 10915, 314, 2138]
y:     [10915, 314, 2138, 1807]


x:[10915, 314, 2138, 1807]
y:     [314, 2138, 1807, 340]


x:[314, 2138, 1807, 340]
y:     [2138, 1807, 340, 561]


x:[2138, 1807, 340, 561]
y:     [1807, 340, 561, 423]


x:[1807, 340, 561, 423]
y:     [340, 561, 423, 587]


x:[340, 561, 423, 587]
y:     [56

In [73]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i : max_length+i]
            target_chunk = token_ids[i+1 : max_length+i+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids ) 

    def __getitem__(self, idx):
        # return self.input_ids[idx], self.target_ids[idx]
        return torch.tensor(self.input_ids[idx]) , torch.tensor(self.target_ids[idx])

- Now after we implemented the Data Loarder, we will use it to load the inputs in batches via PyTorch DataLoader

In [74]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                      stride=256//2, shuflle=True, drop_last=True, num_workers=0):
    """ batch_size : number of CPUs """
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuflle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [None]:
dataloader = create_dataloader_v1(
    raw_data, batch_size=1, max_length=4,stride=1, shuflle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

<torch.utils.data.dataloader.DataLoader object at 0x0000021D514B9990>


  return torch.tensor(self.input_ids[idx]) , torch.tensor(self.target_ids[idx])


[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [76]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


  return torch.tensor(self.input_ids[idx]) , torch.tensor(self.target_ids[idx])


In [83]:
dataloader = create_dataloader_v1(
    raw_data, batch_size=8, max_length=4,stride=4, shuflle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n",inputs)
print("\nTargets:\n",targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


  return torch.tensor(self.input_ids[idx]) , torch.tensor(self.target_ids[idx])
