In [None]:
with open("/content/the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

print("Total number of characters: ", len(raw_text))
print(raw_text[:99])

Total number of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re

text = "Hello, world. This, is a test."
res = re.split(r'(\s)', text)

print(res)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [None]:
res = re.split(r'([,.] | \s)', text)
print(res)

['Hello', ', ', 'world', '. ', 'This', ', ', 'is a test.']


In [None]:
res = [item for item in res if item.strip()]
print(res)

['Hello', ', ', 'world', '. ', 'This', ', ', 'is a test.']


In [None]:
text = "Hello, world. Is this-- a test?"
res = re.split(r'([,.:;?_!"()\'] | -- | \s)', text)
res = [item.strip() for item in res if item.strip()]
print(res)

['Hello', ',', 'world', '.', 'Is this-- a test?']


In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [None]:
print(len(preprocessed))

4690


### Assigning token IDs

In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)
print(all_words)

1130
['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon-dancers', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those', 'Though', 'Thwing', 'Thwings', 'To', 'Usually', 'Venetian', 'Victor', 'Was', 'We', 'Well', 'What', 'When', 'Why', 'Yes', 'You', '_', 'a', 'abdication', 'able', 'about', 'ab

In [None]:
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
  print(item)
  if i >= 50:
    break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


### Encode and Decode function

In [None]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self,text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self,ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
    return text


In [None]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
          Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [None]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### Adding special context tokens (unk and endoftoken)

In [None]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab))

1132


In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self,text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self,ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
    return text


In [None]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = "<|endoftext|>".join((text1,text2))

print(text)

Hello, do you like tea?<|endoftext|>In the sunlit terraces of the palace.


In [None]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1131, 988, 956, 984, 722, 988, 1131, 7]

In [None]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|unk|> the sunlit terraces of the <|unk|>.'

### Byte Pair Encoding

In [None]:
!pip install tiktoken



In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [None]:
strings = tokenizer.decode_batch([integers])
print(strings)

['Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.']


#### The <endoftext> token is assigned a relatively large token ID, namely, 50256. <br>
#### In fact, the BPE tokenizer, which was used to train models such as GPT-2 and 3, and the original GPT model used in ChatGPT, has a total vocabulary size of 50,257, with <endoftext> being assigned the larget token ID. <br>
#### The BPE tokenizer above encodes and decoeds unknown words, such as "someunknownPlace" correctly. The BPE tokenizer can handle any unkown word. How does it achieve this without using <|unk|> tokens? <br>
#### The algoritm underlying BPE breaks down words that aren't in its predefined vocabulary into smaller sub word units or even individual characters. This enables it to handle out of vocabulary words. <br>


In [None]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


# **Lecture 9: Creating Input-Target data pairs using Python Dataloader**

In [None]:
with open("/content/the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [None]:
enc_sample = enc_text[50:]

The context size determines how many tokens are included in the input.

In [None]:
context_size = 10
#The context_size = 4 means that the model is trianed to look at a sequence of 4 words
#to predict the next word in the sequence.

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x = {x}")
print(f"y = {y}")

x = [290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686]
y = [4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976]


In [None]:
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(context, "--->", desired)

[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287
[290, 4920, 2241, 287] ---> 257
[290, 4920, 2241, 287, 257] ---> 4489
[290, 4920, 2241, 287, 257, 4489] ---> 64
[290, 4920, 2241, 287, 257, 4489, 64] ---> 319
[290, 4920, 2241, 287, 257, 4489, 64, 319] ---> 262
[290, 4920, 2241, 287, 257, 4489, 64, 319, 262] ---> 34686
[290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686] ---> 41976


Everything that's left of the arrow referes to the input an LLM would receive, and the token ID on the right side of the arrow represents the target token id which the LLM is supposed to predict.

In [None]:
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(tokenizer.decode(context), "--->", tokenizer.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a
 and established himself in a --->  vill
 and established himself in a vill ---> a
 and established himself in a villa --->  on
 and established himself in a villa on --->  the
 and established himself in a villa on the --->  Riv
 and established himself in a villa on the Riv ---> iera


### We will implement a data loader that fetches input-output target pairs using a **sliding window** approach.

Step 1: Tokenize the entire text <br>
Step 2: Use a sliding window to chunk the book into overlappibg sequences of max_length <br>
Step 3: Return the total number of rows in the dataset
<br>
Step 4: Return a single row from the dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    #tokenize the entire text
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    #use a sliding window to chunk the book into overlapping sequences of max_length
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]

      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))


  def __len__(self):
      return len(self.input_ids)

  def __getitem__(self, idx):
      return self.input_ids[idx], self.target_ids[idx]

The following code will use the GPTDatasetV1 to load the inputs in batches via a Pytorch DataLoader.
<br>
<br>
Step 1: Initialize the tokenizer. <br>
Step 2: Create dataset <br>
Step 3: drop_last = True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes durinng training <br>
Step 4: The number of CPU processes to use for the preprocessing

We are using the dataloader because it can do parallel processing and it can handle mulitple batches at once.

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

  #initialize the tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")

  #create dataset
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  #create dataloader
  dataloader = DataLoader(dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last,
                          num_workers=num_workers)

  return dataloader

In [None]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In deep learning, small batch sizer require less memory during training but lead to more noisy model updates. <br>
Just like in regular deep learning, the batch size is a trade-off and hyperparameter to experiment with when training LLMs.

In [None]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


More overlap could lead to more overfitting. So it is an advantage of using larger stride.