In [2]:
with open("The_Verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of chars :" ,len(raw_text))
print(raw_text[:99])

Total number of chars : 20482
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [7]:
import re 
text = "Hello, world.This , is a test"
result = re.split(r'([,.]|\s)',text)
print(result)

['Hello', ',', '', ' ', 'world', '.', 'This', ' ', '', ',', '', ' ', 'is', ' ', 'a', ' ', 'test']


In [8]:
result  = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test']


In [10]:
text = "Hello, world.This -- ? is a test"
result = re.split(r'([?/_!()`,.]|--|\s)',text)
print(result)

['Hello', ',', '', ' ', 'world', '.', 'This', ' ', '', '--', '', ' ', '', '?', '', ' ', 'is', ' ', 'a', ' ', 'test']


In [13]:
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', '--', '?', 'is', 'a', 'test']


## Applying to the whole text

In [16]:
pre_processed = re.split(r'([?/_!()`,\".]|--|\s)',raw_text)
pre_processed = [item.strip() for item in pre_processed if item.strip()]
print(len(pre_processed))

4478


In [17]:
print(pre_processed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## Creating the vocab

In [18]:
all_words = sorted(list(set(pre_processed)))
vocab_size = len(all_words)
print(vocab_size)

1183


In [22]:
vocab = { token : integer for integer,token in enumerate(all_words)}
for i , item in enumerate(vocab.items()):
    print(item)
    if i>50:
        break

('!', 0)
('"', 1)
("'", 2)
("'Are", 3)
("'It's", 4)
("'coming'", 5)
("'done'", 6)
("'subject", 7)
("'technique'", 8)
("'way", 9)
('(', 10)
(')', 11)
(',', 12)
('--', 13)
('.', 14)
(':', 15)
(';', 16)
('?', 17)
('A', 18)
('Ah', 19)
('Among', 20)
('And', 21)
('Arrt', 22)
('As', 23)
('At', 24)
('Be', 25)
('Begin', 26)
('Burlington', 27)
('But', 28)
('By', 29)
('Carlo', 30)
('Carlo;', 31)
('Chicago', 32)
('Claude', 33)
('Come', 34)
('Croft', 35)
('Destroyed', 36)
('Devonshire', 37)
("Don't", 38)
('Dubarry', 39)
('Emperors', 40)
('Florence', 41)
('For', 42)
('Gallery', 43)
('Gideon', 44)
('Gisburn', 45)
("Gisburn's", 46)
('Gisburns', 47)
('Grafton', 48)
('Greek', 49)
('Grindle', 50)
("Grindle's", 51)


In [40]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encoder(self,text):
        pre_processed = re.split(r'([,.?_!"()\']|--|\s)',text)
        pre_processed = [item.strip() for item in pre_processed if item.strip()]
        
        ids = [self.str_to_int[s] for s in pre_processed]
        return ids
    
    def decoder(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [43]:
tokenizer = SimpleTokenizer(vocab)

text = ("He painted?")
ids = tokenizer.encoder(text)
print(ids)

[58, 785, 17]


In [44]:
tokenizer.decoder(ids)

'He painted?'

In [53]:
all_tokens = sorted(list(set(pre_processed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])
vocab = {tokens:integer for integer,tokens in enumerate(all_tokens)}


In [55]:
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1180)
('your', 1181)
('yourself', 1182)
('<|endoftext|>', 1183)
('<|unk|>', 1184)


In [56]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encoder(self,text):
        pre_processed = re.split(r'([,.?_!"()\']|--|\s)',text)
        pre_processed = [item.strip() for item in pre_processed if item.strip()]
        pre_processed = [item if item in self.str_to_int else "<|unk|>" for item in pre_processed]
        
        ids = [self.str_to_int[s] for s in pre_processed]
        return ids
    
    def decoder(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [60]:
text_1 = "Hello, do you lke tea ?"
text_2 = "My name is cata"
text = "<|endoftext|> ".join((text_1,text_2))
print(text)

Hello, do you lke tea ?<|endoftext|> My name is cata


In [61]:
tokenizer = SimpleTokenizerV2(vocab)
print(text)
print(tokenizer.encoder(text))
print(tokenizer.decoder(tokenizer.encoder(text)))


Hello, do you lke tea ?<|endoftext|> My name is cata
[1184, 12, 379, 1177, 1184, 1020, 17, 1183, 84, 1184, 616, 1184]
<|unk|>, do you <|unk|> tea? <|endoftext|> My <|unk|> is <|unk|>


In [62]:
!pip install tiktoken

Collecting tiktoken
  Obtaining dependency information for tiktoken from https://files.pythonhosted.org/packages/69/ca/0a71c1cdbf36da977bd306d295042087187954c32bfa259fa7afede0608b/tiktoken-0.6.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tiktoken-0.6.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.6.0-cp311-cp311-win_amd64.whl (798 kB)
   ---------------------------------------- 0.0/798.7 kB ? eta -:--:--
   -------------- ------------------------- 286.7/798.7 kB 5.9 MB/s eta 0:00:01
   ---------------------------------------- 798.7/798.7 kB 8.4 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


DEPRECATION: Loading egg at c:\users\catal\anaconda3\lib\site-packages\dlib-19.24.2-py3.11-win-amd64.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..


In [68]:
import importlib_metadata
import tiktoken

print("tiktoken version : ", importlib_metadata.version("tiktoken"))

tiktoken version :  0.6.0


In [71]:
tokenizer = tiktoken.get_encoding("gpt2")

In [75]:
text = "Hello, how are you ? <|endoftext|> I am in RomaniaFOOT"
integers = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 703, 389, 345, 5633, 220, 50256, 314, 716, 287, 23356, 6080, 2394]


In [76]:
strings = tokenizer.decode(integers)
print(strings)

Hello, how are you ? <|endoftext|> I am in RomaniaFOOT


## Data sampling

In [78]:
with open("The_Verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5147


In [80]:
enc_sample = enc_text[:50]
print(enc_sample )

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11]


In [82]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size]
print(f"x: {x}")
print(f"y:     {y}")

x: [40, 367, 2885, 1464]
y:     [367, 2885, 1464]


In [89]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context,"--->",desired)
    

[40] ---> 367
[40, 367] ---> 2885
[40, 367, 2885] ---> 1464
[40, 367, 2885, 1464] ---> 1807


In [90]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context),"--->",tokenizer.decode([desired]))
    

I --->  H
I H ---> AD
I HAD --->  always
I HAD always --->  thought


In [91]:
import torch
from torch.utils.data import Dataset

In [108]:
class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.tokenizer = tokenizer
        self.inputs_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
        
        for i in range(0,len(token_ids) - max_length,stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.inputs_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.inputs_ids)
    def __getitem__(self, idx):
        return self.inputs_ids[idx],self.target_ids[idx]

In [110]:
from torch.utils.data import DataLoader
def create_dataloader(txt,batch_size=4,max_length=256,stride=128,shuffle=True):
    toke = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,toke,max_length,stride)
    dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle)
    
    return dataloader

In [115]:
with open("The_Verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
dataloader = create_dataloader(raw_text,1,4,1,False)
data_iter = iter(dataloader)
first_batch = next(data_iter)

print(first_batch)

[tensor([[15496,    11,   703,   389]]), tensor([[ 11, 703, 389, 345]])]
