#### Reading raw text

In [2]:
with open ("the-verdict.txt","r",encoding="utf-8") as file:
    raw_text=file.read()

#### Creating vocab

In [3]:
import re
preprocessed= re.split(r'([,.:?_!"()\']|--|\s)',raw_text)
result= [items.strip() for items in preprocessed if items.split()]
allwords=sorted(set(result))
allwords.extend(["<|unk|>","<|endoftext|>"])
vocab= {word:index for index,word in enumerate(allwords)}

##### Implementation of simple tokenizer -- For understanding

In [4]:
class simpleTokenizer1:
    def __init__(self, vocab):
        self.str_to_int=vocab
        self.int_to_str={int:str for str,int in vocab.items()}
    
    def encode(self, text):
        _preprocessed= re.split(r'([,.:?_!"()\']|--|\s)',text)
        _result= [items.strip() for items in _preprocessed if items.split()]
        return [self.str_to_int[items] for items in _result]
    
    def decode(self, tokens):
        text= " ".join([self.int_to_str[items] for items in tokens])
        text= re.sub(r'\s([,.:?_!"()\']|--|\s)',r'\1',text)
        return text          

In [5]:
tokenizer= simpleTokenizer1(vocab)
text="thought Jack Gisburn"
encoded= tokenizer.encode(text)
print(encoded)
decode= tokenizer.decode(encoded)
print(decode)

[1014, 58, 39]
['thought', 'Jack', 'Gisburn']


In [6]:
class simpleTokenizer2:
    def __init__(self, vocab):
        self.int_to_str={str:int for int,str in vocab.items()}
        self.str_to_int=vocab
        
    def encode(self, text):
        preprocessed= re.split(r'([.,?!_!"()\']|--|\s)',text)
        _preprocessed= [items.strip() for items in preprocessed if items.split()]
        encoded_text= [item if item in self.str_to_int 
                       else "<|unk|>" for item in _preprocessed]
        return [self.str_to_int[items] for items in encoded_text]
    
    def decode(self, tokens):
        text= " ".join([self.int_to_str[items] for items in tokens])
        text= re.sub(r'\s([,.:?_!"()\']|--|\s)',r'\1',text)
        return text 
        

In [7]:
tokenizer2= simpleTokenizer2(vocab)
text="Hello my name is Anant"
encoded= tokenizer2.encode(text)
print(encoded)
decode= tokenizer2.decode(encoded)
print(decode)

[1143, 705, 1143, 590, 1143]
['<|unk|>', 'my', '<|unk|>', 'is', '<|unk|>']


### Byte-Pair Encoding --> A type of subword encoding technique used in GPT

In [3]:
import tiktoken
tokenizer= tiktoken.get_encoding("gpt2")

### Implementing dataset, Dataloader

In [4]:
from torch.utils.data import Dataset, DataLoader

In [6]:
class GPTDataset_V1:
    
    def __init__(self, text, tokenizer, maximum_length, stride):
        
        self.input_ids=[]
        self.target_ids=[] 
        
        token_ids= tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids)-maximum_length, stride):
            input_tokens = token_ids[i:i+maximum_length]
            output_tokens= token_ids[i+1:i+maximum_length+1]
            
            self.input_ids.append(input_tokens)
            self.target_ids.append(output_tokens)
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
            
            

In [7]:
def Create_Dataloader_V1(text, batch_size=4, maximum_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset= GPTDataset_V1(text, tokenizer, maximum_length, stride)

    dataloader= DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers)
    
    return dataloader