In [69]:
import re
from importlib.metadata import version
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

# print("tiktoken version:", version("tiktoken"))

# with open("the-verdict.txt", "r", encoding="utf-8") as f:
#     raw_text = f.read()

# pattern = r"[\s.,!?;:\"“”‘’()\[\]{}—–\-\/\\@#$%^&*_+=<>|~`]+"
# tokens = re.split(pattern, raw_text)

# print(tokens)
# print(f"Total tokens: {len(tokens)}")
# print(tokens[:30])


# unique_sorted_tokens= sorted(set(tokens))

# print(unique_sorted_tokens)

# vocab_size = len(unique_sorted_tokens)
# print(f"Vocabulary size: {vocab_size}")

# vocabulary = {token:integer for integer,token in enumerate(unique_sorted_tokens)}

# for  i ,token in enumerate(vocabulary.items()):
#     print(token)

class SimpleTokenizer:
    def __init__(self, text):
        
        self.pattern = r"[\s.,!?;:\"“”‘’()\[\]{}—–\-\/\\@#$%^&*_+=<>|~`]+"

        tokens = self._tokenize(text)
        self.vocab = ["<unk>", "<endoftext>"] + sorted(set(tokens))
        self.token_to_id = {tok: i for i, tok in enumerate(self.vocab)}
        self.id_to_token = {i: tok for tok, i in self.token_to_id.items()}

    def _tokenize(self, text):
        # Replace special tokens with placeholders to prevent splitting
        text = text.replace("<endoftext>", "\x00ENDOFTEXT\x00")
        tokens = re.split(self.pattern, text)
        tokens = [t for t in tokens if t]
        # Replace back
        tokens = [t.replace("\x00ENDOFTEXT\x00", "<endoftext>") for t in tokens]
        return tokens

    def encode(self, text):
        tokens = self._tokenize(text)
        unk_id = self.token_to_id["<unk>"]
        ids = [self.token_to_id.get(t, unk_id) for t in tokens]
        ids.append(self.token_to_id["<endoftext>"])
        return ids

    def decode(self, ids):
        return " ".join(self.id_to_token[i] for i in ids)



with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


tokenizer = SimpleTokenizer(raw_text)

# text1 = "forced hello to myself"
# text2 = "forced hello to myself2 Akwirw ier"

# text = " <endoftext> ".join([text1, text2])
# print(text)

# encoded = tokenizer.encode(text)
# decoded = tokenizer.decode(encoded)

# print("Encoded:", encoded)
# print("Decoded:", decoded)

tiktoken_tokenizer = tiktoken.get_encoding("gpt2")

# tiktoken_encoded = tiktoken_tokenizer.encode(raw_text)
# tiktoken_decoded = tiktoken_tokenizer.decode(tiktoken_encoded)

# print(len(tiktoken_encoded))
# print("Encoded:", tiktoken_encoded)
# print("Decoded:", tiktoken_decoded)

encoding_sample = tiktoken_encoded[50:]

context_size = 4
x = encoding_sample[:context_size]
y = encoding_sample[1:context_size + 1]

# print("x:", x)
# print("y:", y)

for i in range(1, context_size+1):
    context = encoding_sample[:i]
    target = encoding_sample[i]
    # print(f"Context: {context} -> Target: {target}")

for i in range(1, context_size+1):
    context = encoding_sample[:i]
    target = encoding_sample[i]
    # print(tiktoken_tokenizer.decode(context) ," -> Target: ", tiktoken_tokenizer.decode([target]))

class GPTDataset(Dataset):
    def __init__(self, tokens, max_length, stride=1):
        self.tokens = tokens
        self.max_length = max_length
        self.stride = stride

    def __len__(self):
        return (len(self.tokens) - self.max_length) // self.stride + 1

    def __getitem__(self, idx):
        start_idx = idx * self.stride
        x = self.tokens[start_idx:start_idx + self.max_length]
        y = self.tokens[start_idx + 1:start_idx + self.max_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def create_dataloader(tokenizer, text, max_length=256, stride=2, batch_size=1, drop_last=True, num_workers=0, shuffle=False):
    tiktoken_encoded = tokenizer.encode(text)
    dataset = GPTDataset(tiktoken_encoded, max_length, stride=stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

# dataset = GPTDataset(tiktoken_encoded[50:], context_size, stride=2)
# dataloader = DataLoader(dataset, batch_size=8, shuffle=False, drop_last=True, num_workers=0)

dataloader = create_dataloader(tiktoken_tokenizer, raw_text, max_length=context_size, stride=context_size, batch_size=1)
vocab_size = tiktoken_tokenizer.n_vocab


# for batch in dataloader:
#     inputs, targets = batch
#     print("Inputs:", inputs)
#     print("Targets:", targets)
#     break


    
# torch.manual_seed(123)
# embedding_layer = torch.nn.Embedding(vocab_size, context_size)
# print(embedding_layer.weight)
# print(embedding_layer(torch.tensor([40,  367, 2885, 1464])))


# output_dimensions = 256 

# token_embedding_layer = torch.nn.Embedding(vocab_size, output_dimensions)

# dataloader = create_dataloader(tiktoken_tokenizer, raw_text, max_length=context_size, stride=context_size, batch_size=8)

# data_iter = iter(dataloader)
# inputs, targets = next(data_iter)

# print("Token IDs:", inputs)
# print("Input shape:", inputs.shape)


# token_embeddings = token_embedding_layer(inputs)
# print("Token Embeddings shape:", token_embeddings.shape)


# positional_embedding_layer = torch.nn.Embedding(context_size, output_dimensions)
# positional_embeddings = positional_embedding_layer(torch.arange(context_size))
# print("Positional Embeddings shape:", positional_embeddings.shape)


# input_embeddings = token_embeddings + positional_embeddings
# print("Input Embeddings shape:", input_embeddings.shape)





# Using SimpleTokenizer
dataloader_simple = create_dataloader(tokenizer, raw_text, max_length=context_size, stride=context_size, batch_size=1)
vocab_size_simple = len(tokenizer.vocab)

print("SimpleTokenizer vocab_size:", vocab_size_simple)

for batch in dataloader_simple:
    inputs_simple, targets_simple = batch
    # print("Simple Inputs:", inputs_simple)
    # print("Simple Targets:", targets_simple)
    break

# Using tiktoken
dataloader_tiktoken = create_dataloader(tiktoken_tokenizer,raw_text, max_length=context_size, stride=context_size, batch_size=1)
vocab_size_tiktoken = tiktoken_tokenizer.n_vocab

# print("Tiktoken vocab_size:", vocab_size_tiktoken)

for batch in dataloader_tiktoken:
    inputs_tiktoken, targets_tiktoken = batch
    # print("Tiktoken Inputs:", inputs_tiktoken)
    # print("Tiktoken Targets:", targets_tiktoken)
    break


    
torch.manual_seed(123)

# Embeddings for SimpleTokenizer
embedding_layer_simple = torch.nn.Embedding(vocab_size_simple, context_size)
# print("Simple Embedding weight shape:", embedding_layer_simple.weight.shape)
# print(embedding_layer_simple(inputs_simple[0]))

# Embeddings for tiktoken
embedding_layer_tiktoken = torch.nn.Embedding(vocab_size_tiktoken, context_size)
# print("Tiktoken Embedding weight shape:", embedding_layer_tiktoken.weight.shape)
# print(embedding_layer_tiktoken(inputs_tiktoken[0]))



output_dimensions = 256 

# For SimpleTokenizer
token_embedding_layer_simple = torch.nn.Embedding(vocab_size_simple, output_dimensions)

dataloader_simple_batch = create_dataloader(tokenizer, raw_text, max_length=context_size, stride=context_size, batch_size=8)

data_iter_simple = iter(dataloader_simple_batch)
inputs_simple_batch, targets_simple_batch = next(data_iter_simple)

print("Simple Token IDs:", inputs_simple_batch)
print("Simple Input shape:", inputs_simple_batch.shape)

token_embeddings_simple = token_embedding_layer_simple(inputs_simple_batch)
print("Simple Token Embeddings shape:", token_embeddings_simple.shape)

positional_embedding_layer_simple = torch.nn.Embedding(context_size, output_dimensions)
positional_embeddings_simple = positional_embedding_layer_simple(torch.arange(context_size))
print("Simple Positional Embeddings shape:", positional_embeddings_simple.shape)

input_embeddings_simple = token_embeddings_simple + positional_embeddings_simple
print("Simple Input Embeddings shape:", input_embeddings_simple.shape)

# For tiktoken
token_embedding_layer_tiktoken = torch.nn.Embedding(vocab_size_tiktoken, output_dimensions)

dataloader_tiktoken_batch = create_dataloader(tiktoken_tokenizer, raw_text, max_length=context_size, stride=context_size, batch_size=8)

data_iter_tiktoken = iter(dataloader_tiktoken_batch)
inputs_tiktoken_batch, targets_tiktoken_batch = next(data_iter_tiktoken)

print("Tiktoken Token IDs:", inputs_tiktoken_batch)
print("Tiktoken Input shape:", inputs_tiktoken_batch.shape)

token_embeddings_tiktoken = token_embedding_layer_tiktoken(inputs_tiktoken_batch)
print("Tiktoken Token Embeddings shape:", token_embeddings_tiktoken.shape)

positional_embedding_layer_tiktoken = torch.nn.Embedding(context_size, output_dimensions)
positional_embeddings_tiktoken = positional_embedding_layer_tiktoken(torch.arange(context_size))
print("Tiktoken Positional Embeddings shape:", positional_embeddings_tiktoken.shape)

input_embeddings_tiktoken = token_embeddings_tiktoken + positional_embeddings_tiktoken
print("Tiktoken Input Embeddings shape:", input_embeddings_tiktoken.shape)

SimpleTokenizer vocab_size: 1154
Simple Token IDs: tensor([[  54,   44,  158, 1024],
        [  62,   36,  838,  124],
        [ 267,  500, 1023,  124],
        [ 514,  448,  405,  928],
        [ 600, 1098,  726,  520],
        [ 982, 1037,  679, 1037],
        [ 547, 1006,  583, 1007],
        [ 550,  739,  561,  510]])
Simple Input shape: torch.Size([8, 4])
Simple Token Embeddings shape: torch.Size([8, 4, 256])
Simple Positional Embeddings shape: torch.Size([4, 256])
Simple Input Embeddings shape: torch.Size([8, 4, 256])
Tiktoken Token IDs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Tiktoken Input shape: torch.Size([8, 4])
Tiktoken Token Embeddings shape: torch.Size([8, 4, 256])
Tiktoken Positional Embeddings shape: torch.Siz


## Visualization of differences


In [75]:
input_embeddings_tiktoken = token_embeddings_tiktoken + positional_embeddings_tiktoken
print("Tiktoken Input Embeddings shape:", input_embeddings_tiktoken.shape)

# Visualization of Tokenization
sample_text = "Hello, how are you doing today?"

print("\n=== Tokenization Visualization ===")
print(f"Sample Text: '{sample_text}'")

# SimpleTokenizer
simple_tokens = tokenizer._tokenize(sample_text)
simple_ids = tokenizer.encode(sample_text)[:-1]  # Remove <endoftext>

print("\nSimpleTokenizer:")
print("Tokens:", simple_tokens)
print("IDs:", simple_ids)
print("Decoded:", tokenizer.decode(simple_ids))

# Tiktoken
tiktoken_tokens = tiktoken_tokenizer.encode(sample_text)
tiktoken_decoded = tiktoken_tokenizer.decode(tiktoken_tokens)

print("\nTiktoken:")
print("Tokens (as IDs):", tiktoken_tokens)
print("Decoded:", tiktoken_decoded)

# Show token boundaries
print("\nToken Boundaries (SimpleTokenizer):")
for i, token in enumerate(simple_tokens):
    print(f"Token {i}: '{token}' (ID: {simple_ids[i]})")

print("\nToken Boundaries (Tiktoken):")
for i, token_id in enumerate(tiktoken_tokens):
    token = tiktoken_tokenizer.decode([token_id])
    print(f"Token {i}: '{token}' (ID: {token_id})")

Tiktoken Input Embeddings shape: torch.Size([8, 4, 256])

=== Tokenization Visualization ===
Sample Text: 'Hello, how are you doing today?'

SimpleTokenizer:
Tokens: ['Hello', 'how', 'are', 'you', 'doing', 'today']
IDs: [0, 572, 178, 1148, 371, 0]
Decoded: <unk> how are you doing <unk>

Tiktoken:
Tokens (as IDs): [15496, 11, 703, 389, 345, 1804, 1909, 30]
Decoded: Hello, how are you doing today?

Token Boundaries (SimpleTokenizer):
Token 0: 'Hello' (ID: 0)
Token 1: 'how' (ID: 572)
Token 2: 'are' (ID: 178)
Token 3: 'you' (ID: 1148)
Token 4: 'doing' (ID: 371)
Token 5: 'today' (ID: 0)

Token Boundaries (Tiktoken):
Token 0: 'Hello' (ID: 15496)
Token 1: ',' (ID: 11)
Token 2: ' how' (ID: 703)
Token 3: ' are' (ID: 389)
Token 4: ' you' (ID: 345)
Token 5: ' doing' (ID: 1804)
Token 6: ' today' (ID: 1909)
Token 7: '?' (ID: 30)
