In [11]:
import re
from importlib.metadata import version
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

# print("tiktoken version:", version("tiktoken"))

# with open("the-verdict.txt", "r", encoding="utf-8") as f:
#     raw_text = f.read()

# pattern = r"[\s.,!?;:\"“”‘’()\[\]{}—–\-\/\\@#$%^&*_+=<>|~`]+"
# tokens = re.split(pattern, raw_text)

# print(tokens)
# print(f"Total tokens: {len(tokens)}")
# print(tokens[:30])


# unique_sorted_tokens= sorted(set(tokens))

# print(unique_sorted_tokens)

# vocab_size = len(unique_sorted_tokens)
# print(f"Vocabulary size: {vocab_size}")

# vocabulary = {token:integer for integer,token in enumerate(unique_sorted_tokens)}

# for  i ,token in enumerate(vocabulary.items()):
#     print(token)

class SimpleTokenizer:
    def __init__(self, text):
        
        self.pattern = r"[\s.,!?;:\"“”‘’()\[\]{}—–\-\/\\@#$%^&*_+=<>|~`]+"

        tokens = self._tokenize(text)
        self.vocab = ["<unk>", "<endoftext>"] + sorted(set(tokens))
        self.token_to_id = {tok: i for i, tok in enumerate(self.vocab)}
        self.id_to_token = {i: tok for tok, i in self.token_to_id.items()}

    def _tokenize(self, text):
        # Replace special tokens with placeholders to prevent splitting
        text = text.replace("<endoftext>", "\x00ENDOFTEXT\x00")
        tokens = re.split(self.pattern, text)
        tokens = [t for t in tokens if t]
        # Replace back
        tokens = [t.replace("\x00ENDOFTEXT\x00", "<endoftext>") for t in tokens]
        return tokens

    def encode(self, text):
        tokens = self._tokenize(text)
        unk_id = self.token_to_id["<unk>"]
        ids = [self.token_to_id.get(t, unk_id) for t in tokens]
        ids.append(self.token_to_id["<endoftext>"])
        return ids

    def decode(self, ids):
        return " ".join(self.id_to_token[i] for i in ids)



with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


tokenizer = SimpleTokenizer(raw_text)

# text1 = "forced hello to myself"
# text2 = "forced hello to myself2 Akwirw ier"

# text = " <endoftext> ".join([text1, text2])
# print(text)

# encoded = tokenizer.encode(text)
# decoded = tokenizer.decode(encoded)

# print("Encoded:", encoded)
# print("Decoded:", decoded)

tiktoken_tokenizer = tiktoken.get_encoding("gpt2")

tiktoken_encoded = tiktoken_tokenizer.encode(raw_text)
tiktoken_decoded = tiktoken_tokenizer.decode(tiktoken_encoded)

# print(len(tiktoken_encoded))
# print("Encoded:", tiktoken_encoded)
# print("Decoded:", tiktoken_decoded)

encoding_sample = tiktoken_encoded[50:]

context_size = 4
x = encoding_sample[:context_size]
y = encoding_sample[1:context_size + 1]

# print("x:", x)
# print("y:", y)

for i in range(1, context_size+1):
    context = encoding_sample[:i]
    target = encoding_sample[i]
    print(f"Context: {context} -> Target: {target}")

for i in range(1, context_size+1):
    context = encoding_sample[:i]
    target = encoding_sample[i]
    print(tiktoken_tokenizer.decode(context) ," -> Target: ", tiktoken_tokenizer.decode([target]))

class GPTDataset(Dataset):
    def __init__(self, tokens, max_length):
        self.tokens = tokens
        self.max_length = max_length

    def __len__(self):
        return len(self.tokens) - self.max_length

    def __getitem__(self, idx):
        x = self.tokens[idx:idx + self.max_length]
        y = self.tokens[idx + 1:idx + self.max_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

dataset = GPTDataset(tiktoken_encoded[50:], context_size)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

for batch in dataloader:
    inputs, targets = batch
    print("Inputs:", inputs)
    print("Targets:", targets)
    break  # Just show one batch



Context: [290] -> Target: 4920
Context: [290, 4920] -> Target: 2241
Context: [290, 4920, 2241] -> Target: 287
Context: [290, 4920, 2241, 287] -> Target: 257
 and  -> Target:   established
 and established  -> Target:   himself
 and established himself  -> Target:   in
 and established himself in  -> Target:   a
Inputs: tensor([[ 290, 4920, 2241,  287],
        [4920, 2241,  287,  257]])
Targets: tensor([[4920, 2241,  287,  257],
        [2241,  287,  257, 4489]])
