In [None]:
%pip install transformers torch tiktoken datasets matplotlib -Uq

![LLMTokenizer](Data/LLMTokenizer.png)

In [4]:
text1 = "the cat chased the dog"
text2 = "the dog chased the cat"

text = "the capital of French is" 

In [None]:
# basic tokenization
def tokenize(text):
  return text.split()

tokenize(text)

In [None]:
vocab = {
    "the":0,
    "cat":1,
    "dog":2,
    "chased":3,
    "capital":4,
    "of":5,
    "french":6,
    "is":7,
    "<unk>":8
}

def tokenize2(text):
    parts = text.split()
    ids = []
    for part in parts:
        if part in vocab:
            value = vocab[part]
        else:
            value = vocab["<unk>"]
        ids.append(value)
    return ids

token_ids1 = tokenize2(text1)
token_ids1

In [None]:
reverse_vocab = {id: part for part, id in vocab.items()}
reverse_vocab

In [None]:
def detokenize(ids):
    text = ""
    for id in ids:
        part = reverse_vocab[id]
        text += part+" "
    text = text.strip()
    return text

detokenize(token_ids1)

In [None]:
import tiktoken

enc = tiktoken.get_encoding("gpt2")

gpt2_ids = enc.encode(text1)
gpt2_ids

In [None]:
enc.decode(gpt2_ids), enc.decode(token_ids1)

In [None]:
enc.decode([25])

In [None]:
enc = tiktoken.get_encoding("o200k_base")

enc.n_vocab

In [None]:
o200_ids = enc.encode(text1)
o200_ids

In [None]:
# Load model directly
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("google/gemma-3-27b-it")

In [None]:
gemma_ids = processor.tokenizer.encode(text1)
gemma_ids

In [None]:
processor.tokenizer.vocab_size

In [None]:
processor.tokenizer.decode(gemma_ids), processor.tokenizer.decode(gemma_ids)[5:]

In [None]:
import json

with open("tokenizer_gemma.json", "w", encoding="utf-8") as f:
  json.dump(processor.tokenizer.get_vocab(), f, ensure_ascii=False)

In [1]:
from tokenizer import Tokenizer
tokenizer = Tokenizer("tokenizer.json")

tokenizer.encode("states"), tokenizer.decode([4,58])

([4, 58], 'states')

In [2]:
with open("text.txt","r") as f:
    text = f.read()

text

'the capital of the united states is not london. the capital of france is paris, and berlin is the capital of germany. rome is in italy, \n\nmadrid is in spain, and lisbon is in portugal. the capital of the united kingdom is not paris, and the capital of the united states is not berlin. \nalthough these places are often mentioned together, although these capitals are often mentioned together, although these are often mentioned together, \neach country has its own capital, and each country has its own city, and each capital has its own identity, and each capital has its own history. washington \nis the capital of the united states, and london is the capital of the united kingdom. paris is known for art and fashion, and berlin is known for art and \nhistory, and rome is known for art and history, and madrid is known for culture and history, and lisbon is known for culture and art. rome is rich with culture, \nrome is rich with history, rome is rich with art, and madrid is rich with art a

In [3]:
tokens = tokenizer.encode(text)

tokens

[0,
 61,
 1,
 61,
 2,
 61,
 0,
 61,
 3,
 61,
 4,
 58,
 61,
 5,
 61,
 6,
 61,
 7,
 59,
 61,
 0,
 61,
 1,
 61,
 2,
 61,
 8,
 61,
 5,
 61,
 9,
 60,
 61,
 10,
 61,
 11,
 61,
 5,
 61,
 0,
 61,
 1,
 61,
 2,
 61,
 12,
 59,
 61,
 13,
 61,
 5,
 61,
 14,
 61,
 15,
 60,
 61,
 16,
 61,
 5,
 61,
 14,
 61,
 17,
 60,
 61,
 10,
 61,
 18,
 61,
 5,
 61,
 14,
 61,
 19,
 59,
 61,
 0,
 61,
 1,
 61,
 2,
 61,
 0,
 61,
 3,
 61,
 20,
 61,
 5,
 61,
 6,
 61,
 9,
 60,
 61,
 10,
 61,
 0,
 61,
 1,
 61,
 2,
 61,
 0,
 61,
 3,
 61,
 4,
 58,
 61,
 5,
 61,
 6,
 61,
 11,
 59,
 61,
 22,
 61,
 23,
 61,
 24,
 58,
 61,
 25,
 61,
 26,
 61,
 27,
 57,
 61,
 28,
 60,
 61,
 22,
 61,
 23,
 61,
 1,
 58,
 61,
 25,
 61,
 26,
 61,
 27,
 57,
 61,
 28,
 60,
 61,
 22,
 61,
 23,
 61,
 25,
 61,
 26,
 61,
 27,
 57,
 61,
 28,
 60,
 61,
 29,
 61,
 30,
 61,
 31,
 61,
 32,
 61,
 33,
 61,
 1,
 60,
 61,
 10,
 61,
 29,
 61,
 30,
 61,
 31,
 61,
 32,
 61,
 33,
 61,
 37,
 60,
 61,
 10,
 61,
 29,
 61,
 1,
 61,
 31,
 61,
 32,
 61,
 33,
 61,
 34,
 60,
 

In [None]:
%pip install sentencepiece -qU

In [2]:
import sentencepiece as spm

spm.SentencePieceTrainer.Train(
  input="text.txt",
  model_prefix="spm_tokenizer",
  vocab_size=64,
  model_type="bpe",
)


In [5]:
spm_tokenizer = spm.SentencePieceProcessor(model_file="spm_tokenizer.model")

spm_ids = spm_tokenizer.Encode(text1)
spm_tokens = spm_tokenizer.Encode(text1, out_type=str)
spm_ids, spm_tokens

([9, 7, 41, 40, 7, 48, 41, 46, 30, 9, 39, 51, 45, 60],
 ['▁the',
  '▁c',
  'a',
  't',
  '▁c',
  'h',
  'a',
  's',
  'ed',
  '▁the',
  '▁',
  'd',
  'o',
  'g'])

In [6]:
%pip install tokenizer -qU

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from tokenizers import Tokenizer #tokenizer kütüphanesi
from tokenizers.models import BPE #algoritmanın eklenemesi (sözlük oluşturma için)
from tokenizers.trainers import BpeTrainer #eğtim eklenmesi
from tokenizers.pre_tokenizers import Whitespace #beyaz alanın eklenmesi

In [8]:
#toekenizer oluşturulması
hf_tokenizer = Tokenizer(BPE())

In [None]:
#ön bir kelme varsa bunların eklenmesi ile ilgili aksiyonun alınması 
hf_tokenizer.pre_tokenizer = Whitespace()

In [None]:
trainer = BpeTrainer(vocab_size=64, special_tokens=["<unk>"]) #burada tranier için bir kaç kelime ve bilinmeyen kelime tokeni tanımlandı

hf_tokenizer.train(["text.txt"], trainer) #bu kısımda traniner içine istenilir ise birden fazla dosya liste ile verilebilir 

In [None]:
hf_tokenizer.get_vocab_size(), hf_tokenizer.encode(text1).ids #kaç adet kelime var, var olan kelimelere göre text1 içerisndeki idler 

(64, [28, 5, 3, 21, 5, 10, 49, 44, 28, 6, 16, 9])

In [12]:
hf_tokenizer.save("hf_tokenizer.json") #kayıt etme 

In [13]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="hf_tokenizer.json")

fast_tokenizer.encode(text1)

[28, 5, 3, 21, 5, 10, 49, 44, 28, 6, 16, 9]