In [1]:
def get_stats(vocab):
    pairs = {}
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            if pair in pairs:
                pairs[pair] += freq
            else:
                pairs[pair] = freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in v_in:
        w_out = word.replace(bigram, replacement)
        v_out[w_out] = v_in[word]
    return v_out
    
def byte_pair_encode_bangla(text, num_merges):
    vocab = {}
    for word in text.split():
        word = ' '.join(word) + ' </w>'  
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1
    
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
    
    bpe_tokens = set()
    for word in vocab.keys():
        bpe_tokens.update(word.split())
    
    return bpe_tokens

bangla_text = "আমি ভাত খাই। সে বাজারে যায়।" 
tokens = []
for i in range(10):
    token = byte_pair_encode_bangla(bangla_text, i)
    for x in token:
        if x not in tokens:
            tokens.append(x)
print(tokens)

['ি', 'য', 'র', '</w>', 'ব', 'ে', 'খ', 'ম', 'স', 'া', 'ত', 'জ', 'য়', 'আ', '।', 'ভ', 'ই', '।</w>', 'ে</w>', 'আম', 'আমি', 'আমি</w>', 'ভা', 'ভাত', 'ভাত</w>', 'খা']


In [1]:
from bnlp import SentencepieceTokenizer

bsp = SentencepieceTokenizer()
input_text = "আমি ভাত খাই। সে বাজারে যায়।"
tokens = bsp.tokenize(input_text)
print(tokens)
text2id = bsp.text2id(input_text)
print(text2id)
id2text = bsp.id2text(text2id)
print(id2text)

['▁আমি', '▁ভাত', '▁খাই', '।', '▁সে', '▁বাজারে', '▁যায়', '।']
[914, 5265, 24224, 3, 124, 2244, 41, 3]
আমি ভাত খাই। সে বাজারে যায়।


In [10]:
from transformers import GPT2Tokenizer,AutoTokenizer, AutoModel

def is_tokenizer_compatible_with_gpt2(tokenizer):
    # Check if the tokenizer is an instance of GPT2Tokenizer
    return isinstance(tokenizer, GPT2Tokenizer)

# Instantiate a GPT2 tokenizer
gpt2_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

# Check if the tokenizer is compatible with GPT2
if is_tokenizer_compatible_with_gpt2(gpt2_tokenizer):
    print("The tokenizer is compatible with GPT2.")
else:
    print("The tokenizer is NOT compatible with GPT2.")


Downloading (…)lve/main/config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

The tokenizer is NOT compatible with GPT2.
