In [3]:
# ... [same functions as before: get_stats and merge_vocab]

def get_stats(vocab):
    pairs = {}
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            if pair in pairs:
                pairs[pair] += freq
            else:
                pairs[pair] = freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in v_in:
        w_out = word.replace(bigram, replacement)
        v_out[w_out] = v_in[word]
    return v_out
def byte_pair_encode_bangla(text, num_merges):
    # Initialize vocabulary with individual Bengali characters
    vocab = {}
    for word in text.split():
        word = ' '.join(word) + ' </w>'  # Adding end of word token
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1
    
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
    
    # Extracting the tokens from the vocabulary
    bpe_tokens = set()
    for word in vocab.keys():
        bpe_tokens.update(word.split())
    
    return bpe_tokens

bangla_text = "মি বল ভষ ভলোবসি"  # This translates to "I love the Bengali language"
tokens = byte_pair_encode_bangla(bangla_text, 10)
print(tokens)

{'ভষ</w>', 'ি</w>', 'বল</w>', 'ভলোবস', 'মি</w>'}


In [1]:
from bnlp import SentencepieceTokenizer

bsp = SentencepieceTokenizer()
input_text = "আমি ভাত খাই। সে বাজারে যায়।"
tokens = bsp.tokenize(input_text)
print(tokens)
text2id = bsp.text2id(input_text)
print(text2id)
id2text = bsp.id2text(text2id)
print(id2text)

['▁আমি', '▁ভাত', '▁খাই', '।', '▁সে', '▁বাজারে', '▁যায়', '।']
[914, 5265, 24224, 3, 124, 2244, 41, 3]
আমি ভাত খাই। সে বাজারে যায়।


In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim
import torch.nn.functional as F

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode text
input_text = "Hello, how are you?"
tokens = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Model output
outputs = model(**tokens)

# Logits (classification scores)
logits = outputs.logits

# Convert logits to probabilities
probabilities = F.softmax(logits, dim=1)

# Print the classification results
print(probabilities)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[0.2729, 0.7271]], grad_fn=<SoftmaxBackward0>)
