In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_from_disk
from collections import Counter
from itertools import compress

dataset_path = "/workspaces/Implementation-of-Neural-Machine-Translation-of-Rare-Words-with-Subword-Units-on-Azerbaijani/data_source"
ds = load_from_disk(dataset_path=dataset_path)

In [7]:
from sklearn.model_selection import train_test_split

raw_data = ds["train"].to_pandas()
X = raw_data["partial_text"].map(lambda text: text.lower())
y = raw_data["gold_ending"].map(lambda text: text.lower())

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1/0.9, random_state=42)


X_train_val.reset_index(inplace=True, drop=True)
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
X_val.reset_index(inplace=True, drop=True)

y_train_val.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)
y_val.reset_index(inplace=True, drop=True)

In [8]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

In [11]:
# And then train
initial_aze_alphabet = list(set(' '.join(X_train_val)))

trainer = trainers.BpeTrainer(
    vocab_size=20000, min_frequency=2,
    initial_alphabet=initial_aze_alphabet,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

In [12]:
tokenizer.train_from_iterator(X_train_val, trainer=trainer)






In [20]:
sample_sentence = " Bürc Xəlifə hal-hazırda 111 dünyada ən hündür göydələndir "
tokenizer.decode(tokenizer.encode(sample_sentence.lower()).ids)

' bürc xəlifə hal-hazırda 111 dünyada ən hündür göydələndir '

In [None]:
# And Save it
tokenizer.save("byte-level-bpe.tokenizer.json", pretty=True)