In [4]:
!pip install datasets --quiet

In [5]:
# Step 1: Read the dataset
from datasets import load_dataset

# Using the HF_Token from Secrets of Google Colab
# Not providing the direct implementation here

# Load the Wikipedia dataset for Hindi using the provided dataset name
dataset = load_dataset("wikimedia/wikipedia", "20231101.hi", split="train")
print("Original number of samples:", len(dataset))

README.md:   0%|          | 0.00/131k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/163093 [00:00<?, ? examples/s]

Original number of samples: 163093


In [6]:
# For demonstration, we select a s# For demonstration, we select a small subset
# Remove this line for full dataset processing
dataset = dataset.select(range(1000))


In [8]:
# Step 2: Run Data Deduplication
# deduplicate by removing duplicate text entries
def deduplicate_examples(examples):
    seen = set()
    unique_texts = []
    for text in examples:
        if text not in seen:
            seen.add(text)
            unique_texts.append(text)
    return unique_texts

In [9]:
# Extract texts and deduplicate
texts = dataset["text"]
unique_texts = deduplicate_examples(texts)
print("Number of unique samples:", len(unique_texts))

Number of unique samples: 1000


In [10]:
# Create a new dataset from deduplicated texts
from datasets import Dataset
dedup_dataset = Dataset.from_dict({"text": unique_texts})

In [11]:
# Step 3: Train your own BPE tokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

In [12]:
# Initialize a BPE tokenizer with an unknown token
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
# Use a simple whitespace pre-tokenizer (you can customize this)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [13]:
# Define a trainer with special tokens and a desired vocabulary size
trainer = trainers.BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=5000  # adjust as needed
)

In [14]:
# Train the tokenizer using an iterator over deduplicated texts
def batch_iterator(batch_size=100):
    for i in range(0, len(unique_texts), batch_size):
        yield unique_texts[i: i + batch_size]

tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
print("Tokenizer training complete.")

Tokenizer training complete.


In [15]:
# Save the trained tokenizer to a file (optional)
tokenizer.save("bpe_tokenizer.json")


In [16]:
# Test the trained tokenizer
output = tokenizer.encode("यह एक उदाहरण वाक्य है।")
print("Encoded tokens:", output.tokens)

Encoded tokens: ['यह', 'एक', 'उदाहरण', 'वा', 'क्य', 'है', '।']
