In [1]:
!pip install tokenizers sentencepiece

Collecting tokenizers
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl (2.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp39-cp39-win_amd64.whl (991 kB)
Collecting huggingface-hub<1.0,>=0.16.4
  Downloading huggingface_hub-0.33.2-py3-none-any.whl (515 kB)
Collecting fsspec>=2023.5.0
  Downloading fsspec-2025.5.1-py3-none-any.whl (199 kB)
Installing collected packages: fsspec, huggingface-hub, tokenizers, sentencepiece
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2021.10.1
    Uninstalling fsspec-2021.10.1:
      Successfully uninstalled fsspec-2021.10.1
Successfully installed fsspec-2025.5.1 huggingface-hub-0.33.2 sentencepiece-0.2.0 tokenizers-0.21.2


In [3]:
import time
import psutil
import os

In [None]:
import os
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
import sentencepiece as spm

def monitor_resource_usage(func):
    import time, psutil
    def wrapper(*args, **kwargs):
        import os
        process = psutil.Process(os.getpid())
        start_time = time.time()
        start_mem = process.memory_info().rss

        print(f"🚀 Starting: {func.__name__}")
        result = func(*args, **kwargs)

        end_time = time.time()
        end_mem = process.memory_info().rss
        print(f"✅ Completed in {end_time - start_time:.2f}s, RAM used: {(end_mem - start_mem)/1e6:.2f} MB\n")
        return result
    return wrapper

# Training functions
@monitor_resource_usage
def train_hf_bpe(file_path, folder_path, vocab_size):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    tokenizer.train([file_path], trainer)
    tokenizer.save(os.path.join(folder_path, "hf_bpe.json"))

@monitor_resource_usage
def train_hf_wordpiece(file_path, folder_path, vocab_size):
    tokenizer = Tokenizer(models.WordPiece())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    tokenizer.train([file_path], trainer)
    tokenizer.save(os.path.join(folder_path, "hf_wordpiece.json"))

@monitor_resource_usage
def train_sp_bpe(file_path, folder_path, vocab_size):
    spm.SentencePieceTrainer.train(
        input=file_path,
        model_prefix=os.path.join(folder_path, "sp_bpe"),
        vocab_size=vocab_size,
        model_type='bpe',
        character_coverage=1.0
    )

@monitor_resource_usage
def train_sp_unigram(file_path, folder_path, vocab_size):
    spm.SentencePieceTrainer.train(
        input=file_path,
        model_prefix=os.path.join(folder_path, "sp_unigram"),
        vocab_size=vocab_size,
        model_type='unigram',
        character_coverage=1.0
    )

# Master training function
def train_all_tokenizers(file_path, folder_path, vocab_size):
    os.makedirs(folder_path, exist_ok=True)
    # train_hf_bpe(file_path, folder_path, vocab_size)
    train_hf_wordpiece(file_path, folder_path, vocab_size)
    # train_sp_bpe(file_path, folder_path, vocab_size)
    # train_sp_unigram(file_path, folder_path, vocab_size)

# Run for all 3 corpus sizes
base_input = r"C:\Users\User\Desktop\tokenizer\balanced_normalized\final_balanced_"
base_output = r"C:\Users\Aish\OneDrive\Desktop\tokenizer\vocab_final"

settings = {
    "small": 15000,
    # "medium": 30000,
    # "large": 50000
}

for size, vocab_size in settings.items():
    print(f"\n📦 Training tokenizers for: {size.upper()} (Vocab size: {vocab_size})")
    file_path = base_input + f"{size}.txt"
    folder_path = base_output + size
    train_all_tokenizers(file_path, folder_path, vocab_size)



📦 Training tokenizers for: SMALL (Vocab size: 15000)
🚀 Starting: train_hf_bpe
✅ Completed in 23.17s, RAM used: 63.04 MB

🚀 Starting: train_hf_wordpiece
✅ Completed in 19.76s, RAM used: 13.04 MB

🚀 Starting: train_sp_bpe
✅ Completed in 94.74s, RAM used: 8.25 MB

🚀 Starting: train_sp_unigram
✅ Completed in 166.59s, RAM used: -3.27 MB


📦 Training tokenizers for: MEDIUM (Vocab size: 30000)
🚀 Starting: train_hf_bpe
✅ Completed in 34.73s, RAM used: 12.43 MB

🚀 Starting: train_hf_wordpiece
✅ Completed in 33.75s, RAM used: 3.64 MB

🚀 Starting: train_sp_bpe
✅ Completed in 344.54s, RAM used: 3.60 MB

🚀 Starting: train_sp_unigram
✅ Completed in 160.53s, RAM used: -4.06 MB


📦 Training tokenizers for: LARGE (Vocab size: 50000)
🚀 Starting: train_hf_bpe
✅ Completed in 43.07s, RAM used: 4.40 MB

🚀 Starting: train_hf_wordpiece
✅ Completed in 43.72s, RAM used: 4.79 MB

🚀 Starting: train_sp_bpe
✅ Completed in 820.05s, RAM used: -20.76 MB

🚀 Starting: train_sp_unigram
✅ Completed in 158.73s, RAM used: 

In [6]:
import json

json_path = r"C:\Users\Aish\OneDrive\Desktop\tokenizer\final_multilang_sentences.json"

with open(json_path, 'r', encoding='utf-8') as f:
    language_data = json.load(f)


In [17]:
import os
from tokenizers import Tokenizer
import sentencepiece as spm

# Define folder and tokenizer paths
base_path = r"C:\Users\Aish\OneDrive\Desktop\tokenizer\vocab_final"
sizes = {"small": 15000, "medium": 30000, "large": 50000}
types = ["hf_bpe", "hf_wordpiece", "sp_bpe", "sp_unigram"]

tokenizers = {}

for size in sizes:
    folder = base_path+size
    print(folder)
    for ttype in types:
        name = f"{ttype}_{size}"
        if "hf" in ttype:
            tokenizer = Tokenizer.from_file(os.path.join(folder, f"{ttype}.json"))
        else:
            tokenizer = spm.SentencePieceProcessor(model_file=os.path.join(folder, f"{ttype}.model"))
        tokenizers[name] = tokenizer


C:\Users\Aish\OneDrive\Desktop\tokenizer\vocab_finalsmall
C:\Users\Aish\OneDrive\Desktop\tokenizer\vocab_finalmedium
C:\Users\Aish\OneDrive\Desktop\tokenizer\vocab_finallarge


In [19]:
len(tokenizers)

12

In [20]:
def compute_nsl(sequence, tokenizer):
    if hasattr(tokenizer, "encode"):  # HF or SentencePiece
        tokens = tokenizer.encode(sequence)
        if hasattr(tokens, "tokens"):  # HF
            tokens = tokens.tokens
    else:
        tokens = []

    return len(tokens) / len(sequence) if len(sequence) > 0 else 0.0


def compute_fertility(sequence, tokenizer):
    words = sequence.split()
    num_words = len(words)

    if hasattr(tokenizer, "encode"):
        tokens = tokenizer.encode(sequence)
        if hasattr(tokens, "tokens"):
            tokens = tokens.tokens
    else:
        tokens = []

    return len(tokens) / num_words if num_words > 0 else 0.0


In [21]:
import pandas as pd

results = []

for lang, sentences in language_data.items():
    for tok_name, tok_obj in tokenizers.items():
        nsl_scores = []
        fert_scores = []

        for sentence in sentences:
            try:
                nsl = compute_nsl(sentence, tok_obj)
                fert = compute_fertility(sentence, tok_obj)
                nsl_scores.append(nsl)
                fert_scores.append(fert)
            except Exception as e:
                print(f" Error with {lang}, {tok_name}: {e}")

        if nsl_scores and fert_scores:
            results.append({
                "Language": lang,
                "Tokenizer": tok_name,
                "Vocab Size": tok_name.split("_")[-1],
                "NSL": round(sum(nsl_scores)/len(nsl_scores), 4),
                "Fertility": round(sum(fert_scores)/len(fert_scores), 4)
            })


In [22]:
df = pd.DataFrame(results)
df.sort_values(["Language", "Tokenizer"], inplace=True)
df.reset_index(drop=True, inplace=True)

# Show preview
df.head()


Unnamed: 0,Language,Tokenizer,Vocab Size,NSL,Fertility
0,Arabic,hf_bpe_large,large,0.3316,1.9056
1,Arabic,hf_bpe_medium,medium,0.3726,2.1427
2,Arabic,hf_bpe_small,small,0.4908,2.8232
3,Arabic,hf_wordpiece_large,large,0.3567,2.0482
4,Arabic,hf_wordpiece_medium,medium,0.4485,2.5773


In [24]:
df.to_csv("tokenizer_nsl_fertility_report.csv", index=False)
print("✅ File saved: tokenizer_nsl_fertility_report.csv")

# Optional: Open file directly (if running in Jupyter or locally)
import os
print(f"📁 File location: {os.path.abspath('tokenizer_nsl_fertility_report.csv')}")


✅ File saved: tokenizer_nsl_fertility_report.csv
📁 File location: C:\Users\Aish\OneDrive\Desktop\tokenizer\tokenizer_nsl_fertility_report.csv
