# Importing Libraries

In [13]:

import os
from clean import (
    cleanup_bnc_spoken,
    cleanup_childes,
    cleanup_gutenberg,
    cleanup_open_subtitles,
    cleanup_simple_wiki,
    cleanup_switchboard
)

from tokenizers import (
    Tokenizer , decoders , models , pre_tokenizers , trainers , processors
)

from tokenizers.normalizers import NFKC

# Checking Datasets

In [5]:
categories = ['bnc_spoken', 'childes' , 'gutenberg', 'open_subtitles', 'simple_wiki' , 'switchboard']

In [6]:
words = 0 


In [7]:
words = 0 
for cat in categories:
    data_path = os.path.join('datasets/train_10M', cat +".train")
    
    with open(data_path, 'r', encoding='utf-8') as f:
        text = f.read()
    words += len(text.split())
    print(f'{cat}: {len(text.split()):,} words')
print(f'Total: {words/1e6:.2f}M words') 
    

bnc_spoken: 932,497 words
childes: 2,839,591 words
gutenberg: 2,539,489 words
open_subtitles: 2,041,868 words
simple_wiki: 1,453,539 words
switchboard: 146,789 words
Total: 9.95M words


In [8]:
words = 0 
for cat in categories:
    data_path = os.path.join('datasets/train_100M', cat + ".train")
    
    with open(data_path, 'r', encoding='utf-8') as f:
        text = f.read()
    words += len(text.split())
    print(f'{cat}: {len(text.split()):,} words')
print(f'Total: {words/1e6:.2f}M words') 
    

bnc_spoken: 7,760,721 words
childes: 28,903,287 words
gutenberg: 26,371,234 words
open_subtitles: 19,963,117 words
simple_wiki: 14,674,311 words
switchboard: 1,342,029 words
Total: 99.01M words


In [9]:
words = 0 
for cat in categories:
    data_path = os.path.join('datasets/dev', cat + ".dev")
    
    with open(data_path, 'r', encoding='utf-8') as f:
        text = f.read()
    words += len(text.split())
    print(f'{cat}: {len(text.split()):,} words')
print(f'Total: {words/1e6:.2f}M words') 
    

bnc_spoken: 1,252,593 words
childes: 2,716,591 words
gutenberg: 2,819,070 words
open_subtitles: 2,077,019 words
simple_wiki: 1,405,366 words
switchboard: 148,340 words
Total: 10.42M words


In [10]:
words = 0 
for cat in categories:
    data_path = os.path.join('datasets/test', cat + ".test")
    
    with open(data_path, 'r', encoding='utf-8') as f:
        text = f.read()
    words += len(text.split())
    print(f'{cat}: {len(text.split()):,} words')
print(f'Total: {words/1e6:.2f}M words') 
    

bnc_spoken: 932,334 words
childes: 2,700,128 words
gutenberg: 2,404,516 words
open_subtitles: 1,949,898 words
simple_wiki: 1,300,077 words
switchboard: 167,133 words
Total: 9.45M words


# cleaned dataset


In [20]:
DATA_SPLIT = ['train_10M', 'train_100M', 'dev', 'test']

categories_and_function = {
    'bnc_spoken': cleanup_bnc_spoken,
    'childes': cleanup_childes,
    'gutenberg': cleanup_gutenberg,
    'open_subtitles': cleanup_open_subtitles,
    'simple_wiki': cleanup_simple_wiki,
    'switchboard': cleanup_switchboard
}

seq_length = 128



for split in DATA_SPLIT:
    print("--" * 40)
    if split == 'train_10M' or split == 'train_100M':
        for cat , func in categories_and_function.items():
            cleaned_data_dir = os.path.join("cleaned_datasets" , split)
            cleaned_data_path = os.path.join(cleaned_data_dir , cat  + ".train")
            os.makedirs(cleaned_data_dir , exist_ok = True)
            data_path = os.path.join("datasets" , split , cat + ".train")
            with open(data_path, 'r', encoding='utf-8') as f:
                text = f.read()
            normal_words = len(text.split())
            cleaned_text = func(text , seq_length)
            cleaned_words = len(cleaned_text.split())
            print(f'{cat} {split}: {normal_words} -> {cleaned_words} words')
            with open(cleaned_data_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
    elif split == 'dev':
        for cat , func in categories_and_function.items():
            cleaned_data_dir = os.path.join("cleaned_datasets" , split )
            cleaned_data_path = os.path.join(cleaned_data_dir , cat  + ".dev")
            os.makedirs(cleaned_data_dir , exist_ok = True)
            data_path = os.path.join('datasets', split, cat + ".dev")
            with open(data_path, 'r', encoding='utf-8') as f:
                text = f.read()
            normal_words = len(text.split())
            cleaned_text = func(text , seq_length)
            cleaned_words = len(cleaned_text.split())
            print(f'{cat} {split}: {normal_words} -> {cleaned_words} words')
            with open(cleaned_data_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
    elif split == 'test':
        for cat , func in categories_and_function.items():
            cleaned_data_dir = os.path.join("cleaned_datasets" , split )
            cleaned_data_path = os.path.join(cleaned_data_dir , cat  + ".test")
            os.makedirs(cleaned_data_dir , exist_ok = True)
            data_path = os.path.join('datasets', split, cat + ".test")
            with open(data_path, 'r', encoding='utf-8') as f:
                text = f.read()
            normal_words = len(text.split())
            cleaned_text = func(text , seq_length)
            cleaned_words = len(cleaned_text.split())
            print(f'{cat} {split}: {normal_words} -> {cleaned_words} words')
            with open(cleaned_data_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
                

--------------------------------------------------------------------------------
bnc_spoken train_10M: 932497 -> 927102 words
childes train_10M: 2839591 -> 2839397 words
gutenberg train_10M: 2539489 -> 2539489 words
open_subtitles train_10M: 2041868 -> 2041523 words
simple_wiki train_10M: 1453539 -> 1441982 words
switchboard train_10M: 146789 -> 146789 words
--------------------------------------------------------------------------------
bnc_spoken train_100M: 7760721 -> 7713524 words
childes train_100M: 28903287 -> 28901073 words
gutenberg train_100M: 26371234 -> 26371234 words
open_subtitles train_100M: 19963117 -> 19960443 words
simple_wiki train_100M: 14674311 -> 14564235 words
switchboard train_100M: 1342029 -> 1342029 words
--------------------------------------------------------------------------------
bnc_spoken dev: 1252593 -> 1245432 words
childes dev: 2716591 -> 2716381 words
gutenberg dev: 2819070 -> 2819070 words
open_subtitles dev: 2077019 -> 2076847 words
simple_wiki dev

# Building Tokenizers 

## train_10M tokenizers

In [21]:
paths = []

for cat in categories:
    paths.append(os.path.join('cleaned_datasets/train_10M', cat + ".train"))
     

In [22]:
paths

['cleaned_datasets/train_10M/bnc_spoken.train',
 'cleaned_datasets/train_10M/childes.train',
 'cleaned_datasets/train_10M/gutenberg.train',
 'cleaned_datasets/train_10M/open_subtitles.train',
 'cleaned_datasets/train_10M/simple_wiki.train',
 'cleaned_datasets/train_10M/switchboard.train']

In [23]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.normalizer = NFKC()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)



In [24]:
trainer = trainers.BpeTrainer(
    vocab_size=32000,
    min_frequency=2,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
    ]
)


tokenizer.train(paths, trainer=trainer)







In [25]:
os.makedirs('tokenizer', exist_ok=True)
tokenizer_path = os.path.join('tokenizer', 'tokenizer_train10M.json')
tokenizer.save(tokenizer_path)
print(f"Tokenizer saved to {tokenizer_path}")

Tokenizer saved to tokenizer/tokenizer_train10M.json


## train_100M tokenizer

In [28]:
paths = []

for cat in categories:
    paths.append(os.path.join('cleaned_datasets/train_100M', cat + ".train"))
     

In [29]:
paths

['cleaned_datasets/train_100M/bnc_spoken.train',
 'cleaned_datasets/train_100M/childes.train',
 'cleaned_datasets/train_100M/gutenberg.train',
 'cleaned_datasets/train_100M/open_subtitles.train',
 'cleaned_datasets/train_100M/simple_wiki.train',
 'cleaned_datasets/train_100M/switchboard.train']

In [30]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.normalizer = NFKC()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)



In [31]:
trainer = trainers.BpeTrainer(
    vocab_size=32000,
    min_frequency=2,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
    ]
)


tokenizer.train(paths, trainer=trainer)







In [32]:

tokenizer_path = os.path.join('tokenizer', 'tokenizer_train100M.json')
tokenizer.save(tokenizer_path)
print(f"Tokenizer saved to {tokenizer_path}")

Tokenizer saved to tokenizer/tokenizer_train100M.json


In [33]:
tokenizer = Tokenizer.from_file(tokenizer_path)
print(f"Tokenizer loaded from {tokenizer_path}")

text = "Hello world! This is a test."
encoded = tokenizer.encode(text)
print(f"Encoded text: {encoded.tokens}")
print(f"Encoded IDs: {encoded.ids}")
decoded = tokenizer.decode(encoded.ids)
print(f"Decoded text: {decoded}")

Tokenizer loaded from tokenizer/tokenizer_train100M.json
Encoded text: ['ĠHello', 'Ġworld', '!', 'ĠThis', 'Ġis', 'Ġa', 'Ġtest', '.']
Encoded IDs: [2556, 1229, 3, 757, 260, 192, 2775, 16]
Decoded text:  Hello world! This is a test.
