In [None]:
from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer
from tokenizers.pre_tokenizers import Metaspace 
from tokenizers.normalizers import NFD, Lowercase, StripAccents, Sequence
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import Metaspace as MetaspaceDecoder
from tokenizers import normalizers
import itertools
from datasets import load_dataset
import time
import re
import numpy as np
import random
from datasets import Dataset, load_dataset, interleave_datasets

In [None]:
# Similar to comprehensive block in model.ipynb
tokenizer = Tokenizer.from_file("tokenizer.json")
PAD_TOKEN_ID = 0
UNK_TOKEN_ID = 1
EN_TOKEN_ID = 2
DE_TOKEN_ID = 3
EOS_TOKEN_ID = 4
MASK_TOKEN_ID = 5

clean_texts = {"tokenizer_approved": []}

# Loading en and de versions of oscar
en = load_dataset(
    'oscar',
    name='unshuffled_deduplicated_en',
    split='train',
    streaming=True
)

de = load_dataset(
    'oscar',
    name='unshuffled_deduplicated_de',
    split='train',
    streaming=True
)

# Labeling data by language
en = en.map(lambda ex: {"text": ex["text"], "lang": "en"})
de = de.map(lambda ex: {"text": ex["text"], "lang": "de"})

# Shuffling data
buffer_size = 10000 
seed = 42
en = en.shuffle(seed=seed, buffer_size=buffer_size)
de = de.shuffle(seed=seed, buffer_size=buffer_size)

# interleaving data into one dataset
streaming_dataset = interleave_datasets(
    [en, de],
    probabilities=[0.5, 0.5],
    stopping_strategy="first_exhausted",
    seed=seed
)

# stop words and paterns
GENERAL_BAD_PATTERNS = re.compile(
    r'''
    \b(
        casino|gambling|poker|betting|slots?|roulette|blackjack|baccarat|craps|freespins|bonus|jackpot|wager|no deposit|ohne einzahlung|kostenlos spielen|echtes geld|spielautomaten|spielhalle|spielbank|willkommensbonus|startguthaben|casinospiele|
        porn|porno|escort|erotic|hookup|onlyfans|nudes?|camgirls?|sexkontakte|erotik|sexchat|live sex|stripchat|webcamsex|geschlechtsverkehr|selbstbefriedigung|masturbation|pornos|pornhub|xvideos|xnxx|vibrators?|dicks?|cums?|
        fast cash|bad credits?|zinsfrei|geld leihen|kredit aufnehmen|ratenzahlung|schnellkredit|binary options?|payday loans?|payday advance|cash advance|short-term loans?|no credit check|guaranteed loan|Kurzzeitkredite?|Minikredite?|Sofortkredite?|Kredit ohne Schufa|schnelles Geld|Geld sofort|
        tinder|badoo|parship|elitepartner|lovoo|flirt|verlieben|
        test answers?|cheat sheet|homework help|buy answers?|buy exam|abitur lösung|prüfung antworten|examen lösung|
        bitcoin|ethereum|blockchain|nft|ico|airdrop|pump and dump|binance|coinbase|kraken|crypto trading|krypto|kryptowährung|
        privacy policy|terms of use|terms and conditions|all rights reserved|copyright|impressum|datenschutz|nutzungsbedingungen|alle rechte vorbehalten|cookie policy|agb|rechtliche hinweise|haftungsausschluss|
        viagra|levitra|cialis|penis|enlargement|erection|erektionsstörung|potenzmittel|libido|sexualstörung|
        weight loss|fat burning|diet pills|appetite suppressant|abnehmen|diätpillen|fettverbrennung|schnell abnehmen|
        make money online|side hustle|get rich quick|passives einkommen|geld verdienen|heimarbeit|schnell reich werden|
        click here|buy now|order now|free trial|limited offer|jetzt kaufen|hier klicken|jetzt abonnieren|kostenlos testen|nur heute
    )\b
    |
    -{3,}|={3,}|\*{3,}|
    (?:(?:\w+\s*,\s*){10,}\w+)
    ''',
    re.IGNORECASE | re.VERBOSE
)

# stop header words
BOILERPLATE_HEADER_PATTERNS = re.compile(
    r'^(?:\s*)'
    r'(you are not logged in|you do not have permission|access this page|'
    r'terms of use|privacy policy|cookies?|all rights reserved|'
    r'sign in|log in|register|create an account|register|'
    r'skip to content|main navigation|toggle navigation|'
    r'select language|choose your region|'
    r'cheap|discounts?|easy|billige|rabatte|einfach|'
    r'sie sind nicht angemeldet|kein zugriff|'
    r'zur hauptnavigation|navigation überspringen|'
    r'anmelden|einloggen|registrieren|konto erstellen|'
    r'nutzungsbedingungen|datenschutz|cookies?|'
    r'alle rechte vorbehalten|sprache auswählen|region wählen|'
    r'günstig|rabatt|einfach|schnell|kostenlos|angebot|aktionen|'
    r'jetzt anmelden|mehr erfahren|hier klicken|'
    r'help|hilfe|assist|unterstützen|call|anrufen|send|senden|respond|antworten|fill|ausfüllen)',

    re.IGNORECASE | re.MULTILINE
)

# One of the many tested filters. No useful patterns were found.
def entropy(text):
    freqs = {}
    for char in text:
        freqs[char] = freqs.get(char, 0) + 1

    total = sum(freqs.values())
    probs = [count / total for count in freqs.values()]
    return -sum(p * math.log2(p) for p in probs if p > 0)


def filter_texts(example: dict,
                 min_num_of_words = 100,
                 max_digit_ratio=0.18, 
                 min_alpha_word_ratio=0.75, 
                 max_symbol_ratio=0.1, 
                 header_check_length=200, 
                 allowed_uppercase_ratio=0.07, 
                 logging=False) -> bool:
    text = example["text"]
    lang = example["lang"]
    
    # Base len filter
    if not text or len(text) < 384:
        return False

    # Stop patterns filter
    if GENERAL_BAD_PATTERNS.search(text):
        if logging:
            print("general bad pattern")
        return False

    # Header stop words filter
    text_header = text[:header_check_length]
    if BOILERPLATE_HEADER_PATTERNS.search(text_header):
        if logging:
            print("header bad pattern")
        return False

    # Statistical filters:
    num_digits = 0
    num_alpha_words = 0
    words = text.lower().split()
    num_words = len(words)
    
    # Filter by number of total words
    if num_words < min_num_of_words:
        if logging:
            print("num words")
        return False

    # Filter ttr
    cleaned_words = [word.strip(".,!?;:`'\"") for word in words]
    ttr = len(cleaned_words) / num_words
    if num_words < 500:
        ttr_threshold = 0.4
    elif num_words < 2000:
        ttr_threshold = 0.35
    else:
        ttr_threshold = 0.3
        
    if ttr < ttr_threshold:
        if logging:
            print("unique words")
        return False
        
    # Filter by numerical digits and alpha words
    for word in words:
        num_digits += sum(c.isdigit() for c in word)
        if word.replace("`", "").replace("'", "").isalpha():
            num_alpha_words += 1

    if (num_digits / len(text)) > max_digit_ratio:
        if logging:
            print("digit words")
        return False

    if (num_alpha_words / num_words) < min_alpha_word_ratio:
        if logging:
            print("alpha ratio")
        return False

    # Filter by mean word lean
    mean_word_len = sum(len(w) for w in words) / num_words
    if not (3 < mean_word_len < (15 if lang == 'de' else 12)):
        if logging:
            print("word len")
        return False

    # Filter by uppercase ratio
    uppercase_chars_ratio = sum(1 for c in text if c.isupper()) / len(text)
    if uppercase_chars_ratio > allowed_uppercase_ratio:
        if logging:
            print("uppercase ratio")
        return False
    return True

def tokenizer_test(batch, max_len=320, min_len=256, max_avg_token_id=7000, min_avg_token_len=3.2, max_avg_token_len=5, max_unk_count=3):
    # Tokenize and filter
    texts = []
    # Text standardization
    replacements = {
        "\n": "[NL]",
        "“": '"',
        "”": '"',
        "„": '"',
        "’": "'",
        "—": "-",
        "…": "...",
        "`": "'",
        "''": '"',
        "$": "dollars",
        "€": "euros",
        "½": "1/2"
    }
    for i in range(len(batch["text"])):
        text = batch["text"][i]
        for old, new in replacements.items():
            text = text.replace(old, new)
        texts.append(text)
        
    # Get encodings
    encodings = tokenizer.encode_batch(texts, add_special_tokens=False)
    batch_test = [] 
    batch_token_ids = [None]*len(batch["text"])
    
    for i, enc in enumerate(encodings):
        token_ids = enc.ids
        max_tokens_per_text = max_len - 1
        # Trim text by max_len and filter by min_len
        if len(token_ids) > max_tokens_per_text:
            token_ids = token_ids[:max_tokens_per_text]
        if len(token_ids) < min_len:
            batch_test.append(False)
            continue

        # Filter by unc tokens
        unk_count = sum(1 for t in token_ids if t == UNK_TOKEN_ID)
        if unk_count >= max_unk_count:
            batch_test.append(False)
            continue 

        # Filter by average token id
        avg_token_id = np.mean(token_ids)
        if avg_token_id > max_avg_token_id:
            batch_test.append(False)
            continue 

        # Filter by strange tokens
        tokens_as_strings = enc.tokens
        if not tokens_as_strings:
            batch_test.append(False)
            continue 

        # Filter by average token len
        avg_token_len = sum(len(t) for t in tokens_as_strings) / len(tokens_as_strings)
        if max_avg_token_len < avg_token_len < min_avg_token_len:
            batch_test.append(False)
            continue
        # Label text as correct and add batch id
        batch_test.append(True)
        batch_token_ids[i] = token_ids
        clean_texts["tokenizer_approved"].append(batch["text"][i])
        
    return {
        "tokenizer_test": batch_test,
        "token_ids": batch_token_ids
    }

def filter_by_tokenizer_test(example):
    return example["tokenizer_test"]

streaming_dataset = streaming_dataset.filter(filter_texts)
streaming_dataset = streaming_dataset.map(tokenizer_test, batched=True, batch_size=128,)
streaming_dataset = streaming_dataset.filter(filter_by_tokenizer_test)

In [None]:
iterator = iter(streaming_dataset)

In [None]:
for i in range(20000):
    if i % 50 == 0:
        print(f"{i}) steps")
    next(iterator)

In [None]:
len(clean_texts["tokenizer_approved"])

In [None]:
# Used for creating first tokenizer
"""
text_tokenizer_en = ""
i = 0
for example in itertools.islice(dataset_en_stream, 8000):
    i+=1
    text_tokenizer_en += example['text']
    time.sleep(0.008)
    if i%100==0:
        print(f"added {i} en texts")
        
i = 0
text_tokenizer_de = ""
for example in itertools.islice(dataset_de_stream, 12000):
    i+=1
    text_tokenizer_de += example['text']
    time.sleep(0.008)
    if i%100==0:
        print(f"added {i} de texts")
"""

In [None]:
#text_tokenizer_de[:3000]

In [None]:
def clean_text_for_tokenizer(text: str) -> str:
    allowed_chars = r"a-zA-ZäöüßÄÖÜ0-9\s.,'!?\"\-():;&"
    cleaned_text = re.sub(f'[^{allowed_chars}]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text

In [None]:
for i in range(len(clean_texts["tokenizer_approved"])):
    clean_texts["tokenizer_approved"][i] = clean_text_for_tokenizer(clean_texts["tokenizer_approved"][i])

In [None]:
#text_tokenizer_de = text_tokenizer_de.replace('\n', ' [NL]')
#text_tokenizer_en = text_tokenizer_en.replace('\n', ' [NL]')

#text_tokenizer_de = clean_text_for_tokenizer(text_tokenizer_de)
#text_tokenizer_en = clean_text_for_tokenizer(text_tokenizer_en)

In [None]:
#lines = (text_tokenizer_en+text_tokenizer_de).splitlines()

In [None]:
tokenizer = Tokenizer(Unigram())

In [None]:
tokenizer.pre_tokenizer = Metaspace()
tokenizer.decoder = MetaspaceDecoder()

In [None]:
trainer = UnigramTrainer(
    vocab_size=30000,
    special_tokens=["[PAD]", "[UNK]", "[SOSE]", "[SOSD]", "[EOS]", "[MASK]", "[NL]"], # sose - start of the sequence english, sosd - start of the sequence deutch
    unk_token="[UNK]",
    max_piece_length=16
)

In [None]:
tokenizer.train_from_iterator(clean_texts["tokenizer_approved"], trainer=trainer)

In [None]:
print(tokenizer.encode("Übersetzung ist ein entscheidender Bestandteil des maschinellen lernens.").tokens)
print(tokenizer.encode("Translation is very important in machine learning.").tokens)
print(tokenizer.encode("Das hierarchische Bayes-Modell wurde modifiziert.").tokens)
print(tokenizer.encode("This is uncharacteristically difficult.").tokens)
print(tokenizer.encode("Let's test antidisestablishmentarianism.").tokens)

In [None]:
tokenizer.save("tokenizer_clean.json")