# Notebook for training (generating the vocabulary) and tokenizing a text

In [1]:
# Imports
from tokenization_methods.tokenization_method import TokenizationMethod
from tokenization_methods.byte_pair_encoding import BytePairEncoding
from tokenization_methods.unigram_language_model import UnigramLanguageModel
from tokenization_methods.wordpiece import WordPiece

import pickle
import os

In [2]:
# Function to generate clean text
def clean_text(raw_text):
    # Remove \n
    raw_text = raw_text.replace("\n", " ")

    # Remove unnecessary characters
    characters_to_remove = ("»", ":", "'", '"', ";", ",", ")", "(", "[", "]", "?", "¿", "!", "¡", "-", "«", "\t")
    for c in characters_to_remove:
        raw_text = raw_text.replace(c, " ")

    # Remove accents
    raw_text = raw_text.lower()
    raw_text = raw_text.replace("á", "a")
    raw_text = raw_text.replace("ä", "a")
    raw_text = raw_text.replace("à", "a")
    raw_text = raw_text.replace("é", "e")
    raw_text = raw_text.replace("ë", "e")
    raw_text = raw_text.replace("è", "e")
    raw_text = raw_text.replace("í", "i")
    raw_text = raw_text.replace("ï", "i")
    raw_text = raw_text.replace("ì", "i")
    raw_text = raw_text.replace("ó", "o")
    raw_text = raw_text.replace("ö", "o")
    raw_text = raw_text.replace("ò", "o")
    raw_text = raw_text.replace("ú", "u")
    raw_text = raw_text.replace("ü", "u")
    raw_text = raw_text.replace("ù", "u")

    # Remove numbers
    numbers = [str(i) for i in range(0, 10)]
    for c in numbers:
        raw_text = raw_text.replace(c, " ")

    # Remove double spaces
    while "  " in raw_text:
        raw_text = raw_text.replace("  ", " ")

    return raw_text

## Training

### Load data

In [3]:
# Read all the lines (Training corpus is Don Quijote)
with open("quijote.txt", "r") as f:
    lines = f.readlines()

# All the lines in the same string
raw_text = " ".join(lines)

# Clean text
sentences = clean_text(raw_text).split(".")

# Make the corpus be made up from each of the sentences in the text.
corpus = [sentence.strip() for sentence in sentences]
corpus = [sentence for sentence in corpus if sentence]
corpus = [sentence for sentence in corpus if not sentence.startswith("capitulo")]

### Train Tokenizers

In [4]:
RESULTS_FOLDER = "results"
if not os.path.exists(RESULTS_FOLDER):
    os.mkdir(RESULTS_FOLDER)

In [5]:
VOCAB_SIZES = {
    "tiny": 250,
    "small": 1_000,
    "medium": 4_000,
    "large": 16_000
}

In [6]:
def get_bpe_vocab(vocab_size, file_name):
    if not os.path.exists(file_name):
        vocab = BytePairEncoding().create_vocabulary(corpus=corpus, vocab_size=vocab_size)
        with open(file_name, "wb") as f:
            pickle.dump(vocab, f)

    with open(file_name, "rb") as f:
        return pickle.load(f)

def get_bpe_tiny_vocab():
    return get_bpe_vocab(VOCAB_SIZES["tiny"], os.path.join(RESULTS_FOLDER, "vocab_bpe_tiny.pkl"))

def get_bpe_small_vocab():
    return get_bpe_vocab(VOCAB_SIZES["small"], os.path.join(RESULTS_FOLDER, "vocab_bpe_small.pkl"))

def get_bpe_medium_vocab():
    return get_bpe_vocab(VOCAB_SIZES["medium"], os.path.join(RESULTS_FOLDER, "vocab_bpe_medium.pkl"))

def get_bpe_large_vocab():
    return get_bpe_vocab(VOCAB_SIZES["large"], os.path.join(RESULTS_FOLDER, "vocab_bpe_large.pkl"))

def get_bpe_vocabs():
    tiny = get_bpe_tiny_vocab()
    small = get_bpe_small_vocab()
    medium = get_bpe_medium_vocab()
    large = get_bpe_large_vocab()

    return tiny,small,medium,large

In [7]:
def get_unigram_vocab(vocab_size, file_name, initial_vocab):
    if not os.path.exists(file_name):
        print("Training file: ", file_name)
        vocab = UnigramLanguageModel().create_vocabulary(
            corpus=corpus[:500],        # NOTE: I don't use the full corpus
            vocab_size=vocab_size,
            starting_vocabulary = initial_vocab
        )
        with open(file_name, "wb") as f:
            pickle.dump(vocab, f)

    with open(file_name, "rb") as f:
        return pickle.load(f)

def get_unigram_tiny_vocab():
    return get_unigram_vocab(
        VOCAB_SIZES["tiny"],
        os.path.join(RESULTS_FOLDER, f"vocab_unigram_tiny.pkl"),
        initial_vocab = get_unigram_small_vocab()
    )

def get_unigram_small_vocab():
    return get_unigram_vocab(
        VOCAB_SIZES["small"],
        os.path.join(RESULTS_FOLDER, f"vocab_unigram_small.pkl"),
        initial_vocab = get_unigram_medium_vocab()
    )

def get_unigram_medium_vocab():
    return get_unigram_vocab(
        VOCAB_SIZES["medium"],
        os.path.join(RESULTS_FOLDER, f"vocab_unigram_medium.pkl"),
        initial_vocab = None
    )

def get_unigram_large_vocab():
    return get_unigram_vocab(
        VOCAB_SIZES["large"],
        os.path.join(RESULTS_FOLDER, "vocab_unigram_large.pkl"),
        initial_vocab = None
    )

def get_unigram_vocabs():
    # large = get_unigram_large_vocab()
    medium = get_unigram_medium_vocab()
    small = get_unigram_small_vocab()
    tiny = get_unigram_tiny_vocab()

    return tiny,small,medium#,large

In [8]:
def get_wp_vocab(vocab_size, file_name):
    if not os.path.exists(file_name):
        print("Training file: ", file_name)
        vocab = WordPiece().create_vocabulary(corpus=corpus, vocab_size=vocab_size)
        with open(file_name, "wb") as f:
            pickle.dump(vocab, f)
            
    with open(file_name, "rb") as f:
        return pickle.load(f)

def get_wp_tiny_vocab():
    return get_wp_vocab(VOCAB_SIZES["tiny"], os.path.join(RESULTS_FOLDER, "vocab_wp_tiny.pkl"))

def get_wp_small_vocab():
    return get_wp_vocab(VOCAB_SIZES["small"], os.path.join(RESULTS_FOLDER, "vocab_wp_small.pkl"))

def get_wp_medium_vocab():
    return get_wp_vocab(VOCAB_SIZES["medium"], os.path.join(RESULTS_FOLDER, "vocab_wp_medium.pkl"))

def get_wp_large_vocab():
    return get_wp_vocab(VOCAB_SIZES["large"], os.path.join(RESULTS_FOLDER, "vocab_wp_large.pkl"))

def get_wp_vocabs():
    tiny = get_wp_tiny_vocab()
    small = get_wp_small_vocab()
    medium = get_wp_medium_vocab()
    large = get_wp_large_vocab()

    return tiny,small,medium,large

In [9]:
bpe_vocabs = get_bpe_vocabs()

In [10]:
for vocab_smaller, vocab_larger in zip(bpe_vocabs, bpe_vocabs[1:]):
    assert vocab_larger[:len(vocab_smaller)] == vocab_smaller

In [11]:
unigram_vocabs = get_unigram_vocabs()

In [12]:
for vocab_smaller, vocab_larger in zip(unigram_vocabs, unigram_vocabs[1:]):
    assert all(tk in vocab_larger for tk in vocab_smaller) 

In [13]:
wp_vocabs = get_wp_vocabs()

In [14]:
for vocab_smaller, vocab_larger in zip(wp_vocabs, wp_vocabs[1:]):
    assert vocab_larger[:len(vocab_smaller)] == vocab_smaller

## Tokenize

### Load data

In [15]:
asimov_law_0 = "Un robot no puede dañar a la humanidad o, por inacción, permitir que la humanidad sufra daños."
asimov_law_1 = "Un robot no hará daño a un ser humano, ni por inacción permitirá que un ser humano sufra daño."
asimov_law_2 = "Un robot debe cumplir las órdenes dadas por los seres humanos, a excepción de aquellas que entren en conflicto con la primera ley."
asimov_law_3 = "Un robot debe proteger su propia existencia en la medida en que esta protección no entre en conflicto con la primera o con la segunda ley."

In [16]:
text_to_tokenize = clean_text(asimov_law_0).replace(".", "")

In [17]:
concrete_words_to_analyze = ["dañar", "inaccion", "humanidad", "robot"]
assert all(w in text_to_tokenize for w in concrete_words_to_analyze)

### Tokenize text

In [18]:
tokenizer: TokenizationMethod = BytePairEncoding()

for vocab in bpe_vocabs:
    print(f"Size: {len(vocab)}")
    print("Text: ", tokenizer.tokenize_text(vocab,text_to_tokenize))
    for word_to_tokenize in concrete_words_to_analyze:
        print(word_to_tokenize, tokenizer.tokenize_text(vocab,word_to_tokenize))
    print()

Size: 250
Text:  [107, 137, 'b', 'o', 't', '_', 57, 96, 'e', 38, 'd', 214, 78, 154, 'h', 'u', 155, 'i', 202, 28, 86, 102, 'a', 'c', 54, 51, 198, 60, 79, 152, 34, 46, 'h', 'u', 155, 'i', 202, 64, 'f', 119, 'd', 214, 90]
dañar ['d', 214, 42]
inaccion [102, 'a', 'c', 54, 37]
humanidad ['h', 'u', 155, 'i', 'd', 49]
robot [137, 'b', 'o', 't']

Size: 1000
Text:  [107, 137, 620, 't', '_', 57, 586, 'd', 214, 78, 154, 292, 155, 'i', 202, 28, 86, 102, 638, 339, 198, 60, 79, 152, 731, 292, 155, 'i', 202, 64, 'f', 119, 'd', 214, 90]
dañar ['d', 214, 42]
inaccion [102, 638, 54, 37]
humanidad [292, 155, 'i', 314]
robot [137, 620, 't']

Size: 4000
Text:  [107, 137, 620, 't', '_', 57, 586, 2400, 78, 154, 2337, 'i', 202, 1160, 102, 638, 339, 3769, 79, 152, 731, 2337, 'i', 202, 64, 'f', 119, 2400, 90]
dañar [2400, 42]
inaccion [102, 638, 1770]
humanidad [2337, 'i', 314]
robot [137, 620, 't']

Size: 16000
Text:  [107, 137, 620, 5161, 5918, 2400, 6382, 2337, 10829, 1160, 102, 638, 339, 5881, 152, 731, 233

In [19]:
tokenizer: TokenizationMethod = UnigramLanguageModel()

for vocab in unigram_vocabs:
    print(f"Size: {len(vocab)}")
    print("Text: ", tokenizer.tokenize_text(vocab,text_to_tokenize))
    for word_to_tokenize in concrete_words_to_analyze:
        print(word_to_tokenize, tokenizer.tokenize_text(vocab,word_to_tokenize))
    print()

Size: 250
Text:  [('u', 'n'), ('r', 'o', 'b', 'o', 't'), ('n', 'o'), ('p', 'u', 'e', 'de'), ('d', 'a', 'ñ', 'a', 'r'), ('a',), ('l', 'a'), ('h', 'u', 'm', 'a', 'n', 'idad'), ('o',), ('por',), ('i', 'n', 'a', 'c', 'cion'), ('per', 'mi', 't', 'i', 'r'), ('que',), ('l', 'a'), ('h', 'u', 'm', 'a', 'n', 'idad'), ('s', 'u', 'f', 'r', 'a'), ('d', 'a', 'ñ', 'o', 's')]
dañar [('d', 'a', 'ñ', 'a', 'r')]
inaccion [('i', 'n', 'a', 'c', 'cion')]
humanidad [('h', 'u', 'm', 'a', 'n', 'idad')]
robot [('r', 'o', 'b', 'o', 't')]

Size: 1000
Text:  [('u', 'n'), ('r', 'o', 'b', 'o', 't'), ('n', 'o'), ('p', 'u', 'e', 'de'), ('d', 'a', 'ñ', 'a', 'r'), ('a',), ('l', 'a'), ('h', 'u', 'm', 'a', 'n', 'idad'), ('o',), ('por',), ('i', 'n', 'a', 'c', 'cion'), ('per', 'mi', 't', 'i', 'r'), ('que',), ('l', 'a'), ('h', 'u', 'm', 'a', 'n', 'idad'), ('s', 'u', 'f', 'r', 'a'), ('d', 'a', 'ñ', 'o', 's')]
dañar [('d', 'a', 'ñ', 'a', 'r')]
inaccion [('i', 'n', 'a', 'c', 'cion')]
humanidad [('h', 'u', 'm', 'a', 'n', 'idad')

In [20]:
tokenizer: TokenizationMethod = WordPiece()

for vocab in wp_vocabs:
    print(f"Size: {len(vocab)}")
    print("Text: ", tokenizer.tokenize_text(vocab,text_to_tokenize))
    for word_to_tokenize in concrete_words_to_analyze:
        print(word_to_tokenize, tokenizer.tokenize_text(vocab,word_to_tokenize))
    print()

Size: 250
Text:  ['un', ' ', 'r', '##o', '##b', '##o', '##t', ' ', 'n', '##o', ' ', 'p', '##u', '##e', '##d', '##e', ' ', 'd', '##a', '##ñ', '##a', '##r', ' ', 'a', ' ', 'l', '##a', ' ', 'h', '##u', '##m', '##a', '##n', '##i', '##d', '##a', '##d', ' ', 'o', ' ', 'p', '##o', '##r', ' ', 'in', '##a', '##c', '##c', '##i', '##o', '##n', ' ', 'p', '##e', '##r', '##m', '##i', '##t', '##i', '##r', ' ', 'qu', '##e', ' ', 'l', '##a', ' ', 'h', '##u', '##m', '##a', '##n', '##i', '##d', '##a', '##d', ' ', 's', '##u', '##f', '##r', '##a', ' ', 'd', '##a', '##ñ', '##o', '##s']
dañar ['d', '##a', '##ñ', '##a', '##r']
inaccion ['in', '##a', '##c', '##c', '##i', '##o', '##n']
humanidad ['h', '##u', '##m', '##a', '##n', '##i', '##d', '##a', '##d']
robot ['r', '##o', '##b', '##o', '##t']

Size: 1000
Text:  ['un', ' ', 'r', '##o', '##b', '##o', '##t', ' ', 'n', '##o', ' ', 'pu', '##e', '##d', '##e', ' ', 'd', '##a', '##ñ', '##a', '##r', ' ', 'a', ' ', 'l', '##a', ' ', 'h', '##u', '##m', '##a', '##n', '##