In [41]:
import torch
import random
import pickle
import os
from tqdm import tqdm

from Tokenizer import RegexTokenizer

In [42]:
with open("data/translation.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()

# data preprocessing
translations = []
for sample in data:
    english, german, src = sample.split("\t")
    translations.append((english, german))

In [43]:
# Hyperparameters
NUM_MERGES = 10000
SPECIAL_TOKENS = [b"<|ENDOFTEXT|>", b"<PAD>"]
VOCAB_SIZE = 256 + NUM_MERGES + len(SPECIAL_TOKENS)

EMBEDDING_SIZE = 32
CONTEXT_SIZE = 8
BATCH_SIZE = 32
MAX_STEPS = 5000
LEARNING_RATE = 3E-4
BLOCK_COUNT = 2
NUM_HEADS = 2
DROPOUT = 0.1
HEAD_SIZE = 16 # How big Query, Key and Value matrices are
device = 'cuda' if torch.cuda.is_available() else "cpu"
EVAL_INTERVAL = 500
EVAL_LOSS_BATCHES = 200
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

In [44]:
# Tokenization
tokenizer = RegexTokenizer(GPT4_SPLIT_PATTERN)
tokenizer_file_path = "models/tokenizer.pkl"
if os.path.exists(tokenizer_file_path):
    print("Load Tokenizer...")
    with open(tokenizer_file_path, "rb") as f:
        combined_dict = pickle.load(f)

    # Extract vocab and merges dictionaries from the combined dictionary
    tokenizer.vocab = combined_dict["vocab"]
    tokenizer.merges = combined_dict["merges"]

else:
    print("Train Tokenizer...")
    # only use a subset of the original dataset for tokenizer training
    translations_subset = random.sample(translations, int(len(translations) * 0.05))
    tokenizer.train(translations_subset, vocab_size=VOCAB_SIZE)

    # Save the combined dictionary to a JSON file
    with open(tokenizer_file_path, "wb") as f:
        pickle.dump({"vocab": tokenizer.vocab, "merges": tokenizer.merges}, f)

for st in SPECIAL_TOKENS:
    tokenizer.vocab[max(tokenizer.vocab)+1] = st

Load Tokenizer...


In [45]:
english, german = translations[-100]
eng_enc, ger_enc = tokenizer.encode(english), tokenizer.encode(german)

print("_".join([tokenizer.vocab[idx].decode("utf-8") for idx in eng_enc]))
print("_".join([tokenizer.vocab[idx].decode("utf-8") for idx in ger_enc]))

You_ always_ have_ the_ right_ to_ refuse_ treat_ment_,_ how_ever_,_ I_ must_ explain_ the_ pot_ential_ con_se_qu_en_ces_ if_ that_ will_ be_ your_ choice_.
Sie_ können_ die_ B_eh_and_lung_ jeder_zeit_ ab_l_ehnen_;_ aller_d_ings_ muss_ ich_ Sie_ in_ diesem_ Fall_ über_ die_ möglich_en_ Aus_w_ir_kungen_ auf_kl_ären_.
