In [None]:
!pip install datasets transformers[sentencepiece]
!apt install git-lfs

In [16]:
from datasets import load_dataset
from transformers import AutoTokenizer
from huggingface_hub import notebook_login, Repository
from google.colab import drive
import json

In [17]:
# HYPERPARAMS AND GLOBAL VARS
vocab_size                     = 30000                             # vocabulary size of the tokenizer
base_model                     = "gpt2"                            # base model Hugging Face name
batch_size                     = 20

drive_mounted                  = False
drive_mounted_path             = "/content/gdrive/"                # only needed if drive_mounted == True
train_data_archived            = True                              # set to True if you need to extract the data from archive
raw_train_archive              = "./dataset_train.zip"             # only needed if data_archived == True
raw_train_json                 = "./dataset_train.json"            # path to the train dataset (after extracting it will be in the current working directory)

config_file                    = None

push_to_hub                    = False                             # set to True if you need to commit changes
user_email                     = "user_email"                      # only needed if push_to_hub == True
user_name                      = "user_name"                       # only needed if push_to_hub == True
tokenizer_repo_name            = "Andrusyshyn/gpt2-coq-tokenizer"  # only needed if push_to_hub == True
tokenizer_output_dir           = "gpt2-coq-tokenizer_local"        # local dir to save the tokenizer
run_name                       = "experimental"                    # branch name (only needed if push_to_hub == True)

In [18]:
def parse_config(config_file: str):
    """
    Parses config_file and sets global variables.

    Parameters
    ----------
    config_file : str
        path to config file.
    """
    global vocab_size, base_model, batch_size, raw_train_json, push_to_hub,\
        tokenizer_repo_name, tokenizer_output_dir, run_name, drive_mounted,\
        drive_mounted_path, train_data_archived, raw_train_archive, user_email, user_name

    with open(config_file, mode='r') as conf_file:
        conf_data = json.load(conf_file)

    vocab_size                     = conf_data["vocab_size"]
    base_model                     = conf_data["base_model"]
    batch_size                     = conf_data["batch_size"]

    raw_train_json                 = conf_data["raw_train_json"]

    push_to_hub                    = conf_data["push_to_hub"]
    tokenizer_repo_name            = conf_data["tokenizer_repo_name"]
    tokenizer_output_dir           = conf_data["tokenizer_output_dir"]
    run_name                       = conf_data["push_to_hub"]

    # Collab only vars:
    drive_mounted                  = conf_data["drive_mounted"]
    drive_mounted_path             = conf_data["drive_mounted_path"]
    train_data_archived            = conf_data["train_data_archived"]
    raw_train_archive              = conf_data["raw_train_archive"]
    user_email                     = conf_data["user_email"]
    user_name                      = conf_data["user_name"]

In [19]:
# MOUNTING DRIVE
if drive_mounted:
    drive.mount(drive_mounted_path)

In [20]:
#PARSING CONFIG
if config_file is not None:
    parse_config(config_file)

In [21]:
# CONFIGURING GIT CREDENTIALS
if push_to_hub:
    !git config --global user.email "{user_email}"
    !git config --global user.name "{user_name}"

# To set Hugging Face token (for writing access) create HF_TOKEN secret in Google Collab or use notebook_login()

In [22]:
# CONFIGURING GIT DIRECTORIES
if push_to_hub:
    repo = Repository(tokenizer_output_dir, clone_from=tokenizer_repo_name)
    repo.git_checkout(run_name, create_branch_ok=True)

In [None]:
# UNPACK DATASET
if train_data_archived:
    if raw_train_archive.endswith(".gz"):
        !gzip -dkv "{raw_train_archive}"
    if raw_train_archive.endswith(".zip"):
        !unzip "{raw_train_archive}"

In [None]:
# LOAD DATASET
tokenizer_dataset = load_dataset("json", data_files=raw_train_json, field="data")
print(tokenizer_dataset)

In [25]:
# LOADING BASE TOKENIZER
base_tokenizer = AutoTokenizer.from_pretrained(base_model)

In [26]:
# GET TRAINING CORPUS
def get_training_corpus():
    """
    Yields batch_size samples from training dataset.
    """
    train_dataset = tokenizer_dataset["train"]
    for ind in range(0, len(train_dataset), batch_size):
        samples = train_dataset[ind : ind + batch_size]
        yield samples["content"]

In [27]:
# TRAINING TOKENIZER
training_corpus = get_training_corpus()
tokenizer = base_tokenizer.train_new_from_iterator(training_corpus, vocab_size)
print("Tokenizer Vocab Size: ", len(tokenizer))

Tokenizer Vocab Size:  30000


In [28]:
# SAVING TOKENIZER
tokenizer.save_pretrained(tokenizer_output_dir)
if push_to_hub:
    repo.push_to_hub(
        commit_message=f"experimental commit", blocking=False
    )