In [None]:
!pip install datasets transformers[sentencepiece]
!apt install git-lfs

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer
from huggingface_hub import notebook_login, Repository
from google.colab import drive

In [6]:
# HYPERPARAMS AND GLOBAL VARS
vocab_size                     = 30000                             # vocabulary size of the tokenizer
base_model                     = "gpt2"                            # base model Hugging Face name
batch_size                     = 20

drive_mounted                  = True
drive_mounted_path             = "/content/gdrive/"                # only needed if drive_mounted == True
train_data_archived            = True                              # set to True if you need to extract the data from archive
raw_train_archive              = drive_mounted_path + "My Drive/UCU/diploma/datasets/dataset_train.zip"    # only needed if data_archived == True
raw_train_json                 = "./dataset_train.json"            # path to the train dataset (after extracting it will be in the current working directory)

push_to_hub                    = True                              # set to True if you need to commit changes
user_email                     = "orest.andrusyshyn@ucu.edu.ua"    # only needed if push_to_hub == True
user_name                      = "Orest Andrusyshyn"               # only needed if push_to_hub == True
tokenizer_repo_name            = "Andrusyshyn/gpt2-coq-tokenizer"  # only needed if push_to_hub == True
tokenizer_output_dir           = "gpt2-coq-tokenizer_local"        # local dir to save the tokenizer (only needed if push_to_hub == True)
run_name                       = "vocab_30k"                       # branch name (only needed if push_to_hub == True)

In [7]:
# MOUNTING DRIVE
if drive_mounted:
    drive.mount(drive_mounted_path)

Mounted at /content/gdrive/


In [8]:
# CONFIGURING GIT CREDENTIALS
if push_to_hub:
    !git config --global user.email "{user_email}"
    !git config --global user.name "{user_name}"

# To set Hugging Face token (for writing access) create HF_TOKEN secret in Google Collab or use notebook_login()

In [None]:
# CONFIGURING GIT DIRECTORIES
if push_to_hub:
    repo = Repository(tokenizer_output_dir, clone_from=tokenizer_repo_name)
    repo.git_checkout(run_name, create_branch_ok=True)

In [10]:
# UNPACK DATASET
if train_data_archived:
    if raw_train_archive.endswith(".gz"):
        !gzip -dkv "{raw_train_archive}"
    if raw_train_archive.endswith(".zip"):
        !unzip "{raw_train_archive}"

Archive:  /content/gdrive/My Drive/UCU/diploma/datasets/dataset_train.zip
  inflating: dataset_train.json      


In [11]:
# LOAD DATASET
tokenizer_dataset = load_dataset("json", data_files=raw_train_json, field="data")
print(tokenizer_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'filepath'],
        num_rows: 4972
    })
})


In [None]:
# LOADING BASE TOKENIZER
base_tokenizer = AutoTokenizer.from_pretrained(base_model)

In [13]:
# GET TRAINING CORPUS
def get_training_corpus():
    train_dataset = tokenizer_dataset["train"]
    for ind in range(0, len(train_dataset), batch_size):
        samples = train_dataset[ind : ind + batch_size]
        yield samples["content"]

In [14]:
training_corpus = get_training_corpus()
tokenizer = base_tokenizer.train_new_from_iterator(training_corpus, vocab_size)
print("Tokenizer Vocab Size: ", len(tokenizer))

Tokenizer Vocab Size:  30000


In [None]:
tokenizer.save_pretrained(tokenizer_output_dir)
repo.push_to_hub(
    commit_message=f"vocab_size 30k train dataset", blocking=False
)