In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs

In [3]:
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

from huggingface_hub import notebook_login, Repository
from datasets import load_dataset, DatasetDict, Dataset
from accelerate import Accelerator
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, pipeline, get_scheduler
from transformers.keras_callbacks import PushToHubCallback

import re
import json
import os
from tqdm import tqdm
from tqdm.notebook import tqdm
from google.colab import drive

### TRAINING


#### HYPER PARAMS, GLOBAL VARS, FUNCTIONS

In [18]:
# HYPER PARAMETERS
context_length                 = 1024
train_batch_size               = 4
eval_batch_size                = 4
weight_decay                   = 0.1
lr                             = 5e-4
lr_scheduler_type              = "linear"    # "linear" | "cosine" | "constant"
num_warmup_steps               = 50
gradient_accumulation_steps    = 8
gradient_checkpointing         = False
eval_steps                     = 50 // gradient_accumulation_steps
num_train_epochs               = 5
mixed_precision                = "fp16"

#GLOBAL VARS
have_git_write_access          = True                              # set to True if you need to commit changes
user_email                     = "orest.andrusyshyn@ucu.edu.ua"    # only needed if above have_git_write_access == True
user_name                      = "Orest Andrusyshyn"               # only needed if above have_git_write_access == True

drive_mounted                  = True
drive_mounted_path             = '/content/gdrive/'    # only needed if drive_mounted == True
save_json_logs                 = True                  # set to True to save training results into json file (drive_train_res_path)
drive_train_res_path           = drive_mounted_path + 'My Drive/UCU/diploma/progress_notes/training_results/second_training/testing_algo1.json'    # only needed if save_json_logs == True
save_tensorboard_logs          = True                  # set to True to save training results into tensorboard run (tensorboard_run_path)
tensorboard_run_path           = drive_mounted_path + 'My Drive/UCU/diploma/progress_notes/training_results/tensorboard_runs/testing_algo'                   # only needed if save_tensorboard_logs == True

data_archived                  = True    # set to True if you need to unarchive the data
raw_train_archive              = "./pretrain_dataset_train.json.gz"         # only needed if data_archived == True
raw_valid_archive              = "./pretrain_dataset_validation.json.gz"    # only needed if data_archived == True
raw_train_json                 = "./pretrain_dataset_train.json"            # only needed if data_archived == False
raw_valid_json                 = "./pretrain_dataset_validation.json"       # only needed if data_archived == False

tokenizer_repo_name            = "Andrusyshyn/gpt2-coq-tokenizer"
tokenizer_commit_hash          = "90c3858771e6120dfc8eab7a6754295964e8d8c3"

init_model                     = "gpt2"                                                   # base model Hugging Face name
model_repo_name                = "Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train"
model_output_dir               = "./gpt2-pretrained-for-coq-pt-custom-train-local"        # local dir to save the model

push_to_hub                    = True                 # set to True to push the model
run_name                       = "testing_algo"    # branch name (only needed if push_to_hub == True)

partially_trained              = True                                         # set to True to continue model training from specific commit hash (model_commit_hash)
model_commit_hash              = "91aac830c5ff8417d8bf389eea271a9d3dabab9c"    # only needed if partially_trained == True
previously_completed_steps     = 18                                          # only needed if partially_trained == True
previous_global_steps          = 144                                         # only needed if partially_trained == True
previous_step                  = 1                                          # only needed if partially_trained == True
stopped_epoch                  = 1                                             # only needed if partially_trained == True

torch.manual_seed(7)

<torch._C.Generator at 0x7e9cfc976250>

In [5]:
# CUSTOM LOSS FUNCTION
def custom_loss(inputs, logits):
    # inputs [batch_size X cl]
    # logits [btach_size X cl X vocab_size]
    # Our labels start from second sequence token because first one does not have preceding token.
    # We drop last logit because last sequence token does not have subsequent token, so no label to compare
    shifted_labels = inputs[..., 1:].contiguous()
    shifted_logits = logits[..., :-1, :].contiguous()

    loss_func = CrossEntropyLoss(reduce=False)
    # loss [batch_size * (cl-1)] = loss_fct([batch_size * (cl-1) X vocab_size], [batch_size * (cl-1)])
    loss = loss_func(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1))
    # loss_per_sample [batch_size]
    loss_per_sequence = loss.view(shifted_logits.size(0), shifted_logits.size(1)).mean(axis=1)
    return loss_per_sequence.mean()

In [6]:
# PARAMETERS FOR WEIGHT DECAY
def get_wd_parameters(model, no_decay=["bias", r"ln_.{1,2}\.weight"]):
    wd_params = []
    nwd_params = []
    for name, params in model.named_parameters():
        if any(re.search(nd_reg, name) for nd_reg in no_decay):
            nwd_params.append(params)
        else:
            wd_params.append(params)
    return [
        {"params": wd_params,  "weight_decay": weight_decay},
        {"params": nwd_params, "weight_decay": 0.0},
    ]

In [7]:
# TOKENIZE RAW DATASETS
def get_tokenized_dataset(p_raw_dataset, p_context_length, p_tokenizer):
    concatenated_tokenized_samples = []
    for sample in p_raw_dataset:
        tokenized_sample = p_tokenizer(sample["content"], truncation=False)["input_ids"]
        concatenated_tokenized_samples.extend(tokenized_sample + [p_tokenizer.eos_token_id])
    tokenized_dataset_list = []
    for i in range(0, len(concatenated_tokenized_samples), p_context_length):
        input_ids = concatenated_tokenized_samples[i : i + p_context_length]
        if len(input_ids) == p_context_length:
            tokenized_dataset_list.append(torch.tensor(input_ids))

    return Dataset.from_dict({"input_ids": tokenized_dataset_list})

In [8]:
# SAVING TRAINING RESULTS TO THE JSON FILE
def save_results(filepath:str, split: str, results: dict):
    """
    {
        "hyperparams": {},
        "train": [
            results1,
            results2,
            ...
        ],
        "valid": [
            results1,
            results2,
            ...
        ]
    }
    """
    if not save_json_logs:
        return
    if split not in {"train", "valid"}:
        print("ERROR: INVALID SPLIT")
        return
    _run_name = run_name
    if not push_to_hub:
        _run_name = ""
    _tensorboard_run_path = tensorboard_run_path
    if not save_tensorboard_logs:
        _tensorboard_run_path = ""
    hyperparams_dict = {
        "context_length"                 : context_length,
        "train_batch_size"               : train_batch_size,
        "eval_batch_size"                : eval_batch_size,
        "weight_decay"                   : weight_decay,
        "lr"                             : lr,
        "lr_scheduler_type"              : lr_scheduler_type,
        "num_warmup_steps"               : num_warmup_steps,
        "gradient_accumulation_steps"    : gradient_accumulation_steps,
        "gradient_checkpointing"         : gradient_checkpointing,
        "eval_steps"                     : eval_steps,
        "num_train_epochs"               : num_train_epochs,
        "mixed_precision"                : mixed_precision,
        "tokenizer_repo_name"            : tokenizer_repo_name,
        "tokenizer_commit_hash"          : tokenizer_commit_hash,
        "init_model"                     : init_model,
        "model_repo_name"                : model_repo_name,
        "run_name"                       : _run_name,
        "drive_train_res_path"           : drive_train_res_path,
        "tensorboard_run_path"           : _tensorboard_run_path,
    }
    json_data = {"train": [], "valid": []}
    if os.path.exists(filepath):
        with open(filepath, 'r') as json_file:
            json_data = json.load(json_file)
    json_data["hyperparams"] = hyperparams_dict
    json_data[split].append(results)
    with open(filepath, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

In [9]:
# TENSORBOARD VISUALISATION
def add_to_tensorboard(json_filepath, tensorboard_run_path):
    if not save_tensorboard_logs:
        return
    if not os.path.exists(json_filepath):
        print("ERROR: json_filepath DOES NOT EXIST")
        return
    with open(json_filepath, mode="r") as json_file:
        json_data = json.load(json_file)
        one_step_tokens = json_data["hyperparams"]["train_batch_size"] * json_data["hyperparams"]["gradient_accumulation_steps"] * json_data["hyperparams"]["context_length"]
        writer = SummaryWriter(tensorboard_run_path)
        prev_comleted_steps = json_data["train"][0]["competed_steps"]
        prev_lr = json_data["train"][0]["lr"][0]
        train_losses = []
        cs = 0
        for entry in json_data["train"]:
            cs = entry["competed_steps"]
            if cs == prev_comleted_steps:
                train_losses.append(entry["loss/train"])
                continue
            else:
                writer.add_scalar("Loss/Train", sum(train_losses)/len(train_losses), prev_comleted_steps * one_step_tokens)
                writer.add_scalar("Learning Rate", prev_lr, prev_comleted_steps * one_step_tokens)
                train_losses = [entry["loss/train"]]
                prev_comleted_steps = cs
                prev_lr = entry["lr"][0]
        writer.add_scalar("Loss/Train", sum(train_losses)/len(train_losses), cs * one_step_tokens)
        writer.add_scalar("Learning Rate", prev_lr, cs * one_step_tokens)

        for entry in json_data["valid"]:
            cs = entry["competed_steps"]
            writer.add_scalar("Loss/Eval", entry["loss/eval"], cs * one_step_tokens)
            writer.add_scalar("Perplexity/Eval", entry["perplexity"], cs * one_step_tokens)
        writer.close()

#### CONFIGURING

In [10]:
# MOUNTING DRIVE FOR SAVING LOGS
if drive_mounted:
    drive.mount(drive_mounted_path)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [11]:
# CONFIGURING GIT CREDENTIALS
if have_git_write_access:
    !git config --global user.email "{user_email}"
    !git config --global user.name "{user_name}"

# To set Hugging Face token (for writing access) create HF_TOKEN secret in Google Collab or use notebook_login()

In [12]:
# CONFIGURING GIT DIRECTORIES
if push_to_hub:
    repo = Repository(model_output_dir, clone_from=model_repo_name)
    repo.git_checkout(run_name, create_branch_ok=True)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train into local empty directory.


Download file model.safetensors:   0%|          | 8.00k/480M [00:00<?, ?B/s]

Clean file model.safetensors:   0%|          | 1.00k/480M [00:00<?, ?B/s]

Checked out testing_algo from testing_algo.
Branch 'testing_algo' set up to track remote branch 'testing_algo' from 'origin'.




#### DATASETS

In [13]:
# UNPACK DATASETS
if data_archived:
    !gzip -dkv "{raw_train_archive}"
    !gzip -dkv "{raw_valid_archive}"
    raw_train_json = raw_train_archive[:-3]
    raw_valid_json = raw_valid_archive[:-3]

gzip: ./pretrain_dataset_train.json already exists; do you wish to overwrite (y or n)? ^C
gzip: ./pretrain_dataset_validation.json already exists; do you wish to overwrite (y or n)? ^C


In [14]:
# LOAD DATASETS
data_files = {"train": raw_train_json, "validation": raw_valid_json}
raw_datasets = load_dataset("json", data_files=data_files, field="data")
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['filepath', 'content'],
        num_rows: 5969
    })
    validation: Dataset({
        features: ['filepath', 'content'],
        num_rows: 663
    })
})


In [15]:
# DEBUG CODE (taking few samples from whole DatasetDict)
train_debug = raw_datasets["train"][:200]
valid_debug = raw_datasets["validation"][:50]
train_debug_dataset = Dataset.from_dict(train_debug)
valid_debug_dataset = Dataset.from_dict(valid_debug)
raw_datasets = DatasetDict({"train": train_debug_dataset, "validation": valid_debug_dataset})
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['filepath', 'content'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['filepath', 'content'],
        num_rows: 50
    })
})


In [16]:
# LOADING TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(tokenizer_repo_name, revision=tokenizer_commit_hash)
print("tokenizer vocab size: ", len(tokenizer))
# TOKENIZE RAW DATASETS
train_dataset = get_tokenized_dataset(raw_datasets["train"],      context_length, tokenizer)
valid_dataset = get_tokenized_dataset(raw_datasets["validation"], context_length, tokenizer)
train_dataset.set_format("torch")
valid_dataset.set_format("torch")

# CREATE DATALOADERS
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=eval_batch_size)
print(train_dataset)
print(valid_dataset)
print("len(train_dataloader): ", len(train_dataloader))
print("len(valid_dataloader): ", len(valid_dataloader))

Token indices sequence length is longer than the specified maximum sequence length for this model (5858 > 1024). Running this sequence through the model will result in indexing errors


tokenizer vocab size:  50000
Dataset({
    features: ['input_ids'],
    num_rows: 568
})
Dataset({
    features: ['input_ids'],
    num_rows: 151
})
len(train_dataloader):  142
len(valid_dataloader):  38


#### CONFIGURING MODEL AND TRAINING

In [19]:
# CONFIGURING MODEL
model = None
if partially_trained:
    print("Loading partially trained model")
    model = GPT2LMHeadModel.from_pretrained(model_repo_name, revision=model_commit_hash)
else:
    print("Training from scratch")
    config = AutoConfig.from_pretrained(
        init_model,
        vocab_size=len(tokenizer),
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model = GPT2LMHeadModel(config)
print()
if gradient_checkpointing:
    model.gradient_checkpointing_enable()

optimizer = AdamW(get_wd_parameters(model), lr=lr)
accelerator = Accelerator(mixed_precision=mixed_precision)
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, valid_dataloader
)

num_steps_per_epoch = len(train_dataloader)
print("Num steps per epoch: ", num_steps_per_epoch)
num_training_completed_steps = (num_train_epochs * num_steps_per_epoch) // gradient_accumulation_steps
if ((num_train_epochs * num_steps_per_epoch) % gradient_accumulation_steps != 0):
    num_training_completed_steps += 1
print("Num optimizer steps: ", num_training_completed_steps)


lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_completed_steps
)

if partially_trained:
    with torch.no_grad():
        for _ in range(previously_completed_steps):
            lr_scheduler.step()

print()
print(f"Model size:                                        {model.num_parameters()}")
print(f"Model size (only trainable params):                {model.num_parameters(only_trainable=True)}")
print(f"Model size (only trainable non-embeddings params): {model.num_parameters(only_trainable=True, exclude_embeddings=True)}")

In [22]:
# EVALUATION FUNCTION
def evaluate():
    torch.cuda.empty_cache()

    model.eval()
    losses = []
    with torch.no_grad():
        for step, batch in enumerate(valid_dataloader):
            with torch.no_grad():
                logits = model(batch["input_ids"]).logits
                loss = custom_loss(batch["input_ids"], logits)
                losses.append(loss.item())
    loss = torch.mean(torch.Tensor(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")

    torch.cuda.empty_cache()
    return loss.item(), perplexity.item()

In [23]:
# TRAINING LOOP
model.train()
completed_steps = 0
global_steps = 0
if partially_trained:
    completed_steps = previously_completed_steps
    global_steps = previous_global_steps
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=num_steps_per_epoch
    ):
        if partially_trained and ((epoch<stopped_epoch) or ((epoch==stopped_epoch) and (step <= previous_step))):
            continue

        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
################################################################################
        log_train = {
                "loss/train": loss.item(),
                "completed_steps": completed_steps,
                "lr": lr_scheduler.get_lr(),
                "global_steps" : global_steps,
                "epoch": epoch,
                "steps": step,
        }
        accelerator.print(log_train)
        save_results(drive_train_res_path, "train", log_train)
################################################################################
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
################################################################################
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (global_steps % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            log_eval = {
                    "loss/eval": eval_loss,
                    "perplexity": perplexity,
                    "completed_steps": completed_steps,
                    "lr": lr_scheduler.get_lr(),
                    "global_steps" : global_steps,
                    "epoch": epoch,
                    "steps": step,
                    "loss/train": loss.item() * gradient_accumulation_steps,
            }
            accelerator.print(log_eval)
            save_results(drive_train_res_path, "valid", log_eval)
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
            tokenizer.save_pretrained(model_output_dir)
            if accelerator.is_main_process:
                if push_to_hub:
                    repo.push_to_hub(
                        commit_message=f"Training in progress: completed_steps {completed_steps}; global_steps {global_steps};\
                                         epoch {epoch}; steps {step}; lr {lr_scheduler.get_lr()};\
                                         loss/eval {eval_loss}; perplexity {perplexity}; loss/train {loss.item() * gradient_accumulation_steps}",
                        blocking=False
                    )
            model.train()

################################################################################
################################################################################
################################################################################

#GRADIENT UPDATE (In case (global_steps % gradient_accumulation_steps != 0)
last_eval_log_train_loss = 0
if (global_steps % gradient_accumulation_steps != 0):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=(gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)) - 1    # -1 here is purely for better visualisation of tqdm progress bar
    ):
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        last_eval_log_train_loss = loss.item()
        log_train = {
                "loss/train": loss.item(),
                "completed_steps": completed_steps,
                "lr": lr_scheduler.get_lr(),
                "global_steps" : global_steps,
                "epoch": epoch,
                "steps": step + (num_train_epochs * num_steps_per_epoch),
        }
        accelerator.print(log_train)
        save_results(drive_train_res_path, "train", log_train)
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
            break

################################################################################
################################################################################
################################################################################

# FINAL EVALUATE AND SAVE
additional_steps = 0
if (global_steps % gradient_accumulation_steps != 0):
    additional_steps = gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)
with torch.no_grad():
    last_train_loss = 0
    for batch in train_dataloader:
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        last_train_loss = loss.item()
        break
eval_loss, perplexity = evaluate()
log_eval = {
        "loss/eval": eval_loss,
        "perplexity": perplexity,
        "completed_steps": completed_steps,
        "lr": lr_scheduler.get_lr(),
        "global_steps" : global_steps,
        "epoch": epoch,
        "steps": (num_train_epochs * num_steps_per_epoch) + additional_steps - 1,
        "loss/train": last_eval_log_train_loss,
}
accelerator.print(log_eval)
log_train = {
        "loss/train": last_train_loss,
        "completed_steps": completed_steps,
        "lr": lr_scheduler.get_lr(),
        "global_steps" : global_steps,
        "epoch": epoch,
        "steps": (num_train_epochs * num_steps_per_epoch) + additional_steps,
}
save_results(drive_train_res_path, "train", log_train)
save_results(drive_train_res_path, "valid", log_eval)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
tokenizer.save_pretrained(model_output_dir)
if accelerator.is_main_process:
    if push_to_hub:
        repo.push_to_hub(
            commit_message=f"Final model: completed_steps {completed_steps}; global_steps {global_steps};\
                             epoch {epoch}; steps {(num_train_epochs * num_steps_per_epoch) + additional_steps - 1}; lr {lr_scheduler.get_lr()};\
                             loss/eval {eval_loss}; perplexity {perplexity}; loss/train {last_eval_log_train_loss}",
            blocking=False
        )
model.train()

torch.cuda.empty_cache()

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]



{'loss/train': 1.0958545207977295, 'completed_steps': 18, 'lr': [0.00017999999999999998, 0.00017999999999999998], 'global_steps': 144, 'epoch': 1, 'steps': 2}
{'loss/train': 1.430997610092163, 'completed_steps': 18, 'lr': [0.00017999999999999998, 0.00017999999999999998], 'global_steps': 145, 'epoch': 1, 'steps': 3}
{'loss/train': 1.0095056295394897, 'completed_steps': 18, 'lr': [0.00017999999999999998, 0.00017999999999999998], 'global_steps': 146, 'epoch': 1, 'steps': 4}
{'loss/train': 1.1447408199310303, 'completed_steps': 18, 'lr': [0.00017999999999999998, 0.00017999999999999998], 'global_steps': 147, 'epoch': 1, 'steps': 5}
{'loss/train': 1.4378679990768433, 'completed_steps': 18, 'lr': [0.00017999999999999998, 0.00017999999999999998], 'global_steps': 148, 'epoch': 1, 'steps': 6}
{'loss/train': 1.367504358291626, 'completed_steps': 18, 'lr': [0.00017999999999999998, 0.00017999999999999998], 'global_steps': 149, 'epoch': 1, 'steps': 7}
{'loss/train': 1.5254594087600708, 'completed_st

Several commits (4) will be pushed upstream.


{'loss/train': 1.1094627380371094, 'completed_steps': 24, 'lr': [0.00016478873239436618, 0.00016478873239436618], 'global_steps': 192, 'epoch': 1, 'steps': 50}
{'loss/train': 1.1844112873077393, 'completed_steps': 24, 'lr': [0.00016478873239436618, 0.00016478873239436618], 'global_steps': 193, 'epoch': 1, 'steps': 51}
{'loss/train': 1.0569342374801636, 'completed_steps': 24, 'lr': [0.00016478873239436618, 0.00016478873239436618], 'global_steps': 194, 'epoch': 1, 'steps': 52}
{'loss/train': 1.0430165529251099, 'completed_steps': 24, 'lr': [0.00016478873239436618, 0.00016478873239436618], 'global_steps': 195, 'epoch': 1, 'steps': 53}
{'loss/train': 1.265427589416504, 'completed_steps': 24, 'lr': [0.00016478873239436618, 0.00016478873239436618], 'global_steps': 196, 'epoch': 1, 'steps': 54}
{'loss/train': 1.712423324584961, 'completed_steps': 24, 'lr': [0.00016478873239436618, 0.00016478873239436618], 'global_steps': 197, 'epoch': 1, 'steps': 55}
{'loss/train': 1.3028028011322021, 'comple

Several commits (5) will be pushed upstream.


{'loss/train': 0.9177290201187134, 'completed_steps': 30, 'lr': [0.00014957746478873237, 0.00014957746478873237], 'global_steps': 240, 'epoch': 1, 'steps': 98}
{'loss/train': 1.2045683860778809, 'completed_steps': 30, 'lr': [0.00014957746478873237, 0.00014957746478873237], 'global_steps': 241, 'epoch': 1, 'steps': 99}
{'loss/train': 1.3974401950836182, 'completed_steps': 30, 'lr': [0.00014957746478873237, 0.00014957746478873237], 'global_steps': 242, 'epoch': 1, 'steps': 100}
{'loss/train': 0.9510871171951294, 'completed_steps': 30, 'lr': [0.00014957746478873237, 0.00014957746478873237], 'global_steps': 243, 'epoch': 1, 'steps': 101}
{'loss/train': 1.313844919204712, 'completed_steps': 30, 'lr': [0.00014957746478873237, 0.00014957746478873237], 'global_steps': 244, 'epoch': 1, 'steps': 102}
{'loss/train': 0.9914324879646301, 'completed_steps': 30, 'lr': [0.00014957746478873237, 0.00014957746478873237], 'global_steps': 245, 'epoch': 1, 'steps': 103}
{'loss/train': 1.2167444229125977, 'c

  0%|          | 0/142 [00:00<?, ?it/s]

{'loss/train': 1.2128849029541016, 'completed_steps': 35, 'lr': [0.0001369014084507042, 0.0001369014084507042], 'global_steps': 284, 'epoch': 2, 'steps': 0}
{'loss/train': 1.123058557510376, 'completed_steps': 35, 'lr': [0.0001369014084507042, 0.0001369014084507042], 'global_steps': 285, 'epoch': 2, 'steps': 1}
{'loss/train': 0.9079849720001221, 'completed_steps': 35, 'lr': [0.0001369014084507042, 0.0001369014084507042], 'global_steps': 286, 'epoch': 2, 'steps': 2}
{'loss/train': 1.1306798458099365, 'completed_steps': 35, 'lr': [0.0001369014084507042, 0.0001369014084507042], 'global_steps': 287, 'epoch': 2, 'steps': 3}
{'loss/eval': 1.5977909564971924, 'perplexity': 4.942102909088135, 'completed_steps': 36, 'lr': [0.0001343661971830986, 0.0001343661971830986], 'global_steps': 288, 'epoch': 2, 'steps': 3, 'loss/train': 1.1306798458099365}


Several commits (6) will be pushed upstream.


{'loss/train': 0.9049314856529236, 'completed_steps': 36, 'lr': [0.0001343661971830986, 0.0001343661971830986], 'global_steps': 288, 'epoch': 2, 'steps': 4}
{'loss/train': 0.870507538318634, 'completed_steps': 36, 'lr': [0.0001343661971830986, 0.0001343661971830986], 'global_steps': 289, 'epoch': 2, 'steps': 5}
{'loss/train': 1.0990574359893799, 'completed_steps': 36, 'lr': [0.0001343661971830986, 0.0001343661971830986], 'global_steps': 290, 'epoch': 2, 'steps': 6}
{'loss/train': 1.1245577335357666, 'completed_steps': 36, 'lr': [0.0001343661971830986, 0.0001343661971830986], 'global_steps': 291, 'epoch': 2, 'steps': 7}
{'loss/train': 1.3111896514892578, 'completed_steps': 36, 'lr': [0.0001343661971830986, 0.0001343661971830986], 'global_steps': 292, 'epoch': 2, 'steps': 8}
{'loss/train': 0.7982566952705383, 'completed_steps': 36, 'lr': [0.0001343661971830986, 0.0001343661971830986], 'global_steps': 293, 'epoch': 2, 'steps': 9}
{'loss/train': 1.2051491737365723, 'completed_steps': 36, '

KeyboardInterrupt: 

In [None]:
torch.cuda.empty_cache()
add_to_tensorboard(drive_train_res_path, tensorboard_run_path)

In [None]:
%tensorboard --logdir "{tensorboard_run_path}"

### TESTING PRETRAINED MODEL

In [None]:
coq_model = GPT2LMHeadModel.from_pretrained("Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train")
coq_tokenizer = AutoTokenizer.from_pretrained("Andrusyshyn/gpt2-coq-tokenizer")
pipe = pipeline(
    "text-generation", model=coq_model, tokenizer=coq_tokenizer#, device=0
)

KeyboardInterrupt: 

In [None]:
model = coq_model
evaluate()

KeyboardInterrupt: 

In [None]:
txt = """Theorem identity : forall (n : nat), n = n."""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Theorem identity : forall (n : nat), n = n.
intros; red; intros; destruct n; simpl; auto.
Qed.

Theorem eq_sym : forall X : Ens, IN X X H.
intros;


### SANDBOX

In [None]:
class CustomStoppingCriteria(StoppingCriteria):
    def __init__(self, stop_sequences):
        super().__init__()
        self.stop_sequences = stop_sequences

    def __call__(self, input_ids, scores, **kwargs):
        # print(input_ids)
        # Check if the generated text ends with any of the stop sequences
        for sequence in self.stop_sequences:
            subsequence = input_ids[0][-len(sequence):]
            # print("###############################################################")
            # print(sequence, subsequence)
            if (sequence == subsequence.tolist()):
                # print("RETURN TRUEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
                return True  # Stop criteria met
        return False  # Stop criteria not met

In [None]:
# stop_sequences = [['ĠQed', '.'], ['Qed', '.']]  # Example stop sequences
stop_sequences = [[372, 14]]
custom_stopping_criteria = CustomStoppingCriteria(stop_sequences)
stopping_criteria = StoppingCriteriaList([custom_stopping_criteria])

In [None]:
for stop_seq in stop_sequences:
  print(tokenizer.encode(stop_seq))

In [None]:
print(tokenizer.encode('ĠQed'))

[129, 255, 372]


In [None]:
tokenizer = AutoTokenizer.from_pretrained("Andrusyshyn/gpt2-coq-tokenizer")
model = GPT2LMHeadModel.from_pretrained("Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train")
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, stopping_criteria=stopping_criteria#, device=0
)

In [None]:
# Generate text
generated_text = pipe(["Theorem theorem1 : forall (n : nat), n + 0 = n."], num_return_sequences=50)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nauto with arith.\nQed.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nHint Immediate plus_reg_l: arith.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  induction n as IHn as [ |'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; case n; auto'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* QuickChick plus_n_O.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; rewrite mult_comm.\nrewrite'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; auto.\nQed'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintros. omega.\nQed.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro n; case n; auto.\nsimpl'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n 

In [None]:
for _ in range(50):
  generated_text = pipe(["Theorem theorem1 : forall (n : nat), n + 0 = n."], num_return_sequences=1)
  print(generated_text)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; unfold O in |- *; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nsimpl in |- *; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; case n; simpl in |- *; auto.\nintros n1 H'; right;\nrewrite H'; auto.\nunfold nat_of_P"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted.\n\nTheorem G1 : forall n m, n = m + m.\nAdmitted. (* Leo: OUT-OFne *)\n\nTheorem G'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAbort.\n\nGoal forall (n m : nat), n + m + m = n + m.\nintros; omega.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\n\nDefinition O_total_order_T : forall n, {n=O}+{n<>O}.\nShow. (*'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  induction n; auto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; rewrite (plus_comm n 0); apply mult_n_Sm.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n.\nsimpl in |- *; auto with arith.\nsimpl in |- *.\nintro; apply le_O_n.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; now rewrite one_succ, IHn, add_0_l.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\n  intros; pattern n at 1 in |- *; rewrite mult_1_r; ring.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro.\nring.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintros n; unfold le.\napply plus_O.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro; case n; auto with arith.\nintros n0; case n0; auto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\ninduction n.\nnow simpl.\nintros ; rewrite IHn ; clear IHn ; easy.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof (fun n => n + n <= n).\nintro; rewrite (plus_comm n (0 + n)); rewrite (plus_assoc n ( n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nsimpl; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nunfold plus; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\n  simpl; intuition;\n  rewrite (expnat_1 n); rewrite expnat_0;\n  auto.\n  f_equal; omega.\n  eapply IHn.\n  omega'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; simpl in |- *; auto.\nintros n' H'1; apply le_plus_minus; auto.\nQed."}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintros n; apply eq_nat_dec.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  eexists.\n  trivial.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\n\nTheorem mult_0_l : forall n, n * n = 0.\nAdmitted. (* QuickChick mult_0_'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; auto with arith.\nintros n0 H'.\ncut (n0 <= n + m0); auto with arith; intros n"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\nTheorem mult_comm : forall n m, n * m = m * m.\nAdmitted. (* Higher Order *)\n\n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\n\nTheorem mult_plus_distr_l : forall (x y z : nat), x * (y * z) ='}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; generalize dependent n.\nelim (mult_sym n); simpl in |- *; auto with arith.\nintros n1 H'; rewrite H'.\n"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; elim n; simpl in |- *; auto.\nintros n1 IH; case (Lt.le_or_lt n1 n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; rewrite (mult_comm n 0); apply (p (S n)); auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  (* We could use [group_zero], instead of building an\n   equality, but is too to\n   replace that n with (m + n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro; apply (mult_eq (S n) (0 + n)).\nrewrite plus_comm; rewrite H.\nrepeat rewrite (plus_comm n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n   intros n; rewrite even_mul; apply even_div_even.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro.\ninduction n as [| n Hrecn].\nsimpl in |- *.\ntrivial.\nsimpl in |- *.\nrewrite Hrecn0.\napply'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; auto with arith.\nsimpl in |- *; auto with arith.\nintros n0 H'; left; auto with arith.\napply"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\ntauto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintuition.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  intro n; apply n_Sn.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro; rewrite <- Nat.add_plus_distr_r.\nsymmetry.\napply mult_le_compat;Qed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted.\n\nLemma foo : forall (n : nat), n = n.\nAdmitted.\n\nGoal exists (A : nat -> Set), A ->'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\ninduction n with\n| 0 => idtac\n| false => auto with arith\nend.\nintros n v; case v; simpl.\nintros'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  induction n; simpl; trivial.\n  destruct 1; trivial.\n  rewrite IHn.\n  rewrite IHn.\n  trivial.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro n. \nsymmetry  in |- *. \napply mult_le_compat_l.\napply le_plus_minus.\ninstantiate (1'}]]
[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  intro n.\n  rewrite (plus_comm n).\n  apply plus_le_compat.\n  cset (plus (fact n) n); rewrite'}]]


In [None]:
text = """Theorem theorem1 : forall (n : nat), n + 0 = n.

    intuition.
    eapply H.
    trivial.
    Qed."""

tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

# text = """Theorem theorem1 : forall (n : nat), n + 0 = n.
# Proof.
#    apply eq_refl.
# Qed."""
# #p_tokenizer(sample["content"], truncation=False)["input_ids"]
# tokenized_text = tokenizer.tokenize(text)
# print(tokenized_text)

['Theorem', 'Ġtheorem', '1', 'Ġ:', 'Ġforall', 'Ġ(', 'n', 'Ġ:', 'Ġnat', '),', 'Ġn', 'Ġ+', 'Ġ0', 'Ġ=', 'Ġn', '.', 'ĊĠĠĠĠĊĠĠĠ', 'Ġintuition', '.', 'ĊĠĠĠ', 'Ġeapply', 'ĠH', '.', 'ĊĠĠĠ', 'Ġtrivial', '.', 'ĊĠĠĠ', 'ĠQed', '.']


In [None]:
text = """Theorem theorem1 : forall (n : nat), n + 0 = n.

    intuition.
    eapply H.
    trivial.
    Qed."""
input_ids = tokenizer(text, truncation=False)["input_ids"][-2:]
print(input_ids)
print(tokenizer.decode(input_ids, skip_special_tokens=True))

[560, 14]
 Qed.


In [None]:
print(tokenizer.decode(199) == '\n')

True
