In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs

In [1]:
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

from huggingface_hub import notebook_login, Repository
from datasets import load_dataset, DatasetDict, Dataset
from accelerate import Accelerator
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, pipeline, get_scheduler
from transformers.keras_callbacks import PushToHubCallback

import re
import json
import os
from tqdm import tqdm
from tqdm.notebook import tqdm
from google.colab import drive

### TRAINING


#### HYPER PARAMS, GLOBAL VARS, FUNCTIONS

In [2]:
# HYPER PARAMETERS
context_length                 = 1024
train_batch_size               = 4
eval_batch_size                = 4
weight_decay                   = 0.1
lr                             = 5e-4
num_warmup_steps               = 50
gradient_accumulation_steps    = 8
gradient_checkpointing         = False
eval_steps                     = 1000 // gradient_accumulation_steps
num_train_epochs               = 5
mixed_precision                = "fp16"

#GLOBAL VARS
have_git_write_access          = True                              # set to True of you need to commit changes
user_email                     = "orest.andrusyshyn@ucu.edu.ua"    # only needed if above have_git_write_access == True
user_name                      = "Orest Andrusyshyn"               # only needed if above have_git_write_access == True

drive_mounted                  = True
drive_mounted_path             = '/content/gdrive/'    # only needed if drive_mounted == True
save_json_logs                 = True                  # set to True to save training results into json file (drive_train_res_path)
drive_train_res_path           = drive_mounted_path + 'My Drive/UCU/diploma/progress_notes/training_results/second_training/theorems_cl_1024_voc_30k_train_res1.json'    # only needed if save_json_logs == True
save_tensorboard_logs          = True                  # set to True to save training results into tensorboard run (tensorboard_run_path)
tensorboard_run_path           = drive_mounted_path + 'My Drive/UCU/diploma/progress_notes/training_results/tensorboard_runs/theorems_cl_1024_voc_30k'                   # only needed if save_tensorboard_logs == True

data_archived                  = True    # set to True if you need to unarchive the data
raw_train_archive              = "./pretrain_dataset_train.json.gz"         # only needed if data_archived == True
raw_valid_archive              = "./pretrain_dataset_validation.json.gz"    # only needed if data_archived == True
raw_train_json                 = "./pretrain_dataset_train.json"            # only needed if data_archived == False
raw_valid_json                 = "./pretrain_dataset_validation.json"       # only needed if data_archived == False

tokenizer_repo_name            = "Andrusyshyn/gpt2-coq-tokenizer"
tokenizer_commit_hash          = "687b6ed5c0eb5b4016323e7798c03af749fdeb31"

init_model                     = "gpt2"                                                   # base model Hugging Face name
model_repo_name                = "Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train"
model_output_dir               = "./gpt2-pretrained-for-coq-pt-custom-train-local"        # local dir to save the model

push_to_hub                    = True                 # set to True to push the model
run_name                       = "cl_1024_tok_30k"    # branch name (only needed if push_to_hub == True)

partially_trained              = False                                         # set to True to continue model training from specific commit hash (model_commit_hash)
model_commit_hash              = "91aac830c5ff8417d8bf389eea271a9d3dabab9c"    # only needed if partially_trained == True
previously_completed_steps     = 2295                                          # only needed if partially_trained == True
previous_global_steps          = 18360                                         # only needed if partially_trained == True
previous_step                  = 1836                                          # only needed if partially_trained == True
stopped_lr                     = lr                                            # only needed if partially_trained == True
stopped_epoch                  = 9                                             # only needed if partially_trained == True

torch.manual_seed(7)

<torch._C.Generator at 0x7d3a9c209cf0>

In [3]:
# CUSTOM LOSS FUNCTION
def custom_loss(inputs, logits):
    # inputs [batch_size X cl]
    # logits [btach_size X cl X vocab_size]
    # Our labels start from second sequence token because first one does not have preceding token.
    # We drop last logit because last sequence token does not have subsequent token, so no label to compare
    shifted_labels = inputs[..., 1:].contiguous()
    shifted_logits = logits[..., :-1, :].contiguous()

    loss_func = CrossEntropyLoss(reduce=False)
    # loss [batch_size * (cl-1)] = loss_fct([batch_size * (cl-1) X vocab_size], [batch_size * (cl-1)])
    loss = loss_func(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1))
    # loss_per_sample [batch_size]
    loss_per_sequence = loss.view(shifted_logits.size(0), shifted_logits.size(1)).mean(axis=1)
    return loss_per_sequence.mean()

In [4]:
# PARAMETERS FOR WEIGHT DECAY
def get_wd_parameters(model, no_decay=["bias", r"ln_.{1,2}\.weight"]):
    wd_params = []
    nwd_params = []
    for name, params in model.named_parameters():
        if any(re.search(nd_reg, name) for nd_reg in no_decay):
            nwd_params.append(params)
        else:
            wd_params.append(params)
    return [
        {"params": wd_params,  "weight_decay": weight_decay},
        {"params": nwd_params, "weight_decay": 0.0},
    ]

In [5]:
# TOKENIZE RAW DATASETS
def get_tokenized_dataset(p_raw_dataset, p_context_length, p_tokenizer):
    concatenated_tokenized_samples = []
    for sample in p_raw_dataset:
        tokenized_sample = p_tokenizer(sample["content"], truncation=False)["input_ids"]
        concatenated_tokenized_samples.extend(tokenized_sample + [p_tokenizer.eos_token_id])
    tokenized_dataset_list = []
    for i in range(0, len(concatenated_tokenized_samples), p_context_length):
        input_ids = concatenated_tokenized_samples[i : i + p_context_length]
        if len(input_ids) == p_context_length:
            tokenized_dataset_list.append(torch.tensor(input_ids))

    return Dataset.from_dict({"input_ids": tokenized_dataset_list})

In [6]:
# SAVING TRAINING RESULTS TO THE JSON FILE
def save_results(filepath:str, split: str, results: dict):
    """
    {
        "train": [
            results1,
            results2,
            ...
        ],
        "valid": [
            results1,
            results2,
            ...
        ]
    }
    """
    if not save_json_logs:
        return
    if not os.path.exists(tensorboard_run_path)
        print("ERROR: filepath DOES NOT EXIST")
        return
    if split not in {"train", "valid"}:
        print("ERROR: INVALID SPLIT")
        return
    json_data = {"train": [], "valid": []}
    if os.path.exists(filepath):
        with open(filepath, 'r') as json_file:
            json_data = json.load(json_file)
    json_data[split].append(results)
    with open(filepath, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

In [None]:
# TENSORBOARD VISUALISATION
def add_to_tensorboard(json_filepath, tensorboard_run_path):
    if not save_tensorboard_logs:
        return
    if not os.path.exists(json_filepath)
        print("ERROR: json_filepath DOES NOT EXIST")
        return
    if not os.path.exists(tensorboard_run_path)
        print("ERROR: tensorboard_run_path DOES NOT EXIST")
        return
    with open(json_filepath, mode="r") as json_file:
        one_step_tokens = entry["hyperparams"]["train_batch_size"] * entry["hyperparams"]["gradient_accumulation_steps"] * entry["hyperparams"]["context_length"]
        json_data = json.load(json_file)
        writer = SummaryWriter(tensorboard_run_path)
        for entry in json_data["train"]:
            writer.add_scalar("Loss/Train", entry["loss/train"], entry["competed_steps"] * one_step_tokens)
            writer.add_scalar("Learning Rate", entry["lr"][0], entry["competed_steps"] * one_step_tokens)
        for entry in json_data["valid"]:
            writer.add_scalar("Loss/Eval", entry["loss/eval"], entry["competed_steps"] * one_step_tokens)
            writer.add_scalar("Perplexity/Eval", entry["perplexity"], entry["competed_steps"] * one_step_tokens)
      writer.close()

#### CONFIGURING

In [7]:
# MOUNTING DRIVE FOR SAVING LOGS
if drive_mounted:
    drive.mount(drive_mounted_path)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [8]:
# CONFIGURING GIT CREDENTIALS
if have_git_write_access:
    !git config --global user.email "{user_email}"
    !git config --global user.name "{user_name}"

# To set Hugging Face token (for writing access) create HF_TOKEN secret in Google Collab or use notebook_login()

In [9]:
# CONFIGURING GIT DIRECTORIES
if push_to_hub:
    repo = Repository(model_output_dir, clone_from=model_repo_name)
    repo.git_checkout(run_name, create_branch_ok=True)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/./gpt2-pretrained-for-coq-pt-custom-train-local is already a clone of https://huggingface.co/Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train. Make sure you pull the latest changes with `repo.git_pull()`.
Checked out cl_1024_tok_30k from cl_1024_tok_30k.
M	config.json
M	generation_config.json
M	merges.txt
M	model.safetensors
M	tokenizer.json
M	vocab.json

M	generation_config.json
M	merges.txt
M	model.safetensors
M	tokenizer.json
M	vocab.json



#### DATASETS

In [11]:
# UNPACK DATASETS
if data_archived:
    !gzip -dkv "{raw_train_archive}"
    !gzip -dkv "{raw_valid_archive}"
    raw_train_json = raw_train_archive[:-3]
    raw_valid_json = raw_valid_archive[:-3]

./pretrain_dataset_train.json.gz:	 82.6% -- created ./pretrain_dataset_train.json
./pretrain_dataset_validation.json.gz:	 80.5% -- created ./pretrain_dataset_validation.json


In [10]:
# LOAD DATASETS
data_files = {"train": raw_train_json, "validation": raw_valid_json}
raw_datasets = load_dataset("json", data_files=data_files, field="data")
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['filepath', 'content'],
        num_rows: 5969
    })
    validation: Dataset({
        features: ['filepath', 'content'],
        num_rows: 663
    })
})


In [None]:
# DEBUG CODE (taking few samples from whole DatasetDict)
train_debug = raw_datasets["train"][:100]
valid_debug = raw_datasets["validation"][:3]
train_debug_dataset = Dataset.from_dict(train_debug)
valid_debug_dataset = Dataset.from_dict(valid_debug)
raw_datasets = DatasetDict({"train": train_debug_dataset, "validation": raw_datasets["validation"]})
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['content', 'filepath'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['content', 'filepath'],
        num_rows: 12401
    })
})

In [11]:
# LOADING TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(tokenizer_repo_name, revision=tokenizer_commit_hash)
print("tokenizer vocab size: ", len(tokenizer))
# TOKENIZE RAW DATASETS
train_dataset = get_tokenized_dataset(raw_datasets["train"],      context_length, tokenizer)
valid_dataset = get_tokenized_dataset(raw_datasets["validation"], context_length, tokenizer)
train_dataset.set_format("torch")
valid_dataset.set_format("torch")

# CREATE DATALOADERS
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=eval_batch_size)
print(train_dataset)
print(valid_dataset)
print("len(train_dataloader): ", len(train_dataloader))
print("len(valid_dataloader): ", len(valid_dataloader))

Token indices sequence length is longer than the specified maximum sequence length for this model (5887 > 1024). Running this sequence through the model will result in indexing errors


tokenizer vocab size:  30000
Dataset({
    features: ['input_ids'],
    num_rows: 22643
})
Dataset({
    features: ['input_ids'],
    num_rows: 2784
})
len(train_dataloader):  5661
len(valid_dataloader):  696


#### CONFIGURING MODEL AND TRAINING

In [12]:
# PRE TRAINING MODEL FROM SCRATCH!
if not partially_trained:
    config = AutoConfig.from_pretrained(
        init_model,
        vocab_size=len(tokenizer),
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    model = GPT2LMHeadModel(config)
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()
    optimizer = AdamW(get_wd_parameters(model), lr=lr)

    accelerator = Accelerator(mixed_precision=mixed_precision)

    model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, valid_dataloader
    )

    print(f"Model size:                                        {model.num_parameters()}")
    print(f"Model size (only trainable params):                {model.num_parameters(only_trainable=True)}")
    print(f"Model size (only trainable non-embeddings params): {model.num_parameters(only_trainable=True, exclude_embeddings=True)}")

Model size:                                        108882432
Model size (only trainable params):                108882432
Model size (only trainable non-embeddings params): 85056000


In [13]:
# PRE TRAINING MODEL FROM SCRATCH!
if not partially_trained:
    num_steps_per_epoch = len(train_dataloader)
    print("Num steps per epoch: ", num_steps_per_epoch)
    num_training_completed_steps = (num_train_epochs * num_steps_per_epoch) // gradient_accumulation_steps
    if ((num_train_epochs * num_steps_per_epoch) % gradient_accumulation_steps != 0):
        num_training_completed_steps += 1
    print("Num optimizer steps: ", num_training_completed_steps)

    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_completed_steps
    )

Num steps per epoch:  5661
Num optimizer steps:  3539


In [14]:
# USING ALREADY TRAINED MODEL!!!
if partially_trained:
    num_steps_per_epoch = len(train_dataloader)
    num_training_completed_steps = (num_train_epochs * num_steps_per_epoch) // gradient_accumulation_steps
    if ((num_train_epochs * num_steps_per_epoch) % gradient_accumulation_steps != 0):
        num_training_completed_steps += 1

    lr_sch_warmup_steps = 0
    num_training_completed_steps -= previously_completed_steps
    lr_sch_lr = stopped_lr

    print("Num optimizer steps: ", num_training_completed_steps)
    print("Start Learning rate: ", lr_sch_lr)
    print("Warmup steps: ", lr_sch_warmup_steps)
    ################################################################################
    model = GPT2LMHeadModel.from_pretrained(model_repo_name, revision=model_commit_hash)
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()
    optimizer = AdamW(get_wd_parameters(model), lr=lr_sch_lr)
    accelerator = Accelerator(mixed_precision=mixed_precision)
    model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, valid_dataloader
    )

    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=lr_sch_warmup_steps,
        num_training_steps=num_training_completed_steps
    )

    print()
    print(f"Model size:                                        {model.num_parameters()}")
    print(f"Model size (only trainable params):                {model.num_parameters(only_trainable=True)}")
    print(f"Model size (only trainable non-embeddings params): {model.num_parameters(only_trainable=True, exclude_embeddings=True)}")

In [15]:
# EVALUATION FUNCTION
def evaluate():
    torch.cuda.empty_cache()

    model.eval()
    losses = []
    with torch.no_grad():
        for step, batch in enumerate(valid_dataloader):
            with torch.no_grad():
                logits = model(batch["input_ids"]).logits
                loss = custom_loss(batch["input_ids"], logits)
                losses.append(loss.item())
    loss = torch.mean(torch.Tensor(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")

    torch.cuda.empty_cache()
    return loss.item(), perplexity.item()

In [None]:
# TRAINING LOOP
model.train()
completed_steps = 0
global_steps = 0
if partially_trained:
    completed_steps = previously_completed_steps
    global_steps = previous_global_steps
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=num_steps_per_epoch
    ):
        if partially_trained and ((epoch<stopped_epoch) or ((epoch==stopped_epoch) and (step <= previous_step))):
            continue

        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
################################################################################
        if (global_steps % ((int(0.1 * num_steps_per_epoch) // gradient_accumulation_steps) * gradient_accumulation_steps) == 0):
            log_train = {
                    "epoch": epoch,
                    "lr": lr_scheduler.get_lr(),
                    "steps": step,
                    "global_steps" : global_steps,
                    "competed_steps": completed_steps,
                    "loss/train": loss.item(),
            }
            accelerator.print(log_train)
            save_results(drive_train_res_path, "train", log_train)
################################################################################
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
################################################################################
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (global_steps % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            log_eval = {
                    "loss/eval": eval_loss,
                    "perplexity": perplexity,
                    "epoch": epoch,
                    "lr": lr_scheduler.get_lr(),
                    "steps": step,
                    "global_steps" : global_steps,
                    "competed_steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
            }
            accelerator.print(log_eval)
            log_train = {
                    "epoch": epoch,
                    "lr": lr_scheduler.get_lr(),
                    "steps": step,
                    "global_steps" : global_steps-1,
                    "competed_steps": completed_steps-1,
                    "loss/train": loss.item() * gradient_accumulation_steps,
            }
            save_results(drive_train_res_path, "train", log_train)
            save_results(drive_train_res_path, "valid", log_eval)
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
            tokenizer.save_pretrained(model_output_dir)
            if accelerator.is_main_process:
                if push_to_hub:
                    repo.push_to_hub(
                        commit_message=f"Training in progress epoch {epoch}; steps {step}; global_steps {global_steps}; completed_steps {completed_steps}; lr {lr_scheduler.get_lr()}; loss/train {loss.item() * gradient_accumulation_steps}; loss/eval {eval_loss}; perplexity {perplexity}", blocking=False    # train loss is for the next step
                    )
            model.train()

################################################################################
################################################################################
################################################################################

#GRADIENT UPDATE (In case (global_steps % gradient_accumulation_steps != 0)
if (global_steps % gradient_accumulation_steps != 0):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=(gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)) - 1
    ):
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
            break

# FINAL EVALUATE AND SAVE
additional_steps = 0
if (global_steps % gradient_accumulation_steps != 0):
    additional_steps = gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)
with torch.no_grad():
    last_train_loss = 0
    for batch in train_dataloader:
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        last_train_loss = loss.item()
        break
eval_loss, perplexity = evaluate()
log_eval = {
        "loss/eval": eval_loss,
        "perplexity": perplexity,
        "epoch": epoch,
        "lr": lr_scheduler.get_lr(),
        "steps": num_steps_per_epoch + additional_steps,
        "global_steps" : global_steps,
        "competed_steps": completed_steps,
        "loss/train": last_train_loss,
}
accelerator.print(log_eval)
log_train = {
        "epoch": epoch,
        "lr": lr_scheduler.get_lr(),
        "steps": num_steps_per_epoch + additional_steps,
        "global_steps" : global_steps,
        "competed_steps": completed_steps,
        "loss/train": last_train_loss,
}
save_results(drive_train_res_path, "train", log_train)
save_results(drive_train_res_path, "valid", log_eval)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
tokenizer.save_pretrained(model_output_dir)
if accelerator.is_main_process:
    if push_to_hub:
        repo.push_to_hub(
            commit_message=f"Final model: epoch {epoch}; steps {num_steps_per_epoch+additional_steps}; global_steps {global_steps}; completed_steps {completed_steps}; lr {lr_scheduler.get_lr()}; loss/train {last_train_loss}; loss/eval {eval_loss}; perplexity {perplexity}", blocking=False
        )
model.train()

torch.cuda.empty_cache()

  0%|          | 0/5661 [00:00<?, ?it/s]



{'epoch': 0, 'lr': [0.0, 0.0], 'steps': 0, 'global_steps': 0, 'competed_steps': 0, 'loss/train': 10.455289840698242}
{'epoch': 0, 'lr': [0.00049713384924047, 0.00049713384924047], 'steps': 560, 'global_steps': 560, 'competed_steps': 70, 'loss/train': 4.529292106628418}
{'loss/eval': 4.685021877288818, 'perplexity': 108.3126449584961, 'epoch': 0, 'lr': [0.0004892519346517627, 0.0004892519346517627], 'steps': 999, 'global_steps': 1000, 'competed_steps': 125, 'loss/train': 5.145664215087891}




{'epoch': 0, 'lr': [0.0004871023215821152, 0.0004871023215821152], 'steps': 1120, 'global_steps': 1120, 'competed_steps': 140, 'loss/train': 4.042849063873291}
{'epoch': 0, 'lr': [0.0004770707939237604, 0.0004770707939237604], 'steps': 1680, 'global_steps': 1680, 'competed_steps': 210, 'loss/train': 4.144962310791016}
{'loss/eval': 4.058959484100342, 'perplexity': 57.91402053833008, 'epoch': 0, 'lr': [0.00047133849240470053, 0.00047133849240470053], 'steps': 1999, 'global_steps': 2000, 'competed_steps': 250, 'loss/train': 2.824413299560547}


Several commits (2) will be pushed upstream.


{'epoch': 0, 'lr': [0.00046703926626540554, 0.00046703926626540554], 'steps': 2240, 'global_steps': 2240, 'competed_steps': 280, 'loss/train': 3.851320266723633}
{'epoch': 0, 'lr': [0.00045700773860705073, 0.00045700773860705073], 'steps': 2800, 'global_steps': 2800, 'competed_steps': 350, 'loss/train': 3.7508034706115723}
{'loss/eval': 3.7102177143096924, 'perplexity': 40.862701416015625, 'epoch': 0, 'lr': [0.00045342505015763827, 0.00045342505015763827], 'steps': 2999, 'global_steps': 3000, 'competed_steps': 375, 'loss/train': 2.9665074348449707}


Several commits (3) will be pushed upstream.


{'epoch': 0, 'lr': [0.0004469762109486959, 0.0004469762109486959], 'steps': 3360, 'global_steps': 3360, 'competed_steps': 420, 'loss/train': 3.7038979530334473}
{'epoch': 0, 'lr': [0.0004369446832903411, 0.0004369446832903411], 'steps': 3920, 'global_steps': 3920, 'competed_steps': 490, 'loss/train': 2.532991647720337}
{'loss/eval': 3.3967089653015137, 'perplexity': 29.86564826965332, 'epoch': 0, 'lr': [0.00043551160791057607, 0.00043551160791057607], 'steps': 3999, 'global_steps': 4000, 'competed_steps': 500, 'loss/train': 2.9878158569335938}


Several commits (4) will be pushed upstream.


{'epoch': 0, 'lr': [0.00042691315563198626, 0.00042691315563198626], 'steps': 4480, 'global_steps': 4480, 'competed_steps': 560, 'loss/train': 2.4860339164733887}
{'loss/eval': 3.169886827468872, 'perplexity': 23.804790496826172, 'epoch': 0, 'lr': [0.00041759816566351387, 0.00041759816566351387], 'steps': 4999, 'global_steps': 5000, 'competed_steps': 625, 'loss/train': 3.2624502182006836}


Several commits (5) will be pushed upstream.


{'epoch': 0, 'lr': [0.00041688162797363145, 0.00041688162797363145], 'steps': 5040, 'global_steps': 5040, 'competed_steps': 630, 'loss/train': 2.639338493347168}
{'epoch': 0, 'lr': [0.0004068501003152766, 0.0004068501003152766], 'steps': 5600, 'global_steps': 5600, 'competed_steps': 700, 'loss/train': 2.9396209716796875}


  0%|          | 0/5661 [00:00<?, ?it/s]

{'loss/eval': 2.9767768383026123, 'perplexity': 19.624462127685547, 'epoch': 1, 'lr': [0.0003996847234164517, 0.0003996847234164517], 'steps': 338, 'global_steps': 6000, 'competed_steps': 750, 'loss/train': 2.6845688819885254}


Several commits (6) will be pushed upstream.


{'epoch': 1, 'lr': [0.0003968185726569218, 0.0003968185726569218], 'steps': 499, 'global_steps': 6160, 'competed_steps': 770, 'loss/train': 2.786147117614746}
{'epoch': 1, 'lr': [0.0003867870449985669, 0.0003867870449985669], 'steps': 1059, 'global_steps': 6720, 'competed_steps': 840, 'loss/train': 2.0801854133605957}
{'loss/eval': 2.820486068725586, 'perplexity': 16.78500747680664, 'epoch': 1, 'lr': [0.0003817712811693895, 0.0003817712811693895], 'steps': 1338, 'global_steps': 7000, 'competed_steps': 875, 'loss/train': 1.8183856010437012}


Several commits (7) will be pushed upstream.


{'epoch': 1, 'lr': [0.0003767555173402121, 0.0003767555173402121], 'steps': 1619, 'global_steps': 7280, 'competed_steps': 910, 'loss/train': 1.999011754989624}
{'epoch': 1, 'lr': [0.00036672398968185725, 0.00036672398968185725], 'steps': 2179, 'global_steps': 7840, 'competed_steps': 980, 'loss/train': 1.9623982906341553}
{'loss/eval': 2.6545791625976562, 'perplexity': 14.219000816345215, 'epoch': 1, 'lr': [0.0003638578389223273, 0.0003638578389223273], 'steps': 2338, 'global_steps': 8000, 'competed_steps': 1000, 'loss/train': 1.5572048425674438}


Several commits (8) will be pushed upstream.


{'epoch': 1, 'lr': [0.00035669246202350244, 0.00035669246202350244], 'steps': 2739, 'global_steps': 8400, 'competed_steps': 1050, 'loss/train': 1.7646706104278564}
{'epoch': 1, 'lr': [0.00034666093436514764, 0.00034666093436514764], 'steps': 3299, 'global_steps': 8960, 'competed_steps': 1120, 'loss/train': 1.536243200302124}
{'loss/eval': 2.5040295124053955, 'perplexity': 12.231682777404785, 'epoch': 1, 'lr': [0.0003459443966752651, 0.0003459443966752651], 'steps': 3338, 'global_steps': 9000, 'competed_steps': 1125, 'loss/train': 1.5786935091018677}


Several commits (9) will be pushed upstream.


{'epoch': 1, 'lr': [0.0003366294067067928, 0.0003366294067067928], 'steps': 3859, 'global_steps': 9520, 'competed_steps': 1190, 'loss/train': 2.210226535797119}
{'loss/eval': 2.3795740604400635, 'perplexity': 10.800301551818848, 'epoch': 1, 'lr': [0.0003280309544282029, 0.0003280309544282029], 'steps': 4338, 'global_steps': 10000, 'competed_steps': 1250, 'loss/train': 2.3846771717071533}


Several commits (10) will be pushed upstream.


{'epoch': 1, 'lr': [0.00032659787904843797, 0.00032659787904843797], 'steps': 4419, 'global_steps': 10080, 'competed_steps': 1260, 'loss/train': 2.0058906078338623}
{'epoch': 1, 'lr': [0.0003165663513900831, 0.0003165663513900831], 'steps': 4979, 'global_steps': 10640, 'competed_steps': 1330, 'loss/train': 1.5822627544403076}
{'loss/eval': 2.214085817337036, 'perplexity': 9.153038024902344, 'epoch': 1, 'lr': [0.00031011751218114076, 0.00031011751218114076], 'steps': 5338, 'global_steps': 11000, 'competed_steps': 1375, 'loss/train': 1.9863076210021973}


Several commits (11) will be pushed upstream.


{'epoch': 1, 'lr': [0.0003065348237317283, 0.0003065348237317283], 'steps': 5539, 'global_steps': 11200, 'competed_steps': 1400, 'loss/train': 2.1521151065826416}


  0%|          | 0/5661 [00:00<?, ?it/s]

{'epoch': 2, 'lr': [0.00029650329607337344, 0.00029650329607337344], 'steps': 438, 'global_steps': 11760, 'competed_steps': 1470, 'loss/train': 2.1484994888305664}
{'loss/eval': 2.1116554737091064, 'perplexity': 8.261907577514648, 'epoch': 2, 'lr': [0.00029220406993407856, 0.00029220406993407856], 'steps': 677, 'global_steps': 12000, 'competed_steps': 1500, 'loss/train': 1.9952863454818726}


Several commits (12) will be pushed upstream.


{'epoch': 2, 'lr': [0.00028647176841501863, 0.00028647176841501863], 'steps': 998, 'global_steps': 12320, 'competed_steps': 1540, 'loss/train': 2.0682852268218994}
{'epoch': 2, 'lr': [0.0002764402407566638, 0.0002764402407566638], 'steps': 1558, 'global_steps': 12880, 'competed_steps': 1610, 'loss/train': 1.5739586353302002}
{'loss/eval': 2.0176873207092285, 'perplexity': 7.52091121673584, 'epoch': 2, 'lr': [0.00027429062768701635, 0.00027429062768701635], 'steps': 1677, 'global_steps': 13000, 'competed_steps': 1625, 'loss/train': 1.5805190801620483}


Several commits (13) will be pushed upstream.


{'epoch': 2, 'lr': [0.000266408713098309, 0.000266408713098309], 'steps': 2118, 'global_steps': 13440, 'competed_steps': 1680, 'loss/train': 1.3008208274841309}
{'loss/eval': 1.9573551416397095, 'perplexity': 7.080574989318848, 'epoch': 2, 'lr': [0.00025637718543995415, 0.00025637718543995415], 'steps': 2677, 'global_steps': 14000, 'competed_steps': 1750, 'loss/train': 2.3755359649658203}


Several commits (14) will be pushed upstream.


{'epoch': 2, 'lr': [0.00025637718543995415, 0.00025637718543995415], 'steps': 2678, 'global_steps': 14000, 'competed_steps': 1750, 'loss/train': 1.3428977727890015}


In [None]:
torch.cuda.empty_cache()
add_to_tensorboard(drive_train_res_path, tensorboard_run_path)

In [None]:
%tensorboard --logdir "{tensorboard_run_path}"

### TESTING PRETRAINED MODEL

In [None]:
coq_model = GPT2LMHeadModel.from_pretrained("Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train")
coq_tokenizer = AutoTokenizer.from_pretrained("Andrusyshyn/gpt2-coq-tokenizer")
pipe = pipeline(
    "text-generation", model=coq_model, tokenizer=coq_tokenizer#, device=0
)

KeyboardInterrupt: 

In [None]:
model = coq_model
evaluate()

KeyboardInterrupt: 

In [None]:
txt = """Theorem identity : forall (n : nat), n = n."""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Theorem identity : forall (n : nat), n = n.
intros; red; intros; destruct n; simpl; auto.
Qed.

Theorem eq_sym : forall X : Ens, IN X X H.
intros;


### SANDBOX

In [None]:
class CustomStoppingCriteria(StoppingCriteria):
    def __init__(self, stop_sequences):
        super().__init__()
        self.stop_sequences = stop_sequences

    def __call__(self, input_ids, scores, **kwargs):
        # print(input_ids)
        # Check if the generated text ends with any of the stop sequences
        for sequence in self.stop_sequences:
            subsequence = input_ids[0][-len(sequence):]
            # print("###############################################################")
            # print(sequence, subsequence)
            if (sequence == subsequence.tolist()):
                # print("RETURN TRUEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
                return True  # Stop criteria met
        return False  # Stop criteria not met

In [None]:
# stop_sequences = [['ĠQed', '.'], ['Qed', '.']]  # Example stop sequences
stop_sequences = [[372, 14]]
custom_stopping_criteria = CustomStoppingCriteria(stop_sequences)
stopping_criteria = StoppingCriteriaList([custom_stopping_criteria])

In [None]:
for stop_seq in stop_sequences:
  print(tokenizer.encode(stop_seq))

In [None]:
print(tokenizer.encode('ĠQed'))

[129, 255, 372]


In [None]:
tokenizer = AutoTokenizer.from_pretrained("Andrusyshyn/gpt2-coq-tokenizer")
model = GPT2LMHeadModel.from_pretrained("Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train")
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, stopping_criteria=stopping_criteria#, device=0
)

In [None]:
# Generate text
generated_text = pipe(["Theorem theorem1 : forall (n : nat), n + 0 = n."], num_return_sequences=50)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nauto with arith.\nQed.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nHint Immediate plus_reg_l: arith.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  induction n as IHn as [ |'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; case n; auto'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* QuickChick plus_n_O.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; rewrite mult_comm.\nrewrite'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; auto.\nQed'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintros. omega.\nQed.'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro n; case n; auto.\nsimpl'}, {'generated_text': 'Theorem theorem1 : forall (n : nat), n 

In [None]:
for _ in range(50):
  generated_text = pipe(["Theorem theorem1 : forall (n : nat), n + 0 = n."], num_return_sequences=1)
  print(generated_text)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; unfold O in |- *; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nsimpl in |- *; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; case n; simpl in |- *; auto.\nintros n1 H'; right;\nrewrite H'; auto.\nunfold nat_of_P"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted.\n\nTheorem G1 : forall n m, n = m + m.\nAdmitted. (* Leo: OUT-OFne *)\n\nTheorem G'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAbort.\n\nGoal forall (n m : nat), n + m + m = n + m.\nintros; omega.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\n\nDefinition O_total_order_T : forall n, {n=O}+{n<>O}.\nShow. (*'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  induction n; auto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; rewrite (plus_comm n 0); apply mult_n_Sm.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n.\nsimpl in |- *; auto with arith.\nsimpl in |- *.\nintro; apply le_O_n.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; now rewrite one_succ, IHn, add_0_l.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\n  intros; pattern n at 1 in |- *; rewrite mult_1_r; ring.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro.\nring.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintros n; unfold le.\napply plus_O.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro; case n; auto with arith.\nintros n0; case n0; auto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\ninduction n.\nnow simpl.\nintros ; rewrite IHn ; clear IHn ; easy.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof (fun n => n + n <= n).\nintro; rewrite (plus_comm n (0 + n)); rewrite (plus_assoc n ( n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nsimpl; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nunfold plus; auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\n  simpl; intuition;\n  rewrite (expnat_1 n); rewrite expnat_0;\n  auto.\n  f_equal; omega.\n  eapply IHn.\n  omega'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; simpl in |- *; auto.\nintros n' H'1; apply le_plus_minus; auto.\nQed."}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintros n; apply eq_nat_dec.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  eexists.\n  trivial.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\n\nTheorem mult_0_l : forall n, n * n = 0.\nAdmitted. (* QuickChick mult_0_'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; auto with arith.\nintros n0 H'.\ncut (n0 <= n + m0); auto with arith; intros n"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\nTheorem mult_comm : forall n m, n * m = m * m.\nAdmitted. (* Higher Order *)\n\n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted. (* Higher Order *)\n\nTheorem mult_plus_distr_l : forall (x y z : nat), x * (y * z) ='}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; generalize dependent n.\nelim (mult_sym n); simpl in |- *; auto with arith.\nintros n1 H'; rewrite H'.\n"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; elim n; simpl in |- *; auto.\nintros n1 IH; case (Lt.le_or_lt n1 n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro n; rewrite (mult_comm n 0); apply (p (S n)); auto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  (* We could use [group_zero], instead of building an\n   equality, but is too to\n   replace that n with (m + n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro; apply (mult_eq (S n) (0 + n)).\nrewrite plus_comm; rewrite H.\nrepeat rewrite (plus_comm n'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n   intros n; rewrite even_mul; apply even_div_even.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro.\ninduction n as [| n Hrecn].\nsimpl in |- *.\ntrivial.\nsimpl in |- *.\nrewrite Hrecn0.\napply'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto with arith.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': "Theorem theorem1 : forall (n : nat), n + 0 = n.\nintros n; elim n; auto with arith.\nsimpl in |- *; auto with arith.\nintros n0 H'; left; auto with arith.\napply"}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\ntauto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintuition.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  intro n; apply n_Sn.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\nintro; rewrite <- Nat.add_plus_distr_r.\nsymmetry.\napply mult_le_compat;Qed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nAdmitted.\n\nLemma foo : forall (n : nat), n = n.\nAdmitted.\n\nGoal exists (A : nat -> Set), A ->'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\ninduction n with\n| 0 => idtac\n| false => auto with arith\nend.\nintros n v; case v; simpl.\nintros'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nauto.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  induction n; simpl; trivial.\n  destruct 1; trivial.\n  rewrite IHn.\n  rewrite IHn.\n  trivial.\nQed.'}]]


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nintro n. \nsymmetry  in |- *. \napply mult_le_compat_l.\napply le_plus_minus.\ninstantiate (1'}]]
[[{'generated_text': 'Theorem theorem1 : forall (n : nat), n + 0 = n.\nProof.\n  intro n.\n  rewrite (plus_comm n).\n  apply plus_le_compat.\n  cset (plus (fact n) n); rewrite'}]]


In [None]:
text = """Theorem theorem1 : forall (n : nat), n + 0 = n.

    intuition.
    eapply H.
    trivial.
    Qed."""

tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

# text = """Theorem theorem1 : forall (n : nat), n + 0 = n.
# Proof.
#    apply eq_refl.
# Qed."""
# #p_tokenizer(sample["content"], truncation=False)["input_ids"]
# tokenized_text = tokenizer.tokenize(text)
# print(tokenized_text)

['Theorem', 'Ġtheorem', '1', 'Ġ:', 'Ġforall', 'Ġ(', 'n', 'Ġ:', 'Ġnat', '),', 'Ġn', 'Ġ+', 'Ġ0', 'Ġ=', 'Ġn', '.', 'ĊĠĠĠĠĊĠĠĠ', 'Ġintuition', '.', 'ĊĠĠĠ', 'Ġeapply', 'ĠH', '.', 'ĊĠĠĠ', 'Ġtrivial', '.', 'ĊĠĠĠ', 'ĠQed', '.']


In [None]:
text = """Theorem theorem1 : forall (n : nat), n + 0 = n.

    intuition.
    eapply H.
    trivial.
    Qed."""
input_ids = tokenizer(text, truncation=False)["input_ids"][-2:]
print(input_ids)
print(tokenizer.decode(input_ids, skip_special_tokens=True))

[560, 14]
 Qed.


In [None]:
print(tokenizer.decode(199) == '\n')

True
