In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs

In [2]:
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

from huggingface_hub import notebook_login, Repository
from datasets import load_dataset, DatasetDict, Dataset
from accelerate import Accelerator
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, pipeline, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from transformers.keras_callbacks import PushToHubCallback

import re
import json
import os
from tqdm import tqdm
from tqdm.notebook import tqdm
from google.colab import drive

### TRAINING


#### HYPER PARAMS, GLOBAL VARS, FUNCTIONS

In [3]:
# HYPER PARAMETERS
context_length                 = 1024
train_batch_size               = 4
eval_batch_size                = 8
weight_decay                   = 0.1
lr                             = 5e-4
lr_scheduler_func              = get_cosine_schedule_with_warmup
adamw_b1                       = 0.9
adamw_b2                       = 0.95
adamw_e                        = 1e-8
num_warmup_steps               = 15
gradient_accumulation_steps    = 8
gradient_checkpointing         = False
eval_steps                     = 100
num_train_epochs               = 8
mixed_precision                = "fp16"


#GLOBAL VARS
have_git_write_access          = True                              # set to True if you need to commit changes
user_email                     = "orest.andrusyshyn@ucu.edu.ua"    # only needed if above have_git_write_access == True
user_name                      = "Orest Andrusyshyn"               # only needed if above have_git_write_access == True

drive_mounted                  = True
drive_mounted_path             = '/content/gdrive/'    # only needed if drive_mounted == True
save_json_logs                 = True                  # set to True to save training results into json file (drive_train_res_path)
drive_train_res_path           = drive_mounted_path + 'My Drive/UCU/diploma/training_results/v30_a95_lrcos5em4.json'    # only needed if save_json_logs == True
save_tensorboard_logs          = True                  # set to True to save training results into tensorboard run (tensorboard_run_path)
tensorboard_run_path           = drive_mounted_path + 'My Drive/UCU/diploma/tensorboard_runs/v30_a95_lrcos5em4'                   # only needed if save_tensorboard_logs == True

data_archived                  = True    # set to True if you need to unarchive the data
raw_train_archive              = drive_mounted_path + "My Drive/UCU/diploma/datasets/dataset_train.zip"         # only needed if data_archived == True
raw_valid_archive              = drive_mounted_path + "My Drive/UCU/diploma/datasets/dataset_valid.zip"    # only needed if data_archived == True
raw_train_json                 = "./dataset_train.json"       # if data_archived == True, json file will be in the current working directory
raw_valid_json                 = "./dataset_valid.json"       # if data_archived == True, json file will be in the current working directory

tokenizer_repo_name            = "Andrusyshyn/gpt2-coq-tokenizer"
tokenizer_commit_hash          = "4e04ba257c544aaad0de61a2d82e64ba22e70847"

init_model                     = "gpt2"                                                   # base model Hugging Face name
model_repo_name                = "Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train"
model_output_dir               = "./gpt2-pretrained-for-coq-pt-custom-train-local"        # local dir to save the model

push_to_hub                    = True                 # set to True to push the model
run_name                       = "v30_a95_lrcos5em4"    # branch name (only needed if push_to_hub == True)

partially_trained              = True                                         # set to True to continue model training from specific commit hash (model_commit_hash)
model_commit_hash              = "bed04d11daf0a71769a16b61546005f69a1d4dd4"    # only needed if partially_trained == True
previously_completed_steps     = 2917                                          # only needed if partially_trained == True
previous_global_steps          = 23336                                         # only needed if partially_trained == True
previous_step                  = 23334                                          # only needed if partially_trained == True
stopped_epoch                  = 4                                             # only needed if partially_trained == True

torch_seed                     = 7
data_seed                      = 23
data_generator = torch.Generator().manual_seed(data_seed)
torch.manual_seed(torch_seed)

<torch._C.Generator at 0x7d9acbed7790>

In [4]:
# CUSTOM LOSS FUNCTION
def custom_loss(inputs, logits):
    # inputs [batch_size X cl]
    # logits [btach_size X cl X vocab_size]
    # Our labels start from second sequence token because first one does not have preceding token.
    # We drop last logit because last sequence token does not have subsequent token, so no label to compare
    shifted_labels = inputs[..., 1:].contiguous()
    shifted_logits = logits[..., :-1, :].contiguous()

    loss_func = CrossEntropyLoss(reduction='none')
    # loss [batch_size * (cl-1)] = loss_fct([batch_size * (cl-1) X vocab_size], [batch_size * (cl-1)])
    loss = loss_func(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1))
    # loss_per_sample [batch_size]
    loss_per_sequence = loss.view(shifted_logits.size(0), shifted_logits.size(1)).mean(axis=1)
    return loss_per_sequence.mean()

In [5]:
# PARAMETERS FOR WEIGHT DECAY
def get_wd_parameters(model, no_decay=["bias", r"ln_.{1,2}\.weight"]):
    wd_params = []
    nwd_params = []
    for name, params in model.named_parameters():
        if any(re.search(nd_reg, name) for nd_reg in no_decay):
            nwd_params.append(params)
        else:
            wd_params.append(params)
    return [
        {"params": wd_params,  "weight_decay": weight_decay},
        {"params": nwd_params, "weight_decay": 0.0},
    ]

In [6]:
# TOKENIZE RAW DATASETS
def get_tokenized_dataset(p_raw_dataset, p_context_length, p_tokenizer):
    concatenated_tokenized_samples = []
    for sample in p_raw_dataset:
        tokenized_sample = p_tokenizer(sample["content"], truncation=False)["input_ids"]
        concatenated_tokenized_samples.extend(tokenized_sample + [p_tokenizer.eos_token_id])

    tokenized_dataset_list = []
    for i in range(0, len(concatenated_tokenized_samples), p_context_length):
        input_ids = concatenated_tokenized_samples[i : i + p_context_length]
        if len(input_ids) == p_context_length:
            tokenized_dataset_list.append(torch.tensor(input_ids))

    return Dataset.from_dict({"input_ids": tokenized_dataset_list})

In [7]:
# SAVING TRAINING RESULTS TO THE JSON FILE
def save_results(filepath:str, split: str, results: list):
    """
    {
        "hyperparams": {},
        "train": [
            results1,
            results2,
            ...
        ],
        "valid": [
            results1,
            results2,
            ...
        ]
    }
    """
    if not save_json_logs:
        return
    if split not in {"train", "valid"}:
        print("ERROR: INVALID SPLIT")
        return
    _run_name = run_name
    if not push_to_hub:
        _run_name = ""
    _tensorboard_run_path = tensorboard_run_path
    if not save_tensorboard_logs:
        _tensorboard_run_path = ""
    _lr_scheduler_type = "cosine"
    if lr_scheduler_func == get_linear_schedule_with_warmup:
        _lr_scheduler_type = "linear"
    elif lr_scheduler_func == get_constant_schedule_with_warmup:
        _lr_scheduler_type = "const"
    hyperparams_dict = {
        "context_length"                 : context_length,
        "train_batch_size"               : train_batch_size,
        "eval_batch_size"                : eval_batch_size,
        "weight_decay"                   : weight_decay,
        "lr"                             : lr,
        "lr_scheduler_type"              : _lr_scheduler_type,
        "adamw_b1"                       : adamw_b1,
        "adamw_b2"                       : adamw_b2,
        "adamw_e"                        : adamw_e,
        "num_warmup_steps"               : num_warmup_steps,
        "gradient_accumulation_steps"    : gradient_accumulation_steps,
        "gradient_checkpointing"         : gradient_checkpointing,
        "eval_steps"                     : eval_steps,
        "num_train_epochs"               : num_train_epochs,
        "mixed_precision"                : mixed_precision,
        "tokenizer_repo_name"            : tokenizer_repo_name,
        "tokenizer_commit_hash"          : tokenizer_commit_hash,
        "init_model"                     : init_model,
        "model_repo_name"                : model_repo_name,
        "run_name"                       : _run_name,
        "drive_train_res_path"           : drive_train_res_path,
        "tensorboard_run_path"           : _tensorboard_run_path,
    }
    json_data = {"train": [], "valid": []}
    if os.path.exists(filepath):
        with open(filepath, 'r') as json_file:
            json_data = json.load(json_file)
    json_data["hyperparams"] = hyperparams_dict
    json_data[split].extend(results)
    with open(filepath, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

In [8]:
# TENSORBOARD VISUALISATION
def add_to_tensorboard(json_filepath, tensorboard_run_path):
    if not save_tensorboard_logs:
        return
    if not os.path.exists(json_filepath):
        print("ERROR: json_filepath DOES NOT EXIST")
        return
    with open(json_filepath, mode="r") as json_file:
        json_data = json.load(json_file)
        one_step_tokens = json_data["hyperparams"]["train_batch_size"] * json_data["hyperparams"]["gradient_accumulation_steps"] * json_data["hyperparams"]["context_length"]
        writer = SummaryWriter(tensorboard_run_path)
        prev_comleted_steps = json_data["train"][0]["completed_steps"]
        prev_lr = json_data["train"][0]["lr"][0]
        train_losses = []
        cs = 0
        for entry in json_data["train"]:
            cs = entry["completed_steps"]
            if cs == prev_comleted_steps:
                train_losses.append(entry["loss/train"])
                continue
            else:
                writer.add_scalar("Loss/Train", sum(train_losses)/len(train_losses), prev_comleted_steps * one_step_tokens)
                writer.add_scalar("Learning Rate", prev_lr, prev_comleted_steps * one_step_tokens)
                train_losses = [entry["loss/train"]]
                prev_comleted_steps = cs
                prev_lr = entry["lr"][0]
        writer.add_scalar("Loss/Train", sum(train_losses)/len(train_losses), cs * one_step_tokens)
        writer.add_scalar("Learning Rate", prev_lr, cs * one_step_tokens)

        for entry in json_data["valid"]:
            cs = entry["completed_steps"]
            writer.add_scalar("Loss/Eval", entry["loss/eval"], cs * one_step_tokens)
            writer.add_scalar("Perplexity/Eval", entry["perplexity"], cs * one_step_tokens)
        writer.close()

#### CONFIGURING

In [9]:
# MOUNTING DRIVE FOR SAVING LOGS
if drive_mounted:
    drive.mount(drive_mounted_path)

Mounted at /content/gdrive/


In [10]:
# CONFIGURING GIT CREDENTIALS
if have_git_write_access:
    !git config --global user.email "{user_email}"
    !git config --global user.name "{user_name}"

# To set Hugging Face token (for writing access) create HF_TOKEN secret in Google Collab or use notebook_login()

In [11]:
# CONFIGURING GIT DIRECTORIES
if push_to_hub:
    repo = Repository(model_output_dir, clone_from=model_repo_name)
    repo.git_checkout(run_name, create_branch_ok=True)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train into local empty directory.


Download file model.safetensors:   0%|          | 8.00k/480M [00:00<?, ?B/s]

Clean file model.safetensors:   0%|          | 1.00k/480M [00:00<?, ?B/s]

Checked out v30_a95_lrcos5em4 from v30_a95_lrcos5em4.
Branch 'v30_a95_lrcos5em4' set up to track remote branch 'v30_a95_lrcos5em4' from 'origin'.




#### DATASETS

In [12]:
# UNPACK DATASETS
if data_archived:
    if raw_train_archive.endswith(".gz"):
        !gzip -dkv "{raw_train_archive}"
    if raw_train_archive.endswith(".zip"):
        !unzip "{raw_train_archive}"
    if raw_valid_archive.endswith(".gz"):
        !gzip -dkv "{raw_valid_archive}"
    if raw_valid_archive.endswith(".zip"):
        !unzip "{raw_valid_archive}"

Archive:  /content/gdrive/My Drive/UCU/diploma/datasets/dataset_train.zip
  inflating: dataset_train.json      
Archive:  /content/gdrive/My Drive/UCU/diploma/datasets/dataset_valid.zip
  inflating: dataset_valid.json      


In [13]:
# LOAD DATASETS
data_files = {"train": raw_train_json, "validation": raw_valid_json}
raw_datasets = load_dataset("json", data_files=data_files, field="data")
print(raw_datasets)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'filepath'],
        num_rows: 4972
    })
    validation: Dataset({
        features: ['content', 'filepath'],
        num_rows: 742
    })
})


In [14]:
# LOADING TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(tokenizer_repo_name, revision=tokenizer_commit_hash)
print("tokenizer vocab size: ", len(tokenizer))
# TOKENIZE RAW DATASETS
train_dataset = get_tokenized_dataset(raw_datasets["train"],      context_length, tokenizer)
valid_dataset = get_tokenized_dataset(raw_datasets["validation"], context_length, tokenizer)
train_dataset.set_format("torch")
valid_dataset.set_format("torch")

# CREATE DATALOADERS
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, generator=data_generator)
valid_dataloader = DataLoader(valid_dataset, batch_size=eval_batch_size)
print(train_dataset)
print(valid_dataset)
print("len(train_dataloader): ", len(train_dataloader))
print("len(valid_dataloader): ", len(valid_dataloader))

tokenizer_config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/465k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/265k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2260 > 1024). Running this sequence through the model will result in indexing errors


tokenizer vocab size:  30000
Dataset({
    features: ['input_ids'],
    num_rows: 18667
})
Dataset({
    features: ['input_ids'],
    num_rows: 3784
})
len(train_dataloader):  4667
len(valid_dataloader):  473


#### CONFIGURING MODEL AND TRAINING

In [15]:
# CONFIGURING MODEL
model = None
if partially_trained:
    print("Loading partially trained model")
    model = GPT2LMHeadModel.from_pretrained(model_repo_name, revision=model_commit_hash)
else:
    print("Training from scratch")
    config = AutoConfig.from_pretrained(
        init_model,
        vocab_size=len(tokenizer),
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model = GPT2LMHeadModel(config)
print()
if gradient_checkpointing:
    model.gradient_checkpointing_enable()

optimizer = AdamW(get_wd_parameters(model), lr=lr, betas=(adamw_b1, adamw_b2), eps=adamw_e)
accelerator = Accelerator(mixed_precision=mixed_precision)
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, valid_dataloader
)

num_steps_per_epoch = len(train_dataloader)
print("Num steps per epoch: ", num_steps_per_epoch)
num_training_completed_steps = (num_train_epochs * num_steps_per_epoch) // gradient_accumulation_steps
if ((num_train_epochs * num_steps_per_epoch) % gradient_accumulation_steps != 0):
    num_training_completed_steps += 1
print("Num optimizer steps: ", num_training_completed_steps)

if lr_scheduler_func == get_constant_schedule_with_warmup:
    lr_scheduler = lr_scheduler_func(
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps
    )
else:
    lr_scheduler = lr_scheduler_func(
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_completed_steps
    )


if partially_trained:
    with torch.no_grad():
        for ind in range(previously_completed_steps):
            if ((ind < num_warmup_steps) or (lr_scheduler.get_lr()[0] > (0.1 * lr))):
                lr_scheduler.step()

print()
print(f"Model size:                                        {model.num_parameters()}")
print(f"Model size (only trainable params):                {model.num_parameters(only_trainable=True)}")
print(f"Model size (only trainable non-embeddings params): {model.num_parameters(only_trainable=True, exclude_embeddings=True)}")

Loading partially trained model


config.json:   0%|          | 0.00/946 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]


Num steps per epoch:  4667
Num optimizer steps:  4667

Model size:                                        108882432
Model size (only trainable params):                108882432
Model size (only trainable non-embeddings params): 85056000


In [16]:
# EVALUATION FUNCTION
def evaluate():
    torch.cuda.empty_cache()

    model.eval()
    losses = []
    with torch.no_grad():
        for step, batch in enumerate(valid_dataloader):
            with torch.no_grad():
                logits = model(batch["input_ids"]).logits
                loss = custom_loss(batch["input_ids"], logits)
                losses.append(loss.item())
    loss = torch.mean(torch.Tensor(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")

    torch.cuda.empty_cache()
    return loss.item(), perplexity.item()

In [None]:
# TRAINING LOOP
log_buffer = []

model.train()
completed_steps = 0
global_steps = 0
if partially_trained:
    completed_steps = previously_completed_steps
    global_steps = previous_global_steps
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=num_steps_per_epoch
    ):
        if partially_trained and ((epoch<stopped_epoch) or ((epoch==stopped_epoch) and (step <= previous_step))):
            continue

        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
################################################################################
        log_train = {
                "loss/train": loss.item(),
                "completed_steps": completed_steps,
                "lr": lr_scheduler.get_lr(),
                "global_steps" : global_steps,
                "epoch": epoch,
                "steps": step,
        }
        if ((completed_steps % 10 == 0) and (global_steps % gradient_accumulation_steps == 0)):
            accelerator.print(log_train)
        if save_json_logs:
            log_buffer.append(log_train)
################################################################################
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
################################################################################
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if ((completed_steps < num_warmup_steps) or (lr_scheduler.get_lr()[0] > (0.1 * lr))):
                lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (global_steps % (eval_steps * gradient_accumulation_steps)) == 0:
            if save_json_logs:
                save_results(drive_train_res_path, "train", log_buffer)
                log_buffer = []
            eval_loss, perplexity = evaluate()
            log_eval = {
                    "loss/eval": eval_loss,
                    "perplexity": perplexity,
                    "completed_steps": completed_steps,
                    "lr": lr_scheduler.get_lr(),
                    "global_steps" : global_steps,
                    "epoch": epoch,
                    "steps": step,
                    "loss/train": loss.item() * gradient_accumulation_steps,
            }
            accelerator.print(log_eval)
            save_results(drive_train_res_path, "valid", [log_eval])
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
            tokenizer.save_pretrained(model_output_dir)
            if accelerator.is_main_process:
                if push_to_hub:
                    repo.push_to_hub(
                        commit_message=f"Training in progress: completed_steps {completed_steps}; global_steps {global_steps};\
                                         epoch {epoch}; steps {step}; lr {lr_scheduler.get_lr()};\
                                         loss/eval {eval_loss}; perplexity {perplexity}; loss/train {loss.item() * gradient_accumulation_steps}",
                        blocking=False
                    )
            model.train()

################################################################################
################################################################################
################################################################################

#GRADIENT UPDATE (In case (global_steps % gradient_accumulation_steps != 0)
last_eval_log_train_loss = 0
if (global_steps % gradient_accumulation_steps != 0):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=(gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)) - 1    # -1 here is purely for better visualisation of tqdm progress bar
    ):
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        last_eval_log_train_loss = loss.item()
        log_train = {
                "loss/train": loss.item(),
                "completed_steps": completed_steps,
                "lr": lr_scheduler.get_lr(),
                "global_steps" : global_steps,
                "epoch": epoch,
                "steps": step + num_steps_per_epoch,
        }
        if save_json_logs:
            log_buffer.append(log_train)
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if ((completed_steps < num_warmup_steps) or (lr_scheduler.get_lr()[0] > (0.1 * lr))):
                lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
            break

################################################################################
################################################################################
################################################################################

# FINAL EVALUATE AND SAVE
additional_steps = 0
if (global_steps % gradient_accumulation_steps != 0):
    additional_steps = gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)
with torch.no_grad():
    last_train_loss = 0
    for batch in train_dataloader:
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        last_train_loss = loss.item()
        break
log_train = {
        "loss/train": last_train_loss,
        "completed_steps": completed_steps,
        "lr": lr_scheduler.get_lr(),
        "global_steps" : global_steps,
        "epoch": epoch,
        "steps": num_steps_per_epoch + additional_steps,
}
if save_json_logs:
    log_buffer.append(log_train)
    save_results(drive_train_res_path, "train", log_buffer)
    log_buffer = []
accelerator.print(log_train)

eval_loss, perplexity = evaluate()
log_eval = {
        "loss/eval": eval_loss,
        "perplexity": perplexity,
        "completed_steps": completed_steps,
        "lr": lr_scheduler.get_lr(),
        "global_steps" : global_steps,
        "epoch": epoch,
        "steps": num_steps_per_epoch + additional_steps - 1,
        "loss/train": last_eval_log_train_loss,
}
accelerator.print(log_eval)
save_results(drive_train_res_path, "valid", [log_eval])

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
tokenizer.save_pretrained(model_output_dir)
if accelerator.is_main_process:
    if push_to_hub:
        repo.push_to_hub(
            commit_message=f"Final model: completed_steps {completed_steps}; global_steps {global_steps};\
                             epoch {epoch}; steps {num_steps_per_epoch + additional_steps - 1}; lr {lr_scheduler.get_lr()};\
                             loss/eval {eval_loss}; perplexity {perplexity}; loss/train {last_eval_log_train_loss}",
            blocking=False
        )

model.train()

torch.cuda.empty_cache()

  0%|          | 0/4667 [00:00<?, ?it/s]

  0%|          | 0/4667 [00:00<?, ?it/s]

  0%|          | 0/4667 [00:00<?, ?it/s]

  0%|          | 0/4667 [00:00<?, ?it/s]

  0%|          | 0/4667 [00:00<?, ?it/s]

  0%|          | 0/4667 [00:00<?, ?it/s]

{'loss/train': 1.3134409189224243, 'completed_steps': 2920, 'lr': [0.00015471922502370737, 0.00015471922502370737], 'global_steps': 23360, 'epoch': 5, 'steps': 24}
{'loss/train': 1.6737143993377686, 'completed_steps': 2930, 'lr': [0.0001531605331877427, 0.0001531605331877427], 'global_steps': 23440, 'epoch': 5, 'steps': 104}
{'loss/train': 1.0121465921401978, 'completed_steps': 2940, 'lr': [0.0001516062577789413, 0.0001516062577789413], 'global_steps': 23520, 'epoch': 5, 'steps': 184}
{'loss/train': 1.3697717189788818, 'completed_steps': 2950, 'lr': [0.0001500564696810488, 0.0001500564696810488], 'global_steps': 23600, 'epoch': 5, 'steps': 264}
{'loss/train': 1.323610544204712, 'completed_steps': 2960, 'lr': [0.00014851123957316406, 0.00014851123957316406], 'global_steps': 23680, 'epoch': 5, 'steps': 344}
{'loss/train': 1.428122639656067, 'completed_steps': 2970, 'lr': [0.0001469706379265159, 0.0001469706379265159], 'global_steps': 23760, 'epoch': 5, 'steps': 424}
{'loss/train': 1.4968

Several commits (31) will be pushed upstream.


{'loss/train': 2.070448875427246, 'completed_steps': 3000, 'lr': [0.0001423773052808029, 0.0001423773052808029], 'global_steps': 24000, 'epoch': 5, 'steps': 664}
{'loss/train': 2.0118329524993896, 'completed_steps': 3010, 'lr': [0.00014085591792170377, 0.00014085591792170377], 'global_steps': 24080, 'epoch': 5, 'steps': 744}
{'loss/train': 1.729892611503601, 'completed_steps': 3020, 'lr': [0.00013933950814978745, 0.00013933950814978745], 'global_steps': 24160, 'epoch': 5, 'steps': 824}
{'loss/train': 1.43679678440094, 'completed_steps': 3030, 'lr': [0.00013782814512191253, 0.00013782814512191253], 'global_steps': 24240, 'epoch': 5, 'steps': 904}
{'loss/train': 1.4595532417297363, 'completed_steps': 3040, 'lr': [0.0001363218977647775, 0.0001363218977647775], 'global_steps': 24320, 'epoch': 5, 'steps': 984}
{'loss/train': 1.850962519645691, 'completed_steps': 3050, 'lr': [0.00013482083477177713, 0.00013482083477177713], 'global_steps': 24400, 'epoch': 5, 'steps': 1064}
{'loss/train': 1.2

Several commits (32) will be pushed upstream.


{'loss/train': 2.099788188934326, 'completed_steps': 3100, 'lr': [0.00012739567281255308, 0.00012739567281255308], 'global_steps': 24800, 'epoch': 5, 'steps': 1464}
{'loss/train': 1.9038989543914795, 'completed_steps': 3110, 'lr': [0.0001259271451072965, 0.0001259271451072965], 'global_steps': 24880, 'epoch': 5, 'steps': 1544}
{'loss/train': 1.5049093961715698, 'completed_steps': 3120, 'lr': [0.00012446427582566197, 0.00012446427582566197], 'global_steps': 24960, 'epoch': 5, 'steps': 1624}
{'loss/train': 1.7158173322677612, 'completed_steps': 3130, 'lr': [0.00012300713168275905, 0.00012300713168275905], 'global_steps': 25040, 'epoch': 5, 'steps': 1704}
{'loss/train': 1.0720021724700928, 'completed_steps': 3140, 'lr': [0.00012155577913259869, 0.00012155577913259869], 'global_steps': 25120, 'epoch': 5, 'steps': 1784}
{'loss/train': 1.027511477470398, 'completed_steps': 3150, 'lr': [0.00012011028436506208, 0.00012011028436506208], 'global_steps': 25200, 'epoch': 5, 'steps': 1864}
{'loss/t

Several commits (33) will be pushed upstream.


{'loss/train': 1.1093721389770508, 'completed_steps': 3200, 'lr': [0.0001129729750442115, 0.0001129729750442115], 'global_steps': 25600, 'epoch': 5, 'steps': 2264}
{'loss/train': 1.4639027118682861, 'completed_steps': 3210, 'lr': [0.00011156400178988221, 0.00011156400178988221], 'global_steps': 25680, 'epoch': 5, 'steps': 2344}
{'loss/train': 1.5370361804962158, 'completed_steps': 3220, 'lr': [0.00011016134199971147, 0.00011016134199971147], 'global_steps': 25760, 'epoch': 5, 'steps': 2424}
{'loss/train': 1.209144949913025, 'completed_steps': 3230, 'lr': [0.0001087650596429157, 0.0001087650596429157], 'global_steps': 25840, 'epoch': 5, 'steps': 2504}
{'loss/train': 1.6402822732925415, 'completed_steps': 3240, 'lr': [0.00010737521839786413, 0.00010737521839786413], 'global_steps': 25920, 'epoch': 5, 'steps': 2584}
{'loss/train': 2.5000269412994385, 'completed_steps': 3250, 'lr': [0.00010599188164917495, 0.00010599188164917495], 'global_steps': 26000, 'epoch': 5, 'steps': 2664}
{'loss/tr

Several commits (34) will be pushed upstream.


{'loss/train': 1.5102977752685547, 'completed_steps': 3300, 'lr': [9.917496288713612e-05, 9.917496288713612e-05], 'global_steps': 26400, 'epoch': 5, 'steps': 3064}
{'loss/train': 1.2810618877410889, 'completed_steps': 3310, 'lr': [9.78319673810284e-05, 9.78319673810284e-05], 'global_steps': 26480, 'epoch': 5, 'steps': 3144}
{'loss/train': 1.2442820072174072, 'completed_steps': 3320, 'lr': [9.649591159748148e-05, 9.649591159748148e-05], 'global_steps': 26560, 'epoch': 5, 'steps': 3224}
{'loss/train': 1.2431848049163818, 'completed_steps': 3330, 'lr': [9.516685646819268e-05, 9.516685646819268e-05], 'global_steps': 26640, 'epoch': 5, 'steps': 3304}
{'loss/train': 1.8437385559082031, 'completed_steps': 3340, 'lr': [9.384486260558964e-05, 9.384486260558964e-05], 'global_steps': 26720, 'epoch': 5, 'steps': 3384}
{'loss/train': 1.701820731163025, 'completed_steps': 3350, 'lr': [9.252999030006612e-05, 9.252999030006612e-05], 'global_steps': 26800, 'epoch': 5, 'steps': 3464}
{'loss/train': 1.11

Several commits (35) will be pushed upstream.


{'loss/train': 1.7824238538742065, 'completed_steps': 3400, 'lr': [8.606453940487505e-05, 8.606453940487505e-05], 'global_steps': 27200, 'epoch': 5, 'steps': 3864}
{'loss/train': 1.351090669631958, 'completed_steps': 3410, 'lr': [8.47936441616455e-05, 8.47936441616455e-05], 'global_steps': 27280, 'epoch': 5, 'steps': 3944}
{'loss/train': 2.0483312606811523, 'completed_steps': 3420, 'lr': [8.35302832622272e-05, 8.35302832622272e-05], 'global_steps': 27360, 'epoch': 5, 'steps': 4024}
{'loss/train': 1.1263028383255005, 'completed_steps': 3430, 'lr': [8.22745143230195e-05, 8.22745143230195e-05], 'global_steps': 27440, 'epoch': 5, 'steps': 4104}
{'loss/train': 1.1250059604644775, 'completed_steps': 3440, 'lr': [8.102639461418548e-05, 8.102639461418548e-05], 'global_steps': 27520, 'epoch': 5, 'steps': 4184}
{'loss/train': 1.2255275249481201, 'completed_steps': 3450, 'lr': [7.978598105703994e-05, 7.978598105703994e-05], 'global_steps': 27600, 'epoch': 5, 'steps': 4264}
{'loss/train': 1.807575

Several commits (36) will be pushed upstream.


{'loss/train': 1.04175865650177, 'completed_steps': 3500, 'lr': [7.370147304755412e-05, 7.370147304755412e-05], 'global_steps': 28000, 'epoch': 5, 'steps': 4664}


  0%|          | 0/4667 [00:00<?, ?it/s]

{'loss/train': 1.1439944505691528, 'completed_steps': 3510, 'lr': [7.250847188759807e-05, 7.250847188759807e-05], 'global_steps': 28080, 'epoch': 6, 'steps': 77}
{'loss/train': 1.3946657180786133, 'completed_steps': 3520, 'lr': [7.132356534476241e-05, 7.132356534476241e-05], 'global_steps': 28160, 'epoch': 6, 'steps': 157}
{'loss/train': 1.4995490312576294, 'completed_steps': 3530, 'lr': [7.01468074574844e-05, 7.01468074574844e-05], 'global_steps': 28240, 'epoch': 6, 'steps': 237}
{'loss/train': 1.8195347785949707, 'completed_steps': 3540, 'lr': [6.897825189257647e-05, 6.897825189257647e-05], 'global_steps': 28320, 'epoch': 6, 'steps': 317}
{'loss/train': 1.4884679317474365, 'completed_steps': 3550, 'lr': [6.781795194277854e-05, 6.781795194277854e-05], 'global_steps': 28400, 'epoch': 6, 'steps': 397}
{'loss/train': 0.9074446558952332, 'completed_steps': 3560, 'lr': [6.666596052432825e-05, 6.666596052432825e-05], 'global_steps': 28480, 'epoch': 6, 'steps': 477}
{'loss/train': 1.16452145

Several commits (37) will be pushed upstream.


{'loss/train': 1.6961241960525513, 'completed_steps': 3600, 'lr': [6.21421251764965e-05, 6.21421251764965e-05], 'global_steps': 28800, 'epoch': 6, 'steps': 797}
{'loss/train': 0.8950042724609375, 'completed_steps': 3610, 'lr': [6.103245681279101e-05, 6.103245681279101e-05], 'global_steps': 28880, 'epoch': 6, 'steps': 877}
{'loss/train': 1.2990057468414307, 'completed_steps': 3620, 'lr': [5.993140643737236e-05, 5.993140643737236e-05], 'global_steps': 28960, 'epoch': 6, 'steps': 957}
{'loss/train': 1.4846739768981934, 'completed_steps': 3630, 'lr': [5.883902426436239e-05, 5.883902426436239e-05], 'global_steps': 29040, 'epoch': 6, 'steps': 1037}
{'loss/train': 1.0364611148834229, 'completed_steps': 3640, 'lr': [5.77553601125641e-05, 5.77553601125641e-05], 'global_steps': 29120, 'epoch': 6, 'steps': 1117}
{'loss/train': 1.263566017150879, 'completed_steps': 3650, 'lr': [5.668046340318905e-05, 5.668046340318905e-05], 'global_steps': 29200, 'epoch': 6, 'steps': 1197}
{'loss/train': 2.0369064

Several commits (38) will be pushed upstream.


{'loss/train': 1.4961575269699097, 'completed_steps': 3700, 'lr': [5.1439193121247116e-05, 5.1439193121247116e-05], 'global_steps': 29600, 'epoch': 6, 'steps': 1597}
{'loss/train': 0.7587864398956299, 'completed_steps': 3710, 'lr': [5.041791636507631e-05, 5.041791636507631e-05], 'global_steps': 29680, 'epoch': 6, 'steps': 1677}
{'loss/train': 1.6942452192306519, 'completed_steps': 3720, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 29760, 'epoch': 6, 'steps': 1757}
{'loss/train': 1.71875, 'completed_steps': 3730, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 29840, 'epoch': 6, 'steps': 1837}
{'loss/train': 0.873186469078064, 'completed_steps': 3740, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 29920, 'epoch': 6, 'steps': 1917}
{'loss/train': 1.249830722808838, 'completed_steps': 3750, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 30000, 'epoch': 6, 'steps': 1997}
{'loss/train': 1.5239977836

Several commits (39) will be pushed upstream.


{'loss/train': 0.9050050377845764, 'completed_steps': 3800, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 30400, 'epoch': 6, 'steps': 2397}
{'loss/train': 1.8786206245422363, 'completed_steps': 3810, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 30480, 'epoch': 6, 'steps': 2477}
{'loss/train': 1.429602861404419, 'completed_steps': 3820, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 30560, 'epoch': 6, 'steps': 2557}
{'loss/train': 1.1819339990615845, 'completed_steps': 3830, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 30640, 'epoch': 6, 'steps': 2637}
{'loss/train': 1.3256292343139648, 'completed_steps': 3840, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 30720, 'epoch': 6, 'steps': 2717}
{'loss/train': 1.185213327407837, 'completed_steps': 3850, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 30800, 'epoch': 6, 'steps': 2797}
{'loss/train': 1.4

Several commits (40) will be pushed upstream.


{'loss/train': 1.373070478439331, 'completed_steps': 3900, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 31200, 'epoch': 6, 'steps': 3197}
{'loss/train': 1.2205897569656372, 'completed_steps': 3910, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 31280, 'epoch': 6, 'steps': 3277}
{'loss/train': 0.8007035851478577, 'completed_steps': 3920, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 31360, 'epoch': 6, 'steps': 3357}
{'loss/train': 1.1174876689910889, 'completed_steps': 3930, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 31440, 'epoch': 6, 'steps': 3437}
{'loss/train': 1.0402395725250244, 'completed_steps': 3940, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 31520, 'epoch': 6, 'steps': 3517}
{'loss/train': 1.4119932651519775, 'completed_steps': 3950, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 31600, 'epoch': 6, 'steps': 3597}
{'loss/train': 1.

Several commits (41) will be pushed upstream.


{'loss/train': 1.4097490310668945, 'completed_steps': 4000, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32000, 'epoch': 6, 'steps': 3997}
{'loss/train': 1.399709701538086, 'completed_steps': 4010, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32080, 'epoch': 6, 'steps': 4077}
{'loss/train': 1.3567140102386475, 'completed_steps': 4020, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32160, 'epoch': 6, 'steps': 4157}
{'loss/train': 1.2883665561676025, 'completed_steps': 4030, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32240, 'epoch': 6, 'steps': 4237}
{'loss/train': 1.278282642364502, 'completed_steps': 4040, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32320, 'epoch': 6, 'steps': 4317}
{'loss/train': 1.1682870388031006, 'completed_steps': 4050, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32400, 'epoch': 6, 'steps': 4397}
{'loss/train': 1.6

  0%|          | 0/4667 [00:00<?, ?it/s]

{'loss/train': 1.2647790908813477, 'completed_steps': 4090, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32720, 'epoch': 7, 'steps': 50}
{'loss/eval': 1.8730721473693848, 'perplexity': 6.508260250091553, 'completed_steps': 4100, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32800, 'epoch': 7, 'steps': 129, 'loss/train': 1.4353737831115723}


Several commits (42) will be pushed upstream.


{'loss/train': 0.7560071349143982, 'completed_steps': 4100, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32800, 'epoch': 7, 'steps': 130}
{'loss/train': 1.2004482746124268, 'completed_steps': 4110, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32880, 'epoch': 7, 'steps': 210}
{'loss/train': 1.0510972738265991, 'completed_steps': 4120, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 32960, 'epoch': 7, 'steps': 290}
{'loss/train': 1.1580150127410889, 'completed_steps': 4130, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 33040, 'epoch': 7, 'steps': 370}
{'loss/train': 1.21113920211792, 'completed_steps': 4140, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 33120, 'epoch': 7, 'steps': 450}
{'loss/train': 1.45094895362854, 'completed_steps': 4150, 'lr': [4.991068836893414e-05, 4.991068836893414e-05], 'global_steps': 33200, 'epoch': 7, 'steps': 530}
{'loss/train': 2.174126148

In [None]:
torch.cuda.empty_cache()
add_to_tensorboard(drive_train_res_path, tensorboard_run_path)

In [None]:
%tensorboard --logdir "{tensorboard_run_path}"