In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs

In [None]:
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard

from huggingface_hub import notebook_login, Repository
from datasets import load_dataset, DatasetDict, Dataset
from accelerate import Accelerator
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, pipeline, get_scheduler
from transformers.keras_callbacks import PushToHubCallback

import re
import json
import os
from tqdm import tqdm
from tqdm.notebook import tqdm
from google.colab import drive

### TRAINING


#### HYPER PARAMS, GLOBAL VARS, FUNCTIONS

In [None]:
# HYPER PARAMETERS
context_length                 = 1024
train_batch_size               = 4
eval_batch_size                = 4
weight_decay                   = 0.1
lr                             = 5e-4
lr_scheduler_type              = "linear"    # "linear" | "cosine" | "constant"
adamw_b1                       = 0.9
adamw_b2                       = 0.95
adamw_e                        = 1e-8
num_warmup_steps               = 50
gradient_accumulation_steps    = 8
gradient_checkpointing         = False
eval_steps                     = 500 // gradient_accumulation_steps
num_train_epochs               = 5
mixed_precision                = "fp16"


#GLOBAL VARS
have_git_write_access          = True                              # set to True if you need to commit changes
user_email                     = "orest.andrusyshyn@ucu.edu.ua"    # only needed if above have_git_write_access == True
user_name                      = "Orest Andrusyshyn"               # only needed if above have_git_write_access == True

drive_mounted                  = True
drive_mounted_path             = '/content/gdrive/'    # only needed if drive_mounted == True
save_json_logs                 = True                  # set to True to save training results into json file (drive_train_res_path)
drive_train_res_path           = drive_mounted_path + 'My Drive/UCU/diploma/training_results/testing_ds_random.json'    # only needed if save_json_logs == True
save_tensorboard_logs          = True                  # set to True to save training results into tensorboard run (tensorboard_run_path)
tensorboard_run_path           = drive_mounted_path + 'My Drive/UCU/diploma/tensorboard_runs/testing_ds_random'                   # only needed if save_tensorboard_logs == True

data_archived                  = True    # set to True if you need to unarchive the data
raw_train_archive              = drive_mounted_path + "My Drive/UCU/diploma/datasets/dataset_train.zip"         # only needed if data_archived == True
raw_valid_archive              = drive_mounted_path + "My Drive/UCU/diploma/datasets/dataset_valid.zip"    # only needed if data_archived == True
raw_train_json                 = "./dataset_train.json"       # if data_archived == True, json file will be in the current working directory
raw_valid_json                 = "./dataset_valid.json"       # if data_archived == True, json file will be in the current working directory

tokenizer_repo_name            = "Andrusyshyn/gpt2-coq-tokenizer"
tokenizer_commit_hash          = "90c3858771e6120dfc8eab7a6754295964e8d8c3"

init_model                     = "gpt2"                                                   # base model Hugging Face name
model_repo_name                = "Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train"
model_output_dir               = "./gpt2-pretrained-for-coq-pt-custom-train-local"        # local dir to save the model

push_to_hub                    = True                 # set to True to push the model
run_name                       = "testing_ds_random"    # branch name (only needed if push_to_hub == True)

partially_trained              = True                                         # set to True to continue model training from specific commit hash (model_commit_hash)
model_commit_hash              = "6623a63e7f66e075818959abc715d488d393abc3"    # only needed if partially_trained == True
previously_completed_steps     = 682                                          # only needed if partially_trained == True
previous_global_steps          = 5456                                         # only needed if partially_trained == True
previous_step                  = 811                                          # only needed if partially_trained == True
stopped_epoch                  = 1                                             # only needed if partially_trained == True

torch_seed                     = 7
data_seed                      = 23
torch.manual_seed(torch_seed)

<torch._C.Generator at 0x78feaaf91ad0>

In [None]:
# CUSTOM LOSS FUNCTION
def custom_loss(inputs, logits):
    # inputs [batch_size X cl]
    # logits [btach_size X cl X vocab_size]
    # Our labels start from second sequence token because first one does not have preceding token.
    # We drop last logit because last sequence token does not have subsequent token, so no label to compare
    shifted_labels = inputs[..., 1:].contiguous()
    shifted_logits = logits[..., :-1, :].contiguous()

    loss_func = CrossEntropyLoss(reduce=False)
    # loss [batch_size * (cl-1)] = loss_fct([batch_size * (cl-1) X vocab_size], [batch_size * (cl-1)])
    loss = loss_func(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1))
    # loss_per_sample [batch_size]
    loss_per_sequence = loss.view(shifted_logits.size(0), shifted_logits.size(1)).mean(axis=1)
    return loss_per_sequence.mean()

In [None]:
# PARAMETERS FOR WEIGHT DECAY
def get_wd_parameters(model, no_decay=["bias", r"ln_.{1,2}\.weight"]):
    wd_params = []
    nwd_params = []
    for name, params in model.named_parameters():
        if any(re.search(nd_reg, name) for nd_reg in no_decay):
            nwd_params.append(params)
        else:
            wd_params.append(params)
    return [
        {"params": wd_params,  "weight_decay": weight_decay},
        {"params": nwd_params, "weight_decay": 0.0},
    ]

In [None]:
# TOKENIZE RAW DATASETS
def get_tokenized_dataset(p_raw_dataset, p_context_length, p_tokenizer):
    concatenated_tokenized_samples = []
    for sample in p_raw_dataset:
        tokenized_sample = p_tokenizer(sample["content"], truncation=False)["input_ids"]
        concatenated_tokenized_samples.extend(tokenized_sample + [p_tokenizer.eos_token_id])

    tokenized_dataset_list = []
    concatenated_tokenized_samples_length = len(concatenated_tokenized_samples)
    max_num = concatenated_tokenized_samples_length // p_context_length
    if (concatenated_tokenized_samples_length % p_context_length != 0):
        max_num += 1
    random_list = torch.randperm(max_num).tolist()

    for i in random_list:
        input_ids = concatenated_tokenized_samples[i*p_context_length : (i+1)*p_context_length]
    # for i in range(0, len(concatenated_tokenized_samples), p_context_length):
    #     input_ids = concatenated_tokenized_samples[i : i + p_context_length]
        if len(input_ids) == p_context_length:
            tokenized_dataset_list.append(torch.tensor(input_ids))

    return Dataset.from_dict({"input_ids": tokenized_dataset_list})

In [None]:
# SAVING TRAINING RESULTS TO THE JSON FILE
def save_results(filepath:str, split: str, results: list):
    """
    {
        "hyperparams": {},
        "train": [
            results1,
            results2,
            ...
        ],
        "valid": [
            results1,
            results2,
            ...
        ]
    }
    """
    if not save_json_logs:
        return
    if split not in {"train", "valid"}:
        print("ERROR: INVALID SPLIT")
        return
    _run_name = run_name
    if not push_to_hub:
        _run_name = ""
    _tensorboard_run_path = tensorboard_run_path
    if not save_tensorboard_logs:
        _tensorboard_run_path = ""
    hyperparams_dict = {
        "context_length"                 : context_length,
        "train_batch_size"               : train_batch_size,
        "eval_batch_size"                : eval_batch_size,
        "weight_decay"                   : weight_decay,
        "lr"                             : lr,
        "lr_scheduler_type"              : lr_scheduler_type,
        "adamw_b1"                       : adamw_b1,
        "adamw_b2"                       : adamw_b2,
        "adamw_e"                        : adamw_e,
        "num_warmup_steps"               : num_warmup_steps,
        "gradient_accumulation_steps"    : gradient_accumulation_steps,
        "gradient_checkpointing"         : gradient_checkpointing,
        "eval_steps"                     : eval_steps,
        "num_train_epochs"               : num_train_epochs,
        "mixed_precision"                : mixed_precision,
        "tokenizer_repo_name"            : tokenizer_repo_name,
        "tokenizer_commit_hash"          : tokenizer_commit_hash,
        "init_model"                     : init_model,
        "model_repo_name"                : model_repo_name,
        "run_name"                       : _run_name,
        "drive_train_res_path"           : drive_train_res_path,
        "tensorboard_run_path"           : _tensorboard_run_path,
    }
    json_data = {"train": [], "valid": []}
    if os.path.exists(filepath):
        with open(filepath, 'r') as json_file:
            json_data = json.load(json_file)
    json_data["hyperparams"] = hyperparams_dict
    json_data[split].extend(results)
    with open(filepath, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

In [None]:
# TENSORBOARD VISUALISATION
def add_to_tensorboard(json_filepath, tensorboard_run_path):
    if not save_tensorboard_logs:
        return
    if not os.path.exists(json_filepath):
        print("ERROR: json_filepath DOES NOT EXIST")
        return
    with open(json_filepath, mode="r") as json_file:
        json_data = json.load(json_file)
        one_step_tokens = json_data["hyperparams"]["train_batch_size"] * json_data["hyperparams"]["gradient_accumulation_steps"] * json_data["hyperparams"]["context_length"]
        writer = SummaryWriter(tensorboard_run_path)
        prev_comleted_steps = json_data["train"][0]["completed_steps"]
        prev_lr = json_data["train"][0]["lr"][0]
        train_losses = []
        cs = 0
        for entry in json_data["train"]:
            cs = entry["completed_steps"]
            if cs == prev_comleted_steps:
                train_losses.append(entry["loss/train"])
                continue
            else:
                writer.add_scalar("Loss/Train", sum(train_losses)/len(train_losses), prev_comleted_steps * one_step_tokens)
                writer.add_scalar("Learning Rate", prev_lr, prev_comleted_steps * one_step_tokens)
                train_losses = [entry["loss/train"]]
                prev_comleted_steps = cs
                prev_lr = entry["lr"][0]
        writer.add_scalar("Loss/Train", sum(train_losses)/len(train_losses), cs * one_step_tokens)
        writer.add_scalar("Learning Rate", prev_lr, cs * one_step_tokens)

        for entry in json_data["valid"]:
            cs = entry["completed_steps"]
            writer.add_scalar("Loss/Eval", entry["loss/eval"], cs * one_step_tokens)
            writer.add_scalar("Perplexity/Eval", entry["perplexity"], cs * one_step_tokens)
        writer.close()

#### CONFIGURING

In [None]:
# MOUNTING DRIVE FOR SAVING LOGS
if drive_mounted:
    drive.mount(drive_mounted_path)

Mounted at /content/gdrive/


In [None]:
# CONFIGURING GIT CREDENTIALS
if have_git_write_access:
    !git config --global user.email "{user_email}"
    !git config --global user.name "{user_name}"

# To set Hugging Face token (for writing access) create HF_TOKEN secret in Google Collab or use notebook_login()

In [None]:
# CONFIGURING GIT DIRECTORIES
if push_to_hub:
    repo = Repository(model_output_dir, clone_from=model_repo_name)
    repo.git_checkout(run_name, create_branch_ok=True)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/./gpt2-pretrained-for-coq-pt-custom-train-local is already a clone of https://huggingface.co/Andrusyshyn/gpt2-pretrained-for-coq-pt-custom-train. Make sure you pull the latest changes with `repo.git_pull()`.
Checked out testing_ds_random from testing_ds_random.
Your branch is up to date with 'origin/testing_ds_random'.




#### DATASETS

In [None]:
# UNPACK DATASETS
if data_archived:
    if raw_train_archive.endswith(".gz"):
        !gzip -dkv "{raw_train_archive}"
    if raw_train_archive.endswith(".zip"):
        !unzip "{raw_train_archive}"
    if raw_valid_archive.endswith(".gz"):
        !gzip -dkv "{raw_valid_archive}"
    if raw_valid_archive.endswith(".zip"):
        !unzip "{raw_valid_archive}"

Archive:  /content/gdrive/My Drive/UCU/diploma/datasets/dataset_train.zip
  inflating: dataset_train.json      
Archive:  /content/gdrive/My Drive/UCU/diploma/datasets/dataset_valid.zip
  inflating: dataset_valid.json      


In [None]:
# LOAD DATASETS
data_files = {"train": raw_train_json, "validation": raw_valid_json}
raw_datasets = load_dataset("json", data_files=data_files, field="data")
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['filepath', 'content'],
        num_rows: 4972
    })
    validation: Dataset({
        features: ['filepath', 'content'],
        num_rows: 742
    })
})


In [None]:
# LOADING TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(tokenizer_repo_name, revision=tokenizer_commit_hash)
print("tokenizer vocab size: ", len(tokenizer))
# TOKENIZE RAW DATASETS
train_dataset = get_tokenized_dataset(raw_datasets["train"],      context_length, tokenizer)
valid_dataset = get_tokenized_dataset(raw_datasets["validation"], context_length, tokenizer)
train_dataset.set_format("torch")
valid_dataset.set_format("torch")

# CREATE DATALOADERS
train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=eval_batch_size)
print(train_dataset)
print(valid_dataset)
print("len(train_dataloader): ", len(train_dataloader))
print("len(valid_dataloader): ", len(valid_dataloader))

Token indices sequence length is longer than the specified maximum sequence length for this model (2233 > 1024). Running this sequence through the model will result in indexing errors


tokenizer vocab size:  50000
Dataset({
    features: ['input_ids'],
    num_rows: 18574
})
Dataset({
    features: ['input_ids'],
    num_rows: 3638
})
len(train_dataloader):  4644
len(valid_dataloader):  910


#### CONFIGURING MODEL AND TRAINING

In [None]:
# CONFIGURING MODEL
model = None
if partially_trained:
    print("Loading partially trained model")
    model = GPT2LMHeadModel.from_pretrained(model_repo_name, revision=model_commit_hash)
else:
    print("Training from scratch")
    config = AutoConfig.from_pretrained(
        init_model,
        vocab_size=len(tokenizer),
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model = GPT2LMHeadModel(config)
print()
if gradient_checkpointing:
    model.gradient_checkpointing_enable()

optimizer = AdamW(get_wd_parameters(model), lr=lr, betas=(adamw_b1, adamw_b2), eps=adamw_e)
accelerator = Accelerator(mixed_precision=mixed_precision)
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, valid_dataloader
)

num_steps_per_epoch = len(train_dataloader)
print("Num steps per epoch: ", num_steps_per_epoch)
num_training_completed_steps = (num_train_epochs * num_steps_per_epoch) // gradient_accumulation_steps
if ((num_train_epochs * num_steps_per_epoch) % gradient_accumulation_steps != 0):
    num_training_completed_steps += 1
print("Num optimizer steps: ", num_training_completed_steps)


lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_completed_steps
)

if partially_trained:
    with torch.no_grad():
        for ind in range(previously_completed_steps):
            if ((ind < num_warmup_steps) or (lr_scheduler.get_lr()[0] > (0.1 * lr))):
                lr_scheduler.step()

print()
print(f"Model size:                                        {model.num_parameters()}")
print(f"Model size (only trainable params):                {model.num_parameters(only_trainable=True)}")
print(f"Model size (only trainable non-embeddings params): {model.num_parameters(only_trainable=True, exclude_embeddings=True)}")

Loading partially trained model


config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]


Num steps per epoch:  4644
Num optimizer steps:  2903

Model size:                                        124242432
Model size (only trainable params):                124242432
Model size (only trainable non-embeddings params): 85056000


In [None]:
# EVALUATION FUNCTION
def evaluate():
    torch.cuda.empty_cache()

    model.eval()
    losses = []
    with torch.no_grad():
        for step, batch in enumerate(valid_dataloader):
            with torch.no_grad():
                logits = model(batch["input_ids"]).logits
                loss = custom_loss(batch["input_ids"], logits)
                losses.append(loss.item())
    loss = torch.mean(torch.Tensor(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")

    torch.cuda.empty_cache()
    return loss.item(), perplexity.item()

In [None]:
# TRAINING LOOP
log_buffer = []

model.train()
completed_steps = 0
global_steps = 0
if partially_trained:
    completed_steps = previously_completed_steps
    global_steps = previous_global_steps
for epoch in range(num_train_epochs):
    # This is needed when continuing training from checkpoint to prevent training
    # multiple times on the same batch during one epoch
    torch.manual_seed(data_seed)
    for _ in range(epoch):
        for step, batch in enumerate(train_dataloader, start=0):
            pass
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=num_steps_per_epoch
    ):
        if partially_trained and ((epoch<stopped_epoch) or ((epoch==stopped_epoch) and (step <= previous_step))):
            continue

        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
################################################################################
        log_train = {
                "loss/train": loss.item(),
                "completed_steps": completed_steps,
                "lr": lr_scheduler.get_lr(),
                "global_steps" : global_steps,
                "epoch": epoch,
                "steps": step,
        }
        if ((completed_steps % 1 == 0) and (global_steps % gradient_accumulation_steps == 0)):
            accelerator.print(log_train)
        if save_json_logs:
            log_buffer.append(log_train)
################################################################################
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
################################################################################
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if ((completed_steps < num_warmup_steps) or (lr_scheduler.get_lr()[0] > (0.1 * lr))):
                lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (global_steps % (eval_steps * gradient_accumulation_steps)) == 0:
            if save_json_logs:
                save_results(drive_train_res_path, "train", log_buffer)
                log_buffer = []
            eval_loss, perplexity = evaluate()
            log_eval = {
                    "loss/eval": eval_loss,
                    "perplexity": perplexity,
                    "completed_steps": completed_steps,
                    "lr": lr_scheduler.get_lr(),
                    "global_steps" : global_steps,
                    "epoch": epoch,
                    "steps": step,
                    "loss/train": loss.item() * gradient_accumulation_steps,
            }
            accelerator.print(log_eval)
            save_results(drive_train_res_path, "valid", [log_eval])
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
            tokenizer.save_pretrained(model_output_dir)
            if accelerator.is_main_process:
                if push_to_hub:
                    repo.push_to_hub(
                        commit_message=f"Training in progress: completed_steps {completed_steps}; global_steps {global_steps};\
                                         epoch {epoch}; steps {step}; lr {lr_scheduler.get_lr()};\
                                         loss/eval {eval_loss}; perplexity {perplexity}; loss/train {loss.item() * gradient_accumulation_steps}",
                        blocking=False
                    )
            model.train()

################################################################################
################################################################################
################################################################################

#GRADIENT UPDATE (In case (global_steps % gradient_accumulation_steps != 0)
last_eval_log_train_loss = 0
if (global_steps % gradient_accumulation_steps != 0):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=0), total=(gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)) - 1    # -1 here is purely for better visualisation of tqdm progress bar
    ):
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        last_eval_log_train_loss = loss.item()
        log_train = {
                "loss/train": loss.item(),
                "completed_steps": completed_steps,
                "lr": lr_scheduler.get_lr(),
                "global_steps" : global_steps,
                "epoch": epoch,
                "steps": step + (num_train_epochs * num_steps_per_epoch),
        }
        if save_json_logs:
            log_buffer.append(log_train)
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        global_steps += 1
        if global_steps % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if ((completed_steps < num_warmup_steps) or (lr_scheduler.get_lr()[0] > (0.1 * lr))):
                lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
            break

################################################################################
################################################################################
################################################################################

# FINAL EVALUATE AND SAVE
additional_steps = 0
if (global_steps % gradient_accumulation_steps != 0):
    additional_steps = gradient_accumulation_steps - (global_steps % gradient_accumulation_steps)
with torch.no_grad():
    last_train_loss = 0
    for batch in train_dataloader:
        logits = model(batch["input_ids"]).logits
        loss = custom_loss(batch["input_ids"], logits)
        last_train_loss = loss.item()
        break
log_train = {
        "loss/train": last_train_loss,
        "completed_steps": completed_steps,
        "lr": lr_scheduler.get_lr(),
        "global_steps" : global_steps,
        "epoch": epoch,
        "steps": (num_train_epochs * num_steps_per_epoch) + additional_steps,
}
if save_json_logs:
    log_buffer.append(log_train)
    save_results(drive_train_res_path, "train", log_buffer)
    log_buffer = []
accelerator.print(log_train)

eval_loss, perplexity = evaluate()
log_eval = {
        "loss/eval": eval_loss,
        "perplexity": perplexity,
        "completed_steps": completed_steps,
        "lr": lr_scheduler.get_lr(),
        "global_steps" : global_steps,
        "epoch": epoch,
        "steps": (num_train_epochs * num_steps_per_epoch) + additional_steps - 1,
        "loss/train": last_eval_log_train_loss,
}
accelerator.print(log_eval)
save_results(drive_train_res_path, "valid", [log_eval])

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_output_dir, save_function=accelerator.save)
tokenizer.save_pretrained(model_output_dir)
if accelerator.is_main_process:
    if push_to_hub:
        repo.push_to_hub(
            commit_message=f"Final model: completed_steps {completed_steps}; global_steps {global_steps};\
                             epoch {epoch}; steps {(num_train_epochs * num_steps_per_epoch) + additional_steps - 1}; lr {lr_scheduler.get_lr()};\
                             loss/eval {eval_loss}; perplexity {perplexity}; loss/train {last_eval_log_train_loss}",
            blocking=False
        )
model.train()

torch.cuda.empty_cache()

  0%|          | 0/4644 [00:00<?, ?it/s]

  0%|          | 0/4644 [00:00<?, ?it/s]



{'loss/train': 2.859977960586548, 'completed_steps': 682, 'lr': [0.0003892393971258324, 0.0003892393971258324], 'global_steps': 5456, 'epoch': 1, 'steps': 812}
{'loss/train': 2.582609176635742, 'completed_steps': 683, 'lr': [0.00038906414300736073, 0.00038906414300736073], 'global_steps': 5464, 'epoch': 1, 'steps': 820}
{'loss/train': 3.084263801574707, 'completed_steps': 684, 'lr': [0.0003888888888888889, 0.0003888888888888889], 'global_steps': 5472, 'epoch': 1, 'steps': 828}
{'loss/train': 2.960275173187256, 'completed_steps': 685, 'lr': [0.0003887136347704171, 0.0003887136347704171], 'global_steps': 5480, 'epoch': 1, 'steps': 836}
{'loss/train': 3.39548397064209, 'completed_steps': 686, 'lr': [0.00038853838065194536, 0.00038853838065194536], 'global_steps': 5488, 'epoch': 1, 'steps': 844}
{'loss/train': 2.811929225921631, 'completed_steps': 687, 'lr': [0.00038836312653347355, 0.00038836312653347355], 'global_steps': 5496, 'epoch': 1, 'steps': 852}
{'loss/train': 3.218229293823242, '

Several commits (14) will be pushed upstream.


{'loss/train': 2.2700326442718506, 'completed_steps': 744, 'lr': [0.00037837364178058185, 0.00037837364178058185], 'global_steps': 5952, 'epoch': 1, 'steps': 1308}
{'loss/train': 2.2608296871185303, 'completed_steps': 745, 'lr': [0.00037819838766211004, 0.00037819838766211004], 'global_steps': 5960, 'epoch': 1, 'steps': 1316}
{'loss/train': 2.9898009300231934, 'completed_steps': 746, 'lr': [0.0003780231335436383, 0.0003780231335436383], 'global_steps': 5968, 'epoch': 1, 'steps': 1324}
{'loss/train': 2.668977737426758, 'completed_steps': 747, 'lr': [0.00037784787942516654, 0.00037784787942516654], 'global_steps': 5976, 'epoch': 1, 'steps': 1332}
{'loss/train': 3.7207179069519043, 'completed_steps': 748, 'lr': [0.00037767262530669473, 0.00037767262530669473], 'global_steps': 5984, 'epoch': 1, 'steps': 1340}
{'loss/train': 3.069397449493408, 'completed_steps': 749, 'lr': [0.0003774973711882229, 0.0003774973711882229], 'global_steps': 5992, 'epoch': 1, 'steps': 1348}
{'loss/train': 2.78328

Several commits (15) will be pushed upstream.


{'loss/train': 2.249023199081421, 'completed_steps': 806, 'lr': [0.0003675078864353312, 0.0003675078864353312], 'global_steps': 6448, 'epoch': 1, 'steps': 1804}
{'loss/train': 2.6038870811462402, 'completed_steps': 807, 'lr': [0.0003673326323168594, 0.0003673326323168594], 'global_steps': 6456, 'epoch': 1, 'steps': 1812}
{'loss/train': 2.694669723510742, 'completed_steps': 808, 'lr': [0.00036715737819838765, 0.00036715737819838765], 'global_steps': 6464, 'epoch': 1, 'steps': 1820}
{'loss/train': 2.3287506103515625, 'completed_steps': 809, 'lr': [0.0003669821240799159, 0.0003669821240799159], 'global_steps': 6472, 'epoch': 1, 'steps': 1828}
{'loss/train': 2.8866031169891357, 'completed_steps': 810, 'lr': [0.0003668068699614441, 0.0003668068699614441], 'global_steps': 6480, 'epoch': 1, 'steps': 1836}
{'loss/train': 2.6179919242858887, 'completed_steps': 811, 'lr': [0.00036663161584297234, 0.00036663161584297234], 'global_steps': 6488, 'epoch': 1, 'steps': 1844}
{'loss/train': 2.466934919

Several commits (16) will be pushed upstream.


{'loss/train': 2.4696412086486816, 'completed_steps': 868, 'lr': [0.00035664213109008064, 0.00035664213109008064], 'global_steps': 6944, 'epoch': 1, 'steps': 2300}
{'loss/train': 2.9418015480041504, 'completed_steps': 869, 'lr': [0.00035646687697160883, 0.00035646687697160883], 'global_steps': 6952, 'epoch': 1, 'steps': 2308}
{'loss/train': 2.695702075958252, 'completed_steps': 870, 'lr': [0.000356291622853137, 0.000356291622853137], 'global_steps': 6960, 'epoch': 1, 'steps': 2316}
{'loss/train': 2.475986957550049, 'completed_steps': 871, 'lr': [0.0003561163687346653, 0.0003561163687346653], 'global_steps': 6968, 'epoch': 1, 'steps': 2324}
{'loss/train': 2.2626519203186035, 'completed_steps': 872, 'lr': [0.0003559411146161935, 0.0003559411146161935], 'global_steps': 6976, 'epoch': 1, 'steps': 2332}
{'loss/train': 2.3377606868743896, 'completed_steps': 873, 'lr': [0.0003557658604977217, 0.0003557658604977217], 'global_steps': 6984, 'epoch': 1, 'steps': 2340}
{'loss/train': 3.15747022628

In [None]:
torch.cuda.empty_cache()
add_to_tensorboard(drive_train_res_path, tensorboard_run_path)

In [None]:
%tensorboard --logdir "{tensorboard_run_path}"