## Imports & Definitions

In [2]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.",
)

warnings.filterwarnings(
    "ignore",
    message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.",
)


In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import sys
sys.path.append("../../utils")
from definitions import *

In [13]:
! ls ../../../../

LongLoRA-diploma-research
OpenBookQA
YandexGPT-api-call_ru.ipynb
cache
dataflow_en.ipynb
gpt-week
long_context_LLMs
modelcache
nlp_course
venv
view_machine.ipynb
wandb
wandb_try.ipynb


In [5]:
CACHE_DIR = Path("../../../../cache/")
DATASET_DIR = Path("/home/jupyter/mnt/datasets/diplomas/russian_dataset/")

## Example of Fine-Tuning

### Helper imports & definitions

In [6]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np



In [7]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
    "prompt_no_input_llama2":(
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
    "prompt_input_llama2": (
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} \n{input} [/INST]"
    ),
    "prompt_llama2": "[INST]{instruction}[/INST]",
    "prompt_input_diploma_special":(
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\nBelow is a diploma text. Your task is to generate abstract of this diploma.\n\n### Input:\n{input}\n\n### Response:"
    ),
}

In [8]:
from typing import Dict, Optional, Sequence

def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

In [9]:
from torch.utils.data import Dataset
import logging

class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, nrows: int, diploma_prefix_len: int):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        data_table = pd.read_csv(data_path)
        data_table = data_table.sample(min(len(data_table), nrows))

        logging.warning("Formatting inputs...")

        prompt_input_diploma = PROMPT_DICT["prompt_input_diploma_special"]
        sources = [
            prompt_input_diploma.format(input=diploma[:diploma_prefix_len])
            for diploma in data_table["diploma"]
        ]

        targets = [f"{abstract}{tokenizer.eos_token}" for abstract in data_table["abstract"]]

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

### Downloading model & tokenizer

In [10]:
model_name = LLAMA_2_7B

In [11]:
model = AutoModelForCausalLM.from_pretrained(HUGGINGFACE_MODEL_TO_REPO[model_name], cache_dir=CACHE_DIR, device_map='auto')

Loading checkpoint shards: 100%|██████████| 2/2 [03:39<00:00, 109.78s/it]


In [12]:
model.device

device(type='cuda', index=0)

In [13]:
MODEL_MAX_LENGTH = 16384

In [14]:
tokenizer = AutoTokenizer.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[model_name], 
    cache_dir=CACHE_DIR, 
    model_max_length=MODEL_MAX_LENGTH,
    padding_side="right",
    use_fast=True)

In [15]:
tokenizer.model_max_length

16384

### Add new tokens

In [16]:
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)

Using pad_token, but it is not set yet.


### Load train/val/test datasets

In [17]:
train_dataset = SupervisedDataset(DATASET_DIR.joinpath("russian_dataset_train.csv"), tokenizer, nrows=720 * 3, diploma_prefix_len=8000)



In [18]:
train_dataset.__len__()

2160

In [19]:
val_dataset = SupervisedDataset(DATASET_DIR.joinpath("russian_dataset_val.csv"), tokenizer,  nrows=720 * 3, diploma_prefix_len=8000)
val_dataset.__len__()



1397

In [20]:
test_dataset = SupervisedDataset(DATASET_DIR.joinpath("russian_dataset_test.csv"), tokenizer,  nrows=10, diploma_prefix_len=8000)



### Ensure that diploma_prefix_len is ok

In [21]:
list(set(test_dataset[9]["labels"].tolist())).__len__() > 1

True

### View how dataset built

In [56]:
text_labels = tokenizer.decode(test_dataset[9]["input_ids"])
print(text_labels)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
Санкт-Петербургский государственный университет


АВРАМЕНКО Полина Андреевна
Выпускная квалификационная работа
Веб-туны как часть южнокорейской культуры в XXI веке (на примере романтических историй) 
Уровень образования: магистратура
Направление 58.04.01 «Востоковедение и африканистика»
Основная образовательная программа BM.5808 «Культура народов Азии и Африки (с изучением языков Азии и Африки)»


Научный руководитель:
доцент, Кафедра корееведения, Санкт-Петербургский государственный университет Гурьева Анастасия Александровна

Рецензент:
приглашенный преподаватель, Кафедра корееведения, Санкт-Петербургская школа социальных наук и востоковедения,
доцент, Санкт-Петербургский филиал федерального государственного автономного 

In [61]:
unignored_tokens = []
for token in test_dataset[9]["labels"]:
    if token != IGNORE_INDEX:
        unignored_tokens.append(token)
text_labels = tokenizer.decode(unignored_tokens)
print(text_labels)

Данная выпускная квалификационная работа посвящена одному из основных элементов, формирующих массовый культурный контент Республики Корея - веб-тунам (webtoon) – цифровым комиксам, появившимся в начале XXI века. Целью работы является выявление места веб-тунов в южнокорейской культуре, а также их культурной специфики. Актуальность исследования обусловлена тем, что в наши дни в Южной Корее к веб-тунам наблюдается повышенный интерес общества. В ходе исследования был собран, изучен и систематизирован материал об истории манхва как предшественника веб-тунов. Были рассмотрены этапы развития веб-тунов, причины популярности и основные характеристики. В качестве материала для исследования были выбраны и проанализированы три популярных южнокорейских веб-туна. Посредством анализа была выявлена специфика подачи материала и связь с культурой. Благодаря анализу удалось выявить взаимосвязь веб-тунов с традиционной литературой и ролью текста в корейской культуре и традиционным распределением ролей.</s

### Generating with raw model before learning

In [21]:
# inference train sample
model.eval()
prefix_len = np.sum(np.array(train_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_tokens = train_dataset[9]["input_ids"][:prefix_len]
generated = model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(generated.to('cpu').flatten())
print(generated_text)



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

In [62]:
prefix_len = np.sum(np.array(test_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_len

443

In [63]:
model.eval()
prefix_tokens = test_dataset[9]["input_ids"][:prefix_len]
text = tokenizer.decode(prefix_tokens)
print(text)
print()
generated = model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(generated.to('cpu').flatten())
print(generated_text)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
Санкт-Петербургский государственный университет


АВРАМЕНКО Полина Андреевна
Выпускная квалификационная работа
Веб-туны как часть южнокорейской культуры в XXI веке (на примере романтических историй) 
Уровень образования: магистратура
Направление 58.04.01 «Востоковедение и африканистика»
Основная образовательная программа BM.5808 «Культура народов Азии и Африки (с изучением языков Азии и Африки)»


Научный руководитель:
доцент, Кафедра корееведения, Санкт-Петербургский государственный университет Гурьева Анастасия Александровна

Рецензент:
приглашенный преподаватель, Кафедра корееведения, Санкт-Петербургская школа социальных наук и востоковедения,
доцент, Санкт-Петербургский филиал федерального государственного автономного 

### Train model

In [22]:
model_type = "llama" # default
if model_type == "gpt-neox":
    # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
    targets = ["query_key_value", "dense"]
else:
    targets=["q_proj", "k_proj", "v_proj", "o_proj"]

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=targets,
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [23]:
peft_model = get_peft_model(model, config)

In [24]:
trainable_params = "embed,norm"

In [25]:
[p.requires_grad_() for n, p in peft_model.named_parameters() if any([k in n for k in trainable_params.split(",")])]
pass

In [26]:
peft_model.config.use_cache = False         # required for gradient checkpointing
peft_model.enable_input_require_grads()     # required for gradient checkpointing
peft_model.gradient_checkpointing_enable()  # enable gradient checkpointing

In [27]:
OUTPUT_DIR = "output_dir_8k_big_data"

In [28]:
from dataclasses import dataclass, field

@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [29]:
# from accelerate.utils import DistributedType

@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=8192 * 4,
        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
    )
    use_flash_attn: bool = field(
        default=True,
        metadata={"help": "Whether use flash attention for training."},
    )
    use_full_attn: bool = field(
        default=False,
        metadata={"help": "Whether to use plain, full-attention for training."},
    )
    low_rank_training: bool = field(
        default=True,
        metadata={"help": "Whether use low rank adaptation for training."},
    )
    trainable_params: str = field(
        default="embed,norm",
        metadata={"help": "Additional trainable parameters except LoRA weights, if low rank training."},
    )
    
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    bf16=True,
    use_flash_attn=True,
    low_rank_training=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="no",
    # save_strategy="steps",
    # save_steps=1,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.0,
    warmup_steps=20,
    lr_scheduler_type="constant_with_warmup",
    logging_steps=1,
    deepspeed="ds_configs/stage2.json",
    tf32=True,
    report_to=['tensorboard'],
)
# training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
training_args



In [31]:
trainer = Trainer(
    model=peft_model, 
    tokenizer=tokenizer, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [54]:
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)

  0%|          | 0/1350 [00:00<?, ?it/s][A
                                                  [A
  0%|          | 1/1350 [01:11<8:49:28, 23.55s/it]
  0%|          | 1/1350 [00:23<8:49:23, 23.55s/it][A

{'loss': 1.6495, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}



                                                  [A
  0%|          | 1/1350 [01:32<8:49:28, 23.55s/it]
  0%|          | 2/1350 [00:44<8:20:34, 22.28s/it][A

{'loss': 1.0973, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}



                                                  [A
  0%|          | 1/1350 [01:52<8:49:28, 23.55s/it]
  0%|          | 3/1350 [01:05<8:00:47, 21.42s/it][A

{'loss': 1.1142, 'learning_rate': 3e-06, 'epoch': 0.01}



                                                  [A
  0%|          | 1/1350 [02:13<8:49:28, 23.55s/it]
  0%|          | 4/1350 [01:26<7:57:38, 21.29s/it][A

{'loss': 1.5671, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}



                                                  [A
  0%|          | 1/1350 [02:32<8:49:28, 23.55s/it]
  0%|          | 5/1350 [01:44<7:32:12, 20.17s/it][A

{'loss': 1.242, 'learning_rate': 5e-06, 'epoch': 0.02}



                                                  [A
  0%|          | 1/1350 [02:52<8:49:28, 23.55s/it]
  0%|          | 6/1350 [02:04<7:32:43, 20.21s/it][A

{'loss': 1.0608, 'learning_rate': 6e-06, 'epoch': 0.02}



                                                  [A
  0%|          | 1/1350 [03:15<8:49:28, 23.55s/it]
  1%|          | 7/1350 [02:27<7:51:26, 21.06s/it][A

{'loss': 0.986, 'learning_rate': 7e-06, 'epoch': 0.03}



                                                  [A
  0%|          | 1/1350 [03:33<8:49:28, 23.55s/it]
  1%|          | 8/1350 [02:45<7:29:34, 20.10s/it][A

{'loss': 1.1857, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.03}



                                                  [A
  0%|          | 1/1350 [03:53<8:49:28, 23.55s/it]
  1%|          | 9/1350 [03:06<7:33:08, 20.27s/it][A

{'loss': 1.564, 'learning_rate': 9e-06, 'epoch': 0.03}



                                                  ][A
  0%|          | 1/1350 [04:13<8:49:28, 23.55s/it] 
  1%|          | 10/1350 [03:25<7:25:57, 19.97s/it][A

{'loss': 1.1665, 'learning_rate': 1e-05, 'epoch': 0.04}



                                                  ][A
  0%|          | 1/1350 [04:33<8:49:28, 23.55s/it] 
  1%|          | 11/1350 [03:45<7:26:53, 20.02s/it][A

{'loss': 1.0275, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.04}



                                                  ][A
  0%|          | 1/1350 [04:52<8:49:28, 23.55s/it] 
  1%|          | 12/1350 [04:04<7:17:25, 19.62s/it][A

{'loss': 1.2889, 'learning_rate': 1.2e-05, 'epoch': 0.04}



                                                  ][A
  0%|          | 1/1350 [05:10<8:49:28, 23.55s/it] 
  1%|          | 13/1350 [04:22<7:08:41, 19.24s/it][A

{'loss': 0.96, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.05}



                                                  ][A
  0%|          | 1/1350 [05:29<8:49:28, 23.55s/it] 
  1%|          | 14/1350 [04:41<7:05:10, 19.09s/it][A

{'loss': 0.8579, 'learning_rate': 1.4e-05, 'epoch': 0.05}



                                                  ][A
  0%|          | 1/1350 [05:48<8:49:28, 23.55s/it] 
  1%|          | 15/1350 [05:01<7:07:21, 19.21s/it][A

{'loss': 1.5223, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.06}



                                                  ][A
  0%|          | 1/1350 [06:05<8:49:28, 23.55s/it] 
  1%|          | 16/1350 [05:18<6:52:54, 18.57s/it][A

{'loss': 1.0516, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.06}



                                                  ][A
  0%|          | 1/1350 [06:23<8:49:28, 23.55s/it] 
  1%|▏         | 17/1350 [05:36<6:48:06, 18.37s/it][A

{'loss': 1.0442, 'learning_rate': 1.7e-05, 'epoch': 0.06}



                                                  ][A
  0%|          | 1/1350 [06:42<8:49:28, 23.55s/it] 
  1%|▏         | 18/1350 [05:55<6:53:44, 18.64s/it][A

{'loss': 0.9041, 'learning_rate': 1.8e-05, 'epoch': 0.07}



                                                  ][A
  0%|          | 1/1350 [07:00<8:49:28, 23.55s/it] 
  1%|▏         | 19/1350 [06:13<6:46:36, 18.33s/it][A

{'loss': 1.051, 'learning_rate': 1.9e-05, 'epoch': 0.07}



                                                  ][A
  0%|          | 1/1350 [07:18<8:49:28, 23.55s/it] 
  1%|▏         | 20/1350 [06:30<6:43:28, 18.20s/it][A

{'loss': 0.8983, 'learning_rate': 2e-05, 'epoch': 0.07}



                                                  ][A
  0%|          | 1/1350 [07:36<8:49:28, 23.55s/it] 
  2%|▏         | 21/1350 [06:48<6:39:33, 18.04s/it][A

{'loss': 1.2203, 'learning_rate': 2e-05, 'epoch': 0.08}



                                                  ][A
  0%|          | 1/1350 [07:57<8:49:28, 23.55s/it] 
  2%|▏         | 22/1350 [07:09<6:58:38, 18.91s/it][A

{'loss': 1.7296, 'learning_rate': 2e-05, 'epoch': 0.08}



                                                  ][A
  0%|          | 1/1350 [08:19<8:49:28, 23.55s/it] 
  2%|▏         | 23/1350 [07:31<7:21:51, 19.98s/it][A

{'loss': 1.9189, 'learning_rate': 2e-05, 'epoch': 0.09}



                                                  ][A
  0%|          | 1/1350 [08:37<8:49:28, 23.55s/it] 
  2%|▏         | 24/1350 [07:50<7:09:48, 19.45s/it][A

{'loss': 0.8973, 'learning_rate': 2e-05, 'epoch': 0.09}



                                                  ][A
  0%|          | 1/1350 [08:58<8:49:28, 23.55s/it] 
  2%|▏         | 25/1350 [08:11<7:21:04, 19.97s/it][A

{'loss': 1.5491, 'learning_rate': 2e-05, 'epoch': 0.09}



                                                  ][A
  0%|          | 1/1350 [09:18<8:49:28, 23.55s/it] 
  2%|▏         | 26/1350 [08:31<7:21:13, 20.00s/it][A

{'loss': 1.0886, 'learning_rate': 2e-05, 'epoch': 0.1}



                                                  ][A
  0%|          | 1/1350 [09:38<8:49:28, 23.55s/it] 
  2%|▏         | 27/1350 [08:51<7:20:34, 19.98s/it][A

{'loss': 0.9792, 'learning_rate': 2e-05, 'epoch': 0.1}



                                                  ][A
  0%|          | 1/1350 [09:58<8:49:28, 23.55s/it] 
  2%|▏         | 28/1350 [09:11<7:18:52, 19.92s/it][A

{'loss': 1.0754, 'learning_rate': 2e-05, 'epoch': 0.1}



                                                  ][A
  0%|          | 1/1350 [10:19<8:49:28, 23.55s/it] 
  2%|▏         | 29/1350 [09:32<7:24:56, 20.21s/it][A

{'loss': 1.0363, 'learning_rate': 2e-05, 'epoch': 0.11}



                                                  ][A
  0%|          | 1/1350 [10:38<8:49:28, 23.55s/it] 
  2%|▏         | 30/1350 [09:50<7:12:59, 19.68s/it][A

{'loss': 0.833, 'learning_rate': 2e-05, 'epoch': 0.11}



                                                  ][A
  0%|          | 1/1350 [10:55<8:49:28, 23.55s/it] 
  2%|▏         | 31/1350 [10:08<7:01:27, 19.17s/it][A

{'loss': 0.9397, 'learning_rate': 2e-05, 'epoch': 0.11}



                                                  ][A
  0%|          | 1/1350 [11:17<8:49:28, 23.55s/it] 
  2%|▏         | 32/1350 [10:29<7:13:25, 19.73s/it][A

{'loss': 1.5333, 'learning_rate': 2e-05, 'epoch': 0.12}



                                                  ][A
  0%|          | 1/1350 [11:37<8:49:28, 23.55s/it] 
  2%|▏         | 33/1350 [10:49<7:16:45, 19.90s/it][A

{'loss': 1.2699, 'learning_rate': 2e-05, 'epoch': 0.12}



                                                  ][A
  0%|          | 1/1350 [11:56<8:49:28, 23.55s/it] 
  3%|▎         | 34/1350 [11:09<7:14:25, 19.81s/it][A

{'loss': 1.0356, 'learning_rate': 2e-05, 'epoch': 0.13}



                                                  ][A
  0%|          | 1/1350 [12:16<8:49:28, 23.55s/it] 
  3%|▎         | 35/1350 [11:28<7:12:10, 19.72s/it][A

{'loss': 1.1792, 'learning_rate': 2e-05, 'epoch': 0.13}



                                                  ][A
  0%|          | 1/1350 [12:33<8:49:28, 23.55s/it] 
  3%|▎         | 36/1350 [11:46<6:54:59, 18.95s/it][A

{'loss': 1.0532, 'learning_rate': 2e-05, 'epoch': 0.13}



                                                  ][A
  0%|          | 1/1350 [12:52<8:49:28, 23.55s/it] 
  3%|▎         | 37/1350 [12:05<6:56:31, 19.03s/it][A

{'loss': 0.8306, 'learning_rate': 2e-05, 'epoch': 0.14}



                                                  ][A
  0%|          | 1/1350 [13:10<8:49:28, 23.55s/it] 
  3%|▎         | 38/1350 [12:23<6:49:37, 18.73s/it][A

{'loss': 1.0542, 'learning_rate': 2e-05, 'epoch': 0.14}



                                                  ][A
  0%|          | 1/1350 [13:31<8:49:28, 23.55s/it] 
  3%|▎         | 39/1350 [12:44<7:05:04, 19.45s/it][A

{'loss': 1.3102, 'learning_rate': 2e-05, 'epoch': 0.14}



                                                  ][A
  0%|          | 1/1350 [13:51<8:49:28, 23.55s/it] 
  3%|▎         | 40/1350 [13:03<7:02:53, 19.37s/it][A

{'loss': 0.7745, 'learning_rate': 2e-05, 'epoch': 0.15}



                                                  ][A
  0%|          | 1/1350 [14:10<8:49:28, 23.55s/it] 
  3%|▎         | 41/1350 [13:23<7:02:48, 19.38s/it][A

{'loss': 0.9433, 'learning_rate': 2e-05, 'epoch': 0.15}



                                                  ][A
  0%|          | 1/1350 [14:30<8:49:28, 23.55s/it] 
  3%|▎         | 42/1350 [13:43<7:08:51, 19.67s/it][A

{'loss': 1.0653, 'learning_rate': 2e-05, 'epoch': 0.16}



                                                  ][A
  0%|          | 1/1350 [14:49<8:49:28, 23.55s/it] 
  3%|▎         | 43/1350 [14:02<7:02:59, 19.42s/it][A

{'loss': 1.0165, 'learning_rate': 2e-05, 'epoch': 0.16}



                                                  ][A
  0%|          | 1/1350 [15:08<8:49:28, 23.55s/it] 
  3%|▎         | 44/1350 [14:21<7:01:09, 19.35s/it][A

{'loss': 0.9938, 'learning_rate': 2e-05, 'epoch': 0.16}



                                                  ][A
  0%|          | 1/1350 [15:26<8:49:28, 23.55s/it] 
  3%|▎         | 45/1350 [14:38<6:46:11, 18.68s/it][A

{'loss': 0.9777, 'learning_rate': 2e-05, 'epoch': 0.17}



                                                  ][A
  0%|          | 1/1350 [15:43<8:49:28, 23.55s/it] 
  3%|▎         | 46/1350 [14:56<6:41:08, 18.46s/it][A

{'loss': 0.8733, 'learning_rate': 2e-05, 'epoch': 0.17}



                                                  ][A
  0%|          | 1/1350 [16:04<8:49:28, 23.55s/it] 
  3%|▎         | 47/1350 [15:16<6:53:01, 19.02s/it][A

{'loss': 0.7873, 'learning_rate': 2e-05, 'epoch': 0.17}



                                                  ][A
  0%|          | 1/1350 [16:22<8:49:28, 23.55s/it] 
  4%|▎         | 48/1350 [15:35<6:50:10, 18.90s/it][A

{'loss': 0.7786, 'learning_rate': 2e-05, 'epoch': 0.18}



                                                  ][A
  0%|          | 1/1350 [16:42<8:49:28, 23.55s/it] 
  4%|▎         | 49/1350 [15:55<6:55:08, 19.15s/it][A

{'loss': 1.071, 'learning_rate': 2e-05, 'epoch': 0.18}



                                                  ][A
  0%|          | 1/1350 [17:02<8:49:28, 23.55s/it] 
  4%|▎         | 50/1350 [16:15<7:01:25, 19.45s/it][A

{'loss': 2.0156, 'learning_rate': 2e-05, 'epoch': 0.19}



                                                  ][A
  0%|          | 1/1350 [17:22<8:49:28, 23.55s/it] 
  4%|▍         | 51/1350 [16:34<7:00:05, 19.40s/it][A

{'loss': 0.9436, 'learning_rate': 2e-05, 'epoch': 0.19}



                                                  ][A
  0%|          | 1/1350 [17:42<8:49:28, 23.55s/it] 
  4%|▍         | 52/1350 [16:55<7:08:39, 19.81s/it][A

{'loss': 1.4652, 'learning_rate': 2e-05, 'epoch': 0.19}



                                                  ][A
  0%|          | 1/1350 [18:01<8:49:28, 23.55s/it] 
  4%|▍         | 53/1350 [17:14<7:01:06, 19.48s/it][A

{'loss': 0.741, 'learning_rate': 2e-05, 'epoch': 0.2}



                                                  ][A
  0%|          | 1/1350 [18:25<8:49:28, 23.55s/it] 
  4%|▍         | 54/1350 [17:38<7:29:58, 20.83s/it][A

{'loss': 1.5086, 'learning_rate': 2e-05, 'epoch': 0.2}



                                                  ][A
  0%|          | 1/1350 [18:45<8:49:28, 23.55s/it] 
  4%|▍         | 55/1350 [17:58<7:26:04, 20.67s/it][A

{'loss': 1.1596, 'learning_rate': 2e-05, 'epoch': 0.2}



                                                  ][A
  0%|          | 1/1350 [19:06<8:49:28, 23.55s/it] 
  4%|▍         | 56/1350 [18:18<7:24:28, 20.61s/it][A

{'loss': 0.8642, 'learning_rate': 2e-05, 'epoch': 0.21}



                                                  ][A
  0%|          | 1/1350 [19:26<8:49:28, 23.55s/it] 
  4%|▍         | 57/1350 [18:39<7:21:58, 20.51s/it][A

{'loss': 0.8354, 'learning_rate': 2e-05, 'epoch': 0.21}



                                                  ][A
  0%|          | 1/1350 [19:48<8:49:28, 23.55s/it] 
  4%|▍         | 58/1350 [19:00<7:27:50, 20.80s/it][A

{'loss': 1.3893, 'learning_rate': 2e-05, 'epoch': 0.21}



                                                  ][A
  0%|          | 1/1350 [20:06<8:49:28, 23.55s/it] 
  4%|▍         | 59/1350 [19:18<7:11:28, 20.05s/it][A

{'loss': 0.7298, 'learning_rate': 2e-05, 'epoch': 0.22}



                                                  ][A
  0%|          | 1/1350 [20:23<8:49:28, 23.55s/it] 
  4%|▍         | 60/1350 [19:36<6:53:18, 19.22s/it][A

{'loss': 1.0632, 'learning_rate': 2e-05, 'epoch': 0.22}



                                                  ][A
  0%|          | 1/1350 [20:41<8:49:28, 23.55s/it] 
  5%|▍         | 61/1350 [19:53<6:40:52, 18.66s/it][A

{'loss': 0.9528, 'learning_rate': 2e-05, 'epoch': 0.23}



                                                  ][A
  0%|          | 1/1350 [21:02<8:49:28, 23.55s/it] 
  5%|▍         | 62/1350 [20:14<6:56:34, 19.41s/it][A

{'loss': 1.2759, 'learning_rate': 2e-05, 'epoch': 0.23}



                                                  ][A
  0%|          | 1/1350 [21:20<8:49:28, 23.55s/it] 
  5%|▍         | 63/1350 [20:33<6:49:34, 19.09s/it][A

{'loss': 1.0606, 'learning_rate': 2e-05, 'epoch': 0.23}



                                                  ][A
  0%|          | 1/1350 [21:40<8:49:28, 23.55s/it] 
  5%|▍         | 64/1350 [20:52<6:52:16, 19.24s/it][A

{'loss': 1.067, 'learning_rate': 2e-05, 'epoch': 0.24}



                                                  ][A
  0%|          | 1/1350 [21:59<8:49:28, 23.55s/it] 
  5%|▍         | 65/1350 [21:11<6:50:59, 19.19s/it][A

{'loss': 1.1082, 'learning_rate': 2e-05, 'epoch': 0.24}



                                                  ][A
  0%|          | 1/1350 [22:18<8:49:28, 23.55s/it] 
  5%|▍         | 66/1350 [21:30<6:49:16, 19.13s/it][A

{'loss': 1.098, 'learning_rate': 2e-05, 'epoch': 0.24}



                                                  ][A
  0%|          | 1/1350 [22:37<8:49:28, 23.55s/it] 
  5%|▍         | 67/1350 [21:49<6:49:22, 19.14s/it][A

{'loss': 1.1464, 'learning_rate': 2e-05, 'epoch': 0.25}



                                                  ][A
  0%|          | 1/1350 [22:56<8:49:28, 23.55s/it] 
  5%|▌         | 68/1350 [22:09<6:50:08, 19.20s/it][A

{'loss': 0.9531, 'learning_rate': 2e-05, 'epoch': 0.25}



                                                  ][A
  0%|          | 1/1350 [23:14<8:49:28, 23.55s/it] 
  5%|▌         | 69/1350 [22:27<6:42:11, 18.84s/it][A

{'loss': 1.0147, 'learning_rate': 2e-05, 'epoch': 0.26}



                                                  ][A
  0%|          | 1/1350 [23:32<8:49:28, 23.55s/it] 
  5%|▌         | 70/1350 [22:44<6:32:53, 18.42s/it][A

{'loss': 1.0095, 'learning_rate': 2e-05, 'epoch': 0.26}



                                                  ][A
  0%|          | 1/1350 [23:53<8:49:28, 23.55s/it] 
  5%|▌         | 71/1350 [23:06<6:52:55, 19.37s/it][A

{'loss': 1.1511, 'learning_rate': 2e-05, 'epoch': 0.26}



                                                  ][A
  0%|          | 1/1350 [24:14<8:49:28, 23.55s/it] 
  5%|▌         | 72/1350 [23:26<7:00:00, 19.72s/it][A

{'loss': 1.29, 'learning_rate': 2e-05, 'epoch': 0.27}



                                                  ][A
  0%|          | 1/1350 [24:42<8:49:28, 23.55s/it] 
  5%|▌         | 73/1350 [23:54<7:51:43, 22.16s/it][A

{'loss': 2.3011, 'learning_rate': 2e-05, 'epoch': 0.27}



                                                  ][A
  0%|          | 1/1350 [24:59<8:49:28, 23.55s/it] 
  5%|▌         | 74/1350 [24:12<7:21:19, 20.75s/it][A

{'loss': 0.9373, 'learning_rate': 2e-05, 'epoch': 0.27}



                                                  ][A
  0%|          | 1/1350 [25:19<8:49:28, 23.55s/it] 
  6%|▌         | 75/1350 [24:31<7:14:28, 20.45s/it][A

{'loss': 1.1206, 'learning_rate': 2e-05, 'epoch': 0.28}



                                                  ][A
  0%|          | 1/1350 [25:36<8:49:28, 23.55s/it] 
  6%|▌         | 76/1350 [24:49<6:54:34, 19.52s/it][A

{'loss': 1.1128, 'learning_rate': 2e-05, 'epoch': 0.28}



                                                  ][A
  0%|          | 1/1350 [25:55<8:49:28, 23.55s/it] 
  6%|▌         | 77/1350 [25:08<6:52:51, 19.46s/it][A

{'loss': 1.1069, 'learning_rate': 2e-05, 'epoch': 0.29}



                                                  ][A
  0%|          | 1/1350 [26:17<8:49:28, 23.55s/it] 
  6%|▌         | 78/1350 [25:30<7:06:28, 20.12s/it][A

{'loss': 1.0736, 'learning_rate': 2e-05, 'epoch': 0.29}



                                                  ][A
  0%|          | 1/1350 [26:37<8:49:28, 23.55s/it] 
  6%|▌         | 79/1350 [25:49<7:02:32, 19.95s/it][A

{'loss': 0.9069, 'learning_rate': 2e-05, 'epoch': 0.29}



                                                  ][A
  0%|          | 1/1350 [26:55<8:49:28, 23.55s/it] 
  6%|▌         | 80/1350 [26:07<6:49:27, 19.34s/it][A

{'loss': 0.8666, 'learning_rate': 2e-05, 'epoch': 0.3}



                                                  ][A
  0%|          | 1/1350 [27:14<8:49:28, 23.55s/it] 
  6%|▌         | 81/1350 [26:27<6:49:40, 19.37s/it][A

{'loss': 0.7585, 'learning_rate': 2e-05, 'epoch': 0.3}



                                                  ][A
  0%|          | 1/1350 [27:33<8:49:28, 23.55s/it] 
  6%|▌         | 82/1350 [26:45<6:45:15, 19.18s/it][A

{'loss': 0.7758, 'learning_rate': 2e-05, 'epoch': 0.3}



                                                  ][A
  0%|          | 1/1350 [27:53<8:49:28, 23.55s/it] 
  6%|▌         | 83/1350 [27:06<6:53:11, 19.57s/it][A

{'loss': 0.9695, 'learning_rate': 2e-05, 'epoch': 0.31}



                                                  ][A
  0%|          | 1/1350 [28:11<8:49:28, 23.55s/it] 
  6%|▌         | 84/1350 [27:24<6:43:17, 19.11s/it][A

{'loss': 1.044, 'learning_rate': 2e-05, 'epoch': 0.31}



                                                  ][A
  0%|          | 1/1350 [28:30<8:49:28, 23.55s/it] 
  6%|▋         | 85/1350 [27:43<6:41:48, 19.06s/it][A

{'loss': 1.0333, 'learning_rate': 2e-05, 'epoch': 0.31}



                                                  ][A
  0%|          | 1/1350 [28:51<8:49:28, 23.55s/it] 
  6%|▋         | 86/1350 [28:03<6:50:17, 19.48s/it][A

{'loss': 1.0432, 'learning_rate': 2e-05, 'epoch': 0.32}



                                                  ][A
  0%|          | 1/1350 [29:11<8:49:28, 23.55s/it] 
  6%|▋         | 87/1350 [28:23<6:52:28, 19.60s/it][A

{'loss': 0.7642, 'learning_rate': 2e-05, 'epoch': 0.32}



                                                  ][A
  0%|          | 1/1350 [29:33<8:49:28, 23.55s/it] 
  7%|▋         | 88/1350 [28:45<7:07:48, 20.34s/it][A

{'loss': 1.0001, 'learning_rate': 2e-05, 'epoch': 0.33}



                                                  ][A
  0%|          | 1/1350 [29:50<8:49:28, 23.55s/it] 
  7%|▋         | 89/1350 [29:03<6:49:19, 19.48s/it][A

{'loss': 1.0698, 'learning_rate': 2e-05, 'epoch': 0.33}



                                                  ][A
  0%|          | 1/1350 [30:09<8:49:28, 23.55s/it] 
  7%|▋         | 90/1350 [29:22<6:46:25, 19.35s/it][A

{'loss': 1.1594, 'learning_rate': 2e-05, 'epoch': 0.33}



                                                  ][A
  0%|          | 1/1350 [30:29<8:49:28, 23.55s/it] 
  7%|▋         | 91/1350 [29:42<6:52:15, 19.65s/it][A

{'loss': 1.11, 'learning_rate': 2e-05, 'epoch': 0.34}



                                                  ][A
  0%|          | 1/1350 [30:49<8:49:28, 23.55s/it] 
  7%|▋         | 92/1350 [30:02<6:53:36, 19.73s/it][A

{'loss': 0.7767, 'learning_rate': 2e-05, 'epoch': 0.34}



                                                  ][A
  0%|          | 1/1350 [31:08<8:49:28, 23.55s/it] 
  7%|▋         | 93/1350 [30:20<6:44:10, 19.29s/it][A

{'loss': 0.7941, 'learning_rate': 2e-05, 'epoch': 0.34}



                                                  ][A
  0%|          | 1/1350 [31:28<8:49:28, 23.55s/it] 
  7%|▋         | 94/1350 [30:41<6:51:34, 19.66s/it][A

{'loss': 0.5869, 'learning_rate': 2e-05, 'epoch': 0.35}



                                                  ][A
  0%|          | 1/1350 [31:48<8:49:28, 23.55s/it] 
  7%|▋         | 95/1350 [31:00<6:49:38, 19.58s/it][A

{'loss': 0.6892, 'learning_rate': 2e-05, 'epoch': 0.35}



                                                  ][A
  0%|          | 1/1350 [32:06<8:49:28, 23.55s/it] 
  7%|▋         | 96/1350 [31:18<6:40:10, 19.15s/it][A

{'loss': 1.0942, 'learning_rate': 2e-05, 'epoch': 0.36}



                                                  ][A
  0%|          | 1/1350 [32:25<8:49:28, 23.55s/it] 
  7%|▋         | 97/1350 [31:38<6:43:04, 19.30s/it][A

{'loss': 0.9785, 'learning_rate': 2e-05, 'epoch': 0.36}



                                                  ][A
  0%|          | 1/1350 [32:46<8:49:28, 23.55s/it] 
  7%|▋         | 98/1350 [31:58<6:48:47, 19.59s/it][A

{'loss': 0.7582, 'learning_rate': 2e-05, 'epoch': 0.36}



                                                  ][A
  0%|          | 1/1350 [33:06<8:49:28, 23.55s/it] 
  7%|▋         | 99/1350 [32:18<6:52:20, 19.78s/it][A

{'loss': 0.9182, 'learning_rate': 2e-05, 'epoch': 0.37}



                                                  t][A
  0%|          | 1/1350 [33:23<8:49:28, 23.55s/it]  
  7%|▋         | 100/1350 [32:36<6:37:38, 19.09s/it][A

{'loss': 0.9132, 'learning_rate': 2e-05, 'epoch': 0.37}



                                                  t][A
  0%|          | 1/1350 [33:41<8:49:28, 23.55s/it]  
  7%|▋         | 101/1350 [32:54<6:30:04, 18.74s/it][A

{'loss': 1.1594, 'learning_rate': 2e-05, 'epoch': 0.37}



                                                  t][A
  0%|          | 1/1350 [33:58<8:49:28, 23.55s/it]  
  8%|▊         | 102/1350 [33:11<6:19:18, 18.24s/it][A

{'loss': 0.9674, 'learning_rate': 2e-05, 'epoch': 0.38}



                                                  t][A
  0%|          | 1/1350 [34:18<8:49:28, 23.55s/it]  
  8%|▊         | 103/1350 [33:31<6:29:49, 18.76s/it][A

{'loss': 1.155, 'learning_rate': 2e-05, 'epoch': 0.38}



                                                  t][A
  0%|          | 1/1350 [34:41<8:49:28, 23.55s/it]  
  8%|▊         | 104/1350 [33:53<6:52:31, 19.86s/it][A

{'loss': 1.0151, 'learning_rate': 2e-05, 'epoch': 0.39}



                                                  t][A
  0%|          | 1/1350 [35:00<8:49:28, 23.55s/it]  
  8%|▊         | 105/1350 [34:13<6:50:47, 19.80s/it][A

{'loss': 1.0118, 'learning_rate': 2e-05, 'epoch': 0.39}



                                                  t][A
  0%|          | 1/1350 [35:24<8:49:28, 23.55s/it]  
  8%|▊         | 106/1350 [34:37<7:14:49, 20.97s/it][A

{'loss': 1.3116, 'learning_rate': 2e-05, 'epoch': 0.39}



                                                  t][A
  0%|          | 1/1350 [35:41<8:49:28, 23.55s/it]  
  8%|▊         | 107/1350 [34:53<6:47:07, 19.65s/it][A

{'loss': 1.1338, 'learning_rate': 2e-05, 'epoch': 0.4}



                                                  t][A
  0%|          | 1/1350 [35:58<8:49:28, 23.55s/it]  
  8%|▊         | 108/1350 [35:10<6:30:41, 18.87s/it][A

{'loss': 1.1176, 'learning_rate': 2e-05, 'epoch': 0.4}



                                                  t][A
  0%|          | 1/1350 [36:16<8:49:28, 23.55s/it]  
  8%|▊         | 109/1350 [35:28<6:25:33, 18.64s/it][A

{'loss': 1.0553, 'learning_rate': 2e-05, 'epoch': 0.4}



                                                  t][A
  0%|          | 1/1350 [36:36<8:49:28, 23.55s/it]  
  8%|▊         | 110/1350 [35:48<6:32:03, 18.97s/it][A

{'loss': 0.8129, 'learning_rate': 2e-05, 'epoch': 0.41}



                                                  t][A
  0%|          | 1/1350 [36:53<8:49:28, 23.55s/it]  
  8%|▊         | 111/1350 [36:05<6:21:51, 18.49s/it][A

{'loss': 0.9779, 'learning_rate': 2e-05, 'epoch': 0.41}



                                                  t][A
  0%|          | 1/1350 [37:14<8:49:28, 23.55s/it]  
  8%|▊         | 112/1350 [36:26<6:34:51, 19.14s/it][A

{'loss': 1.3079, 'learning_rate': 2e-05, 'epoch': 0.41}



                                                  t][A
  0%|          | 1/1350 [37:32<8:49:28, 23.55s/it]  
  8%|▊         | 113/1350 [36:45<6:30:46, 18.95s/it][A

{'loss': 1.0467, 'learning_rate': 2e-05, 'epoch': 0.42}



                                                  t][A
  0%|          | 1/1350 [37:52<8:49:28, 23.55s/it]  
  8%|▊         | 114/1350 [37:05<6:36:40, 19.26s/it][A

{'loss': 0.9661, 'learning_rate': 2e-05, 'epoch': 0.42}



                                                  t][A
  0%|          | 1/1350 [38:11<8:49:28, 23.55s/it]  
  9%|▊         | 115/1350 [37:24<6:34:51, 19.18s/it][A

{'loss': 0.9769, 'learning_rate': 2e-05, 'epoch': 0.43}



                                                  t][A
  0%|          | 1/1350 [38:31<8:49:28, 23.55s/it]  
  9%|▊         | 116/1350 [37:44<6:40:44, 19.48s/it][A

{'loss': 0.681, 'learning_rate': 2e-05, 'epoch': 0.43}



                                                  t][A
  0%|          | 1/1350 [38:51<8:49:28, 23.55s/it]  
  9%|▊         | 117/1350 [38:03<6:40:06, 19.47s/it][A

{'loss': 1.0187, 'learning_rate': 2e-05, 'epoch': 0.43}



                                                  t][A
  0%|          | 1/1350 [39:09<8:49:28, 23.55s/it]  
  9%|▊         | 118/1350 [38:22<6:33:11, 19.15s/it][A

{'loss': 1.0789, 'learning_rate': 2e-05, 'epoch': 0.44}



                                                  t][A
  0%|          | 1/1350 [39:28<8:49:28, 23.55s/it]  
  9%|▉         | 119/1350 [38:41<6:33:08, 19.16s/it][A

{'loss': 0.9411, 'learning_rate': 2e-05, 'epoch': 0.44}



                                                  t][A
  0%|          | 1/1350 [39:48<8:49:28, 23.55s/it]  
  9%|▉         | 120/1350 [39:00<6:35:08, 19.27s/it][A

{'loss': 1.125, 'learning_rate': 2e-05, 'epoch': 0.44}



                                                  t][A
  0%|          | 1/1350 [40:06<8:49:28, 23.55s/it]  
  9%|▉         | 121/1350 [39:18<6:26:40, 18.88s/it][A

{'loss': 0.9242, 'learning_rate': 2e-05, 'epoch': 0.45}



                                                  t][A
  0%|          | 1/1350 [40:26<8:49:28, 23.55s/it]  
  9%|▉         | 122/1350 [39:39<6:37:06, 19.40s/it][A

{'loss': 1.1904, 'learning_rate': 2e-05, 'epoch': 0.45}



                                                  t][A
  0%|          | 1/1350 [40:46<8:49:28, 23.55s/it]  
  9%|▉         | 123/1350 [39:58<6:37:37, 19.44s/it][A

{'loss': 1.0103, 'learning_rate': 2e-05, 'epoch': 0.46}



                                                  t][A
  0%|          | 1/1350 [41:05<8:49:28, 23.55s/it]  
  9%|▉         | 124/1350 [40:17<6:32:25, 19.21s/it][A

{'loss': 0.8261, 'learning_rate': 2e-05, 'epoch': 0.46}



                                                  t][A
  0%|          | 1/1350 [41:24<8:49:28, 23.55s/it]  
  9%|▉         | 125/1350 [40:37<6:35:05, 19.35s/it][A

{'loss': 0.9392, 'learning_rate': 2e-05, 'epoch': 0.46}



                                                  t][A
  0%|          | 1/1350 [41:44<8:49:28, 23.55s/it]  
  9%|▉         | 126/1350 [40:56<6:35:43, 19.40s/it][A

{'loss': 1.074, 'learning_rate': 2e-05, 'epoch': 0.47}



                                                  t][A
  0%|          | 1/1350 [42:01<8:49:28, 23.55s/it]  
  9%|▉         | 127/1350 [41:14<6:22:33, 18.77s/it][A

{'loss': 0.9428, 'learning_rate': 2e-05, 'epoch': 0.47}



                                                  t][A
  0%|          | 1/1350 [42:21<8:49:28, 23.55s/it]  
  9%|▉         | 128/1350 [41:33<6:28:15, 19.06s/it][A

{'loss': 0.9557, 'learning_rate': 2e-05, 'epoch': 0.47}



                                                  t][A
  0%|          | 1/1350 [42:39<8:49:28, 23.55s/it]  
 10%|▉         | 129/1350 [41:51<6:20:39, 18.71s/it][A

{'loss': 1.0452, 'learning_rate': 2e-05, 'epoch': 0.48}



                                                  t][A
  0%|          | 1/1350 [43:00<8:49:28, 23.55s/it]  
 10%|▉         | 130/1350 [42:13<6:37:26, 19.55s/it][A

{'loss': 1.1761, 'learning_rate': 2e-05, 'epoch': 0.48}



                                                  t][A
  0%|          | 1/1350 [43:27<8:49:28, 23.55s/it]  
 10%|▉         | 131/1350 [42:39<7:19:01, 21.61s/it][A

{'loss': 1.8256, 'learning_rate': 2e-05, 'epoch': 0.49}



                                                  t][A
  0%|          | 1/1350 [43:47<8:49:28, 23.55s/it]  
 10%|▉         | 132/1350 [42:59<7:10:14, 21.19s/it][A

{'loss': 0.9861, 'learning_rate': 2e-05, 'epoch': 0.49}



                                                  t][A
  0%|          | 1/1350 [44:08<8:49:28, 23.55s/it]  
 10%|▉         | 133/1350 [43:21<7:11:02, 21.25s/it][A

{'loss': 0.9597, 'learning_rate': 2e-05, 'epoch': 0.49}



                                                  t][A
  0%|          | 1/1350 [44:27<8:49:28, 23.55s/it]  
 10%|▉         | 134/1350 [43:39<6:54:46, 20.47s/it][A

{'loss': 0.8508, 'learning_rate': 2e-05, 'epoch': 0.5}



                                                  t][A
  0%|          | 1/1350 [44:46<8:49:28, 23.55s/it]  
 10%|█         | 135/1350 [43:58<6:43:07, 19.91s/it][A

{'loss': 1.0814, 'learning_rate': 2e-05, 'epoch': 0.5}



                                                  t][A
  0%|          | 1/1350 [45:04<8:49:28, 23.55s/it]  
 10%|█         | 136/1350 [44:17<6:36:08, 19.58s/it][A

{'loss': 1.1386, 'learning_rate': 2e-05, 'epoch': 0.5}



                                                  t][A
  0%|          | 1/1350 [45:26<8:49:28, 23.55s/it]  
 10%|█         | 137/1350 [44:38<6:46:35, 20.11s/it][A

{'loss': 0.9824, 'learning_rate': 2e-05, 'epoch': 0.51}



                                                  t][A
  0%|          | 1/1350 [45:44<8:49:28, 23.55s/it]  
 10%|█         | 138/1350 [44:56<6:34:54, 19.55s/it][A

{'loss': 1.1065, 'learning_rate': 2e-05, 'epoch': 0.51}



                                                  t][A
  0%|          | 1/1350 [46:05<8:49:28, 23.55s/it]  
 10%|█         | 139/1350 [45:18<6:45:43, 20.10s/it][A

{'loss': 1.0455, 'learning_rate': 2e-05, 'epoch': 0.51}



                                                  t][A
  0%|          | 1/1350 [46:28<8:49:28, 23.55s/it]  
 10%|█         | 140/1350 [45:41<7:01:59, 20.93s/it][A

{'loss': 1.1284, 'learning_rate': 2e-05, 'epoch': 0.52}



                                                  t][A
  0%|          | 1/1350 [46:47<8:49:28, 23.55s/it]  
 10%|█         | 141/1350 [46:00<6:51:59, 20.45s/it][A

{'loss': 0.949, 'learning_rate': 2e-05, 'epoch': 0.52}



                                                  t][A
  0%|          | 1/1350 [47:10<8:49:28, 23.55s/it]  
 11%|█         | 142/1350 [46:22<7:03:38, 21.04s/it][A

{'loss': 0.9569, 'learning_rate': 2e-05, 'epoch': 0.53}



                                                  t][A
  0%|          | 1/1350 [47:27<8:49:28, 23.55s/it]  
 11%|█         | 143/1350 [46:39<6:36:37, 19.72s/it][A

{'loss': 0.9156, 'learning_rate': 2e-05, 'epoch': 0.53}



                                                  t][A
  0%|          | 1/1350 [47:42<8:49:28, 23.55s/it]  
 11%|█         | 144/1350 [46:55<6:13:24, 18.58s/it][A

{'loss': 0.9817, 'learning_rate': 2e-05, 'epoch': 0.53}



                                                  t][A
  0%|          | 1/1350 [48:03<8:49:28, 23.55s/it]  
 11%|█         | 145/1350 [47:15<6:22:22, 19.04s/it][A

{'loss': 0.8395, 'learning_rate': 2e-05, 'epoch': 0.54}



                                                  t][A
  0%|          | 1/1350 [48:21<8:49:28, 23.55s/it]  
 11%|█         | 146/1350 [47:33<6:17:42, 18.82s/it][A

{'loss': 0.8359, 'learning_rate': 2e-05, 'epoch': 0.54}



                                                  t][A
  0%|          | 1/1350 [48:41<8:49:28, 23.55s/it]  
 11%|█         | 147/1350 [47:53<6:23:29, 19.13s/it][A

{'loss': 0.6674, 'learning_rate': 2e-05, 'epoch': 0.54}



                                                  t][A
  0%|          | 1/1350 [49:02<8:49:28, 23.55s/it]  
 11%|█         | 148/1350 [48:15<6:38:40, 19.90s/it][A

{'loss': 0.8276, 'learning_rate': 2e-05, 'epoch': 0.55}



                                                  t][A
  0%|          | 1/1350 [49:24<8:49:28, 23.55s/it]  
 11%|█         | 149/1350 [48:36<6:46:03, 20.29s/it][A

{'loss': 1.0184, 'learning_rate': 2e-05, 'epoch': 0.55}



                                                  t][A
  0%|          | 1/1350 [49:46<8:49:28, 23.55s/it]  
 11%|█         | 150/1350 [48:58<6:57:19, 20.87s/it][A

{'loss': 1.0829, 'learning_rate': 2e-05, 'epoch': 0.56}



                                                  t][A
  0%|          | 1/1350 [50:04<8:49:28, 23.55s/it]  
 11%|█         | 151/1350 [49:17<6:41:15, 20.08s/it][A

{'loss': 0.9885, 'learning_rate': 2e-05, 'epoch': 0.56}



                                                  t][A
  0%|          | 1/1350 [50:24<8:49:28, 23.55s/it]  
 11%|█▏        | 152/1350 [49:37<6:40:22, 20.05s/it][A

{'loss': 0.9973, 'learning_rate': 2e-05, 'epoch': 0.56}



                                                  t][A
  0%|          | 1/1350 [50:43<8:49:28, 23.55s/it]  
 11%|█▏        | 153/1350 [49:55<6:31:40, 19.63s/it][A

{'loss': 0.8484, 'learning_rate': 2e-05, 'epoch': 0.57}



                                                  t][A
  0%|          | 1/1350 [51:01<8:49:28, 23.55s/it]  
 11%|█▏        | 154/1350 [50:13<6:20:26, 19.09s/it][A

{'loss': 0.674, 'learning_rate': 2e-05, 'epoch': 0.57}



                                                  t][A
  0%|          | 1/1350 [51:18<8:49:28, 23.55s/it]  
 11%|█▏        | 155/1350 [50:31<6:11:24, 18.65s/it][A

{'loss': 1.0884, 'learning_rate': 2e-05, 'epoch': 0.57}



                                                  t][A
  0%|          | 1/1350 [51:38<8:49:28, 23.55s/it]  
 12%|█▏        | 156/1350 [50:51<6:19:50, 19.09s/it][A

{'loss': 0.7661, 'learning_rate': 2e-05, 'epoch': 0.58}



                                                  t][A
  0%|          | 1/1350 [52:00<8:49:28, 23.55s/it]  
 12%|█▏        | 157/1350 [51:13<6:38:16, 20.03s/it][A

{'loss': 1.0236, 'learning_rate': 2e-05, 'epoch': 0.58}



                                                  t][A
  0%|          | 1/1350 [52:20<8:49:28, 23.55s/it]  
 12%|█▏        | 158/1350 [51:33<6:34:58, 19.88s/it][A

{'loss': 1.096, 'learning_rate': 2e-05, 'epoch': 0.59}



                                                  t][A
  0%|          | 1/1350 [52:37<8:49:28, 23.55s/it]  
 12%|█▏        | 159/1350 [51:50<6:18:53, 19.09s/it][A

{'loss': 0.7776, 'learning_rate': 2e-05, 'epoch': 0.59}



                                                  t][A
  0%|          | 1/1350 [52:59<8:49:28, 23.55s/it]  
 12%|█▏        | 160/1350 [52:12<6:36:05, 19.97s/it][A

{'loss': 1.2198, 'learning_rate': 2e-05, 'epoch': 0.59}



                                                  t][A
  0%|          | 1/1350 [53:17<8:49:28, 23.55s/it]  
 12%|█▏        | 161/1350 [52:29<6:21:49, 19.27s/it][A

{'loss': 0.9028, 'learning_rate': 2e-05, 'epoch': 0.6}



                                                  t][A
  0%|          | 1/1350 [53:35<8:49:28, 23.55s/it]  
 12%|█▏        | 162/1350 [52:47<6:12:48, 18.83s/it][A

{'loss': 1.0687, 'learning_rate': 2e-05, 'epoch': 0.6}



                                                  t][A
  0%|          | 1/1350 [53:53<8:49:28, 23.55s/it]  
 12%|█▏        | 163/1350 [53:05<6:08:32, 18.63s/it][A

{'loss': 1.0215, 'learning_rate': 2e-05, 'epoch': 0.6}



                                                  t][A
  0%|          | 1/1350 [54:11<8:49:28, 23.55s/it]  
 12%|█▏        | 164/1350 [53:24<6:07:25, 18.59s/it][A

{'loss': 0.8577, 'learning_rate': 2e-05, 'epoch': 0.61}



                                                  t][A
  0%|          | 1/1350 [54:29<8:49:28, 23.55s/it]  
 12%|█▏        | 165/1350 [53:42<6:02:35, 18.36s/it][A

{'loss': 0.799, 'learning_rate': 2e-05, 'epoch': 0.61}



                                                  t][A
  0%|          | 1/1350 [54:51<8:49:28, 23.55s/it]  
 12%|█▏        | 166/1350 [54:03<6:21:11, 19.32s/it][A

{'loss': 1.054, 'learning_rate': 2e-05, 'epoch': 0.61}



                                                  t][A
  0%|          | 1/1350 [55:12<8:49:28, 23.55s/it]  
 12%|█▏        | 167/1350 [54:25<6:35:04, 20.04s/it][A

{'loss': 1.0163, 'learning_rate': 2e-05, 'epoch': 0.62}



                                                  t][A
  0%|          | 1/1350 [55:33<8:49:28, 23.55s/it]  
 12%|█▏        | 168/1350 [54:46<6:39:21, 20.27s/it][A

{'loss': 0.885, 'learning_rate': 2e-05, 'epoch': 0.62}



                                                  t][A
  0%|          | 1/1350 [55:56<8:49:28, 23.55s/it]  
 13%|█▎        | 169/1350 [55:08<6:50:34, 20.86s/it][A

{'loss': 0.8226, 'learning_rate': 2e-05, 'epoch': 0.63}



                                                  t][A
  0%|          | 1/1350 [56:15<8:49:28, 23.55s/it]  
 13%|█▎        | 170/1350 [55:27<6:40:54, 20.39s/it][A

{'loss': 1.0212, 'learning_rate': 2e-05, 'epoch': 0.63}



                                                  t][A
  0%|          | 1/1350 [56:32<8:49:28, 23.55s/it]  
 13%|█▎        | 171/1350 [55:45<6:23:41, 19.53s/it][A

{'loss': 0.999, 'learning_rate': 2e-05, 'epoch': 0.63}



                                                  t][A
  0%|          | 1/1350 [56:52<8:49:28, 23.55s/it]  
 13%|█▎        | 172/1350 [56:04<6:21:37, 19.44s/it][A

{'loss': 0.7681, 'learning_rate': 2e-05, 'epoch': 0.64}



                                                  t][A
  0%|          | 1/1350 [57:11<8:49:28, 23.55s/it]  
 13%|█▎        | 173/1350 [56:24<6:22:56, 19.52s/it][A

{'loss': 0.8537, 'learning_rate': 2e-05, 'epoch': 0.64}



                                                  t][A
  0%|          | 1/1350 [57:31<8:49:28, 23.55s/it]  
 13%|█▎        | 174/1350 [56:43<6:21:14, 19.45s/it][A

{'loss': 0.9715, 'learning_rate': 2e-05, 'epoch': 0.64}



                                                  t][A
  0%|          | 1/1350 [57:48<8:49:28, 23.55s/it]  
 13%|█▎        | 175/1350 [57:01<6:11:04, 18.95s/it][A

{'loss': 1.1372, 'learning_rate': 2e-05, 'epoch': 0.65}



                                                  t][A
  0%|          | 1/1350 [58:05<8:49:28, 23.55s/it]  
 13%|█▎        | 176/1350 [57:17<5:57:00, 18.25s/it][A

{'loss': 0.9863, 'learning_rate': 2e-05, 'epoch': 0.65}



                                                  t][A
  0%|          | 1/1350 [58:22<8:49:28, 23.55s/it]  
 13%|█▎        | 177/1350 [57:35<5:50:54, 17.95s/it][A

{'loss': 0.965, 'learning_rate': 2e-05, 'epoch': 0.66}



                                                  t][A
  0%|          | 1/1350 [58:42<8:49:28, 23.55s/it]  
 13%|█▎        | 178/1350 [57:55<6:04:20, 18.65s/it][A

{'loss': 0.9927, 'learning_rate': 2e-05, 'epoch': 0.66}



                                                  t][A
  0%|          | 1/1350 [59:03<8:49:28, 23.55s/it]  
 13%|█▎        | 179/1350 [58:15<6:13:17, 19.13s/it][A

{'loss': 0.5951, 'learning_rate': 2e-05, 'epoch': 0.66}



                                                  t][A
  0%|          | 1/1350 [59:24<8:49:28, 23.55s/it]  
 13%|█▎        | 180/1350 [58:37<6:26:36, 19.83s/it][A

{'loss': 0.9889, 'learning_rate': 2e-05, 'epoch': 0.67}



                                                  t][A
  0%|          | 1/1350 [59:44<8:49:28, 23.55s/it]  
 13%|█▎        | 181/1350 [58:56<6:24:58, 19.76s/it][A

{'loss': 0.8075, 'learning_rate': 2e-05, 'epoch': 0.67}



                                                  t][A
  0%|          | 1/1350 [1:00:04<8:49:28, 23.55s/it]
 13%|█▎        | 182/1350 [59:17<6:27:44, 19.92s/it][A

{'loss': 1.2269, 'learning_rate': 2e-05, 'epoch': 0.67}



                                                    [A
  0%|          | 1/1350 [1:00:24<8:49:28, 23.55s/it]
 14%|█▎        | 183/1350 [59:37<6:29:54, 20.05s/it][A

{'loss': 0.8124, 'learning_rate': 2e-05, 'epoch': 0.68}



                                                    [A
  0%|          | 1/1350 [1:00:45<8:49:28, 23.55s/it]
 14%|█▎        | 184/1350 [59:58<6:32:39, 20.21s/it][A

{'loss': 1.0636, 'learning_rate': 2e-05, 'epoch': 0.68}



                                                    t][A
  0%|          | 1/1350 [1:01:03<8:49:28, 23.55s/it]  
 14%|█▎        | 185/1350 [1:00:15<6:18:42, 19.50s/it][A

{'loss': 0.8362, 'learning_rate': 2e-05, 'epoch': 0.69}



                                                    t][A
  0%|          | 1/1350 [1:01:20<8:49:28, 23.55s/it]  
 14%|█▍        | 186/1350 [1:00:33<6:07:13, 18.93s/it][A

{'loss': 0.8828, 'learning_rate': 2e-05, 'epoch': 0.69}



                                                    t][A
  0%|          | 1/1350 [1:01:40<8:49:28, 23.55s/it]  
 14%|█▍        | 187/1350 [1:00:53<6:10:44, 19.13s/it][A

{'loss': 0.8706, 'learning_rate': 2e-05, 'epoch': 0.69}



                                                    t][A
  0%|          | 1/1350 [1:01:59<8:49:28, 23.55s/it]  
 14%|█▍        | 188/1350 [1:01:11<6:07:28, 18.97s/it][A

{'loss': 0.8415, 'learning_rate': 2e-05, 'epoch': 0.7}



                                                    t][A
  0%|          | 1/1350 [1:02:17<8:49:28, 23.55s/it]  
 14%|█▍        | 189/1350 [1:01:29<6:01:01, 18.66s/it][A

{'loss': 0.9263, 'learning_rate': 2e-05, 'epoch': 0.7}



                                                    t][A
  0%|          | 1/1350 [1:02:33<8:49:28, 23.55s/it]  
 14%|█▍        | 190/1350 [1:01:46<5:50:25, 18.13s/it][A

{'loss': 1.125, 'learning_rate': 2e-05, 'epoch': 0.7}



                                                    t][A
  0%|          | 1/1350 [1:02:51<8:49:28, 23.55s/it]  
 14%|█▍        | 191/1350 [1:02:04<5:47:40, 18.00s/it][A

{'loss': 0.7335, 'learning_rate': 2e-05, 'epoch': 0.71}



                                                    t][A
  0%|          | 1/1350 [1:03:15<8:49:28, 23.55s/it]  
 14%|█▍        | 192/1350 [1:02:27<6:20:18, 19.71s/it][A

{'loss': 0.9849, 'learning_rate': 2e-05, 'epoch': 0.71}



                                                    t][A
  0%|          | 1/1350 [1:03:33<8:49:28, 23.55s/it]  
 14%|█▍        | 193/1350 [1:02:46<6:11:11, 19.25s/it][A

{'loss': 1.0536, 'learning_rate': 2e-05, 'epoch': 0.71}



                                                    t][A
  0%|          | 1/1350 [1:04:00<8:49:28, 23.55s/it]  
 14%|█▍        | 194/1350 [1:03:13<6:56:41, 21.63s/it][A

{'loss': 1.699, 'learning_rate': 2e-05, 'epoch': 0.72}



                                                    t][A
  0%|          | 1/1350 [1:04:20<8:49:28, 23.55s/it]  
 14%|█▍        | 195/1350 [1:03:33<6:47:06, 21.15s/it][A

{'loss': 0.7838, 'learning_rate': 2e-05, 'epoch': 0.72}



                                                    t][A
  0%|          | 1/1350 [1:04:40<8:49:28, 23.55s/it]  
 15%|█▍        | 196/1350 [1:03:52<6:37:55, 20.69s/it][A

{'loss': 0.8451, 'learning_rate': 2e-05, 'epoch': 0.73}



                                                    t][A
  0%|          | 1/1350 [1:05:00<8:49:28, 23.55s/it]  
 15%|█▍        | 197/1350 [1:04:13<6:35:52, 20.60s/it][A

{'loss': 1.1626, 'learning_rate': 2e-05, 'epoch': 0.73}



                                                    t][A
  0%|          | 1/1350 [1:05:19<8:49:28, 23.55s/it]  
 15%|█▍        | 198/1350 [1:04:31<6:22:45, 19.94s/it][A

{'loss': 0.6274, 'learning_rate': 2e-05, 'epoch': 0.73}



                                                    t][A
  0%|          | 1/1350 [1:05:37<8:49:28, 23.55s/it]  
 15%|█▍        | 199/1350 [1:04:50<6:14:01, 19.50s/it][A

{'loss': 0.8469, 'learning_rate': 2e-05, 'epoch': 0.74}



                                                    t][A
  0%|          | 1/1350 [1:05:54<8:49:28, 23.55s/it]  
 15%|█▍        | 200/1350 [1:05:07<5:59:43, 18.77s/it][A

{'loss': 1.2816, 'learning_rate': 2e-05, 'epoch': 0.74}



                                                    t][A
  0%|          | 1/1350 [1:06:16<8:49:28, 23.55s/it]  
 15%|█▍        | 201/1350 [1:05:28<6:15:35, 19.61s/it][A

{'loss': 1.0247, 'learning_rate': 2e-05, 'epoch': 0.74}



                                                    t][A
  0%|          | 1/1350 [1:06:33<8:49:28, 23.55s/it]  
 15%|█▍        | 202/1350 [1:05:45<5:59:55, 18.81s/it][A

{'loss': 0.8989, 'learning_rate': 2e-05, 'epoch': 0.75}



                                                    t][A
  0%|          | 1/1350 [1:06:51<8:49:28, 23.55s/it]  
 15%|█▌        | 203/1350 [1:06:04<5:58:00, 18.73s/it][A

{'loss': 0.9511, 'learning_rate': 2e-05, 'epoch': 0.75}



                                                    t][A
  0%|          | 1/1350 [1:07:11<8:49:28, 23.55s/it]  
 15%|█▌        | 204/1350 [1:06:24<6:06:17, 19.18s/it][A

{'loss': 1.1216, 'learning_rate': 2e-05, 'epoch': 0.76}



                                                    t][A
  0%|          | 1/1350 [1:07:31<8:49:28, 23.55s/it]  
 15%|█▌        | 205/1350 [1:06:43<6:06:36, 19.21s/it][A

{'loss': 0.9972, 'learning_rate': 2e-05, 'epoch': 0.76}



                                                    t][A
  0%|          | 1/1350 [1:07:52<8:49:28, 23.55s/it]  
 15%|█▌        | 206/1350 [1:07:04<6:16:25, 19.74s/it][A

{'loss': 0.8327, 'learning_rate': 2e-05, 'epoch': 0.76}



                                                    t][A
  0%|          | 1/1350 [1:08:14<8:49:28, 23.55s/it]  
 15%|█▌        | 207/1350 [1:07:26<6:29:46, 20.46s/it][A

{'loss': 0.9975, 'learning_rate': 2e-05, 'epoch': 0.77}



                                                    t][A
  0%|          | 1/1350 [1:08:33<8:49:28, 23.55s/it]  
 15%|█▌        | 208/1350 [1:07:46<6:23:55, 20.17s/it][A

{'loss': 0.9656, 'learning_rate': 2e-05, 'epoch': 0.77}



                                                    t][A
  0%|          | 1/1350 [1:08:54<8:49:28, 23.55s/it]  
 15%|█▌        | 209/1350 [1:08:06<6:25:42, 20.28s/it][A

{'loss': 0.9615, 'learning_rate': 2e-05, 'epoch': 0.77}



                                                    t][A
  0%|          | 1/1350 [1:09:14<8:49:28, 23.55s/it]  
 16%|█▌        | 210/1350 [1:08:26<6:21:23, 20.07s/it][A

{'loss': 1.0888, 'learning_rate': 2e-05, 'epoch': 0.78}



                                                    t][A
  0%|          | 1/1350 [1:09:34<8:49:28, 23.55s/it]  
 16%|█▌        | 211/1350 [1:08:46<6:21:03, 20.07s/it][A

{'loss': 0.6961, 'learning_rate': 2e-05, 'epoch': 0.78}



                                                    t][A
  0%|          | 1/1350 [1:09:52<8:49:28, 23.55s/it]  
 16%|█▌        | 212/1350 [1:09:04<6:08:37, 19.44s/it][A

{'loss': 1.0419, 'learning_rate': 2e-05, 'epoch': 0.79}



                                                    t][A
  0%|          | 1/1350 [1:10:09<8:49:28, 23.55s/it]  
 16%|█▌        | 213/1350 [1:09:22<5:58:07, 18.90s/it][A

{'loss': 1.115, 'learning_rate': 2e-05, 'epoch': 0.79}



                                                    t][A
  0%|          | 1/1350 [1:10:28<8:49:28, 23.55s/it]  
 16%|█▌        | 214/1350 [1:09:41<5:57:57, 18.91s/it][A

{'loss': 0.9965, 'learning_rate': 2e-05, 'epoch': 0.79}



                                                    t][A
  0%|          | 1/1350 [1:10:48<8:49:28, 23.55s/it]  
 16%|█▌        | 215/1350 [1:10:00<6:03:04, 19.19s/it][A

{'loss': 0.8239, 'learning_rate': 2e-05, 'epoch': 0.8}



                                                    t][A
  0%|          | 1/1350 [1:11:07<8:49:28, 23.55s/it]  
 16%|█▌        | 216/1350 [1:10:19<6:00:11, 19.06s/it][A

{'loss': 0.9977, 'learning_rate': 2e-05, 'epoch': 0.8}



                                                    t][A
  0%|          | 1/1350 [1:11:31<8:49:28, 23.55s/it]  
 16%|█▌        | 217/1350 [1:10:43<6:27:41, 20.53s/it][A

{'loss': 1.2513, 'learning_rate': 2e-05, 'epoch': 0.8}



                                                    t][A
  0%|          | 1/1350 [1:11:50<8:49:28, 23.55s/it]  
 16%|█▌        | 218/1350 [1:11:02<6:20:13, 20.15s/it][A

{'loss': 1.1562, 'learning_rate': 2e-05, 'epoch': 0.81}



                                                    t][A
  0%|          | 1/1350 [1:12:11<8:49:28, 23.55s/it]  
 16%|█▌        | 219/1350 [1:11:23<6:24:57, 20.42s/it][A

{'loss': 0.9157, 'learning_rate': 2e-05, 'epoch': 0.81}



                                                    t][A
  0%|          | 1/1350 [1:12:31<8:49:28, 23.55s/it]  
 16%|█▋        | 220/1350 [1:11:44<6:23:11, 20.35s/it][A

{'loss': 0.7446, 'learning_rate': 2e-05, 'epoch': 0.81}



                                                    t][A
  0%|          | 1/1350 [1:12:51<8:49:28, 23.55s/it]  
 16%|█▋        | 221/1350 [1:12:03<6:17:55, 20.08s/it][A

{'loss': 0.8569, 'learning_rate': 2e-05, 'epoch': 0.82}



                                                    t][A
  0%|          | 1/1350 [1:13:09<8:49:28, 23.55s/it]  
 16%|█▋        | 222/1350 [1:12:22<6:09:12, 19.64s/it][A

{'loss': 1.1378, 'learning_rate': 2e-05, 'epoch': 0.82}



                                                    t][A
  0%|          | 1/1350 [1:13:29<8:49:28, 23.55s/it]  
 17%|█▋        | 223/1350 [1:12:42<6:10:17, 19.71s/it][A

{'loss': 1.0561, 'learning_rate': 2e-05, 'epoch': 0.83}



                                                    t][A
  0%|          | 1/1350 [1:13:48<8:49:28, 23.55s/it]  
 17%|█▋        | 224/1350 [1:13:00<6:04:22, 19.42s/it][A

{'loss': 0.8081, 'learning_rate': 2e-05, 'epoch': 0.83}



                                                    t][A
  0%|          | 1/1350 [1:14:05<8:49:28, 23.55s/it]  
 17%|█▋        | 225/1350 [1:13:18<5:54:06, 18.89s/it][A

{'loss': 0.8238, 'learning_rate': 2e-05, 'epoch': 0.83}



                                                    t][A
  0%|          | 1/1350 [1:14:26<8:49:28, 23.55s/it]  
 17%|█▋        | 226/1350 [1:13:38<6:02:29, 19.35s/it][A

{'loss': 1.0701, 'learning_rate': 2e-05, 'epoch': 0.84}



                                                    t][A
  0%|          | 1/1350 [1:14:46<8:49:28, 23.55s/it]  
 17%|█▋        | 227/1350 [1:13:59<6:06:37, 19.59s/it][A

{'loss': 0.967, 'learning_rate': 2e-05, 'epoch': 0.84}



                                                    t][A
  0%|          | 1/1350 [1:15:09<8:49:28, 23.55s/it]  
 17%|█▋        | 228/1350 [1:14:22<6:25:40, 20.62s/it][A

{'loss': 0.9843, 'learning_rate': 2e-05, 'epoch': 0.84}



                                                    t][A
  0%|          | 1/1350 [1:15:30<8:49:28, 23.55s/it]  
 17%|█▋        | 229/1350 [1:14:43<6:28:16, 20.78s/it][A

{'loss': 1.0846, 'learning_rate': 2e-05, 'epoch': 0.85}



                                                    t][A
  0%|          | 1/1350 [1:15:48<8:49:28, 23.55s/it]  
 17%|█▋        | 230/1350 [1:15:00<6:09:42, 19.81s/it][A

{'loss': 0.9353, 'learning_rate': 2e-05, 'epoch': 0.85}



                                                    t][A
  0%|          | 1/1350 [1:16:05<8:49:28, 23.55s/it]  
 17%|█▋        | 231/1350 [1:15:18<5:55:59, 19.09s/it][A

{'loss': 1.1145, 'learning_rate': 2e-05, 'epoch': 0.86}



                                                    t][A
  0%|          | 1/1350 [1:16:27<8:49:28, 23.55s/it]  
 17%|█▋        | 232/1350 [1:15:39<6:10:22, 19.88s/it][A

{'loss': 1.2259, 'learning_rate': 2e-05, 'epoch': 0.86}



                                                    t][A
  0%|          | 1/1350 [1:16:45<8:49:28, 23.55s/it]  
 17%|█▋        | 233/1350 [1:15:57<5:58:28, 19.26s/it][A

{'loss': 1.0319, 'learning_rate': 2e-05, 'epoch': 0.86}



                                                    t][A
  0%|          | 1/1350 [1:17:05<8:49:28, 23.55s/it]  
 17%|█▋        | 234/1350 [1:16:17<6:03:28, 19.54s/it][A

{'loss': 0.6845, 'learning_rate': 2e-05, 'epoch': 0.87}



                                                    t][A
  0%|          | 1/1350 [1:17:24<8:49:28, 23.55s/it]  
 17%|█▋        | 235/1350 [1:16:37<6:02:31, 19.51s/it][A

{'loss': 0.9108, 'learning_rate': 2e-05, 'epoch': 0.87}



                                                    t][A
  0%|          | 1/1350 [1:17:45<8:49:28, 23.55s/it]  
 17%|█▋        | 236/1350 [1:16:57<6:08:19, 19.84s/it][A

{'loss': 0.8882, 'learning_rate': 2e-05, 'epoch': 0.87}



                                                    t][A
  0%|          | 1/1350 [1:18:07<8:49:28, 23.55s/it]  
 18%|█▊        | 237/1350 [1:17:20<6:20:51, 20.53s/it][A

{'loss': 1.1119, 'learning_rate': 2e-05, 'epoch': 0.88}



                                                    t][A
  0%|          | 1/1350 [1:18:31<8:49:28, 23.55s/it]  
 18%|█▊        | 238/1350 [1:17:44<6:39:36, 21.56s/it][A

{'loss': 1.0153, 'learning_rate': 2e-05, 'epoch': 0.88}



                                                    t][A
  0%|          | 1/1350 [1:18:51<8:49:28, 23.55s/it]  
 18%|█▊        | 239/1350 [1:18:04<6:30:48, 21.11s/it][A

{'loss': 0.9323, 'learning_rate': 2e-05, 'epoch': 0.89}



                                                    t][A
  0%|          | 1/1350 [1:19:15<8:49:28, 23.55s/it]  
 18%|█▊        | 240/1350 [1:18:28<6:46:08, 21.95s/it][A

{'loss': 1.1381, 'learning_rate': 2e-05, 'epoch': 0.89}



                                                    t][A
  0%|          | 1/1350 [1:19:32<8:49:28, 23.55s/it]  
 18%|█▊        | 241/1350 [1:18:45<6:20:15, 20.57s/it][A

{'loss': 0.8053, 'learning_rate': 2e-05, 'epoch': 0.89}



                                                    t][A
  0%|          | 1/1350 [1:19:53<8:49:28, 23.55s/it]  
 18%|█▊        | 242/1350 [1:19:06<6:21:36, 20.66s/it][A

{'loss': 1.1677, 'learning_rate': 2e-05, 'epoch': 0.9}



                                                    t][A
  0%|          | 1/1350 [1:20:11<8:49:28, 23.55s/it]  
 18%|█▊        | 243/1350 [1:19:24<6:05:56, 19.83s/it][A

{'loss': 1.1645, 'learning_rate': 2e-05, 'epoch': 0.9}



                                                    t][A
  0%|          | 1/1350 [1:20:29<8:49:28, 23.55s/it]  
 18%|█▊        | 244/1350 [1:19:41<5:53:59, 19.20s/it][A

{'loss': 1.0214, 'learning_rate': 2e-05, 'epoch': 0.9}



                                                    t][A
  0%|          | 1/1350 [1:20:48<8:49:28, 23.55s/it]  
 18%|█▊        | 245/1350 [1:20:00<5:53:00, 19.17s/it][A

{'loss': 0.8579, 'learning_rate': 2e-05, 'epoch': 0.91}



                                                    t][A
  0%|          | 1/1350 [1:21:05<8:49:28, 23.55s/it]  
 18%|█▊        | 246/1350 [1:20:18<5:41:16, 18.55s/it][A

{'loss': 0.8706, 'learning_rate': 2e-05, 'epoch': 0.91}



                                                    t][A
  0%|          | 1/1350 [1:21:22<8:49:28, 23.55s/it]  
 18%|█▊        | 247/1350 [1:20:35<5:32:32, 18.09s/it][A

{'loss': 0.77, 'learning_rate': 2e-05, 'epoch': 0.91}



                                                    t][A
  0%|          | 1/1350 [1:21:42<8:49:28, 23.55s/it]  
 18%|█▊        | 248/1350 [1:20:55<5:42:18, 18.64s/it][A

{'loss': 1.093, 'learning_rate': 2e-05, 'epoch': 0.92}



                                                    t][A
  0%|          | 1/1350 [1:22:00<8:49:28, 23.55s/it]  
 18%|█▊        | 249/1350 [1:21:12<5:37:17, 18.38s/it][A

{'loss': 0.826, 'learning_rate': 2e-05, 'epoch': 0.92}



                                                    t][A
  0%|          | 1/1350 [1:22:21<8:49:28, 23.55s/it]  
 19%|█▊        | 250/1350 [1:21:34<5:52:50, 19.25s/it][A

{'loss': 1.0865, 'learning_rate': 2e-05, 'epoch': 0.93}



                                                    t][A
  0%|          | 1/1350 [1:22:41<8:49:28, 23.55s/it]  
 19%|█▊        | 251/1350 [1:21:54<5:57:47, 19.53s/it][A

{'loss': 0.8563, 'learning_rate': 2e-05, 'epoch': 0.93}



                                                    t][A
  0%|          | 1/1350 [1:23:02<8:49:28, 23.55s/it]  
 19%|█▊        | 252/1350 [1:22:15<6:04:45, 19.93s/it][A

{'loss': 0.9891, 'learning_rate': 2e-05, 'epoch': 0.93}



                                                    t][A
  0%|          | 1/1350 [1:23:29<8:49:28, 23.55s/it]  
 19%|█▊        | 253/1350 [1:22:42<6:44:57, 22.15s/it][A

{'loss': 1.2768, 'learning_rate': 2e-05, 'epoch': 0.94}



                                                    t][A
  0%|          | 1/1350 [1:23:48<8:49:28, 23.55s/it]  
 19%|█▉        | 254/1350 [1:23:00<6:22:55, 20.96s/it][A

{'loss': 1.04, 'learning_rate': 2e-05, 'epoch': 0.94}



                                                    t][A
  0%|          | 1/1350 [1:24:06<8:49:28, 23.55s/it]  
 19%|█▉        | 255/1350 [1:23:19<6:08:35, 20.20s/it][A

{'loss': 0.7674, 'learning_rate': 2e-05, 'epoch': 0.94}



                                                    t][A
  0%|          | 1/1350 [1:24:26<8:49:28, 23.55s/it]  
 19%|█▉        | 256/1350 [1:23:39<6:07:10, 20.14s/it][A

{'loss': 0.8145, 'learning_rate': 2e-05, 'epoch': 0.95}



                                                    t][A
  0%|          | 1/1350 [1:24:46<8:49:28, 23.55s/it]  
 19%|█▉        | 257/1350 [1:23:58<6:04:25, 20.01s/it][A

{'loss': 0.7823, 'learning_rate': 2e-05, 'epoch': 0.95}



                                                    t][A
  0%|          | 1/1350 [1:25:04<8:49:28, 23.55s/it]  
 19%|█▉        | 258/1350 [1:24:16<5:52:13, 19.35s/it][A

{'loss': 0.8872, 'learning_rate': 2e-05, 'epoch': 0.96}



                                                    t][A
  0%|          | 1/1350 [1:25:21<8:49:28, 23.55s/it]  
 19%|█▉        | 259/1350 [1:24:34<5:41:49, 18.80s/it][A

{'loss': 0.804, 'learning_rate': 2e-05, 'epoch': 0.96}



                                                    t][A
  0%|          | 1/1350 [1:25:41<8:49:28, 23.55s/it]  
 19%|█▉        | 260/1350 [1:24:54<5:49:07, 19.22s/it][A

{'loss': 0.8013, 'learning_rate': 2e-05, 'epoch': 0.96}



                                                    t][A
  0%|          | 1/1350 [1:26:01<8:49:28, 23.55s/it]  
 19%|█▉        | 261/1350 [1:25:13<5:49:36, 19.26s/it][A

{'loss': 0.9494, 'learning_rate': 2e-05, 'epoch': 0.97}



                                                    t][A
  0%|          | 1/1350 [1:26:19<8:49:28, 23.55s/it]  
 19%|█▉        | 262/1350 [1:25:32<5:46:48, 19.13s/it][A

{'loss': 0.9956, 'learning_rate': 2e-05, 'epoch': 0.97}



                                                    t][A
  0%|          | 1/1350 [1:26:38<8:49:28, 23.55s/it]  
 19%|█▉        | 263/1350 [1:25:51<5:44:42, 19.03s/it][A

{'loss': 1.0707, 'learning_rate': 2e-05, 'epoch': 0.97}



                                                    t][A
  0%|          | 1/1350 [1:26:59<8:49:28, 23.55s/it]  
 20%|█▉        | 264/1350 [1:26:11<5:53:05, 19.51s/it][A

{'loss': 1.3308, 'learning_rate': 2e-05, 'epoch': 0.98}



                                                    t][A
  0%|          | 1/1350 [1:27:20<8:49:28, 23.55s/it]  
 20%|█▉        | 265/1350 [1:26:32<5:59:25, 19.88s/it][A

{'loss': 1.1884, 'learning_rate': 2e-05, 'epoch': 0.98}



                                                    t][A
  0%|          | 1/1350 [1:27:38<8:49:28, 23.55s/it]  
 20%|█▉        | 266/1350 [1:26:51<5:51:51, 19.48s/it][A

{'loss': 0.9459, 'learning_rate': 2e-05, 'epoch': 0.99}



                                                    t][A
  0%|          | 1/1350 [1:27:58<8:49:28, 23.55s/it]  
 20%|█▉        | 267/1350 [1:27:10<5:51:09, 19.45s/it][A

{'loss': 0.9376, 'learning_rate': 2e-05, 'epoch': 0.99}



                                                    t][A
  0%|          | 1/1350 [1:28:15<8:49:28, 23.55s/it]  
 20%|█▉        | 268/1350 [1:27:27<5:38:08, 18.75s/it][A

{'loss': 0.7441, 'learning_rate': 2e-05, 'epoch': 0.99}



                                                    t][A
  0%|          | 1/1350 [1:28:36<8:49:28, 23.55s/it]  
 20%|█▉        | 269/1350 [1:27:49<5:52:23, 19.56s/it][A

{'loss': 1.3308, 'learning_rate': 2e-05, 'epoch': 1.0}



                                                    t][A
  0%|          | 1/1350 [1:28:54<8:49:28, 23.55s/it]  
 20%|██        | 270/1350 [1:28:07<5:44:03, 19.11s/it][A

{'loss': 0.7245, 'learning_rate': 2e-05, 'epoch': 1.0}



                                                    t][A
  0%|          | 1/1350 [1:29:15<8:49:28, 23.55s/it]  
 20%|██        | 271/1350 [1:28:28<5:54:02, 19.69s/it][A

{'loss': 1.0476, 'learning_rate': 2e-05, 'epoch': 1.0}



                                                    t][A
  0%|          | 1/1350 [1:29:33<8:49:28, 23.55s/it]  
 20%|██        | 272/1350 [1:28:45<5:42:10, 19.05s/it][A

{'loss': 0.7897, 'learning_rate': 2e-05, 'epoch': 1.01}



                                                    t][A
  0%|          | 1/1350 [1:29:53<8:49:28, 23.55s/it]  
 20%|██        | 273/1350 [1:29:06<5:48:51, 19.43s/it][A

{'loss': 0.8912, 'learning_rate': 2e-05, 'epoch': 1.01}



                                                    t][A
  0%|          | 1/1350 [1:30:13<8:49:28, 23.55s/it]  
 20%|██        | 274/1350 [1:29:25<5:49:13, 19.47s/it][A

{'loss': 1.0383, 'learning_rate': 2e-05, 'epoch': 1.01}



                                                    t][A
  0%|          | 1/1350 [1:30:31<8:49:28, 23.55s/it]  
 20%|██        | 275/1350 [1:29:44<5:42:43, 19.13s/it][A

{'loss': 0.9816, 'learning_rate': 2e-05, 'epoch': 1.02}



                                                    t][A
  0%|          | 1/1350 [1:30:51<8:49:28, 23.55s/it]  
 20%|██        | 276/1350 [1:30:04<5:49:09, 19.51s/it][A

{'loss': 0.8742, 'learning_rate': 2e-05, 'epoch': 1.02}



                                                    t][A
  0%|          | 1/1350 [1:31:10<8:49:28, 23.55s/it]  
 21%|██        | 277/1350 [1:30:23<5:44:07, 19.24s/it][A

{'loss': 0.8141, 'learning_rate': 2e-05, 'epoch': 1.03}



                                                    t][A
  0%|          | 1/1350 [1:31:28<8:49:28, 23.55s/it]  
 21%|██        | 278/1350 [1:30:41<5:38:59, 18.97s/it][A

{'loss': 0.931, 'learning_rate': 2e-05, 'epoch': 1.03}



                                                    t][A
  0%|          | 1/1350 [1:31:48<8:49:28, 23.55s/it]  
 21%|██        | 279/1350 [1:31:00<5:40:00, 19.05s/it][A

{'loss': 0.9368, 'learning_rate': 2e-05, 'epoch': 1.03}



                                                    t][A
  0%|          | 1/1350 [1:32:10<8:49:28, 23.55s/it]  
 21%|██        | 280/1350 [1:31:22<5:55:30, 19.93s/it][A

{'loss': 1.0989, 'learning_rate': 2e-05, 'epoch': 1.04}



                                                    t][A
  0%|          | 1/1350 [1:32:29<8:49:28, 23.55s/it]  
 21%|██        | 281/1350 [1:31:41<5:49:49, 19.63s/it][A

{'loss': 0.9939, 'learning_rate': 2e-05, 'epoch': 1.04}



                                                    t][A
  0%|          | 1/1350 [1:32:50<8:49:28, 23.55s/it]  
 21%|██        | 282/1350 [1:32:03<6:00:50, 20.27s/it][A

{'loss': 1.2596, 'learning_rate': 2e-05, 'epoch': 1.04}



                                                    t][A
  0%|          | 1/1350 [1:33:08<8:49:28, 23.55s/it]  
 21%|██        | 283/1350 [1:32:21<5:47:22, 19.53s/it][A

{'loss': 0.8136, 'learning_rate': 2e-05, 'epoch': 1.05}



                                                    t][A
  0%|          | 1/1350 [1:33:27<8:49:28, 23.55s/it]  
 21%|██        | 284/1350 [1:32:40<5:43:46, 19.35s/it][A

{'loss': 1.0307, 'learning_rate': 2e-05, 'epoch': 1.05}



                                                    t][A
  0%|          | 1/1350 [1:33:45<8:49:28, 23.55s/it]  
 21%|██        | 285/1350 [1:32:58<5:38:27, 19.07s/it][A

{'loss': 0.8622, 'learning_rate': 2e-05, 'epoch': 1.06}



                                                    t][A
  0%|          | 1/1350 [1:34:06<8:49:28, 23.55s/it]  
 21%|██        | 286/1350 [1:33:19<5:46:17, 19.53s/it][A

{'loss': 1.0971, 'learning_rate': 2e-05, 'epoch': 1.06}



                                                    t][A
  0%|          | 1/1350 [1:34:26<8:49:28, 23.55s/it]  
 21%|██▏       | 287/1350 [1:33:39<5:48:49, 19.69s/it][A

{'loss': 0.7586, 'learning_rate': 2e-05, 'epoch': 1.06}



                                                    t][A
  0%|          | 1/1350 [1:34:48<8:49:28, 23.55s/it]  
 21%|██▏       | 288/1350 [1:34:01<6:01:52, 20.45s/it][A

{'loss': 0.9058, 'learning_rate': 2e-05, 'epoch': 1.07}



                                                    t][A
  0%|          | 1/1350 [1:35:07<8:49:28, 23.55s/it]  
 21%|██▏       | 289/1350 [1:34:20<5:53:45, 20.00s/it][A

{'loss': 0.9993, 'learning_rate': 2e-05, 'epoch': 1.07}



                                                    t][A
  0%|          | 1/1350 [1:35:29<8:49:28, 23.55s/it]  
 21%|██▏       | 290/1350 [1:34:42<6:04:31, 20.63s/it][A

{'loss': 1.1998, 'learning_rate': 2e-05, 'epoch': 1.07}



                                                    t][A
  0%|          | 1/1350 [1:35:49<8:49:28, 23.55s/it]  
 22%|██▏       | 291/1350 [1:35:02<5:59:03, 20.34s/it][A

{'loss': 0.8979, 'learning_rate': 2e-05, 'epoch': 1.08}



                                                    t][A
  0%|          | 1/1350 [1:36:11<8:49:28, 23.55s/it]  
 22%|██▏       | 292/1350 [1:35:24<6:08:46, 20.91s/it][A

{'loss': 0.9472, 'learning_rate': 2e-05, 'epoch': 1.08}



                                                    t][A
  0%|          | 1/1350 [1:36:32<8:49:28, 23.55s/it]  
 22%|██▏       | 293/1350 [1:35:45<6:07:19, 20.85s/it][A

{'loss': 1.0445, 'learning_rate': 2e-05, 'epoch': 1.09}



                                                    t][A
  0%|          | 1/1350 [1:36:49<8:49:28, 23.55s/it]  
 22%|██▏       | 294/1350 [1:36:01<5:43:58, 19.54s/it][A

{'loss': 0.8972, 'learning_rate': 2e-05, 'epoch': 1.09}



                                                    t][A
  0%|          | 1/1350 [1:37:07<8:49:28, 23.55s/it]  
 22%|██▏       | 295/1350 [1:36:19<5:36:13, 19.12s/it][A

{'loss': 1.169, 'learning_rate': 2e-05, 'epoch': 1.09}



                                                    t][A
  0%|          | 1/1350 [1:37:26<8:49:28, 23.55s/it]  
 22%|██▏       | 296/1350 [1:36:38<5:36:47, 19.17s/it][A

{'loss': 0.869, 'learning_rate': 2e-05, 'epoch': 1.1}



                                                    t][A
  0%|          | 1/1350 [1:37:44<8:49:28, 23.55s/it]  
 22%|██▏       | 297/1350 [1:36:56<5:28:32, 18.72s/it][A

{'loss': 0.9066, 'learning_rate': 2e-05, 'epoch': 1.1}



                                                    t][A
  0%|          | 1/1350 [1:38:02<8:49:28, 23.55s/it]  
 22%|██▏       | 298/1350 [1:37:15<5:28:25, 18.73s/it][A

{'loss': 0.76, 'learning_rate': 2e-05, 'epoch': 1.1}



                                                    t][A
  0%|          | 1/1350 [1:38:24<8:49:28, 23.55s/it]  
 22%|██▏       | 299/1350 [1:37:36<5:42:30, 19.55s/it][A

{'loss': 0.9721, 'learning_rate': 2e-05, 'epoch': 1.11}



                                                    t][A
  0%|          | 1/1350 [1:38:44<8:49:28, 23.55s/it]  
 22%|██▏       | 300/1350 [1:37:56<5:43:19, 19.62s/it][A

{'loss': 1.203, 'learning_rate': 2e-05, 'epoch': 1.11}



                                                    t][A
  0%|          | 1/1350 [1:39:02<8:49:28, 23.55s/it]  
 22%|██▏       | 301/1350 [1:38:15<5:39:09, 19.40s/it][A

{'loss': 1.179, 'learning_rate': 2e-05, 'epoch': 1.11}



                                                    t][A
  0%|          | 1/1350 [1:39:22<8:49:28, 23.55s/it]  
 22%|██▏       | 302/1350 [1:38:34<5:36:55, 19.29s/it][A

{'loss': 0.7698, 'learning_rate': 2e-05, 'epoch': 1.12}



                                                    t][A
  0%|          | 1/1350 [1:39:43<8:49:28, 23.55s/it]  
 22%|██▏       | 303/1350 [1:38:55<5:46:34, 19.86s/it][A

{'loss': 0.9535, 'learning_rate': 2e-05, 'epoch': 1.12}



                                                    t][A
  0%|          | 1/1350 [1:40:03<8:49:28, 23.55s/it]  
 23%|██▎       | 304/1350 [1:39:15<5:46:56, 19.90s/it][A

{'loss': 1.0994, 'learning_rate': 2e-05, 'epoch': 1.13}



                                                    t][A
  0%|          | 1/1350 [1:40:23<8:49:28, 23.55s/it]  
 23%|██▎       | 305/1350 [1:39:36<5:49:22, 20.06s/it][A

{'loss': 1.0193, 'learning_rate': 2e-05, 'epoch': 1.13}



                                                    t][A
  0%|          | 1/1350 [1:40:46<8:49:28, 23.55s/it]  
 23%|██▎       | 306/1350 [1:39:58<6:02:32, 20.84s/it][A

{'loss': 1.0694, 'learning_rate': 2e-05, 'epoch': 1.13}



                                                    t][A
  0%|          | 1/1350 [1:41:08<8:49:28, 23.55s/it]  
 23%|██▎       | 307/1350 [1:40:20<6:07:10, 21.12s/it][A

{'loss': 0.9992, 'learning_rate': 2e-05, 'epoch': 1.14}



                                                    t][A
  0%|          | 1/1350 [1:41:26<8:49:28, 23.55s/it]  
 23%|██▎       | 308/1350 [1:40:38<5:50:37, 20.19s/it][A

{'loss': 0.7329, 'learning_rate': 2e-05, 'epoch': 1.14}



                                                    t][A
  0%|          | 1/1350 [1:41:44<8:49:28, 23.55s/it]  
 23%|██▎       | 309/1350 [1:40:56<5:39:54, 19.59s/it][A

{'loss': 0.9309, 'learning_rate': 2e-05, 'epoch': 1.14}



                                                    t][A
  0%|          | 1/1350 [1:42:02<8:49:28, 23.55s/it]  
 23%|██▎       | 310/1350 [1:41:15<5:32:35, 19.19s/it][A

{'loss': 1.0346, 'learning_rate': 2e-05, 'epoch': 1.15}



                                                    t][A
  0%|          | 1/1350 [1:42:21<8:49:28, 23.55s/it]  
 23%|██▎       | 311/1350 [1:41:34<5:32:45, 19.22s/it][A

{'loss': 0.9296, 'learning_rate': 2e-05, 'epoch': 1.15}



                                                    t][A
  0%|          | 1/1350 [1:42:42<8:49:28, 23.55s/it]  
 23%|██▎       | 312/1350 [1:41:55<5:40:23, 19.68s/it][A

{'loss': 0.998, 'learning_rate': 2e-05, 'epoch': 1.16}



                                                    t][A
  0%|          | 1/1350 [1:43:02<8:49:28, 23.55s/it]  
 23%|██▎       | 313/1350 [1:42:15<5:43:19, 19.86s/it][A

{'loss': 0.8913, 'learning_rate': 2e-05, 'epoch': 1.16}



                                                    t][A
  0%|          | 1/1350 [1:43:20<8:49:28, 23.55s/it]  
 23%|██▎       | 314/1350 [1:42:32<5:29:50, 19.10s/it][A

{'loss': 0.8941, 'learning_rate': 2e-05, 'epoch': 1.16}



                                                    t][A
  0%|          | 1/1350 [1:43:46<8:49:28, 23.55s/it]  
 23%|██▎       | 315/1350 [1:42:58<6:04:15, 21.12s/it][A

{'loss': 1.8265, 'learning_rate': 2e-05, 'epoch': 1.17}



                                                    t][A
  0%|          | 1/1350 [1:44:05<8:49:28, 23.55s/it]  
 23%|██▎       | 316/1350 [1:43:18<5:56:18, 20.68s/it][A

{'loss': 1.0869, 'learning_rate': 2e-05, 'epoch': 1.17}



                                                    t][A
  0%|          | 1/1350 [1:44:22<8:49:28, 23.55s/it]  
 23%|██▎       | 317/1350 [1:43:35<5:38:01, 19.63s/it][A

{'loss': 0.8422, 'learning_rate': 2e-05, 'epoch': 1.17}



                                                    t][A
  0%|          | 1/1350 [1:44:39<8:49:28, 23.55s/it]  
 24%|██▎       | 318/1350 [1:43:52<5:24:51, 18.89s/it][A

{'loss': 0.8418, 'learning_rate': 2e-05, 'epoch': 1.18}



                                                    t][A
  0%|          | 1/1350 [1:45:00<8:49:28, 23.55s/it]  
 24%|██▎       | 319/1350 [1:44:12<5:32:40, 19.36s/it][A

{'loss': 0.9221, 'learning_rate': 2e-05, 'epoch': 1.18}



                                                    t][A
  0%|          | 1/1350 [1:45:22<8:49:28, 23.55s/it]  
 24%|██▎       | 320/1350 [1:44:34<5:45:13, 20.11s/it][A

{'loss': 1.2065, 'learning_rate': 2e-05, 'epoch': 1.19}



                                                    t][A
  0%|          | 1/1350 [1:45:40<8:49:28, 23.55s/it]  
 24%|██▍       | 321/1350 [1:44:52<5:32:45, 19.40s/it][A

{'loss': 1.0972, 'learning_rate': 2e-05, 'epoch': 1.19}



                                                    t][A
  0%|          | 1/1350 [1:45:58<8:49:28, 23.55s/it]  
 24%|██▍       | 322/1350 [1:45:11<5:27:31, 19.12s/it][A

{'loss': 1.0011, 'learning_rate': 2e-05, 'epoch': 1.19}



                                                    t][A
  0%|          | 1/1350 [1:46:16<8:49:28, 23.55s/it]  
 24%|██▍       | 323/1350 [1:45:28<5:21:08, 18.76s/it][A

{'loss': 0.6553, 'learning_rate': 2e-05, 'epoch': 1.2}



                                                    t][A
  0%|          | 1/1350 [1:46:38<8:49:28, 23.55s/it]  
 24%|██▍       | 324/1350 [1:45:50<5:37:22, 19.73s/it][A

{'loss': 0.889, 'learning_rate': 2e-05, 'epoch': 1.2}



                                                    t][A
  0%|          | 1/1350 [1:46:58<8:49:28, 23.55s/it]  
 24%|██▍       | 325/1350 [1:46:11<5:40:16, 19.92s/it][A

{'loss': 0.9633, 'learning_rate': 2e-05, 'epoch': 1.2}



                                                    t][A
  0%|          | 1/1350 [1:47:16<8:49:28, 23.55s/it]  
 24%|██▍       | 326/1350 [1:46:29<5:28:41, 19.26s/it][A

{'loss': 0.9174, 'learning_rate': 2e-05, 'epoch': 1.21}



                                                    t][A
  0%|          | 1/1350 [1:47:39<8:49:28, 23.55s/it]  
 24%|██▍       | 327/1350 [1:46:52<5:48:23, 20.43s/it][A

{'loss': 1.2632, 'learning_rate': 2e-05, 'epoch': 1.21}



                                                    t][A
  0%|          | 1/1350 [1:47:59<8:49:28, 23.55s/it]  
 24%|██▍       | 328/1350 [1:47:11<5:43:47, 20.18s/it][A

{'loss': 0.8784, 'learning_rate': 2e-05, 'epoch': 1.21}



                                                    t][A
  0%|          | 1/1350 [1:48:19<8:49:28, 23.55s/it]  
 24%|██▍       | 329/1350 [1:47:32<5:45:59, 20.33s/it][A

{'loss': 0.7619, 'learning_rate': 2e-05, 'epoch': 1.22}



                                                    t][A
  0%|          | 1/1350 [1:48:38<8:49:28, 23.55s/it]  
 24%|██▍       | 330/1350 [1:47:51<5:38:16, 19.90s/it][A

{'loss': 0.7578, 'learning_rate': 2e-05, 'epoch': 1.22}



                                                    t][A
  0%|          | 1/1350 [1:48:58<8:49:28, 23.55s/it]  
 25%|██▍       | 331/1350 [1:48:10<5:35:20, 19.75s/it][A

{'loss': 0.8881, 'learning_rate': 2e-05, 'epoch': 1.23}



                                                    t][A
  0%|          | 1/1350 [1:49:21<8:49:28, 23.55s/it]  
 25%|██▍       | 332/1350 [1:48:33<5:50:39, 20.67s/it][A

{'loss': 0.9224, 'learning_rate': 2e-05, 'epoch': 1.23}



                                                    t][A
  0%|          | 1/1350 [1:49:40<8:49:28, 23.55s/it]  
 25%|██▍       | 333/1350 [1:48:53<5:44:28, 20.32s/it][A

{'loss': 1.0281, 'learning_rate': 2e-05, 'epoch': 1.23}



                                                    t][A
  0%|          | 1/1350 [1:49:59<8:49:28, 23.55s/it]  
 25%|██▍       | 334/1350 [1:49:11<5:34:34, 19.76s/it][A

{'loss': 1.1359, 'learning_rate': 2e-05, 'epoch': 1.24}



                                                    t][A
  0%|          | 1/1350 [1:50:17<8:49:28, 23.55s/it]  
 25%|██▍       | 335/1350 [1:49:29<5:27:23, 19.35s/it][A

{'loss': 1.1873, 'learning_rate': 2e-05, 'epoch': 1.24}



                                                    t][A
  0%|          | 1/1350 [1:50:36<8:49:28, 23.55s/it]  
 25%|██▍       | 336/1350 [1:49:49<5:27:04, 19.35s/it][A

{'loss': 0.9966, 'learning_rate': 2e-05, 'epoch': 1.24}



                                                    t][A
  0%|          | 1/1350 [1:50:57<8:49:28, 23.55s/it]  
 25%|██▍       | 337/1350 [1:50:09<5:32:21, 19.69s/it][A

{'loss': 0.9993, 'learning_rate': 2e-05, 'epoch': 1.25}



                                                    t][A
  0%|          | 1/1350 [1:51:18<8:49:28, 23.55s/it]  
 25%|██▌       | 338/1350 [1:50:30<5:37:57, 20.04s/it][A

{'loss': 0.943, 'learning_rate': 2e-05, 'epoch': 1.25}



                                                    t][A
  0%|          | 1/1350 [1:51:35<8:49:28, 23.55s/it]  
 25%|██▌       | 339/1350 [1:50:48<5:26:06, 19.35s/it][A

{'loss': 1.0904, 'learning_rate': 2e-05, 'epoch': 1.26}



                                                    t][A
  0%|          | 1/1350 [1:51:57<8:49:28, 23.55s/it]  
 25%|██▌       | 340/1350 [1:51:09<5:35:26, 19.93s/it][A

{'loss': 1.145, 'learning_rate': 2e-05, 'epoch': 1.26}



                                                    t][A
  0%|          | 1/1350 [1:52:17<8:49:28, 23.55s/it]  
 25%|██▌       | 341/1350 [1:51:29<5:36:52, 20.03s/it][A

{'loss': 0.9512, 'learning_rate': 2e-05, 'epoch': 1.26}



                                                    t][A
  0%|          | 1/1350 [1:52:40<8:49:28, 23.55s/it]  
 25%|██▌       | 342/1350 [1:51:52<5:49:40, 20.81s/it][A

{'loss': 1.0715, 'learning_rate': 2e-05, 'epoch': 1.27}



                                                    t][A
  0%|          | 1/1350 [1:52:59<8:49:28, 23.55s/it]  
 25%|██▌       | 343/1350 [1:52:12<5:44:00, 20.50s/it][A

{'loss': 1.1505, 'learning_rate': 2e-05, 'epoch': 1.27}



                                                    t][A
  0%|          | 1/1350 [1:53:17<8:49:28, 23.55s/it]  
 25%|██▌       | 344/1350 [1:52:29<5:29:27, 19.65s/it][A

{'loss': 0.8151, 'learning_rate': 2e-05, 'epoch': 1.27}



                                                    t][A
  0%|          | 1/1350 [1:53:36<8:49:28, 23.55s/it]  
 26%|██▌       | 345/1350 [1:52:49<5:27:41, 19.56s/it][A

{'loss': 0.9655, 'learning_rate': 2e-05, 'epoch': 1.28}



                                                    t][A
  0%|          | 1/1350 [1:53:56<8:49:28, 23.55s/it]  
 26%|██▌       | 346/1350 [1:53:08<5:26:08, 19.49s/it][A

{'loss': 0.6346, 'learning_rate': 2e-05, 'epoch': 1.28}



                                                    t][A
  0%|          | 1/1350 [1:54:14<8:49:28, 23.55s/it]  
 26%|██▌       | 347/1350 [1:53:26<5:17:35, 19.00s/it][A

{'loss': 1.1487, 'learning_rate': 2e-05, 'epoch': 1.29}



                                                    t][A
  0%|          | 1/1350 [1:54:36<8:49:28, 23.55s/it]  
 26%|██▌       | 348/1350 [1:53:48<5:32:22, 19.90s/it][A

{'loss': 1.03, 'learning_rate': 2e-05, 'epoch': 1.29}



                                                    t][A
  0%|          | 1/1350 [1:54:55<8:49:28, 23.55s/it]  
 26%|██▌       | 349/1350 [1:54:08<5:31:12, 19.85s/it][A

{'loss': 0.9564, 'learning_rate': 2e-05, 'epoch': 1.29}



                                                    t][A
  0%|          | 1/1350 [1:55:14<8:49:28, 23.55s/it]  
 26%|██▌       | 350/1350 [1:54:27<5:26:54, 19.61s/it][A

{'loss': 1.0754, 'learning_rate': 2e-05, 'epoch': 1.3}



                                                    t][A
  0%|          | 1/1350 [1:55:32<8:49:28, 23.55s/it]  
 26%|██▌       | 351/1350 [1:54:45<5:17:29, 19.07s/it][A

{'loss': 0.8478, 'learning_rate': 2e-05, 'epoch': 1.3}



                                                    t][A
  0%|          | 1/1350 [1:55:52<8:49:28, 23.55s/it]  
 26%|██▌       | 352/1350 [1:55:04<5:20:45, 19.28s/it][A

{'loss': 1.1766, 'learning_rate': 2e-05, 'epoch': 1.3}



                                                    t][A
  0%|          | 1/1350 [1:56:12<8:49:28, 23.55s/it]  
 26%|██▌       | 353/1350 [1:55:24<5:24:02, 19.50s/it][A

{'loss': 1.6071, 'learning_rate': 2e-05, 'epoch': 1.31}



                                                    t][A
  0%|          | 1/1350 [1:56:30<8:49:28, 23.55s/it]  
 26%|██▌       | 354/1350 [1:55:42<5:16:28, 19.07s/it][A

{'loss': 0.7738, 'learning_rate': 2e-05, 'epoch': 1.31}



                                                    t][A
  0%|          | 1/1350 [1:56:49<8:49:28, 23.55s/it]  
 26%|██▋       | 355/1350 [1:56:02<5:16:10, 19.07s/it][A

{'loss': 1.0837, 'learning_rate': 2e-05, 'epoch': 1.31}



                                                    t][A
  0%|          | 1/1350 [1:57:12<8:49:28, 23.55s/it]  
 26%|██▋       | 356/1350 [1:56:24<5:33:45, 20.15s/it][A

{'loss': 1.1472, 'learning_rate': 2e-05, 'epoch': 1.32}



                                                    t][A
  0%|          | 1/1350 [1:57:28<8:49:28, 23.55s/it]  
 26%|██▋       | 357/1350 [1:56:40<5:14:17, 18.99s/it][A

{'loss': 0.8948, 'learning_rate': 2e-05, 'epoch': 1.32}



                                                    t][A
  0%|          | 1/1350 [1:57:47<8:49:28, 23.55s/it]  
 27%|██▋       | 358/1350 [1:57:00<5:16:07, 19.12s/it][A

{'loss': 0.6962, 'learning_rate': 2e-05, 'epoch': 1.33}



                                                    t][A
  0%|          | 1/1350 [1:58:04<8:49:28, 23.55s/it]  
 27%|██▋       | 359/1350 [1:57:17<5:03:57, 18.40s/it][A

{'loss': 0.8765, 'learning_rate': 2e-05, 'epoch': 1.33}



                                                    t][A
  0%|          | 1/1350 [1:58:23<8:49:28, 23.55s/it]  
 27%|██▋       | 360/1350 [1:57:36<5:07:35, 18.64s/it][A

{'loss': 0.6898, 'learning_rate': 2e-05, 'epoch': 1.33}



                                                    t][A
  0%|          | 1/1350 [1:58:50<8:49:28, 23.55s/it]  
 27%|██▋       | 361/1350 [1:58:02<5:45:10, 20.94s/it][A

{'loss': 1.2451, 'learning_rate': 2e-05, 'epoch': 1.34}



                                                    t][A
  0%|          | 1/1350 [1:59:08<8:49:28, 23.55s/it]  
 27%|██▋       | 362/1350 [1:58:20<5:30:26, 20.07s/it][A

{'loss': 0.7218, 'learning_rate': 2e-05, 'epoch': 1.34}



                                                    t][A
  0%|          | 1/1350 [1:59:27<8:49:28, 23.55s/it]  
 27%|██▋       | 363/1350 [1:58:40<5:27:30, 19.91s/it][A

{'loss': 0.9488, 'learning_rate': 2e-05, 'epoch': 1.34}



                                                    t][A
  0%|          | 1/1350 [1:59:47<8:49:28, 23.55s/it]  
 27%|██▋       | 364/1350 [1:59:00<5:27:04, 19.90s/it][A

{'loss': 0.6982, 'learning_rate': 2e-05, 'epoch': 1.35}



                                                    t][A
  0%|          | 1/1350 [2:00:08<8:49:28, 23.55s/it]  
 27%|██▋       | 365/1350 [1:59:21<5:34:03, 20.35s/it][A

{'loss': 0.8331, 'learning_rate': 2e-05, 'epoch': 1.35}



                                                    t][A
  0%|          | 1/1350 [2:00:27<8:49:28, 23.55s/it]  
 27%|██▋       | 366/1350 [1:59:40<5:26:02, 19.88s/it][A

{'loss': 0.983, 'learning_rate': 2e-05, 'epoch': 1.36}



                                                    t][A
  0%|          | 1/1350 [2:00:46<8:49:28, 23.55s/it]  
 27%|██▋       | 367/1350 [1:59:58<5:18:23, 19.43s/it][A

{'loss': 0.8535, 'learning_rate': 2e-05, 'epoch': 1.36}



                                                    t][A
  0%|          | 1/1350 [2:01:05<8:49:28, 23.55s/it]  
 27%|██▋       | 368/1350 [2:00:18<5:18:52, 19.48s/it][A

{'loss': 0.9559, 'learning_rate': 2e-05, 'epoch': 1.36}



                                                    t][A
  0%|          | 1/1350 [2:01:25<8:49:28, 23.55s/it]  
 27%|██▋       | 369/1350 [2:00:37<5:17:38, 19.43s/it][A

{'loss': 0.894, 'learning_rate': 2e-05, 'epoch': 1.37}



                                                    t][A
  0%|          | 1/1350 [2:01:43<8:49:28, 23.55s/it]  
 27%|██▋       | 370/1350 [2:00:55<5:11:42, 19.08s/it][A

{'loss': 1.0305, 'learning_rate': 2e-05, 'epoch': 1.37}



                                                    t][A
  0%|          | 1/1350 [2:02:05<8:49:28, 23.55s/it]  
 27%|██▋       | 371/1350 [2:01:18<5:26:44, 20.03s/it][A

{'loss': 1.0876, 'learning_rate': 2e-05, 'epoch': 1.37}



                                                    t][A
  0%|          | 1/1350 [2:02:25<8:49:28, 23.55s/it]  
 28%|██▊       | 372/1350 [2:01:38<5:26:49, 20.05s/it][A

{'loss': 0.9982, 'learning_rate': 2e-05, 'epoch': 1.38}



                                                    t][A
  0%|          | 1/1350 [2:02:46<8:49:28, 23.55s/it]  
 28%|██▊       | 373/1350 [2:01:59<5:32:23, 20.41s/it][A

{'loss': 0.9228, 'learning_rate': 2e-05, 'epoch': 1.38}



                                                    t][A
  0%|          | 1/1350 [2:03:07<8:49:28, 23.55s/it]  
 28%|██▊       | 374/1350 [2:02:19<5:32:27, 20.44s/it][A

{'loss': 0.9694, 'learning_rate': 2e-05, 'epoch': 1.39}



                                                    t][A
  0%|          | 1/1350 [2:03:26<8:49:28, 23.55s/it]  
 28%|██▊       | 375/1350 [2:02:38<5:23:59, 19.94s/it][A

{'loss': 0.9067, 'learning_rate': 2e-05, 'epoch': 1.39}



                                                    t][A
  0%|          | 1/1350 [2:03:47<8:49:28, 23.55s/it]  
 28%|██▊       | 376/1350 [2:03:00<5:30:42, 20.37s/it][A

{'loss': 1.2304, 'learning_rate': 2e-05, 'epoch': 1.39}



                                                    t][A
  0%|          | 1/1350 [2:04:05<8:49:28, 23.55s/it]  
 28%|██▊       | 377/1350 [2:03:18<5:18:57, 19.67s/it][A

{'loss': 0.8335, 'learning_rate': 2e-05, 'epoch': 1.4}



                                                    t][A
  0%|          | 1/1350 [2:04:26<8:49:28, 23.55s/it]  
 28%|██▊       | 378/1350 [2:03:39<5:25:36, 20.10s/it][A

{'loss': 0.9534, 'learning_rate': 2e-05, 'epoch': 1.4}



                                                    t][A
  0%|          | 1/1350 [2:04:45<8:49:28, 23.55s/it]  
 28%|██▊       | 379/1350 [2:03:58<5:20:40, 19.81s/it][A

{'loss': 0.7256, 'learning_rate': 2e-05, 'epoch': 1.4}



                                                    t][A
  0%|          | 1/1350 [2:05:04<8:49:28, 23.55s/it]  
 28%|██▊       | 380/1350 [2:04:17<5:15:33, 19.52s/it][A

{'loss': 0.959, 'learning_rate': 2e-05, 'epoch': 1.41}



                                                    t][A
  0%|          | 1/1350 [2:05:23<8:49:28, 23.55s/it]  
 28%|██▊       | 381/1350 [2:04:35<5:11:13, 19.27s/it][A

{'loss': 1.0742, 'learning_rate': 2e-05, 'epoch': 1.41}



                                                    t][A
  0%|          | 1/1350 [2:05:46<8:49:28, 23.55s/it]  
 28%|██▊       | 382/1350 [2:04:58<5:28:24, 20.36s/it][A

{'loss': 0.6539, 'learning_rate': 2e-05, 'epoch': 1.41}



                                                    t][A
  0%|          | 1/1350 [2:06:05<8:49:28, 23.55s/it]  
 28%|██▊       | 383/1350 [2:05:17<5:20:55, 19.91s/it][A

{'loss': 0.8127, 'learning_rate': 2e-05, 'epoch': 1.42}



                                                    t][A
  0%|          | 1/1350 [2:06:25<8:49:28, 23.55s/it]  
 28%|██▊       | 384/1350 [2:05:37<5:20:58, 19.94s/it][A

{'loss': 1.1013, 'learning_rate': 2e-05, 'epoch': 1.42}



                                                    t][A
  0%|          | 1/1350 [2:06:43<8:49:28, 23.55s/it]  
 29%|██▊       | 385/1350 [2:05:55<5:11:53, 19.39s/it][A

{'loss': 0.9003, 'learning_rate': 2e-05, 'epoch': 1.43}



                                                    t][A
  0%|          | 1/1350 [2:07:02<8:49:28, 23.55s/it]  
 29%|██▊       | 386/1350 [2:06:15<5:12:10, 19.43s/it][A

{'loss': 0.8804, 'learning_rate': 2e-05, 'epoch': 1.43}



                                                    t][A
  0%|          | 1/1350 [2:07:21<8:49:28, 23.55s/it]  
 29%|██▊       | 387/1350 [2:06:34<5:09:31, 19.29s/it][A

{'loss': 0.9523, 'learning_rate': 2e-05, 'epoch': 1.43}



                                                    t][A
  0%|          | 1/1350 [2:07:43<8:49:28, 23.55s/it]  
 29%|██▊       | 388/1350 [2:06:55<5:18:47, 19.88s/it][A

{'loss': 0.8938, 'learning_rate': 2e-05, 'epoch': 1.44}



                                                    t][A
  0%|          | 1/1350 [2:08:02<8:49:28, 23.55s/it]  
 29%|██▉       | 389/1350 [2:07:15<5:18:18, 19.87s/it][A

{'loss': 0.9994, 'learning_rate': 2e-05, 'epoch': 1.44}



                                                    t][A
  0%|          | 1/1350 [2:08:19<8:49:28, 23.55s/it]  
 29%|██▉       | 390/1350 [2:07:32<5:03:54, 18.99s/it][A

{'loss': 0.8905, 'learning_rate': 2e-05, 'epoch': 1.44}



                                                    t][A
  0%|          | 1/1350 [2:08:39<8:49:28, 23.55s/it]  
 29%|██▉       | 391/1350 [2:07:51<5:05:59, 19.14s/it][A

{'loss': 0.9337, 'learning_rate': 2e-05, 'epoch': 1.45}



                                                    t][A
  0%|          | 1/1350 [2:09:00<8:49:28, 23.55s/it]  
 29%|██▉       | 392/1350 [2:08:12<5:15:12, 19.74s/it][A

{'loss': 0.9561, 'learning_rate': 2e-05, 'epoch': 1.45}



                                                    t][A
  0%|          | 1/1350 [2:09:22<8:49:28, 23.55s/it]  
 29%|██▉       | 393/1350 [2:08:34<5:24:09, 20.32s/it][A

{'loss': 0.9915, 'learning_rate': 2e-05, 'epoch': 1.46}



                                                    t][A
  0%|          | 1/1350 [2:09:41<8:49:28, 23.55s/it]  
 29%|██▉       | 394/1350 [2:08:54<5:21:14, 20.16s/it][A

{'loss': 1.1285, 'learning_rate': 2e-05, 'epoch': 1.46}



                                                    t][A
  0%|          | 1/1350 [2:10:00<8:49:28, 23.55s/it]  
 29%|██▉       | 395/1350 [2:09:12<5:12:01, 19.60s/it][A

{'loss': 0.8703, 'learning_rate': 2e-05, 'epoch': 1.46}



                                                    t][A
  0%|          | 1/1350 [2:10:19<8:49:28, 23.55s/it]  
 29%|██▉       | 396/1350 [2:09:31<5:09:39, 19.47s/it][A

{'loss': 0.9535, 'learning_rate': 2e-05, 'epoch': 1.47}



                                                    t][A
  0%|          | 1/1350 [2:10:41<8:49:28, 23.55s/it]  
 29%|██▉       | 397/1350 [2:09:54<5:22:11, 20.28s/it][A

{'loss': 1.166, 'learning_rate': 2e-05, 'epoch': 1.47}



                                                    t][A
  0%|          | 1/1350 [2:10:59<8:49:28, 23.55s/it]  
 29%|██▉       | 398/1350 [2:10:12<5:12:17, 19.68s/it][A

{'loss': 0.6918, 'learning_rate': 2e-05, 'epoch': 1.47}



                                                    t][A
  0%|          | 1/1350 [2:11:19<8:49:28, 23.55s/it]  
 30%|██▉       | 399/1350 [2:10:32<5:12:54, 19.74s/it][A

{'loss': 0.9731, 'learning_rate': 2e-05, 'epoch': 1.48}



                                                    t][A
  0%|          | 1/1350 [2:11:39<8:49:28, 23.55s/it]  
 30%|██▉       | 400/1350 [2:10:52<5:14:48, 19.88s/it][A

{'loss': 0.9348, 'learning_rate': 2e-05, 'epoch': 1.48}



                                                    t][A
  0%|          | 1/1350 [2:11:57<8:49:28, 23.55s/it]  
 30%|██▉       | 401/1350 [2:11:10<5:05:04, 19.29s/it][A

{'loss': 1.0799, 'learning_rate': 2e-05, 'epoch': 1.49}



                                                    t][A
  0%|          | 1/1350 [2:12:16<8:49:28, 23.55s/it]  
 30%|██▉       | 402/1350 [2:11:28<5:01:08, 19.06s/it][A

{'loss': 0.8805, 'learning_rate': 2e-05, 'epoch': 1.49}



                                                    t][A
  0%|          | 1/1350 [2:12:36<8:49:28, 23.55s/it]  
 30%|██▉       | 403/1350 [2:11:49<5:07:31, 19.48s/it][A

{'loss': 0.7519, 'learning_rate': 2e-05, 'epoch': 1.49}



                                                    t][A
  0%|          | 1/1350 [2:12:56<8:49:28, 23.55s/it]  
 30%|██▉       | 404/1350 [2:12:08<5:06:11, 19.42s/it][A

{'loss': 0.8837, 'learning_rate': 2e-05, 'epoch': 1.5}



                                                    t][A
  0%|          | 1/1350 [2:13:21<8:49:28, 23.55s/it]  
 30%|███       | 405/1350 [2:12:33<5:33:39, 21.18s/it][A

{'loss': 1.3375, 'learning_rate': 2e-05, 'epoch': 1.5}



                                                    t][A
  0%|          | 1/1350 [2:13:38<8:49:28, 23.55s/it]  
 30%|███       | 406/1350 [2:12:51<5:14:20, 19.98s/it][A

{'loss': 1.0224, 'learning_rate': 2e-05, 'epoch': 1.5}



                                                    t][A
  0%|          | 1/1350 [2:13:57<8:49:28, 23.55s/it]  
 30%|███       | 407/1350 [2:13:10<5:11:03, 19.79s/it][A

{'loss': 1.0625, 'learning_rate': 2e-05, 'epoch': 1.51}



                                                    t][A
  0%|          | 1/1350 [2:14:17<8:49:28, 23.55s/it]  
 30%|███       | 408/1350 [2:13:29<5:07:47, 19.60s/it][A

{'loss': 1.1804, 'learning_rate': 2e-05, 'epoch': 1.51}



                                                    t][A
  0%|          | 1/1350 [2:14:35<8:49:28, 23.55s/it]  
 30%|███       | 409/1350 [2:13:48<5:03:33, 19.36s/it][A

{'loss': 0.8016, 'learning_rate': 2e-05, 'epoch': 1.51}



                                                    t][A
  0%|          | 1/1350 [2:14:53<8:49:28, 23.55s/it]  
 30%|███       | 410/1350 [2:14:05<4:55:01, 18.83s/it][A

{'loss': 0.8629, 'learning_rate': 2e-05, 'epoch': 1.52}



                                                    t][A
  0%|          | 1/1350 [2:15:14<8:49:28, 23.55s/it]  
 30%|███       | 411/1350 [2:14:27<5:05:17, 19.51s/it][A

{'loss': 0.8826, 'learning_rate': 2e-05, 'epoch': 1.52}



                                                    t][A
  0%|          | 1/1350 [2:15:37<8:49:28, 23.55s/it]  
 31%|███       | 412/1350 [2:14:49<5:20:36, 20.51s/it][A

{'loss': 0.9419, 'learning_rate': 2e-05, 'epoch': 1.53}



                                                    t][A
  0%|          | 1/1350 [2:15:54<8:49:28, 23.55s/it]  
 31%|███       | 413/1350 [2:15:07<5:05:53, 19.59s/it][A

{'loss': 0.9025, 'learning_rate': 2e-05, 'epoch': 1.53}



                                                    t][A
  0%|          | 1/1350 [2:16:13<8:49:28, 23.55s/it]  
 31%|███       | 414/1350 [2:15:26<5:02:00, 19.36s/it][A

{'loss': 1.0793, 'learning_rate': 2e-05, 'epoch': 1.53}



                                                    t][A
  0%|          | 1/1350 [2:16:34<8:49:28, 23.55s/it]  
 31%|███       | 415/1350 [2:15:47<5:08:53, 19.82s/it][A

{'loss': 1.0183, 'learning_rate': 2e-05, 'epoch': 1.54}



                                                    t][A
  0%|          | 1/1350 [2:16:54<8:49:28, 23.55s/it]  
 31%|███       | 416/1350 [2:16:07<5:09:36, 19.89s/it][A

{'loss': 0.8291, 'learning_rate': 2e-05, 'epoch': 1.54}



                                                    t][A
  0%|          | 1/1350 [2:17:14<8:49:28, 23.55s/it]  
 31%|███       | 417/1350 [2:16:26<5:08:45, 19.86s/it][A

{'loss': 1.2048, 'learning_rate': 2e-05, 'epoch': 1.54}



                                                    t][A
  0%|          | 1/1350 [2:17:33<8:49:28, 23.55s/it]  
 31%|███       | 418/1350 [2:16:46<5:06:16, 19.72s/it][A

{'loss': 0.986, 'learning_rate': 2e-05, 'epoch': 1.55}



                                                    t][A
  0%|          | 1/1350 [2:17:54<8:49:28, 23.55s/it]  
 31%|███       | 419/1350 [2:17:07<5:11:17, 20.06s/it][A

{'loss': 0.845, 'learning_rate': 2e-05, 'epoch': 1.55}



                                                    t][A
  0%|          | 1/1350 [2:18:15<8:49:28, 23.55s/it]  
 31%|███       | 420/1350 [2:17:27<5:14:29, 20.29s/it][A

{'loss': 1.0042, 'learning_rate': 2e-05, 'epoch': 1.56}



                                                    t][A
  0%|          | 1/1350 [2:18:35<8:49:28, 23.55s/it]  
 31%|███       | 421/1350 [2:17:47<5:12:39, 20.19s/it][A

{'loss': 0.8828, 'learning_rate': 2e-05, 'epoch': 1.56}



                                                    t][A
  0%|          | 1/1350 [2:18:55<8:49:28, 23.55s/it]  
 31%|███▏      | 422/1350 [2:18:07<5:10:37, 20.08s/it][A

{'loss': 0.9288, 'learning_rate': 2e-05, 'epoch': 1.56}



                                                    t][A
  0%|          | 1/1350 [2:19:14<8:49:28, 23.55s/it]  
 31%|███▏      | 423/1350 [2:18:26<5:05:36, 19.78s/it][A

{'loss': 0.7054, 'learning_rate': 2e-05, 'epoch': 1.57}



                                                    t][A
  0%|          | 1/1350 [2:19:34<8:49:28, 23.55s/it]  
 31%|███▏      | 424/1350 [2:18:47<5:07:41, 19.94s/it][A

{'loss': 0.7633, 'learning_rate': 2e-05, 'epoch': 1.57}



                                                    t][A
  0%|          | 1/1350 [2:19:51<8:49:28, 23.55s/it]  
 31%|███▏      | 425/1350 [2:19:04<4:54:19, 19.09s/it][A

{'loss': 0.7754, 'learning_rate': 2e-05, 'epoch': 1.57}



                                                    t][A
  0%|          | 1/1350 [2:20:14<8:49:28, 23.55s/it]  
 32%|███▏      | 426/1350 [2:19:26<5:09:23, 20.09s/it][A

{'loss': 0.8997, 'learning_rate': 2e-05, 'epoch': 1.58}



                                                    t][A
  0%|          | 1/1350 [2:20:34<8:49:28, 23.55s/it]  
 32%|███▏      | 427/1350 [2:19:47<5:11:26, 20.25s/it][A

{'loss': 1.0507, 'learning_rate': 2e-05, 'epoch': 1.58}



                                                    t][A
  0%|          | 1/1350 [2:20:54<8:49:28, 23.55s/it]  
 32%|███▏      | 428/1350 [2:20:07<5:09:22, 20.13s/it][A

{'loss': 1.0435, 'learning_rate': 2e-05, 'epoch': 1.59}



                                                    t][A
  0%|          | 1/1350 [2:21:13<8:49:28, 23.55s/it]  
 32%|███▏      | 429/1350 [2:20:26<5:04:18, 19.82s/it][A

{'loss': 0.6594, 'learning_rate': 2e-05, 'epoch': 1.59}



                                                    t][A
  0%|          | 1/1350 [2:21:32<8:49:28, 23.55s/it]  
 32%|███▏      | 430/1350 [2:20:44<4:56:48, 19.36s/it][A

{'loss': 0.7741, 'learning_rate': 2e-05, 'epoch': 1.59}



                                                    t][A
  0%|          | 1/1350 [2:21:50<8:49:28, 23.55s/it]  
 32%|███▏      | 431/1350 [2:21:02<4:51:37, 19.04s/it][A

{'loss': 1.1059, 'learning_rate': 2e-05, 'epoch': 1.6}



                                                    t][A
  0%|          | 1/1350 [2:22:10<8:49:28, 23.55s/it]  
 32%|███▏      | 432/1350 [2:21:23<4:56:54, 19.41s/it][A

{'loss': 0.7822, 'learning_rate': 2e-05, 'epoch': 1.6}



                                                    t][A
  0%|          | 1/1350 [2:22:28<8:49:28, 23.55s/it]  
 32%|███▏      | 433/1350 [2:21:41<4:50:31, 19.01s/it][A

{'loss': 0.8304, 'learning_rate': 2e-05, 'epoch': 1.6}



                                                    t][A
  0%|          | 1/1350 [2:22:47<8:49:28, 23.55s/it]  
 32%|███▏      | 434/1350 [2:21:59<4:49:13, 18.94s/it][A

{'loss': 0.952, 'learning_rate': 2e-05, 'epoch': 1.61}



                                                    t][A
  0%|          | 1/1350 [2:23:08<8:49:28, 23.55s/it]  
 32%|███▏      | 435/1350 [2:22:20<4:58:19, 19.56s/it][A

{'loss': 0.9755, 'learning_rate': 2e-05, 'epoch': 1.61}



                                                    t][A
  0%|          | 1/1350 [2:23:29<8:49:28, 23.55s/it]  
 32%|███▏      | 436/1350 [2:22:42<5:05:18, 20.04s/it][A

{'loss': 1.013, 'learning_rate': 2e-05, 'epoch': 1.61}



                                                    t][A
  0%|          | 1/1350 [2:23:47<8:49:28, 23.55s/it]  
 32%|███▏      | 437/1350 [2:22:59<4:54:59, 19.39s/it][A

{'loss': 0.9188, 'learning_rate': 2e-05, 'epoch': 1.62}



                                                    t][A
  0%|          | 1/1350 [2:24:06<8:49:28, 23.55s/it]  
 32%|███▏      | 438/1350 [2:23:18<4:51:00, 19.15s/it][A

{'loss': 1.0722, 'learning_rate': 2e-05, 'epoch': 1.62}



                                                    t][A
  0%|          | 1/1350 [2:24:23<8:49:28, 23.55s/it]  
 33%|███▎      | 439/1350 [2:23:35<4:40:59, 18.51s/it][A

{'loss': 0.8029, 'learning_rate': 2e-05, 'epoch': 1.63}



                                                    t][A
  0%|          | 1/1350 [2:24:46<8:49:28, 23.55s/it]  
 33%|███▎      | 440/1350 [2:23:58<5:01:37, 19.89s/it][A

{'loss': 0.8774, 'learning_rate': 2e-05, 'epoch': 1.63}



                                                    t][A
  0%|          | 1/1350 [2:25:03<8:49:28, 23.55s/it]  
 33%|███▎      | 441/1350 [2:24:15<4:47:45, 18.99s/it][A

{'loss': 0.9157, 'learning_rate': 2e-05, 'epoch': 1.63}



                                                    t][A
  0%|          | 1/1350 [2:25:23<8:49:28, 23.55s/it]  
 33%|███▎      | 442/1350 [2:24:35<4:53:38, 19.40s/it][A

{'loss': 0.9619, 'learning_rate': 2e-05, 'epoch': 1.64}



                                                    t][A
  0%|          | 1/1350 [2:25:41<8:49:28, 23.55s/it]  
 33%|███▎      | 443/1350 [2:24:54<4:48:35, 19.09s/it][A

{'loss': 0.8164, 'learning_rate': 2e-05, 'epoch': 1.64}



                                                    t][A
  0%|          | 1/1350 [2:26:01<8:49:28, 23.55s/it]  
 33%|███▎      | 444/1350 [2:25:13<4:50:21, 19.23s/it][A

{'loss': 0.8753, 'learning_rate': 2e-05, 'epoch': 1.64}



                                                    t][A
  0%|          | 1/1350 [2:26:20<8:49:28, 23.55s/it]  
 33%|███▎      | 445/1350 [2:25:32<4:47:23, 19.05s/it][A

{'loss': 1.1227, 'learning_rate': 2e-05, 'epoch': 1.65}



                                                    t][A
  0%|          | 1/1350 [2:26:39<8:49:28, 23.55s/it]  
 33%|███▎      | 446/1350 [2:25:52<4:50:55, 19.31s/it][A

{'loss': 0.8894, 'learning_rate': 2e-05, 'epoch': 1.65}



                                                    t][A
  0%|          | 1/1350 [2:26:57<8:49:28, 23.55s/it]  
 33%|███▎      | 447/1350 [2:26:10<4:44:28, 18.90s/it][A

{'loss': 0.973, 'learning_rate': 2e-05, 'epoch': 1.66}



                                                    t][A
  0%|          | 1/1350 [2:27:15<8:49:28, 23.55s/it]  
 33%|███▎      | 448/1350 [2:26:28<4:39:58, 18.62s/it][A

{'loss': 0.8647, 'learning_rate': 2e-05, 'epoch': 1.66}



                                                    t][A
  0%|          | 1/1350 [2:27:35<8:49:28, 23.55s/it]  
 33%|███▎      | 449/1350 [2:26:47<4:43:00, 18.85s/it][A

{'loss': 1.0778, 'learning_rate': 2e-05, 'epoch': 1.66}



                                                    t][A
  0%|          | 1/1350 [2:27:55<8:49:28, 23.55s/it]  
 33%|███▎      | 450/1350 [2:27:07<4:46:57, 19.13s/it][A

{'loss': 0.9414, 'learning_rate': 2e-05, 'epoch': 1.67}



                                                    t][A
  0%|          | 1/1350 [2:28:13<8:49:28, 23.55s/it]  
 33%|███▎      | 451/1350 [2:27:26<4:44:34, 18.99s/it][A

{'loss': 0.9415, 'learning_rate': 2e-05, 'epoch': 1.67}



                                                    t][A
  0%|          | 1/1350 [2:28:31<8:49:28, 23.55s/it]  
 33%|███▎      | 452/1350 [2:27:43<4:37:05, 18.51s/it][A

{'loss': 0.9842, 'learning_rate': 2e-05, 'epoch': 1.67}



                                                    t][A
  0%|          | 1/1350 [2:28:52<8:49:28, 23.55s/it]  
 34%|███▎      | 453/1350 [2:28:04<4:47:44, 19.25s/it][A

{'loss': 0.9097, 'learning_rate': 2e-05, 'epoch': 1.68}



                                                    t][A
  0%|          | 1/1350 [2:29:11<8:49:28, 23.55s/it]  

{'loss': 0.9119, 'learning_rate': 2e-05, 'epoch': 1.68}



 34%|███▎      | 454/1350 [2:28:23<4:46:48, 19.21s/it][A
                                                    t][A
  0%|          | 1/1350 [2:29:31<8:49:28, 23.55s/it]  
 34%|███▎      | 455/1350 [2:28:43<4:50:18, 19.46s/it][A

{'loss': 0.905, 'learning_rate': 2e-05, 'epoch': 1.69}



                                                    t][A
  0%|          | 1/1350 [2:29:50<8:49:28, 23.55s/it]  
 34%|███▍      | 456/1350 [2:29:03<4:49:30, 19.43s/it][A

{'loss': 0.7439, 'learning_rate': 2e-05, 'epoch': 1.69}



                                                    t][A
  0%|          | 1/1350 [2:30:07<8:49:28, 23.55s/it]  
 34%|███▍      | 457/1350 [2:29:20<4:39:25, 18.77s/it][A

{'loss': 0.9904, 'learning_rate': 2e-05, 'epoch': 1.69}



                                                    t][A
  0%|          | 1/1350 [2:30:27<8:49:28, 23.55s/it]  
 34%|███▍      | 458/1350 [2:29:39<4:42:02, 18.97s/it][A

{'loss': 0.8771, 'learning_rate': 2e-05, 'epoch': 1.7}



                                                    t][A
  0%|          | 1/1350 [2:30:45<8:49:28, 23.55s/it]  
 34%|███▍      | 459/1350 [2:29:58<4:39:36, 18.83s/it][A

{'loss': 0.9494, 'learning_rate': 2e-05, 'epoch': 1.7}



                                                    t][A
  0%|          | 1/1350 [2:31:04<8:49:28, 23.55s/it]  
 34%|███▍      | 460/1350 [2:30:17<4:41:07, 18.95s/it][A

{'loss': 0.976, 'learning_rate': 2e-05, 'epoch': 1.7}



                                                    t][A
  0%|          | 1/1350 [2:31:25<8:49:28, 23.55s/it]  
 34%|███▍      | 461/1350 [2:30:37<4:47:05, 19.38s/it][A

{'loss': 0.8648, 'learning_rate': 2e-05, 'epoch': 1.71}



                                                    t][A
  0%|          | 1/1350 [2:31:43<8:49:28, 23.55s/it]  
 34%|███▍      | 462/1350 [2:30:55<4:39:35, 18.89s/it][A

{'loss': 0.6113, 'learning_rate': 2e-05, 'epoch': 1.71}



                                                    t][A
  0%|          | 1/1350 [2:32:00<8:49:28, 23.55s/it]  
 34%|███▍      | 463/1350 [2:31:13<4:34:14, 18.55s/it][A

{'loss': 1.0269, 'learning_rate': 2e-05, 'epoch': 1.71}



                                                    t][A
  0%|          | 1/1350 [2:32:22<8:49:28, 23.55s/it]  
 34%|███▍      | 464/1350 [2:31:34<4:46:03, 19.37s/it][A

{'loss': 0.9822, 'learning_rate': 2e-05, 'epoch': 1.72}



                                                    t][A
  0%|          | 1/1350 [2:32:41<8:49:28, 23.55s/it]  
 34%|███▍      | 465/1350 [2:31:54<4:46:13, 19.40s/it][A

{'loss': 0.8202, 'learning_rate': 2e-05, 'epoch': 1.72}



                                                    t][A
  0%|          | 1/1350 [2:33:02<8:49:28, 23.55s/it]  
 35%|███▍      | 466/1350 [2:32:15<4:53:14, 19.90s/it][A

{'loss': 0.8614, 'learning_rate': 2e-05, 'epoch': 1.73}



                                                    t][A
  0%|          | 1/1350 [2:33:22<8:49:28, 23.55s/it]  
 35%|███▍      | 467/1350 [2:32:34<4:52:12, 19.86s/it][A

{'loss': 0.7149, 'learning_rate': 2e-05, 'epoch': 1.73}



                                                    t][A
  0%|          | 1/1350 [2:33:41<8:49:28, 23.55s/it]  
 35%|███▍      | 468/1350 [2:32:54<4:49:43, 19.71s/it][A

{'loss': 1.071, 'learning_rate': 2e-05, 'epoch': 1.73}



                                                    t][A
  0%|          | 1/1350 [2:34:01<8:49:28, 23.55s/it]  
 35%|███▍      | 469/1350 [2:33:13<4:48:50, 19.67s/it][A

{'loss': 0.8317, 'learning_rate': 2e-05, 'epoch': 1.74}



                                                    t][A
  0%|          | 1/1350 [2:34:21<8:49:28, 23.55s/it]  
 35%|███▍      | 470/1350 [2:33:33<4:49:50, 19.76s/it][A

{'loss': 1.089, 'learning_rate': 2e-05, 'epoch': 1.74}



                                                    t][A
  0%|          | 1/1350 [2:34:44<8:49:28, 23.55s/it]  
 35%|███▍      | 471/1350 [2:33:56<5:03:22, 20.71s/it][A

{'loss': 1.6451, 'learning_rate': 2e-05, 'epoch': 1.74}



                                                    t][A
  0%|          | 1/1350 [2:35:01<8:49:28, 23.55s/it]  
 35%|███▍      | 472/1350 [2:34:14<4:49:45, 19.80s/it][A

{'loss': 0.9793, 'learning_rate': 2e-05, 'epoch': 1.75}



                                                    t][A
  0%|          | 1/1350 [2:35:22<8:49:28, 23.55s/it]  
 35%|███▌      | 473/1350 [2:34:34<4:52:22, 20.00s/it][A

{'loss': 0.9791, 'learning_rate': 2e-05, 'epoch': 1.75}



                                                    t][A
  0%|          | 1/1350 [2:35:42<8:49:28, 23.55s/it]  
 35%|███▌      | 474/1350 [2:34:54<4:52:03, 20.00s/it][A

{'loss': 0.8881, 'learning_rate': 2e-05, 'epoch': 1.76}



                                                    t][A
  0%|          | 1/1350 [2:36:00<8:49:28, 23.55s/it]  
 35%|███▌      | 475/1350 [2:35:12<4:42:30, 19.37s/it][A

{'loss': 0.7654, 'learning_rate': 2e-05, 'epoch': 1.76}



                                                    t][A
  0%|          | 1/1350 [2:36:19<8:49:28, 23.55s/it]  
 35%|███▌      | 476/1350 [2:35:31<4:40:48, 19.28s/it][A

{'loss': 0.7818, 'learning_rate': 2e-05, 'epoch': 1.76}



                                                    t][A
  0%|          | 1/1350 [2:36:40<8:49:28, 23.55s/it]  
 35%|███▌      | 477/1350 [2:35:52<4:47:26, 19.76s/it][A

{'loss': 0.8141, 'learning_rate': 2e-05, 'epoch': 1.77}



                                                    t][A
  0%|          | 1/1350 [2:36:58<8:49:28, 23.55s/it]  
 35%|███▌      | 478/1350 [2:36:10<4:38:43, 19.18s/it][A

{'loss': 0.9512, 'learning_rate': 2e-05, 'epoch': 1.77}



                                                    t][A
  0%|          | 1/1350 [2:37:16<8:49:28, 23.55s/it]  
 35%|███▌      | 479/1350 [2:36:29<4:35:53, 19.01s/it][A

{'loss': 1.0581, 'learning_rate': 2e-05, 'epoch': 1.77}



                                                    t][A
  0%|          | 1/1350 [2:37:34<8:49:28, 23.55s/it]  
 36%|███▌      | 480/1350 [2:36:47<4:31:38, 18.73s/it][A

{'loss': 1.0774, 'learning_rate': 2e-05, 'epoch': 1.78}



                                                    t][A
  0%|          | 1/1350 [2:37:53<8:49:28, 23.55s/it]  
 36%|███▌      | 481/1350 [2:37:05<4:30:44, 18.69s/it][A

{'loss': 0.899, 'learning_rate': 2e-05, 'epoch': 1.78}



                                                    t][A
  0%|          | 1/1350 [2:38:13<8:49:28, 23.55s/it]  
 36%|███▌      | 482/1350 [2:37:26<4:36:41, 19.13s/it][A

{'loss': 0.7886, 'learning_rate': 2e-05, 'epoch': 1.79}



                                                    t][A
  0%|          | 1/1350 [2:38:31<8:49:28, 23.55s/it]  
 36%|███▌      | 483/1350 [2:37:44<4:31:30, 18.79s/it][A

{'loss': 0.969, 'learning_rate': 2e-05, 'epoch': 1.79}



                                                    t][A
  0%|          | 1/1350 [2:38:51<8:49:28, 23.55s/it]  
 36%|███▌      | 484/1350 [2:38:04<4:38:18, 19.28s/it][A

{'loss': 0.6556, 'learning_rate': 2e-05, 'epoch': 1.79}



                                                    t][A
  0%|          | 1/1350 [2:39:09<8:49:28, 23.55s/it]  
 36%|███▌      | 485/1350 [2:38:22<4:32:10, 18.88s/it][A

{'loss': 0.9568, 'learning_rate': 2e-05, 'epoch': 1.8}



                                                    t][A
  0%|          | 1/1350 [2:39:31<8:49:28, 23.55s/it]  
 36%|███▌      | 486/1350 [2:38:43<4:43:30, 19.69s/it][A

{'loss': 1.1716, 'learning_rate': 2e-05, 'epoch': 1.8}



                                                    t][A
  0%|          | 1/1350 [2:39:50<8:49:28, 23.55s/it]  
 36%|███▌      | 487/1350 [2:39:03<4:41:50, 19.59s/it][A

{'loss': 1.0293, 'learning_rate': 2e-05, 'epoch': 1.8}



                                                    t][A
  0%|          | 1/1350 [2:40:12<8:49:28, 23.55s/it]  
 36%|███▌      | 488/1350 [2:39:24<4:49:26, 20.15s/it][A

{'loss': 0.6453, 'learning_rate': 2e-05, 'epoch': 1.81}



                                                    t][A
  0%|          | 1/1350 [2:40:32<8:49:28, 23.55s/it]  
 36%|███▌      | 489/1350 [2:39:45<4:49:29, 20.17s/it][A

{'loss': 0.8583, 'learning_rate': 2e-05, 'epoch': 1.81}



                                                    t][A
  0%|          | 1/1350 [2:40:50<8:49:28, 23.55s/it]  
 36%|███▋      | 490/1350 [2:40:03<4:41:11, 19.62s/it][A

{'loss': 0.9042, 'learning_rate': 2e-05, 'epoch': 1.81}



                                                    t][A
  0%|          | 1/1350 [2:41:10<8:49:28, 23.55s/it]  
 36%|███▋      | 491/1350 [2:40:23<4:42:19, 19.72s/it][A

{'loss': 0.8932, 'learning_rate': 2e-05, 'epoch': 1.82}



                                                    t][A
  0%|          | 1/1350 [2:41:27<8:49:28, 23.55s/it]  
 36%|███▋      | 492/1350 [2:40:39<4:28:19, 18.76s/it][A

{'loss': 0.6144, 'learning_rate': 2e-05, 'epoch': 1.82}



                                                    t][A
  0%|          | 1/1350 [2:41:47<8:49:28, 23.55s/it]  
 37%|███▋      | 493/1350 [2:40:59<4:32:22, 19.07s/it][A

{'loss': 0.8905, 'learning_rate': 2e-05, 'epoch': 1.83}



                                                    t][A
  0%|          | 1/1350 [2:42:07<8:49:28, 23.55s/it]  
 37%|███▋      | 494/1350 [2:41:19<4:36:01, 19.35s/it][A

{'loss': 0.9108, 'learning_rate': 2e-05, 'epoch': 1.83}



                                                    t][A
  0%|          | 1/1350 [2:42:27<8:49:28, 23.55s/it]  
 37%|███▋      | 495/1350 [2:41:40<4:41:13, 19.74s/it][A

{'loss': 0.9893, 'learning_rate': 2e-05, 'epoch': 1.83}



                                                    t][A
  0%|          | 1/1350 [2:42:48<8:49:28, 23.55s/it]  
 37%|███▋      | 496/1350 [2:42:00<4:44:40, 20.00s/it][A

{'loss': 0.6814, 'learning_rate': 2e-05, 'epoch': 1.84}



                                                    t][A
  0%|          | 1/1350 [2:43:07<8:49:28, 23.55s/it]  
 37%|███▋      | 497/1350 [2:42:19<4:39:40, 19.67s/it][A

{'loss': 0.7628, 'learning_rate': 2e-05, 'epoch': 1.84}



                                                    t][A
  0%|          | 1/1350 [2:43:26<8:49:28, 23.55s/it]  
 37%|███▋      | 498/1350 [2:42:39<4:37:43, 19.56s/it][A

{'loss': 0.9163, 'learning_rate': 2e-05, 'epoch': 1.84}



                                                    t][A
  0%|          | 1/1350 [2:43:45<8:49:28, 23.55s/it]  
 37%|███▋      | 499/1350 [2:42:58<4:36:44, 19.51s/it][A

{'loss': 0.9997, 'learning_rate': 2e-05, 'epoch': 1.85}



                                                    t][A
  0%|          | 1/1350 [2:44:03<8:49:28, 23.55s/it]  


{'loss': 0.9896, 'learning_rate': 2e-05, 'epoch': 1.85}


AttributeError: 'LlamaForCausalLM' object has no attribute 'save_checkpoint'

In [59]:
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
  0%|          | 1/1350 [4:54:38<6624:21:56, 17678.07s/it]
 37%|███▋      | 500/1350 [4:53:50<8:19:31, 35.26s/it]
  0%|          | 1/1350 [00:23<8:49:10, 23.54s/it]

{'loss': 1.2017, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 2/1350 [00:44<8:20:21, 22.27s/it]

{'loss': 1.0156, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}


KeyboardInterrupt: 

### Try to train from checkpoint

In [40]:
trainer.train(resume_from_checkpoint=True)

ValueError: Can't find a valid checkpoint at output_dir_8k_big_data/checkpoint-500

### Push learn model to hf

In [55]:
from huggingface_hub import login
login("")
model_id = "nvdenisov2002/llama-longLoRA-v2"
peft_model.push_to_hub(model_id)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /tmp/xdg_cache/huggingface/token
Login successful




adapter_model.bin:   0%|          | 0.00/1.08G [00:00<?, ?B/s][A[A

adapter_model.bin:   0%|          | 8.19k/1.08G [00:00<7:19:38, 41.0kB/s][A[A

adapter_model.bin:   0%|          | 336k/1.08G [00:00<14:54, 1.21MB/s]   [A[A

adapter_model.bin:   0%|          | 2.06M/1.08G [00:00<02:59, 6.01MB/s][A[A

adapter_model.bin:   0%|          | 5.27M/1.08G [00:00<01:31, 11.8MB/s][A[A

adapter_model.bin:   1%|          | 8.15M/1.08G [00:00<01:08, 15.7MB/s][A[A

adapter_model.bin:   1%|          | 12.1M/1.08G [00:00<00:50, 21.2MB/s][A[A

adapter_model.bin:   1%|▏         | 16.0M/1.08G [00:01<01:29, 11.9MB/s][A[A

adapter_model.bin:   2%|▏         | 24.4M/1.08G [00:01<00:46, 22.7MB/s][A[A

adapter_model.bin:   3%|▎         | 32.0M/1.08G [00:02<00:56, 18.4MB/s][A[A

adapter_model.bin:   4%|▎         | 40.0M/1.08G [00:02<00:43, 23.8MB/s][A[A

adapter_model.bin:   4%|▍         | 43.5M/1.08G [00:02<00:45, 22.6MB/s][A[A

adapter_model.bin:   4%|▍         | 46.5M/1.08G [00:02<

CommitInfo(commit_url='https://huggingface.co/nvdenisov2002/llama-longLoRA-v2/commit/25186ac0cee59ead03b56ef117cddfcd942b26ad', commit_message='Upload model', commit_description='', oid='25186ac0cee59ead03b56ef117cddfcd942b26ad', pr_url=None, pr_revision=None, pr_num=None)

### Infer learnt model on asessors questions

In [57]:
device='cuda'

In [58]:
# test_df = pd.read_csv("/home/jupyter/mnt/datasets/diplomas/russian_dataset/russian_dataset_test.csv")
# with open(REPOSITOTY_DIR_PATH.joinpath("src/notebooks/junk/mcs_df_human_filled_processed.json"), "r") as f:
#     asessors_questions = json.load(f) 
# ids = [int(x['meta']['id']) for x in asessors_questions]
test_df = pd.read_csv(ARTIFACTS_DIR_PATH.joinpath("diplomas_abstracts/mcs_raw_learnt_abstract.csv"))
diploma_prefix_len=8000

def get_some_model_result(prefix_len, prefix_tokens, some_model):
    some_model.eval()
    generated = some_model.generate(prefix_tokens.reshape((1, -1)).to(device))
    generated_continue = tokenizer.decode(generated.to('cpu').flatten()[prefix_len:])
    return generated_continue

def get_prefix_len_and_tokens(row):
    prompt_input_diploma = PROMPT_DICT["prompt_input_diploma_special"]
    source = prompt_input_diploma.format(input=row["diploma"][:diploma_prefix_len])

    target = f"{row['abstract']}{tokenizer.eos_token}"

    data_dict = preprocess([source], [target], tokenizer)
    
    prefix_len = np.sum(np.array(data_dict["labels"][0]) == IGNORE_INDEX)
    prefix_tokens = data_dict["input_ids"][0][:prefix_len]

    return prefix_len, prefix_tokens

new_rows = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Rows..."):
    new_row = copy.deepcopy(row)
    prefix_len, prefix_tokens = get_prefix_len_and_tokens(row)
    new_row["learnt_8k"] = get_some_model_result(prefix_len, prefix_tokens, peft_model)
    new_rows.append(new_row)
new_df = pd.DataFrame(new_rows)
new_df.to_csv(ARTIFACTS_DIR_PATH.joinpath("diplomas_abstracts/mcs_raw_learnt_abstract_learnt8k.csv"))
new_df.head()

Rows...:   0%|          | 0/70 [00:00<?, ?it/s]



Unnamed: 0.1,Unnamed: 0,id,year,diploma,abstract,study_field,degree,original_diploma_extension,raw_model,learnt,learnt_8k
0,12,45042,2023,АЙВАЗЬЯН Аршак Владимирович\nВыпускная квалифи...,В этой работе мы строим правую трансферную мод...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматривается модельная структура н...,В данной работе рассматривается модельная стру...,В работе рассматриваются алгебраические теории...
1,25,45043,2023,Санкт-Петербургский государственный университе...,"Пусть 𝐾 выпуклое тело в ℝ^𝑛. Определим 𝑑𝑛,𝑛−1(...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Плотность решетки трансляций - это минимальная...,В работе рассматриваются плотности решеток тра...,В
2,37,45044,2023,Санкт-Петербургский государственный университе...,Работа посвящена повышению производительности ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе представлены результаты исслед...,В работе рассматривается задача булевой выполн...,In this work we propose a method for improving...
3,101,45046,2023,Санкт-Петербургский государственный университе...,В работе мы обобщаем результаты об энергии нат...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматриваются классы случайных проц...,В данной работе рассматривается энергетически-...,В работе мы обобщаем результаты об энергии нат...
4,152,45047,2023,Санкт–Петербургский государственный университе...,В рамках данной работы рассматривается подход ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе рассматривается задача настраи...,В данной работе рассматривается задача добавле...,В работе рассматривается применение добавления...


### Comparing lernt vs learn_8k

In [5]:
! ls ../../../artifacts/

[1m[36mbook_full_texts[m[m    [1m[36mdiplomas_abstracts[m[m [1m[36mmetrics[m[m            [1m[36mtokens[m[m
[1m[36mdatasets[m[m           [1m[36mjunk[m[m               [1m[36mparsing[m[m


In [7]:
import pandas as pd
df = pd.read_csv("../../../artifacts/diplomas_abstracts/mcs_raw_learnt_abstract_learnt8k.csv")

In [23]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,year,diploma,abstract,study_field,degree,original_diploma_extension,raw_model,learnt,learnt_8k
0,0,12,45042,2023,АЙВАЗЬЯН Аршак Владимирович\nВыпускная квалифи...,В этой работе мы строим правую трансферную мод...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматривается модельная структура н...,В данной работе рассматривается модельная стру...,В работе рассматриваются алгебраические теории...
1,1,25,45043,2023,Санкт-Петербургский государственный университе...,"Пусть 𝐾 выпуклое тело в ℝ^𝑛. Определим 𝑑𝑛,𝑛−1(...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Плотность решетки трансляций - это минимальная...,В работе рассматриваются плотности решеток тра...,В
2,2,37,45044,2023,Санкт-Петербургский государственный университе...,Работа посвящена повышению производительности ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе представлены результаты исслед...,В работе рассматривается задача булевой выполн...,In this work we propose a method for improving...
3,3,101,45046,2023,Санкт-Петербургский государственный университе...,В работе мы обобщаем результаты об энергии нат...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматриваются классы случайных проц...,В данной работе рассматривается энергетически-...,В работе мы обобщаем результаты об энергии нат...
4,4,152,45047,2023,Санкт–Петербургский государственный университе...,В рамках данной работы рассматривается подход ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе рассматривается задача настраи...,В данной работе рассматривается задача добавле...,В работе рассматривается применение добавления...


In [26]:
import numpy as np

np.mean(list(map(len, df["learnt"]))), np.mean(list(map(len, df["raw_model"]))), np.mean(list(map(len, df["learnt_8k"].astype(str)))), np.mean(list(map(len, df["abstract"])))

(700.0428571428571, 446.9428571428571, 444.0285714285714, 587.1285714285714)

In [20]:
print(*df["learnt"].tolist(), sep='\n')

В данной работе рассматривается модельная структура алгебр над теорией Ловера.</s>
В работе рассматриваются плотности решеток трансляций в трёхмерном пространстве. В частности, получается оценка плотности решеток трансляций для любой замкнутой выпуклой конвексной области 𝐾 в трёхмерном пространстве.</s>
В работе рассматривается задача булевой выполнимости. Помимо известных методов решения, в работе предлагается использовать методы машинного обучения для поиска эвристических правил. Для этого используются методы сжатия данных и методы машинного обучения.</s>
В данной работе рассматривается энергетически-эффективная аппроксимация многомерных случайных процессов. В качестве аппроксимационных процессов рассматриваются процессы с медленно убывающими вторыми моментами, а также их комбинации. В качестве энергетически-эффективной аппроксимации рассматривается процесс с медленно убывающими вторыми моментами, а также его комбинация с процессом с медленно убывающими вторыми моментами. В работе ра

In [19]:
print(*df["learnt_8k"].tolist(), sep='\n')

В работе рассматриваются алгебраические теории над конечнопорожденными свободными T -алгебрами. Определяется структура монад на T -объектах, которые оказываются финитарными. Предлагается определение алгебраической теории в терминах монад.</s>
В
In this work we propose a method for improving the performance of SAT solvers by embedding a graph neural network into a state-of-the-art solver MiniSat. The network predicts the next variable to branch on during each step based on the current state of the problem. The proposed method is compared with the original Graph-Q-SAT and other solvers on a set of SAT instances from the Uniform Random-3-SAT distribution and achieves better results in terms of wall-clock time and the number of branching decisions required to solve an instance.</s>
В работе мы обобщаем результаты об энергии натянутых струн, сопровождающих Винеровский процесс, на случай многомерного Винеровского процесса.</s>
В работе рассматривается применение добавления признаков композиц

In [16]:
print(*list(zip(df["learnt"], df["learnt_8k"])), sep='\n\n')

('В данной работе рассматривается модельная структура алгебр над теорией Ловера.</s>', 'В работе рассматриваются алгебраические теории над конечнопорожденными свободными T -алгебрами. Определяется структура монад на T -объектах, которые оказываются финитарными. Предлагается определение алгебраической теории в терминах монад.</s>')

('В работе рассматриваются плотности решеток трансляций в трёхмерном пространстве. В частности, получается оценка плотности решеток трансляций для любой замкнутой выпуклой конвексной области 𝐾 в трёхмерном пространстве.</s>', 'В')

('В работе рассматривается задача булевой выполнимости. Помимо известных методов решения, в работе предлагается использовать методы машинного обучения для поиска эвристических правил. Для этого используются методы сжатия данных и методы машинного обучения.</s>', 'In this work we propose a method for improving the performance of SAT solvers by embedding a graph neural network into a state-of-the-art solver MiniSat. The network pred

### Comparing learn_8k vs longlora-64k & 16k

In [31]:
import pandas as pd
df = pd.read_csv("../../../artifacts/diplomas_abstracts/baselines_with_learnt_16k.csv")
df.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'id', 'year', 'diploma',
       'abstract', 'study_field', 'degree', 'original_diploma_extension',
       'raw_model', 'learnt', 'learnt_8k', 'learnt_16k'],
      dtype='object')

In [32]:
df["learnt_16k"].tolist()

['#',
 'Д',
 '.',
 '</s>',
 '</s>',
 'Д',
 '4',
 'жи',
 '</s>',
 'ции',
 '</s>',
 '</s>',
 '∩',
 '</s>',
 'lications',
 '0',
 'сли',
 'Proof',
 '</s>',
 'Command',
 'на',
 '6',
 ').',
 'reach',
 '</s>',
 'W',
 'ч',
 '</s>',
 'для',
 'a',
 '∈',
 '</s>',
 '</s>',
 'э',
 'пара',
 'ти',
 '</s>',
 '</s>',
 '</s>',
 '�',
 '9',
 '(',
 '×',
 nan,
 '7',
 '</s>',
 '</s>',
 '</s>',
 'сту',
 'ти',
 'вы',
 '</s>',
 nan,
 '�',
 '</s>',
 'ors',
 'бка',
 'бер',
 '</s>',
 '[',
 'да',
 ',',
 '</s>',
 'problem',
 '</s>',
 'ной',
 '</s>',
 ',',
 'мо',
 'би']

In [34]:
import numpy as np

np.mean(list(map(len, df["learnt"]))), np.mean(list(map(len, df["learnt_8k"].astype(str)))), np.mean(list(map(len, df["learnt_16k"].astype(str))))

(700.0428571428571, 444.0285714285714, 2.842857142857143)