## Imports & Definitions

In [1]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.",
)

warnings.filterwarnings(
    "ignore",
    message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.",
)


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import sys
sys.path.append("../../utils")
from definitions import *

In [13]:
! ls ../../../../

LongLoRA-diploma-research
OpenBookQA
YandexGPT-api-call_ru.ipynb
cache
dataflow_en.ipynb
gpt-week
long_context_LLMs
modelcache
nlp_course
venv
view_machine.ipynb
wandb
wandb_try.ipynb


In [4]:
CACHE_DIR = Path("../../../../cache/")
DATASET_DIR = Path("/home/jupyter/mnt/datasets/diplomas/russian_dataset/")

## Example of Fine-Tuning

### Helper imports & definitions

In [5]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np



In [6]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
    "prompt_no_input_llama2":(
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
    "prompt_input_llama2": (
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} \n{input} [/INST]"
    ),
    "prompt_llama2": "[INST]{instruction}[/INST]",
    "prompt_input_diploma_special":(
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\nBelow is a diploma text. Your task is to generate abstract of this diploma.\n\n### Input:\n{input}\n\n### Response:"
    ),
}

In [7]:
from typing import Dict, Optional, Sequence

def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

In [8]:
from torch.utils.data import Dataset
import logging

class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, nrows: int, diploma_prefix_len: int):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        data_table = pd.read_csv(data_path, nrows=nrows)

        logging.warning("Formatting inputs...")

        prompt_input_diploma = PROMPT_DICT["prompt_input_diploma_special"]
        sources = [
            prompt_input_diploma.format(input=diploma[:diploma_prefix_len])
            for diploma in data_table["diploma"]
        ]

        targets = [f"{abstract}{tokenizer.eos_token}" for abstract in data_table["abstract"]]

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

### Downloading model & tokenizer

In [9]:
model_name = LLAMA_2_7B

In [10]:
model = AutoModelForCausalLM.from_pretrained(HUGGINGFACE_MODEL_TO_REPO[model_name], cache_dir=CACHE_DIR, device_map='auto')

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.67s/it]


In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [13]:
MODEL_MAX_LENGTH = 16384

In [14]:
tokenizer = AutoTokenizer.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[model_name], 
    cache_dir=CACHE_DIR, 
    model_max_length=MODEL_MAX_LENGTH,
    padding_side="right",
    use_fast=True)

In [15]:
tokenizer.model_max_length

16384

### Add new tokens

In [16]:
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)

Using pad_token, but it is not set yet.


### Load train/val/test datasets

In [17]:
train_dataset = SupervisedDataset(DATASET_DIR.joinpath("russian_dataset_train.csv"), tokenizer, nrows=720, diploma_prefix_len=1000)



In [18]:
val_dataset = SupervisedDataset(DATASET_DIR.joinpath("russian_dataset_val.csv"), tokenizer,  nrows=720, diploma_prefix_len=1000)



In [49]:
test_dataset = SupervisedDataset(DATASET_DIR.joinpath("russian_dataset_test.csv"), tokenizer,  nrows=10, diploma_prefix_len=1000)



### Ensure that diploma_prefix_len is ok

In [20]:
list(set(test_dataset[9]["labels"].tolist()))[:30]

[2,
 517,
 24072,
 24585,
 23567,
 8211,
 531,
 26133,
 1046,
 2583,
 1561,
 24090,
 6687,
 14367,
 10786,
 2082,
 551,
 23082,
 8747,
 6188,
 6193,
 6195,
 1587,
 1086,
 3648,
 1604,
 25671,
 15432,
 1097,
 16970]

### View how dataset built

In [56]:
text_labels = tokenizer.decode(test_dataset[9]["input_ids"])
print(text_labels)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
Санкт-Петербургский государственный университет


АВРАМЕНКО Полина Андреевна
Выпускная квалификационная работа
Веб-туны как часть южнокорейской культуры в XXI веке (на примере романтических историй) 
Уровень образования: магистратура
Направление 58.04.01 «Востоковедение и африканистика»
Основная образовательная программа BM.5808 «Культура народов Азии и Африки (с изучением языков Азии и Африки)»


Научный руководитель:
доцент, Кафедра корееведения, Санкт-Петербургский государственный университет Гурьева Анастасия Александровна

Рецензент:
приглашенный преподаватель, Кафедра корееведения, Санкт-Петербургская школа социальных наук и востоковедения,
доцент, Санкт-Петербургский филиал федерального государственного автономного 

In [61]:
unignored_tokens = []
for token in test_dataset[9]["labels"]:
    if token != IGNORE_INDEX:
        unignored_tokens.append(token)
text_labels = tokenizer.decode(unignored_tokens)
print(text_labels)

Данная выпускная квалификационная работа посвящена одному из основных элементов, формирующих массовый культурный контент Республики Корея - веб-тунам (webtoon) – цифровым комиксам, появившимся в начале XXI века. Целью работы является выявление места веб-тунов в южнокорейской культуре, а также их культурной специфики. Актуальность исследования обусловлена тем, что в наши дни в Южной Корее к веб-тунам наблюдается повышенный интерес общества. В ходе исследования был собран, изучен и систематизирован материал об истории манхва как предшественника веб-тунов. Были рассмотрены этапы развития веб-тунов, причины популярности и основные характеристики. В качестве материала для исследования были выбраны и проанализированы три популярных южнокорейских веб-туна. Посредством анализа была выявлена специфика подачи материала и связь с культурой. Благодаря анализу удалось выявить взаимосвязь веб-тунов с традиционной литературой и ролью текста в корейской культуре и традиционным распределением ролей.</s

### Generating with raw model before learning

In [72]:
# inference train sample
model.eval()
prefix_len = np.sum(np.array(train_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_tokens = train_dataset[9]["input_ids"][:prefix_len]
generated = model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(generated.to('cpu').flatten())
print(generated_text)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
САНКТ-ПЕТЕРБУРГСКИЙ ГОСУДАРСТВЕННЫЙ
УНИВЕРСИТЕТ

Мануэль Антонио Агилар Ривера
Выпускная квалификационная работа
«Multiplication Operators in Cauchy-de Branges Spaces»
Уровень образования: магистратура
Направление 01.04.01 “Математика”
Основная образовательная программа BM.5832.2019
“Современная математика”
Научный руководитель:
Профессор, математико-механический факультет СПбГУ,
доктор физ.-мат. наук, профессор РАН
Баранов Антон Дмитриевич.
Рецензент:
Профессор, Факультет математики,
Автономный университет Мадрида, кандидат физ.-мат. наук
Якубович Дмитрий Владимирович
Санкт-Петербург
2021

Contents
Contents

2

Cauchy–de Branges spaces
3
Cauchy–de Branges spaces as Reproducing Kernel Hilbert spaces . 4
The Division Prope

In [62]:
prefix_len = np.sum(np.array(test_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_len

443

In [63]:
model.eval()
prefix_tokens = test_dataset[9]["input_ids"][:prefix_len]
text = tokenizer.decode(prefix_tokens)
print(text)
print()
generated = model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(generated.to('cpu').flatten())
print(generated_text)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
Санкт-Петербургский государственный университет


АВРАМЕНКО Полина Андреевна
Выпускная квалификационная работа
Веб-туны как часть южнокорейской культуры в XXI веке (на примере романтических историй) 
Уровень образования: магистратура
Направление 58.04.01 «Востоковедение и африканистика»
Основная образовательная программа BM.5808 «Культура народов Азии и Африки (с изучением языков Азии и Африки)»


Научный руководитель:
доцент, Кафедра корееведения, Санкт-Петербургский государственный университет Гурьева Анастасия Александровна

Рецензент:
приглашенный преподаватель, Кафедра корееведения, Санкт-Петербургская школа социальных наук и востоковедения,
доцент, Санкт-Петербургский филиал федерального государственного автономного 

### Train model

In [21]:
model_type = "llama" # default
if model_type == "gpt-neox":
    # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
    targets = ["query_key_value", "dense"]
else:
    targets=["q_proj", "k_proj", "v_proj", "o_proj"]

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=targets,
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM",
)

In [22]:
 peft_model = get_peft_model(model, config)

In [23]:
trainable_params = "embed,norm"

In [24]:
[p.requires_grad_() for n, p in peft_model.named_parameters() if any([k in n for k in trainable_params.split(",")])]
pass

In [25]:
peft_model.config.use_cache = False         # required for gradient checkpointing
peft_model.enable_input_require_grads()     # required for gradient checkpointing
peft_model.gradient_checkpointing_enable()  # enable gradient checkpointing

In [26]:
OUTPUT_DIR = "output_dir"

In [27]:
from dataclasses import dataclass, field

@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [46]:
# from accelerate.utils import DistributedType

@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=8192 * 4,
        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
    )
    use_flash_attn: bool = field(
        default=True,
        metadata={"help": "Whether use flash attention for training."},
    )
    use_full_attn: bool = field(
        default=False,
        metadata={"help": "Whether to use plain, full-attention for training."},
    )
    low_rank_training: bool = field(
        default=True,
        metadata={"help": "Whether use low rank adaptation for training."},
    )
    trainable_params: str = field(
        default="embed,norm",
        metadata={"help": "Additional trainable parameters except LoRA weights, if low rank training."},
    )
    
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    bf16=True,
    use_flash_attn=True,
    low_rank_training=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="no",
    # save_strategy="steps",
    # save_steps=1,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.0,
    warmup_steps=20,
    lr_scheduler_type="constant_with_warmup",
    logging_steps=1,
    deepspeed="ds_configs/stage2.json",
    tf32=True,
    report_to=['tensorboard'],
)
# training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
training_args



In [47]:
trainer = Trainer(
    model=peft_model, 
    tokenizer=tokenizer, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    data_collator=data_collator,
)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)

  0%|          | 0/450 [00:00<?, ?it/s][A
                                               [A
  0%|          | 1/450 [00:55<27:06,  3.62s/it]
  0%|          | 1/450 [00:03<27:03,  3.62s/it][A

{'loss': 1.3015, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}



                                               [A
  0%|          | 1/450 [00:59<27:06,  3.62s/it]
  0%|          | 2/450 [00:07<27:28,  3.68s/it][A

{'loss': 1.4281, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}



                                               [A
  0%|          | 1/450 [01:03<27:06,  3.62s/it]
  1%|          | 3/450 [00:10<27:09,  3.65s/it][A

{'loss': 1.4498, 'learning_rate': 3e-06, 'epoch': 0.03}



                                               [A
  0%|          | 1/450 [01:06<27:06,  3.62s/it]
  1%|          | 4/450 [00:14<27:43,  3.73s/it][A

{'loss': 1.3943, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}



                                               [A
  0%|          | 1/450 [01:10<27:06,  3.62s/it]
  1%|          | 5/450 [00:18<27:37,  3.72s/it][A

{'loss': 1.455, 'learning_rate': 5e-06, 'epoch': 0.06}



                                               [A
  0%|          | 1/450 [01:14<27:06,  3.62s/it]
  1%|▏         | 6/450 [00:22<27:04,  3.66s/it][A

{'loss': 1.4377, 'learning_rate': 6e-06, 'epoch': 0.07}



                                               [A
  0%|          | 1/450 [01:17<27:06,  3.62s/it]
  2%|▏         | 7/450 [00:25<26:22,  3.57s/it][A

{'loss': 1.5221, 'learning_rate': 7e-06, 'epoch': 0.08}



                                               [A
  0%|          | 1/450 [01:21<27:06,  3.62s/it]
  2%|▏         | 8/450 [00:29<27:26,  3.72s/it][A

{'loss': 1.4089, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.09}



                                               [A
  0%|          | 1/450 [01:25<27:06,  3.62s/it]
  2%|▏         | 9/450 [00:33<27:57,  3.80s/it][A

{'loss': 1.4574, 'learning_rate': 9e-06, 'epoch': 0.1}



                                               ][A
  0%|          | 1/450 [01:30<27:06,  3.62s/it] 
  2%|▏         | 10/450 [00:38<30:46,  4.20s/it][A

{'loss': 1.3287, 'learning_rate': 1e-05, 'epoch': 0.11}



                                               ][A
  0%|          | 1/450 [01:34<27:06,  3.62s/it] 
  2%|▏         | 11/450 [00:41<28:57,  3.96s/it][A

{'loss': 1.4322, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.12}



                                               ][A
  0%|          | 1/450 [01:37<27:06,  3.62s/it] 
  3%|▎         | 12/450 [00:45<28:33,  3.91s/it][A

{'loss': 1.4901, 'learning_rate': 1.2e-05, 'epoch': 0.13}



                                               ][A
  0%|          | 1/450 [01:41<27:06,  3.62s/it] 
  3%|▎         | 13/450 [00:49<28:18,  3.89s/it][A

{'loss': 1.3765, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.14}



                                               ][A
  0%|          | 1/450 [01:45<27:06,  3.62s/it] 
  3%|▎         | 14/450 [00:53<28:45,  3.96s/it][A

{'loss': 1.4277, 'learning_rate': 1.4e-05, 'epoch': 0.16}



                                               ][A
  0%|          | 1/450 [01:49<27:06,  3.62s/it] 
  3%|▎         | 15/450 [00:57<28:51,  3.98s/it][A

{'loss': 1.4893, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.17}



                                               ][A
  0%|          | 1/450 [01:54<27:06,  3.62s/it] 
  4%|▎         | 16/450 [01:02<29:28,  4.07s/it][A

{'loss': 1.2648, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.18}



                                               ][A
  0%|          | 1/450 [01:58<27:06,  3.62s/it] 
  4%|▍         | 17/450 [01:06<30:37,  4.24s/it][A

{'loss': 1.4851, 'learning_rate': 1.7e-05, 'epoch': 0.19}



                                               ][A
  0%|          | 1/450 [02:02<27:06,  3.62s/it] 
  4%|▍         | 18/450 [01:10<29:31,  4.10s/it][A

{'loss': 1.3235, 'learning_rate': 1.8e-05, 'epoch': 0.2}



                                               ][A
  0%|          | 1/450 [02:06<27:06,  3.62s/it] 
  4%|▍         | 19/450 [01:14<29:21,  4.09s/it][A

{'loss': 1.3674, 'learning_rate': 1.9e-05, 'epoch': 0.21}



                                               ][A
  0%|          | 1/450 [02:10<27:06,  3.62s/it] 
  4%|▍         | 20/450 [01:18<28:06,  3.92s/it][A

{'loss': 1.3619, 'learning_rate': 2e-05, 'epoch': 0.22}



                                               ][A
  0%|          | 1/450 [02:13<27:06,  3.62s/it] 
  5%|▍         | 21/450 [01:21<26:25,  3.70s/it][A

{'loss': 1.3814, 'learning_rate': 2e-05, 'epoch': 0.23}



                                               ][A
  0%|          | 1/450 [02:16<27:06,  3.62s/it] 
  5%|▍         | 22/450 [01:24<26:07,  3.66s/it][A

{'loss': 1.3902, 'learning_rate': 2e-05, 'epoch': 0.24}



                                               ][A
  0%|          | 1/450 [02:20<27:06,  3.62s/it] 
  5%|▌         | 23/450 [01:28<25:37,  3.60s/it][A

{'loss': 1.4724, 'learning_rate': 2e-05, 'epoch': 0.26}



                                               ][A
  0%|          | 1/450 [02:27<27:06,  3.62s/it] 
  5%|▌         | 24/450 [01:34<32:02,  4.51s/it][A

{'loss': 1.4082, 'learning_rate': 2e-05, 'epoch': 0.27}



                                               ][A
  0%|          | 1/450 [02:30<27:06,  3.62s/it] 
  6%|▌         | 25/450 [01:38<30:03,  4.24s/it][A

{'loss': 1.4753, 'learning_rate': 2e-05, 'epoch': 0.28}



                                               ][A
  0%|          | 1/450 [02:34<27:06,  3.62s/it] 
  6%|▌         | 26/450 [01:42<30:00,  4.25s/it][A

{'loss': 1.3142, 'learning_rate': 2e-05, 'epoch': 0.29}



                                               ][A
  0%|          | 1/450 [02:39<27:06,  3.62s/it] 
  6%|▌         | 27/450 [01:47<30:35,  4.34s/it][A

{'loss': 1.3124, 'learning_rate': 2e-05, 'epoch': 0.3}



                                               ][A
  0%|          | 1/450 [02:43<27:06,  3.62s/it] 
  6%|▌         | 28/450 [01:51<29:08,  4.14s/it][A

{'loss': 1.3489, 'learning_rate': 2e-05, 'epoch': 0.31}



                                               ][A
  0%|          | 1/450 [02:47<27:06,  3.62s/it] 
  6%|▋         | 29/450 [01:55<30:05,  4.29s/it][A

{'loss': 1.3424, 'learning_rate': 2e-05, 'epoch': 0.32}



                                               ][A
  0%|          | 1/450 [02:51<27:06,  3.62s/it] 
  7%|▋         | 30/450 [01:59<28:44,  4.11s/it][A

{'loss': 1.4087, 'learning_rate': 2e-05, 'epoch': 0.33}



                                               ][A
  0%|          | 1/450 [02:55<27:06,  3.62s/it] 
  7%|▋         | 31/450 [02:03<28:23,  4.06s/it][A

{'loss': 1.4061, 'learning_rate': 2e-05, 'epoch': 0.34}



                                               ][A
  0%|          | 1/450 [02:59<27:06,  3.62s/it] 
  7%|▋         | 32/450 [02:07<29:09,  4.19s/it][A

{'loss': 1.4727, 'learning_rate': 2e-05, 'epoch': 0.36}



                                               ][A
  0%|          | 1/450 [03:03<27:06,  3.62s/it] 
  7%|▋         | 33/450 [02:11<27:08,  3.91s/it][A

{'loss': 1.3032, 'learning_rate': 2e-05, 'epoch': 0.37}



                                               ][A
  0%|          | 1/450 [03:07<27:06,  3.62s/it] 
  8%|▊         | 34/450 [02:14<27:08,  3.91s/it][A

{'loss': 1.3527, 'learning_rate': 2e-05, 'epoch': 0.38}



                                               ][A
  0%|          | 1/450 [03:11<27:06,  3.62s/it] 
  8%|▊         | 35/450 [02:19<28:20,  4.10s/it][A

{'loss': 1.4014, 'learning_rate': 2e-05, 'epoch': 0.39}



                                               ][A
  0%|          | 1/450 [03:15<27:06,  3.62s/it] 
  8%|▊         | 36/450 [02:23<27:23,  3.97s/it][A

{'loss': 1.3887, 'learning_rate': 2e-05, 'epoch': 0.4}



                                               ][A
  0%|          | 1/450 [03:18<27:06,  3.62s/it] 
  8%|▊         | 37/450 [02:26<25:50,  3.75s/it][A

{'loss': 1.3897, 'learning_rate': 2e-05, 'epoch': 0.41}



                                               ][A
  0%|          | 1/450 [03:21<27:06,  3.62s/it] 
  8%|▊         | 38/450 [02:29<25:07,  3.66s/it][A

{'loss': 1.3574, 'learning_rate': 2e-05, 'epoch': 0.42}



                                               ][A
  0%|          | 1/450 [03:25<27:06,  3.62s/it] 
  9%|▊         | 39/450 [02:33<24:50,  3.63s/it][A

{'loss': 1.4434, 'learning_rate': 2e-05, 'epoch': 0.43}



                                               ][A
  0%|          | 1/450 [03:30<27:06,  3.62s/it] 
  9%|▉         | 40/450 [02:38<28:00,  4.10s/it][A

{'loss': 1.2705, 'learning_rate': 2e-05, 'epoch': 0.44}



                                               ][A
  0%|          | 1/450 [03:34<27:06,  3.62s/it] 
  9%|▉         | 41/450 [02:42<27:06,  3.98s/it][A

{'loss': 1.4504, 'learning_rate': 2e-05, 'epoch': 0.46}



                                               ][A
  0%|          | 1/450 [03:37<27:06,  3.62s/it] 
  9%|▉         | 42/450 [02:45<25:41,  3.78s/it][A

{'loss': 1.5292, 'learning_rate': 2e-05, 'epoch': 0.47}



                                               ][A
  0%|          | 1/450 [03:42<27:06,  3.62s/it] 
 10%|▉         | 43/450 [02:50<27:05,  3.99s/it][A

{'loss': 1.3911, 'learning_rate': 2e-05, 'epoch': 0.48}



                                               ][A
  0%|          | 1/450 [03:46<27:06,  3.62s/it] 
 10%|▉         | 44/450 [02:54<26:57,  3.98s/it][A

{'loss': 1.2904, 'learning_rate': 2e-05, 'epoch': 0.49}



                                               ][A
  0%|          | 1/450 [03:49<27:06,  3.62s/it] 
 10%|█         | 45/450 [02:57<26:31,  3.93s/it][A

{'loss': 1.3152, 'learning_rate': 2e-05, 'epoch': 0.5}



                                               ][A
  0%|          | 1/450 [03:53<27:06,  3.62s/it] 
 10%|█         | 46/450 [03:01<25:06,  3.73s/it][A

{'loss': 1.2907, 'learning_rate': 2e-05, 'epoch': 0.51}



                                               ][A
  0%|          | 1/450 [03:57<27:06,  3.62s/it] 
 10%|█         | 47/450 [03:05<25:39,  3.82s/it][A

{'loss': 1.3705, 'learning_rate': 2e-05, 'epoch': 0.52}



                                               ][A
  0%|          | 1/450 [04:01<27:06,  3.62s/it] 
 11%|█         | 48/450 [03:09<26:54,  4.02s/it][A

{'loss': 1.3643, 'learning_rate': 2e-05, 'epoch': 0.53}



                                               ][A
  0%|          | 1/450 [04:05<27:06,  3.62s/it] 
 11%|█         | 49/450 [03:13<26:16,  3.93s/it][A

{'loss': 1.4056, 'learning_rate': 2e-05, 'epoch': 0.54}



                                               ][A
  0%|          | 1/450 [04:08<27:06,  3.62s/it] 
 11%|█         | 50/450 [03:16<25:10,  3.78s/it][A

{'loss': 1.502, 'learning_rate': 2e-05, 'epoch': 0.56}



                                               ][A
  0%|          | 1/450 [04:12<27:06,  3.62s/it] 
 11%|█▏        | 51/450 [03:20<24:33,  3.69s/it][A

{'loss': 1.4266, 'learning_rate': 2e-05, 'epoch': 0.57}



                                               ][A
  0%|          | 1/450 [04:16<27:06,  3.62s/it] 
 12%|█▏        | 52/450 [03:23<24:34,  3.70s/it][A

{'loss': 1.3296, 'learning_rate': 2e-05, 'epoch': 0.58}



                                               ][A
  0%|          | 1/450 [04:20<27:06,  3.62s/it] 
 12%|█▏        | 53/450 [03:27<25:00,  3.78s/it][A

{'loss': 1.3355, 'learning_rate': 2e-05, 'epoch': 0.59}



                                               ][A
  0%|          | 1/450 [04:25<27:06,  3.62s/it] 
 12%|█▏        | 54/450 [03:33<28:02,  4.25s/it][A

{'loss': 1.3942, 'learning_rate': 2e-05, 'epoch': 0.6}



                                               ][A
  0%|          | 1/450 [04:29<27:06,  3.62s/it] 
 12%|█▏        | 55/450 [03:36<26:51,  4.08s/it][A

{'loss': 1.3474, 'learning_rate': 2e-05, 'epoch': 0.61}



                                               ][A
  0%|          | 1/450 [04:33<27:06,  3.62s/it] 
 12%|█▏        | 56/450 [03:41<27:26,  4.18s/it][A

{'loss': 1.3985, 'learning_rate': 2e-05, 'epoch': 0.62}



                                               ][A
  0%|          | 1/450 [04:37<27:06,  3.62s/it] 
 13%|█▎        | 57/450 [03:45<26:40,  4.07s/it][A

{'loss': 1.4883, 'learning_rate': 2e-05, 'epoch': 0.63}



                                               ][A
  0%|          | 1/450 [04:41<27:06,  3.62s/it] 
 13%|█▎        | 58/450 [03:49<26:20,  4.03s/it][A

{'loss': 1.1958, 'learning_rate': 2e-05, 'epoch': 0.64}



                                               ][A
  0%|          | 1/450 [04:45<27:06,  3.62s/it] 
 13%|█▎        | 59/450 [03:53<26:49,  4.12s/it][A

{'loss': 1.5296, 'learning_rate': 2e-05, 'epoch': 0.66}



                                               ][A
  0%|          | 1/450 [04:49<27:06,  3.62s/it] 
 13%|█▎        | 60/450 [03:57<25:46,  3.97s/it][A

{'loss': 1.3338, 'learning_rate': 2e-05, 'epoch': 0.67}



                                               ][A
  0%|          | 1/450 [04:53<27:06,  3.62s/it] 
 14%|█▎        | 61/450 [04:01<25:45,  3.97s/it][A

{'loss': 1.2165, 'learning_rate': 2e-05, 'epoch': 0.68}



                                               ][A
  0%|          | 1/450 [04:57<27:06,  3.62s/it] 
 14%|█▍        | 62/450 [04:05<26:02,  4.03s/it][A

{'loss': 1.3481, 'learning_rate': 2e-05, 'epoch': 0.69}



                                               ][A
  0%|          | 1/450 [05:01<27:06,  3.62s/it] 
 14%|█▍        | 63/450 [04:09<26:27,  4.10s/it][A

{'loss': 1.3667, 'learning_rate': 2e-05, 'epoch': 0.7}



                                               ][A
  0%|          | 1/450 [05:09<27:06,  3.62s/it] 
 14%|█▍        | 64/450 [04:17<33:21,  5.18s/it][A

{'loss': 1.3844, 'learning_rate': 2e-05, 'epoch': 0.71}



                                               ][A
  0%|          | 1/450 [05:13<27:06,  3.62s/it] 
 14%|█▍        | 65/450 [04:21<31:58,  4.98s/it][A

{'loss': 1.3475, 'learning_rate': 2e-05, 'epoch': 0.72}



                                               ][A
  0%|          | 1/450 [05:18<27:06,  3.62s/it] 
 15%|█▍        | 66/450 [04:26<31:15,  4.88s/it][A

{'loss': 1.3695, 'learning_rate': 2e-05, 'epoch': 0.73}



                                               ][A
  0%|          | 1/450 [05:22<27:06,  3.62s/it] 
 15%|█▍        | 67/450 [04:30<30:18,  4.75s/it][A

{'loss': 1.212, 'learning_rate': 2e-05, 'epoch': 0.74}



                                               ][A
  0%|          | 1/450 [05:26<27:06,  3.62s/it] 
 15%|█▌        | 68/450 [04:33<27:05,  4.26s/it][A

{'loss': 1.2033, 'learning_rate': 2e-05, 'epoch': 0.76}



                                               ][A
  0%|          | 1/450 [05:29<27:06,  3.62s/it] 
 15%|█▌        | 69/450 [04:37<25:43,  4.05s/it][A

{'loss': 1.3812, 'learning_rate': 2e-05, 'epoch': 0.77}



                                               ][A
  0%|          | 1/450 [05:32<27:06,  3.62s/it] 
 16%|█▌        | 70/450 [04:40<24:04,  3.80s/it][A

{'loss': 1.5949, 'learning_rate': 2e-05, 'epoch': 0.78}



                                               ][A
  0%|          | 1/450 [05:37<27:06,  3.62s/it] 
 16%|█▌        | 71/450 [04:44<24:49,  3.93s/it][A

{'loss': 1.3994, 'learning_rate': 2e-05, 'epoch': 0.79}



                                               ][A
  0%|          | 1/450 [05:41<27:06,  3.62s/it] 
 16%|█▌        | 72/450 [04:49<25:47,  4.09s/it][A

{'loss': 1.3861, 'learning_rate': 2e-05, 'epoch': 0.8}



                                               ][A
  0%|          | 1/450 [05:46<27:06,  3.62s/it] 
 16%|█▌        | 73/450 [04:54<28:17,  4.50s/it][A

{'loss': 1.2532, 'learning_rate': 2e-05, 'epoch': 0.81}



                                               ][A
  0%|          | 1/450 [05:51<27:06,  3.62s/it] 
 16%|█▋        | 74/450 [04:59<27:38,  4.41s/it][A

{'loss': 1.3334, 'learning_rate': 2e-05, 'epoch': 0.82}



                                               ][A
  0%|          | 1/450 [05:55<27:06,  3.62s/it] 
 17%|█▋        | 75/450 [05:03<28:06,  4.50s/it][A

{'loss': 1.3905, 'learning_rate': 2e-05, 'epoch': 0.83}



                                               ][A
  0%|          | 1/450 [05:59<27:06,  3.62s/it] 
 17%|█▋        | 76/450 [05:07<25:46,  4.14s/it][A

{'loss': 1.2549, 'learning_rate': 2e-05, 'epoch': 0.84}



                                               ][A
  0%|          | 1/450 [06:02<27:06,  3.62s/it] 
 17%|█▋        | 77/450 [05:10<24:10,  3.89s/it][A

{'loss': 1.3188, 'learning_rate': 2e-05, 'epoch': 0.86}



                                               ][A
  0%|          | 1/450 [06:06<27:06,  3.62s/it] 
 17%|█▋        | 78/450 [05:14<23:54,  3.86s/it][A

{'loss': 1.2879, 'learning_rate': 2e-05, 'epoch': 0.87}



                                               ][A
  0%|          | 1/450 [06:11<27:06,  3.62s/it] 
 18%|█▊        | 79/450 [05:19<26:05,  4.22s/it][A

{'loss': 1.3814, 'learning_rate': 2e-05, 'epoch': 0.88}



                                               ][A
  0%|          | 1/450 [06:14<27:06,  3.62s/it] 
 18%|█▊        | 80/450 [05:22<24:05,  3.91s/it][A

{'loss': 1.4786, 'learning_rate': 2e-05, 'epoch': 0.89}



                                               ][A
  0%|          | 1/450 [06:18<27:06,  3.62s/it] 
 18%|█▊        | 81/450 [05:26<24:24,  3.97s/it][A

{'loss': 1.2379, 'learning_rate': 2e-05, 'epoch': 0.9}



                                               ][A
  0%|          | 1/450 [06:22<27:06,  3.62s/it] 
 18%|█▊        | 82/450 [05:30<23:53,  3.89s/it][A

{'loss': 1.2746, 'learning_rate': 2e-05, 'epoch': 0.91}



                                               ][A
  0%|          | 1/450 [06:25<27:06,  3.62s/it] 
 18%|█▊        | 83/450 [05:33<22:28,  3.67s/it][A

{'loss': 1.341, 'learning_rate': 2e-05, 'epoch': 0.92}



                                               ][A
  0%|          | 1/450 [06:29<27:06,  3.62s/it] 
 19%|█▊        | 84/450 [05:37<23:02,  3.78s/it][A

{'loss': 1.4664, 'learning_rate': 2e-05, 'epoch': 0.93}



                                               ][A
  0%|          | 1/450 [06:32<27:06,  3.62s/it] 
 19%|█▉        | 85/450 [05:40<22:12,  3.65s/it][A

{'loss': 1.2359, 'learning_rate': 2e-05, 'epoch': 0.94}



                                               ][A
  0%|          | 1/450 [06:36<27:06,  3.62s/it] 
 19%|█▉        | 86/450 [05:44<22:15,  3.67s/it][A

{'loss': 1.3863, 'learning_rate': 2e-05, 'epoch': 0.96}



                                               ][A
  0%|          | 1/450 [06:40<27:06,  3.62s/it] 
 19%|█▉        | 87/450 [05:48<23:30,  3.89s/it][A

{'loss': 1.3409, 'learning_rate': 2e-05, 'epoch': 0.97}



                                               ][A
  0%|          | 1/450 [06:45<27:06,  3.62s/it] 
 20%|█▉        | 88/450 [05:53<23:57,  3.97s/it][A

{'loss': 1.261, 'learning_rate': 2e-05, 'epoch': 0.98}



                                               ][A
  0%|          | 1/450 [06:49<27:06,  3.62s/it] 
 20%|█▉        | 89/450 [05:57<24:29,  4.07s/it][A

{'loss': 1.4486, 'learning_rate': 2e-05, 'epoch': 0.99}



                                               ][A
  0%|          | 1/450 [06:52<27:06,  3.62s/it] 
 20%|██        | 90/450 [06:00<23:14,  3.87s/it][A

{'loss': 1.3002, 'learning_rate': 2e-05, 'epoch': 1.0}



                                               ][A
  0%|          | 1/450 [06:56<27:06,  3.62s/it] 
 20%|██        | 91/450 [06:04<23:03,  3.85s/it][A

{'loss': 1.3119, 'learning_rate': 2e-05, 'epoch': 1.01}



                                               ][A
  0%|          | 1/450 [07:00<27:06,  3.62s/it] 
 20%|██        | 92/450 [06:08<22:23,  3.75s/it][A

{'loss': 1.3295, 'learning_rate': 2e-05, 'epoch': 1.02}



                                               ][A
  0%|          | 1/450 [07:03<27:06,  3.62s/it] 
 21%|██        | 93/450 [06:11<22:11,  3.73s/it][A

{'loss': 1.3181, 'learning_rate': 2e-05, 'epoch': 1.03}



                                               ][A
  0%|          | 1/450 [07:07<27:06,  3.62s/it] 
 21%|██        | 94/450 [06:15<21:23,  3.61s/it][A

{'loss': 1.2759, 'learning_rate': 2e-05, 'epoch': 1.04}



                                               ][A
  0%|          | 1/450 [07:11<27:06,  3.62s/it] 
 21%|██        | 95/450 [06:19<22:09,  3.74s/it][A

{'loss': 1.309, 'learning_rate': 2e-05, 'epoch': 1.06}



                                               ][A
  0%|          | 1/450 [07:14<27:06,  3.62s/it] 
 21%|██▏       | 96/450 [06:22<21:24,  3.63s/it][A

{'loss': 1.3587, 'learning_rate': 2e-05, 'epoch': 1.07}



                                               ][A
  0%|          | 1/450 [07:17<27:06,  3.62s/it] 
 22%|██▏       | 97/450 [06:25<20:41,  3.52s/it][A

{'loss': 1.2882, 'learning_rate': 2e-05, 'epoch': 1.08}



                                               ][A
  0%|          | 1/450 [07:23<27:06,  3.62s/it] 
 22%|██▏       | 98/450 [06:31<24:52,  4.24s/it][A

{'loss': 1.4208, 'learning_rate': 2e-05, 'epoch': 1.09}



                                               ][A
  0%|          | 1/450 [07:27<27:06,  3.62s/it] 
 22%|██▏       | 99/450 [06:35<23:23,  4.00s/it][A

{'loss': 1.139, 'learning_rate': 2e-05, 'epoch': 1.1}



                                               t][A
  0%|          | 1/450 [07:30<27:06,  3.62s/it]  
 22%|██▏       | 100/450 [06:38<22:00,  3.77s/it][A

{'loss': 1.1894, 'learning_rate': 2e-05, 'epoch': 1.11}



                                               t][A
  0%|          | 1/450 [07:34<27:06,  3.62s/it]  
 22%|██▏       | 101/450 [06:42<22:57,  3.95s/it][A

{'loss': 1.3628, 'learning_rate': 2e-05, 'epoch': 1.12}



                                               t][A
  0%|          | 1/450 [07:39<27:06,  3.62s/it]  
 23%|██▎       | 102/450 [06:47<24:40,  4.25s/it][A

{'loss': 1.27, 'learning_rate': 2e-05, 'epoch': 1.13}



                                               t][A
  0%|          | 1/450 [07:43<27:06,  3.62s/it]  
 23%|██▎       | 103/450 [06:51<23:25,  4.05s/it][A

{'loss': 1.2595, 'learning_rate': 2e-05, 'epoch': 1.14}



                                               t][A
  0%|          | 1/450 [07:47<27:06,  3.62s/it]  
 23%|██▎       | 104/450 [06:55<23:24,  4.06s/it][A

{'loss': 1.33, 'learning_rate': 2e-05, 'epoch': 1.16}



                                               t][A
  0%|          | 1/450 [07:51<27:06,  3.62s/it]  
 23%|██▎       | 105/450 [06:59<23:38,  4.11s/it][A

{'loss': 1.4013, 'learning_rate': 2e-05, 'epoch': 1.17}



                                               t][A
  0%|          | 1/450 [07:55<27:06,  3.62s/it]  
 24%|██▎       | 106/450 [07:02<22:22,  3.90s/it][A

{'loss': 1.1994, 'learning_rate': 2e-05, 'epoch': 1.18}



                                               t][A
  0%|          | 1/450 [07:58<27:06,  3.62s/it]  
 24%|██▍       | 107/450 [07:06<21:33,  3.77s/it][A

{'loss': 1.3142, 'learning_rate': 2e-05, 'epoch': 1.19}



                                               t][A
  0%|          | 1/450 [08:02<27:06,  3.62s/it]  
 24%|██▍       | 108/450 [07:10<21:40,  3.80s/it][A

{'loss': 1.4185, 'learning_rate': 2e-05, 'epoch': 1.2}



                                               t][A
  0%|          | 1/450 [08:06<27:06,  3.62s/it]  
 24%|██▍       | 109/450 [07:14<21:45,  3.83s/it][A

{'loss': 1.2608, 'learning_rate': 2e-05, 'epoch': 1.21}



                                               t][A
  0%|          | 1/450 [08:10<27:06,  3.62s/it]  
 24%|██▍       | 110/450 [07:17<21:37,  3.81s/it][A

{'loss': 1.3811, 'learning_rate': 2e-05, 'epoch': 1.22}



                                               t][A
  0%|          | 1/450 [08:13<27:06,  3.62s/it]  
 25%|██▍       | 111/450 [07:21<20:21,  3.60s/it][A

{'loss': 1.3882, 'learning_rate': 2e-05, 'epoch': 1.23}



                                               t][A
  0%|          | 1/450 [08:16<27:06,  3.62s/it]  
 25%|██▍       | 112/450 [07:24<20:18,  3.61s/it][A

{'loss': 1.2802, 'learning_rate': 2e-05, 'epoch': 1.24}



                                               t][A
  0%|          | 1/450 [08:21<27:06,  3.62s/it]  
 25%|██▌       | 113/450 [07:29<21:37,  3.85s/it][A

{'loss': 1.1823, 'learning_rate': 2e-05, 'epoch': 1.26}



                                               t][A
  0%|          | 1/450 [08:24<27:06,  3.62s/it]  
 25%|██▌       | 114/450 [07:32<20:38,  3.69s/it][A

{'loss': 1.2833, 'learning_rate': 2e-05, 'epoch': 1.27}



                                               t][A
  0%|          | 1/450 [08:28<27:06,  3.62s/it]  
 26%|██▌       | 115/450 [07:36<20:23,  3.65s/it][A

{'loss': 1.4519, 'learning_rate': 2e-05, 'epoch': 1.28}



                                               t][A
  0%|          | 1/450 [08:33<27:06,  3.62s/it]  
 26%|██▌       | 116/450 [07:40<22:23,  4.02s/it][A

{'loss': 1.2855, 'learning_rate': 2e-05, 'epoch': 1.29}



                                               t][A
  0%|          | 1/450 [08:40<27:06,  3.62s/it]  
 26%|██▌       | 117/450 [07:48<27:30,  4.96s/it][A

{'loss': 1.2367, 'learning_rate': 2e-05, 'epoch': 1.3}



                                               t][A
  0%|          | 1/450 [08:43<27:06,  3.62s/it]  
 26%|██▌       | 118/450 [07:51<25:09,  4.55s/it][A

{'loss': 1.3691, 'learning_rate': 2e-05, 'epoch': 1.31}



                                               t][A
  0%|          | 1/450 [08:47<27:06,  3.62s/it]  
 26%|██▋       | 119/450 [07:55<24:13,  4.39s/it][A

{'loss': 1.2726, 'learning_rate': 2e-05, 'epoch': 1.32}



                                               t][A
  0%|          | 1/450 [08:52<27:06,  3.62s/it]  
 27%|██▋       | 120/450 [08:00<24:08,  4.39s/it][A

{'loss': 1.2895, 'learning_rate': 2e-05, 'epoch': 1.33}



                                               t][A
  0%|          | 1/450 [08:57<27:06,  3.62s/it]  
 27%|██▋       | 121/450 [08:05<25:46,  4.70s/it][A

{'loss': 1.3119, 'learning_rate': 2e-05, 'epoch': 1.34}



                                               t][A
  0%|          | 1/450 [09:00<27:06,  3.62s/it]  
 27%|██▋       | 122/450 [08:08<23:31,  4.30s/it][A

{'loss': 1.279, 'learning_rate': 2e-05, 'epoch': 1.36}



                                               t][A
  0%|          | 1/450 [09:05<27:06,  3.62s/it]  
 27%|██▋       | 123/450 [08:13<24:19,  4.46s/it][A

{'loss': 1.2333, 'learning_rate': 2e-05, 'epoch': 1.37}



                                               t][A
  0%|          | 1/450 [09:10<27:06,  3.62s/it]  
 28%|██▊       | 124/450 [08:18<24:04,  4.43s/it][A

{'loss': 1.2206, 'learning_rate': 2e-05, 'epoch': 1.38}



                                               t][A
  0%|          | 1/450 [09:13<27:06,  3.62s/it]  
 28%|██▊       | 125/450 [08:21<22:33,  4.17s/it][A

{'loss': 1.3423, 'learning_rate': 2e-05, 'epoch': 1.39}



                                               t][A
  0%|          | 1/450 [09:17<27:06,  3.62s/it]  
 28%|██▊       | 126/450 [08:25<21:29,  3.98s/it][A

{'loss': 1.2588, 'learning_rate': 2e-05, 'epoch': 1.4}



                                               t][A
  0%|          | 1/450 [09:20<27:06,  3.62s/it]  
 28%|██▊       | 127/450 [08:28<20:14,  3.76s/it][A

{'loss': 1.1811, 'learning_rate': 2e-05, 'epoch': 1.41}



                                               t][A
  0%|          | 1/450 [09:23<27:06,  3.62s/it]  
 28%|██▊       | 128/450 [08:31<19:25,  3.62s/it][A

{'loss': 1.3719, 'learning_rate': 2e-05, 'epoch': 1.42}



                                               t][A
  0%|          | 1/450 [09:26<27:06,  3.62s/it]  
 29%|██▊       | 129/450 [08:34<18:33,  3.47s/it][A

{'loss': 1.4495, 'learning_rate': 2e-05, 'epoch': 1.43}



                                               t][A
  0%|          | 1/450 [09:31<27:06,  3.62s/it]  

{'loss': 1.4182, 'learning_rate': 2e-05, 'epoch': 1.44}



 29%|██▉       | 130/450 [08:39<20:56,  3.93s/it][A
                                               t][A
  0%|          | 1/450 [09:35<27:06,  3.62s/it]  
 29%|██▉       | 131/450 [08:43<20:07,  3.78s/it][A

{'loss': 1.3553, 'learning_rate': 2e-05, 'epoch': 1.46}



                                               t][A
  0%|          | 1/450 [09:38<27:06,  3.62s/it]  
 29%|██▉       | 132/450 [08:46<19:38,  3.71s/it][A

{'loss': 1.3562, 'learning_rate': 2e-05, 'epoch': 1.47}



                                               t][A
  0%|          | 1/450 [09:42<27:06,  3.62s/it]  
 30%|██▉       | 133/450 [08:50<19:40,  3.72s/it][A

{'loss': 1.2997, 'learning_rate': 2e-05, 'epoch': 1.48}



                                               t][A
  0%|          | 1/450 [09:47<27:06,  3.62s/it]  
 30%|██▉       | 134/450 [08:55<21:13,  4.03s/it][A

{'loss': 1.2518, 'learning_rate': 2e-05, 'epoch': 1.49}



                                               t][A
  0%|          | 1/450 [09:51<27:06,  3.62s/it]  
 30%|███       | 135/450 [08:59<20:56,  3.99s/it][A

{'loss': 1.3145, 'learning_rate': 2e-05, 'epoch': 1.5}



                                               t][A
  0%|          | 1/450 [09:54<27:06,  3.62s/it]  
 30%|███       | 136/450 [09:02<19:25,  3.71s/it][A

{'loss': 1.3927, 'learning_rate': 2e-05, 'epoch': 1.51}



                                               t][A
  0%|          | 1/450 [09:57<27:06,  3.62s/it]  
 30%|███       | 137/450 [09:05<18:59,  3.64s/it][A

{'loss': 1.4503, 'learning_rate': 2e-05, 'epoch': 1.52}



                                               t][A
  0%|          | 1/450 [10:02<27:06,  3.62s/it]  
 31%|███       | 138/450 [09:10<20:00,  3.85s/it][A

{'loss': 1.4147, 'learning_rate': 2e-05, 'epoch': 1.53}



                                               t][A
  0%|          | 1/450 [10:06<27:06,  3.62s/it]  
 31%|███       | 139/450 [09:14<20:40,  3.99s/it][A

{'loss': 1.2696, 'learning_rate': 2e-05, 'epoch': 1.54}



                                               t][A
  0%|          | 1/450 [10:11<27:06,  3.62s/it]  
 31%|███       | 140/450 [09:19<22:23,  4.34s/it][A

{'loss': 1.2751, 'learning_rate': 2e-05, 'epoch': 1.56}



                                               t][A
  0%|          | 1/450 [10:15<27:06,  3.62s/it]  
 31%|███▏      | 141/450 [09:22<20:53,  4.06s/it][A

{'loss': 1.303, 'learning_rate': 2e-05, 'epoch': 1.57}



                                               t][A
  0%|          | 1/450 [10:18<27:06,  3.62s/it]  
 32%|███▏      | 142/450 [09:26<19:54,  3.88s/it][A

{'loss': 1.295, 'learning_rate': 2e-05, 'epoch': 1.58}



                                               t][A
  0%|          | 1/450 [10:22<27:06,  3.62s/it]  
 32%|███▏      | 143/450 [09:30<20:46,  4.06s/it][A

{'loss': 1.3347, 'learning_rate': 2e-05, 'epoch': 1.59}



                                               t][A
  0%|          | 1/450 [10:26<27:06,  3.62s/it]  
 32%|███▏      | 144/450 [09:34<20:32,  4.03s/it][A

{'loss': 1.3195, 'learning_rate': 2e-05, 'epoch': 1.6}



                                               t][A
  0%|          | 1/450 [10:31<27:06,  3.62s/it]  
 32%|███▏      | 145/450 [09:39<20:52,  4.11s/it][A

{'loss': 1.2815, 'learning_rate': 2e-05, 'epoch': 1.61}



                                               t][A
  0%|          | 1/450 [10:34<27:06,  3.62s/it]  
 32%|███▏      | 146/450 [09:42<19:22,  3.82s/it][A

{'loss': 1.3128, 'learning_rate': 2e-05, 'epoch': 1.62}



                                               t][A
  0%|          | 1/450 [10:37<27:06,  3.62s/it]  
 33%|███▎      | 147/450 [09:45<18:28,  3.66s/it][A

{'loss': 1.1768, 'learning_rate': 2e-05, 'epoch': 1.63}



                                               t][A
  0%|          | 1/450 [10:41<27:06,  3.62s/it]  
 33%|███▎      | 148/450 [09:49<18:43,  3.72s/it][A

{'loss': 1.3368, 'learning_rate': 2e-05, 'epoch': 1.64}



                                               t][A
  0%|          | 1/450 [10:45<27:06,  3.62s/it]  
 33%|███▎      | 149/450 [09:53<18:58,  3.78s/it][A

{'loss': 1.3746, 'learning_rate': 2e-05, 'epoch': 1.66}



                                               t][A
  0%|          | 1/450 [10:48<27:06,  3.62s/it]  
 33%|███▎      | 150/450 [09:56<18:26,  3.69s/it][A

{'loss': 1.3906, 'learning_rate': 2e-05, 'epoch': 1.67}



                                               t][A
  0%|          | 1/450 [10:53<27:06,  3.62s/it]  
 34%|███▎      | 151/450 [10:01<19:11,  3.85s/it][A

{'loss': 1.321, 'learning_rate': 2e-05, 'epoch': 1.68}



                                               t][A
  0%|          | 1/450 [10:58<27:06,  3.62s/it]  
 34%|███▍      | 152/450 [10:06<21:57,  4.42s/it][A

{'loss': 1.2435, 'learning_rate': 2e-05, 'epoch': 1.69}



                                               t][A
  0%|          | 1/450 [11:02<27:06,  3.62s/it]  
 34%|███▍      | 153/450 [10:10<21:17,  4.30s/it][A

{'loss': 1.3075, 'learning_rate': 2e-05, 'epoch': 1.7}



                                               t][A
  0%|          | 1/450 [11:06<27:06,  3.62s/it]  
 34%|███▍      | 154/450 [10:14<20:32,  4.16s/it][A

{'loss': 1.3707, 'learning_rate': 2e-05, 'epoch': 1.71}



                                               t][A
  0%|          | 1/450 [11:10<27:06,  3.62s/it]  
 34%|███▍      | 155/450 [10:18<20:34,  4.18s/it][A

{'loss': 1.3653, 'learning_rate': 2e-05, 'epoch': 1.72}



                                               t][A
  0%|          | 1/450 [11:14<27:06,  3.62s/it]  
 35%|███▍      | 156/450 [10:22<19:24,  3.96s/it][A

{'loss': 1.3269, 'learning_rate': 2e-05, 'epoch': 1.73}



                                               t][A
  0%|          | 1/450 [11:18<27:06,  3.62s/it]  
 35%|███▍      | 157/450 [10:26<19:55,  4.08s/it][A

{'loss': 1.2515, 'learning_rate': 2e-05, 'epoch': 1.74}



                                               t][A
  0%|          | 1/450 [11:22<27:06,  3.62s/it]  
 35%|███▌      | 158/450 [10:30<19:23,  3.98s/it][A

{'loss': 1.3876, 'learning_rate': 2e-05, 'epoch': 1.76}



                                               t][A
  0%|          | 1/450 [11:26<27:06,  3.62s/it]  
 35%|███▌      | 159/450 [10:34<19:13,  3.96s/it][A

{'loss': 1.1814, 'learning_rate': 2e-05, 'epoch': 1.77}



                                               t][A
  0%|          | 1/450 [11:30<27:06,  3.62s/it]  
 36%|███▌      | 160/450 [10:38<18:46,  3.89s/it][A

{'loss': 1.3542, 'learning_rate': 2e-05, 'epoch': 1.78}



                                               t][A
  0%|          | 1/450 [11:34<27:06,  3.62s/it]  
 36%|███▌      | 161/450 [10:41<18:48,  3.90s/it][A

{'loss': 1.2409, 'learning_rate': 2e-05, 'epoch': 1.79}



                                               t][A
  0%|          | 1/450 [11:37<27:06,  3.62s/it]  
 36%|███▌      | 162/450 [10:45<18:35,  3.87s/it][A

{'loss': 1.4083, 'learning_rate': 2e-05, 'epoch': 1.8}



                                               t][A
  0%|          | 1/450 [11:41<27:06,  3.62s/it]  
 36%|███▌      | 163/450 [10:49<18:10,  3.80s/it][A

{'loss': 1.3068, 'learning_rate': 2e-05, 'epoch': 1.81}



                                               t][A
  0%|          | 1/450 [11:46<27:06,  3.62s/it]  
 36%|███▋      | 164/450 [10:53<19:06,  4.01s/it][A

{'loss': 1.3397, 'learning_rate': 2e-05, 'epoch': 1.82}



                                               t][A
  0%|          | 1/450 [11:50<27:06,  3.62s/it]  
 37%|███▋      | 165/450 [10:58<19:56,  4.20s/it][A

{'loss': 1.3989, 'learning_rate': 2e-05, 'epoch': 1.83}



                                               t][A
  0%|          | 1/450 [11:54<27:06,  3.62s/it]  
 37%|███▋      | 166/450 [11:01<18:45,  3.96s/it][A

{'loss': 1.1801, 'learning_rate': 2e-05, 'epoch': 1.84}



                                               t][A
  0%|          | 1/450 [11:57<27:06,  3.62s/it]  
 37%|███▋      | 167/450 [11:05<18:27,  3.91s/it][A

{'loss': 1.231, 'learning_rate': 2e-05, 'epoch': 1.86}



                                               t][A
  0%|          | 1/450 [12:01<27:06,  3.62s/it]  
 37%|███▋      | 168/450 [11:09<18:38,  3.97s/it][A

{'loss': 1.2747, 'learning_rate': 2e-05, 'epoch': 1.87}



                                               t][A
  0%|          | 1/450 [12:05<27:06,  3.62s/it]  
 38%|███▊      | 169/450 [11:13<18:07,  3.87s/it][A

{'loss': 1.3697, 'learning_rate': 2e-05, 'epoch': 1.88}



                                               t][A
  0%|          | 1/450 [12:09<27:06,  3.62s/it]  
 38%|███▊      | 170/450 [11:17<17:57,  3.85s/it][A

{'loss': 1.1737, 'learning_rate': 2e-05, 'epoch': 1.89}



                                               t][A
  0%|          | 1/450 [12:13<27:06,  3.62s/it]  
 38%|███▊      | 171/450 [11:21<18:12,  3.92s/it][A

{'loss': 1.4005, 'learning_rate': 2e-05, 'epoch': 1.9}



                                               t][A
  0%|          | 1/450 [12:18<27:06,  3.62s/it]  
 38%|███▊      | 172/450 [11:26<19:59,  4.31s/it][A

{'loss': 1.3068, 'learning_rate': 2e-05, 'epoch': 1.91}



                                               t][A
  0%|          | 1/450 [12:22<27:06,  3.62s/it]  
 38%|███▊      | 173/450 [11:30<19:37,  4.25s/it][A

{'loss': 1.314, 'learning_rate': 2e-05, 'epoch': 1.92}



                                               t][A
  0%|          | 1/450 [12:29<27:06,  3.62s/it]  
 39%|███▊      | 174/450 [11:37<23:11,  5.04s/it][A

{'loss': 1.2888, 'learning_rate': 2e-05, 'epoch': 1.93}



                                               t][A
  0%|          | 1/450 [12:34<27:06,  3.62s/it]  
 39%|███▉      | 175/450 [11:42<22:59,  5.02s/it][A

{'loss': 1.4259, 'learning_rate': 2e-05, 'epoch': 1.94}



                                               t][A
  0%|          | 1/450 [12:38<27:06,  3.62s/it]  
 39%|███▉      | 176/450 [11:46<21:07,  4.62s/it][A

{'loss': 1.346, 'learning_rate': 2e-05, 'epoch': 1.96}



                                               t][A
  0%|          | 1/450 [12:41<27:06,  3.62s/it]  
 39%|███▉      | 177/450 [11:49<19:04,  4.19s/it][A

{'loss': 1.1118, 'learning_rate': 2e-05, 'epoch': 1.97}



                                               t][A
  0%|          | 1/450 [12:45<27:06,  3.62s/it]  
 40%|███▉      | 178/450 [11:53<19:07,  4.22s/it][A

{'loss': 1.3533, 'learning_rate': 2e-05, 'epoch': 1.98}



                                               t][A
  0%|          | 1/450 [12:49<27:06,  3.62s/it]  
 40%|███▉      | 179/450 [11:57<18:20,  4.06s/it][A

{'loss': 1.2803, 'learning_rate': 2e-05, 'epoch': 1.99}



                                               t][A
  0%|          | 1/450 [12:53<27:06,  3.62s/it]  
 40%|████      | 180/450 [12:01<18:08,  4.03s/it][A

{'loss': 1.2263, 'learning_rate': 2e-05, 'epoch': 2.0}



                                               t][A
  0%|          | 1/450 [12:57<27:06,  3.62s/it]  
 40%|████      | 181/450 [12:05<17:45,  3.96s/it][A

{'loss': 1.211, 'learning_rate': 2e-05, 'epoch': 2.01}



                                               t][A
  0%|          | 1/450 [13:02<27:06,  3.62s/it]  
 40%|████      | 182/450 [12:09<18:45,  4.20s/it][A

{'loss': 1.3963, 'learning_rate': 2e-05, 'epoch': 2.02}



                                               t][A
  0%|          | 1/450 [13:05<27:06,  3.62s/it]  
 41%|████      | 183/450 [12:13<17:21,  3.90s/it][A

{'loss': 1.3386, 'learning_rate': 2e-05, 'epoch': 2.03}



                                               t][A
  0%|          | 1/450 [13:09<27:06,  3.62s/it]  
 41%|████      | 184/450 [12:17<17:58,  4.05s/it][A

{'loss': 1.2399, 'learning_rate': 2e-05, 'epoch': 2.04}



                                               t][A
  0%|          | 1/450 [13:14<27:06,  3.62s/it]  
 41%|████      | 185/450 [12:22<18:40,  4.23s/it][A

{'loss': 1.2932, 'learning_rate': 2e-05, 'epoch': 2.06}



                                               t][A
  0%|          | 1/450 [13:18<27:06,  3.62s/it]  
 41%|████▏     | 186/450 [12:26<18:06,  4.12s/it][A

{'loss': 1.3952, 'learning_rate': 2e-05, 'epoch': 2.07}



                                               t][A
  0%|          | 1/450 [13:22<27:06,  3.62s/it]  
 42%|████▏     | 187/450 [12:30<18:26,  4.21s/it][A

{'loss': 1.3003, 'learning_rate': 2e-05, 'epoch': 2.08}



                                               t][A
  0%|          | 1/450 [13:30<27:06,  3.62s/it]  
 42%|████▏     | 188/450 [12:38<23:04,  5.28s/it][A

{'loss': 1.2899, 'learning_rate': 2e-05, 'epoch': 2.09}



                                               t][A
  0%|          | 1/450 [13:34<27:06,  3.62s/it]  
 42%|████▏     | 189/450 [12:42<21:51,  5.03s/it][A

{'loss': 1.363, 'learning_rate': 2e-05, 'epoch': 2.1}



                                               t][A
  0%|          | 1/450 [13:38<27:06,  3.62s/it]  
 42%|████▏     | 190/450 [12:45<19:26,  4.49s/it][A

{'loss': 1.2344, 'learning_rate': 2e-05, 'epoch': 2.11}



                                               t][A
  0%|          | 1/450 [13:41<27:06,  3.62s/it]  
 42%|████▏     | 191/450 [12:49<17:42,  4.10s/it][A

{'loss': 1.1305, 'learning_rate': 2e-05, 'epoch': 2.12}



                                               t][A
  0%|          | 1/450 [13:45<27:06,  3.62s/it]  
 43%|████▎     | 192/450 [12:53<17:22,  4.04s/it][A

{'loss': 1.186, 'learning_rate': 2e-05, 'epoch': 2.13}



                                               t][A
  0%|          | 1/450 [13:50<27:06,  3.62s/it]  
 43%|████▎     | 193/450 [12:58<18:47,  4.39s/it][A

{'loss': 1.2804, 'learning_rate': 2e-05, 'epoch': 2.14}



                                               t][A
  0%|          | 1/450 [13:54<27:06,  3.62s/it]  
 43%|████▎     | 194/450 [13:01<17:48,  4.18s/it][A

{'loss': 1.1893, 'learning_rate': 2e-05, 'epoch': 2.16}



                                               t][A
  0%|          | 1/450 [13:57<27:06,  3.62s/it]  
 43%|████▎     | 195/450 [13:05<16:51,  3.97s/it][A

{'loss': 1.2607, 'learning_rate': 2e-05, 'epoch': 2.17}



                                               t][A
  0%|          | 1/450 [14:01<27:06,  3.62s/it]  
 44%|████▎     | 196/450 [13:09<16:32,  3.91s/it][A

{'loss': 1.2015, 'learning_rate': 2e-05, 'epoch': 2.18}



                                               t][A
  0%|          | 1/450 [14:05<27:06,  3.62s/it]  
 44%|████▍     | 197/450 [13:12<16:16,  3.86s/it][A

{'loss': 1.3948, 'learning_rate': 2e-05, 'epoch': 2.19}



                                               t][A
  0%|          | 1/450 [14:10<27:06,  3.62s/it]  
 44%|████▍     | 198/450 [13:18<18:47,  4.47s/it][A

{'loss': 1.2904, 'learning_rate': 2e-05, 'epoch': 2.2}



                                               t][A
  0%|          | 1/450 [14:14<27:06,  3.62s/it]  
 44%|████▍     | 199/450 [13:22<17:36,  4.21s/it][A

{'loss': 1.4623, 'learning_rate': 2e-05, 'epoch': 2.21}



                                               t][A
  0%|          | 1/450 [14:18<27:06,  3.62s/it]  
 44%|████▍     | 200/450 [13:26<16:53,  4.05s/it][A

{'loss': 1.3776, 'learning_rate': 2e-05, 'epoch': 2.22}



                                               t][A
  0%|          | 1/450 [14:21<27:06,  3.62s/it]  
 45%|████▍     | 201/450 [13:29<16:21,  3.94s/it][A

{'loss': 1.2358, 'learning_rate': 2e-05, 'epoch': 2.23}



                                               t][A
  0%|          | 1/450 [14:25<27:06,  3.62s/it]  
 45%|████▍     | 202/450 [13:33<15:42,  3.80s/it][A

{'loss': 1.2964, 'learning_rate': 2e-05, 'epoch': 2.24}



                                               t][A
  0%|          | 1/450 [14:29<27:06,  3.62s/it]  
 45%|████▌     | 203/450 [13:37<15:57,  3.88s/it][A

{'loss': 1.095, 'learning_rate': 2e-05, 'epoch': 2.26}



                                               t][A
  0%|          | 1/450 [14:34<27:06,  3.62s/it]  
 45%|████▌     | 204/450 [13:42<17:14,  4.20s/it][A

{'loss': 1.3107, 'learning_rate': 2e-05, 'epoch': 2.27}



                                               t][A
  0%|          | 1/450 [14:38<27:06,  3.62s/it]  
 46%|████▌     | 205/450 [13:46<16:36,  4.07s/it][A

{'loss': 1.1573, 'learning_rate': 2e-05, 'epoch': 2.28}



                                               t][A
  0%|          | 1/450 [14:41<27:06,  3.62s/it]  
 46%|████▌     | 206/450 [13:49<16:09,  3.97s/it][A

{'loss': 1.2963, 'learning_rate': 2e-05, 'epoch': 2.29}



                                               t][A
  0%|          | 1/450 [14:45<27:06,  3.62s/it]  
 46%|████▌     | 207/450 [13:53<15:33,  3.84s/it][A

{'loss': 1.212, 'learning_rate': 2e-05, 'epoch': 2.3}



                                               t][A
  0%|          | 1/450 [14:50<27:06,  3.62s/it]  
 46%|████▌     | 208/450 [13:58<16:54,  4.19s/it][A

{'loss': 1.3345, 'learning_rate': 2e-05, 'epoch': 2.31}



                                               t][A
  0%|          | 1/450 [14:54<27:06,  3.62s/it]  
 46%|████▋     | 209/450 [14:02<16:19,  4.06s/it][A

{'loss': 1.5012, 'learning_rate': 2e-05, 'epoch': 2.32}



                                               t][A
  0%|          | 1/450 [14:58<27:06,  3.62s/it]  
 47%|████▋     | 210/450 [14:06<16:54,  4.23s/it][A

{'loss': 1.2649, 'learning_rate': 2e-05, 'epoch': 2.33}



                                               t][A
  0%|          | 1/450 [15:02<27:06,  3.62s/it]  
 47%|████▋     | 211/450 [14:10<16:32,  4.15s/it][A

{'loss': 1.1612, 'learning_rate': 2e-05, 'epoch': 2.34}



                                               t][A
  0%|          | 1/450 [15:07<27:06,  3.62s/it]  
 47%|████▋     | 212/450 [14:15<16:48,  4.24s/it][A

{'loss': 1.2271, 'learning_rate': 2e-05, 'epoch': 2.36}



                                               t][A
  0%|          | 1/450 [15:11<27:06,  3.62s/it]  
 47%|████▋     | 213/450 [14:19<17:11,  4.35s/it][A

{'loss': 1.3223, 'learning_rate': 2e-05, 'epoch': 2.37}



                                               t][A
  0%|          | 1/450 [15:15<27:06,  3.62s/it]  
 48%|████▊     | 214/450 [14:23<16:12,  4.12s/it][A

{'loss': 1.343, 'learning_rate': 2e-05, 'epoch': 2.38}



                                               t][A
  0%|          | 1/450 [15:18<27:06,  3.62s/it]  
 48%|████▊     | 215/450 [14:26<15:18,  3.91s/it][A

{'loss': 1.3141, 'learning_rate': 2e-05, 'epoch': 2.39}



                                               t][A
  0%|          | 1/450 [15:22<27:06,  3.62s/it]  
 48%|████▊     | 216/450 [14:30<14:54,  3.82s/it][A

{'loss': 1.3062, 'learning_rate': 2e-05, 'epoch': 2.4}



                                               t][A
  0%|          | 1/450 [15:26<27:06,  3.62s/it]  
 48%|████▊     | 217/450 [14:34<15:06,  3.89s/it][A

{'loss': 1.3975, 'learning_rate': 2e-05, 'epoch': 2.41}



                                               t][A
  0%|          | 1/450 [15:30<27:06,  3.62s/it]  
 48%|████▊     | 218/450 [14:38<14:48,  3.83s/it][A

{'loss': 1.1237, 'learning_rate': 2e-05, 'epoch': 2.42}



                                               t][A
  0%|          | 1/450 [15:34<27:06,  3.62s/it]  
 49%|████▊     | 219/450 [14:42<15:23,  4.00s/it][A

{'loss': 1.2495, 'learning_rate': 2e-05, 'epoch': 2.43}



                                               t][A
  0%|          | 1/450 [15:38<27:06,  3.62s/it]  
 49%|████▉     | 220/450 [14:46<15:13,  3.97s/it][A

{'loss': 1.419, 'learning_rate': 2e-05, 'epoch': 2.44}



                                               t][A
  0%|          | 1/450 [15:42<27:06,  3.62s/it]  
 49%|████▉     | 221/450 [14:50<14:48,  3.88s/it][A

{'loss': 1.2452, 'learning_rate': 2e-05, 'epoch': 2.46}



                                               t][A
  0%|          | 1/450 [15:45<27:06,  3.62s/it]  
 49%|████▉     | 222/450 [14:53<14:22,  3.78s/it][A

{'loss': 1.4173, 'learning_rate': 2e-05, 'epoch': 2.47}



                                               t][A
  0%|          | 1/450 [15:49<27:06,  3.62s/it]  
 50%|████▉     | 223/450 [14:57<14:45,  3.90s/it][A

{'loss': 1.4226, 'learning_rate': 2e-05, 'epoch': 2.48}



                                               t][A
  0%|          | 1/450 [15:53<27:06,  3.62s/it]  
 50%|████▉     | 224/450 [15:01<14:22,  3.81s/it][A

{'loss': 1.303, 'learning_rate': 2e-05, 'epoch': 2.49}



                                               t][A
  0%|          | 1/450 [15:57<27:06,  3.62s/it]  
 50%|█████     | 225/450 [15:05<14:45,  3.93s/it][A

{'loss': 1.3202, 'learning_rate': 2e-05, 'epoch': 2.5}



                                               t][A
  0%|          | 1/450 [16:04<27:06,  3.62s/it]  
 50%|█████     | 226/450 [15:11<17:19,  4.64s/it][A

{'loss': 1.3554, 'learning_rate': 2e-05, 'epoch': 2.51}



                                               t][A
  0%|          | 1/450 [16:07<27:06,  3.62s/it]  
 50%|█████     | 227/450 [15:15<16:15,  4.37s/it][A

{'loss': 1.1988, 'learning_rate': 2e-05, 'epoch': 2.52}



                                               t][A
  0%|          | 1/450 [16:11<27:06,  3.62s/it]  
 51%|█████     | 228/450 [15:19<15:46,  4.26s/it][A

{'loss': 1.2966, 'learning_rate': 2e-05, 'epoch': 2.53}



                                               t][A
  0%|          | 1/450 [16:15<27:06,  3.62s/it]  
 51%|█████     | 229/450 [15:23<15:08,  4.11s/it][A

{'loss': 1.1834, 'learning_rate': 2e-05, 'epoch': 2.54}



                                               t][A
  0%|          | 1/450 [16:19<27:06,  3.62s/it]  
 51%|█████     | 230/450 [15:26<14:22,  3.92s/it][A

{'loss': 1.204, 'learning_rate': 2e-05, 'epoch': 2.56}



                                               t][A
  0%|          | 1/450 [16:22<27:06,  3.62s/it]  
 51%|█████▏    | 231/450 [15:30<13:50,  3.79s/it][A

{'loss': 1.2432, 'learning_rate': 2e-05, 'epoch': 2.57}



                                               t][A
  0%|          | 1/450 [16:27<27:06,  3.62s/it]  
 52%|█████▏    | 232/450 [15:35<14:51,  4.09s/it][A

{'loss': 1.2509, 'learning_rate': 2e-05, 'epoch': 2.58}



                                               t][A
  0%|          | 1/450 [16:31<27:06,  3.62s/it]  
 52%|█████▏    | 233/450 [15:38<14:29,  4.01s/it][A

{'loss': 1.2474, 'learning_rate': 2e-05, 'epoch': 2.59}



                                               t][A
  0%|          | 1/450 [16:34<27:06,  3.62s/it]  
 52%|█████▏    | 234/450 [15:42<14:03,  3.91s/it][A

{'loss': 1.2901, 'learning_rate': 2e-05, 'epoch': 2.6}



                                               t][A
  0%|          | 1/450 [16:38<27:06,  3.62s/it]  
 52%|█████▏    | 235/450 [15:46<14:00,  3.91s/it][A

{'loss': 1.2185, 'learning_rate': 2e-05, 'epoch': 2.61}



                                               t][A
  0%|          | 1/450 [16:41<27:06,  3.62s/it]  
 52%|█████▏    | 236/450 [15:49<13:14,  3.71s/it][A

{'loss': 1.1084, 'learning_rate': 2e-05, 'epoch': 2.62}



                                               t][A
  0%|          | 1/450 [16:45<27:06,  3.62s/it]  
 53%|█████▎    | 237/450 [15:53<12:58,  3.66s/it][A

{'loss': 1.4245, 'learning_rate': 2e-05, 'epoch': 2.63}



                                               t][A
  0%|          | 1/450 [16:49<27:06,  3.62s/it]  
 53%|█████▎    | 238/450 [15:57<13:25,  3.80s/it][A

{'loss': 1.2993, 'learning_rate': 2e-05, 'epoch': 2.64}



                                               t][A
  0%|          | 1/450 [16:53<27:06,  3.62s/it]  
 53%|█████▎    | 239/450 [16:01<13:39,  3.89s/it][A

{'loss': 1.313, 'learning_rate': 2e-05, 'epoch': 2.66}



                                               t][A
  0%|          | 1/450 [16:56<27:06,  3.62s/it]  
 53%|█████▎    | 240/450 [16:04<12:38,  3.61s/it][A

{'loss': 1.3624, 'learning_rate': 2e-05, 'epoch': 2.67}



                                               t][A
  0%|          | 1/450 [17:01<27:06,  3.62s/it]  
 54%|█████▎    | 241/450 [16:09<13:44,  3.94s/it][A

{'loss': 1.1682, 'learning_rate': 2e-05, 'epoch': 2.68}



                                               t][A
  0%|          | 1/450 [17:05<27:06,  3.62s/it]  
 54%|█████▍    | 242/450 [16:13<13:50,  3.99s/it][A

{'loss': 1.2933, 'learning_rate': 2e-05, 'epoch': 2.69}



                                               t][A
  0%|          | 1/450 [17:09<27:06,  3.62s/it]  
 54%|█████▍    | 243/450 [16:17<14:00,  4.06s/it][A

{'loss': 1.243, 'learning_rate': 2e-05, 'epoch': 2.7}



                                               t][A
  0%|          | 1/450 [17:14<27:06,  3.62s/it]  
 54%|█████▍    | 244/450 [16:22<14:20,  4.18s/it][A

{'loss': 1.1929, 'learning_rate': 2e-05, 'epoch': 2.71}



                                               t][A
  0%|          | 1/450 [17:18<27:06,  3.62s/it]  
 54%|█████▍    | 245/450 [16:26<14:28,  4.24s/it][A

{'loss': 1.3383, 'learning_rate': 2e-05, 'epoch': 2.72}



                                               t][A
  0%|          | 1/450 [17:22<27:06,  3.62s/it]  
 55%|█████▍    | 246/450 [16:30<13:52,  4.08s/it][A

{'loss': 1.1987, 'learning_rate': 2e-05, 'epoch': 2.73}



                                               t][A
  0%|          | 1/450 [17:25<27:06,  3.62s/it]  
 55%|█████▍    | 247/450 [16:33<13:07,  3.88s/it][A

{'loss': 1.2616, 'learning_rate': 2e-05, 'epoch': 2.74}



                                               t][A
  0%|          | 1/450 [17:30<27:06,  3.62s/it]  
 55%|█████▌    | 248/450 [16:38<14:04,  4.18s/it][A

{'loss': 1.2453, 'learning_rate': 2e-05, 'epoch': 2.76}



                                               t][A
  0%|          | 1/450 [17:34<27:06,  3.62s/it]  
 55%|█████▌    | 249/450 [16:42<13:45,  4.11s/it][A

{'loss': 1.2939, 'learning_rate': 2e-05, 'epoch': 2.77}



                                               t][A
  0%|          | 1/450 [17:37<27:06,  3.62s/it]  
 56%|█████▌    | 250/450 [16:45<12:48,  3.84s/it][A

{'loss': 1.2291, 'learning_rate': 2e-05, 'epoch': 2.78}



                                               t][A
  0%|          | 1/450 [17:41<27:06,  3.62s/it]  
 56%|█████▌    | 251/450 [16:49<12:27,  3.76s/it][A

{'loss': 1.223, 'learning_rate': 2e-05, 'epoch': 2.79}



                                               t][A
  0%|          | 1/450 [17:44<27:06,  3.62s/it]  
 56%|█████▌    | 252/450 [16:52<12:05,  3.66s/it][A

{'loss': 1.2218, 'learning_rate': 2e-05, 'epoch': 2.8}



                                               t][A
  0%|          | 1/450 [17:49<27:06,  3.62s/it]  
 56%|█████▌    | 253/450 [16:56<12:44,  3.88s/it][A

{'loss': 1.325, 'learning_rate': 2e-05, 'epoch': 2.81}



                                               t][A
  0%|          | 1/450 [17:54<27:06,  3.62s/it]  
 56%|█████▋    | 254/450 [17:02<14:10,  4.34s/it][A

{'loss': 1.1948, 'learning_rate': 2e-05, 'epoch': 2.82}



                                               t][A
  0%|          | 1/450 [17:58<27:06,  3.62s/it]  
 57%|█████▋    | 255/450 [17:06<14:05,  4.33s/it][A

{'loss': 1.3043, 'learning_rate': 2e-05, 'epoch': 2.83}



                                               t][A
  0%|          | 1/450 [18:02<27:06,  3.62s/it]  
 57%|█████▋    | 256/450 [17:10<13:40,  4.23s/it][A

{'loss': 1.2954, 'learning_rate': 2e-05, 'epoch': 2.84}



                                               t][A
  0%|          | 1/450 [18:06<27:06,  3.62s/it]  
 57%|█████▋    | 257/450 [17:13<12:42,  3.95s/it][A

{'loss': 1.2569, 'learning_rate': 2e-05, 'epoch': 2.86}



                                               t][A
  0%|          | 1/450 [18:09<27:06,  3.62s/it]  
 57%|█████▋    | 258/450 [17:17<12:02,  3.76s/it][A

{'loss': 1.3089, 'learning_rate': 2e-05, 'epoch': 2.87}



                                               t][A
  0%|          | 1/450 [18:14<27:06,  3.62s/it]  
 58%|█████▊    | 259/450 [17:22<13:06,  4.12s/it][A

{'loss': 1.2996, 'learning_rate': 2e-05, 'epoch': 2.88}



                                               t][A
  0%|          | 1/450 [18:18<27:06,  3.62s/it]  
 58%|█████▊    | 260/450 [17:26<12:49,  4.05s/it][A

{'loss': 1.299, 'learning_rate': 2e-05, 'epoch': 2.89}



                                               t][A
  0%|          | 1/450 [18:21<27:06,  3.62s/it]  
 58%|█████▊    | 261/450 [17:29<12:18,  3.91s/it][A

{'loss': 1.2845, 'learning_rate': 2e-05, 'epoch': 2.9}



                                               t][A
  0%|          | 1/450 [18:24<27:06,  3.62s/it]  
 58%|█████▊    | 262/450 [17:32<11:28,  3.66s/it][A

{'loss': 1.1922, 'learning_rate': 2e-05, 'epoch': 2.91}



                                               t][A
  0%|          | 1/450 [18:28<27:06,  3.62s/it]  
 58%|█████▊    | 263/450 [17:36<11:04,  3.55s/it][A

{'loss': 1.1089, 'learning_rate': 2e-05, 'epoch': 2.92}



                                               t][A
  0%|          | 1/450 [18:32<27:06,  3.62s/it]  
 59%|█████▊    | 264/450 [17:39<11:17,  3.64s/it][A

{'loss': 1.2926, 'learning_rate': 2e-05, 'epoch': 2.93}



                                               t][A
  0%|          | 1/450 [18:36<27:06,  3.62s/it]  
 59%|█████▉    | 265/450 [17:43<11:31,  3.74s/it][A

{'loss': 1.2205, 'learning_rate': 2e-05, 'epoch': 2.94}



                                               t][A
  0%|          | 1/450 [18:39<27:06,  3.62s/it]  
 59%|█████▉    | 266/450 [17:47<11:18,  3.69s/it][A

{'loss': 1.2509, 'learning_rate': 2e-05, 'epoch': 2.96}



                                               t][A
  0%|          | 1/450 [18:43<27:06,  3.62s/it]  
 59%|█████▉    | 267/450 [17:50<11:01,  3.61s/it][A

{'loss': 1.3576, 'learning_rate': 2e-05, 'epoch': 2.97}



                                               t][A
  0%|          | 1/450 [18:46<27:06,  3.62s/it]  
 60%|█████▉    | 268/450 [17:54<11:13,  3.70s/it][A

{'loss': 1.3584, 'learning_rate': 2e-05, 'epoch': 2.98}



                                               t][A
  0%|          | 1/450 [18:50<27:06,  3.62s/it]  
 60%|█████▉    | 269/450 [17:58<10:59,  3.64s/it][A

{'loss': 1.2712, 'learning_rate': 2e-05, 'epoch': 2.99}



                                               t][A
  0%|          | 1/450 [18:54<27:06,  3.62s/it]  
 60%|██████    | 270/450 [18:02<11:01,  3.68s/it][A

{'loss': 1.3389, 'learning_rate': 2e-05, 'epoch': 3.0}



                                               t][A
  0%|          | 1/450 [18:58<27:06,  3.62s/it]  
 60%|██████    | 271/450 [18:05<11:07,  3.73s/it][A

{'loss': 1.217, 'learning_rate': 2e-05, 'epoch': 3.01}



                                               t][A
  0%|          | 1/450 [19:01<27:06,  3.62s/it]  
 60%|██████    | 272/450 [18:09<10:29,  3.54s/it][A

{'loss': 1.2596, 'learning_rate': 2e-05, 'epoch': 3.02}



                                               t][A
  0%|          | 1/450 [19:05<27:06,  3.62s/it]  
 61%|██████    | 273/450 [18:13<10:55,  3.70s/it][A

{'loss': 1.2238, 'learning_rate': 2e-05, 'epoch': 3.03}



                                               t][A
  0%|          | 1/450 [19:09<27:06,  3.62s/it]  
 61%|██████    | 274/450 [18:17<11:08,  3.80s/it][A

{'loss': 1.1841, 'learning_rate': 2e-05, 'epoch': 3.04}



                                               t][A
  0%|          | 1/450 [19:12<27:06,  3.62s/it]  
 61%|██████    | 275/450 [18:20<10:54,  3.74s/it][A

{'loss': 1.3192, 'learning_rate': 2e-05, 'epoch': 3.06}



                                               t][A
  0%|          | 1/450 [19:16<27:06,  3.62s/it]  
 61%|██████▏   | 276/450 [18:24<10:51,  3.75s/it][A

{'loss': 1.3277, 'learning_rate': 2e-05, 'epoch': 3.07}



                                               t][A
  0%|          | 1/450 [19:20<27:06,  3.62s/it]  
 62%|██████▏   | 277/450 [18:28<11:03,  3.83s/it][A

{'loss': 1.3483, 'learning_rate': 2e-05, 'epoch': 3.08}



                                               t][A
  0%|          | 1/450 [19:24<27:06,  3.62s/it]  
 62%|██████▏   | 278/450 [18:32<10:51,  3.79s/it][A

{'loss': 1.2256, 'learning_rate': 2e-05, 'epoch': 3.09}



                                               t][A
  0%|          | 1/450 [19:27<27:06,  3.62s/it]  
 62%|██████▏   | 279/450 [18:35<10:39,  3.74s/it][A

{'loss': 1.1878, 'learning_rate': 2e-05, 'epoch': 3.1}



                                               t][A
  0%|          | 1/450 [19:32<27:06,  3.62s/it]  
 62%|██████▏   | 280/450 [18:40<11:02,  3.90s/it][A

{'loss': 1.2736, 'learning_rate': 2e-05, 'epoch': 3.11}



                                               t][A
  0%|          | 1/450 [19:36<27:06,  3.62s/it]  
 62%|██████▏   | 281/450 [18:44<11:08,  3.95s/it][A

{'loss': 1.2108, 'learning_rate': 2e-05, 'epoch': 3.12}



                                               t][A
  0%|          | 1/450 [19:39<27:06,  3.62s/it]  
 63%|██████▎   | 282/450 [18:47<10:48,  3.86s/it][A

{'loss': 1.1168, 'learning_rate': 2e-05, 'epoch': 3.13}



                                               t][A
  0%|          | 1/450 [19:43<27:06,  3.62s/it]  
 63%|██████▎   | 283/450 [18:51<10:20,  3.71s/it][A

{'loss': 1.2841, 'learning_rate': 2e-05, 'epoch': 3.14}



                                               t][A
  0%|          | 1/450 [19:47<27:06,  3.62s/it]  
 63%|██████▎   | 284/450 [18:55<10:24,  3.76s/it][A

{'loss': 1.215, 'learning_rate': 2e-05, 'epoch': 3.16}



                                               t][A
  0%|          | 1/450 [19:51<27:06,  3.62s/it]  
 63%|██████▎   | 285/450 [18:58<10:28,  3.81s/it][A

{'loss': 1.1552, 'learning_rate': 2e-05, 'epoch': 3.17}



                                               t][A
  0%|          | 1/450 [19:55<27:06,  3.62s/it]  
 64%|██████▎   | 286/450 [19:02<10:29,  3.84s/it][A

{'loss': 1.2161, 'learning_rate': 2e-05, 'epoch': 3.18}



                                               t][A
  0%|          | 1/450 [19:58<27:06,  3.62s/it]  
 64%|██████▍   | 287/450 [19:06<10:24,  3.83s/it][A

{'loss': 1.3477, 'learning_rate': 2e-05, 'epoch': 3.19}



                                               t][A
  0%|          | 1/450 [20:03<27:06,  3.62s/it]  
 64%|██████▍   | 288/450 [19:11<11:06,  4.11s/it][A

{'loss': 1.2334, 'learning_rate': 2e-05, 'epoch': 3.2}



                                               t][A
  0%|          | 1/450 [20:06<27:06,  3.62s/it]  
 64%|██████▍   | 289/450 [19:14<10:14,  3.82s/it][A

{'loss': 1.3001, 'learning_rate': 2e-05, 'epoch': 3.21}



                                               t][A
  0%|          | 1/450 [20:09<27:06,  3.62s/it]  
 64%|██████▍   | 290/450 [19:17<09:30,  3.56s/it][A

{'loss': 1.0189, 'learning_rate': 2e-05, 'epoch': 3.22}



                                               t][A
  0%|          | 1/450 [20:13<27:06,  3.62s/it]  
 65%|██████▍   | 291/450 [19:20<09:14,  3.49s/it][A

{'loss': 1.254, 'learning_rate': 2e-05, 'epoch': 3.23}



                                               t][A
  0%|          | 1/450 [20:16<27:06,  3.62s/it]  
 65%|██████▍   | 292/450 [19:24<09:28,  3.60s/it][A

{'loss': 1.0806, 'learning_rate': 2e-05, 'epoch': 3.24}



                                               t][A
  0%|          | 1/450 [20:21<27:06,  3.62s/it]  
 65%|██████▌   | 293/450 [19:29<10:25,  3.98s/it][A

{'loss': 1.3138, 'learning_rate': 2e-05, 'epoch': 3.26}



                                               t][A
  0%|          | 1/450 [20:26<27:06,  3.62s/it]  
 65%|██████▌   | 294/450 [19:34<10:43,  4.12s/it][A

{'loss': 1.2069, 'learning_rate': 2e-05, 'epoch': 3.27}



                                               t][A
  0%|          | 1/450 [20:30<27:06,  3.62s/it]  
 66%|██████▌   | 295/450 [19:38<10:31,  4.08s/it][A

{'loss': 1.2564, 'learning_rate': 2e-05, 'epoch': 3.28}



                                               t][A
  0%|          | 1/450 [20:33<27:06,  3.62s/it]  
 66%|██████▌   | 296/450 [19:41<10:01,  3.91s/it][A

{'loss': 1.2159, 'learning_rate': 2e-05, 'epoch': 3.29}



                                               t][A
  0%|          | 1/450 [20:38<27:06,  3.62s/it]  
 66%|██████▌   | 297/450 [19:46<10:36,  4.16s/it][A

{'loss': 1.035, 'learning_rate': 2e-05, 'epoch': 3.3}



                                               t][A
  0%|          | 1/450 [20:41<27:06,  3.62s/it]  
 66%|██████▌   | 298/450 [19:49<10:04,  3.98s/it][A

{'loss': 1.3348, 'learning_rate': 2e-05, 'epoch': 3.31}



                                               t][A
  0%|          | 1/450 [20:45<27:06,  3.62s/it]  
 66%|██████▋   | 299/450 [19:53<09:52,  3.92s/it][A

{'loss': 1.1858, 'learning_rate': 2e-05, 'epoch': 3.32}



                                               t][A
  0%|          | 1/450 [20:49<27:06,  3.62s/it]  
 67%|██████▋   | 300/450 [19:57<09:39,  3.86s/it][A

{'loss': 1.2503, 'learning_rate': 2e-05, 'epoch': 3.33}



                                               t][A
  0%|          | 1/450 [20:54<27:06,  3.62s/it]  
 67%|██████▋   | 301/450 [20:02<10:15,  4.13s/it][A

{'loss': 1.2748, 'learning_rate': 2e-05, 'epoch': 3.34}



                                               t][A
  0%|          | 1/450 [20:58<27:06,  3.62s/it]  
 67%|██████▋   | 302/450 [20:06<10:01,  4.07s/it][A

{'loss': 1.2131, 'learning_rate': 2e-05, 'epoch': 3.36}



                                               t][A
  0%|          | 1/450 [21:01<27:06,  3.62s/it]  
 67%|██████▋   | 303/450 [20:09<09:37,  3.93s/it][A

{'loss': 1.2046, 'learning_rate': 2e-05, 'epoch': 3.37}



                                               t][A
  0%|          | 1/450 [21:06<27:06,  3.62s/it]  
 68%|██████▊   | 304/450 [20:14<10:00,  4.12s/it][A

{'loss': 1.2716, 'learning_rate': 2e-05, 'epoch': 3.38}



                                               t][A
  0%|          | 1/450 [21:10<27:06,  3.62s/it]  
 68%|██████▊   | 305/450 [20:18<10:10,  4.21s/it][A

{'loss': 1.3449, 'learning_rate': 2e-05, 'epoch': 3.39}



                                               t][A
  0%|          | 1/450 [21:14<27:06,  3.62s/it]  
 68%|██████▊   | 306/450 [20:22<09:38,  4.02s/it][A

{'loss': 1.1789, 'learning_rate': 2e-05, 'epoch': 3.4}



                                               t][A
  0%|          | 1/450 [21:18<27:06,  3.62s/it]  
 68%|██████▊   | 307/450 [20:26<09:41,  4.07s/it][A

{'loss': 1.2985, 'learning_rate': 2e-05, 'epoch': 3.41}



                                               t][A
  0%|          | 1/450 [21:22<27:06,  3.62s/it]  
 68%|██████▊   | 308/450 [20:30<09:34,  4.05s/it][A

{'loss': 1.2788, 'learning_rate': 2e-05, 'epoch': 3.42}



                                               t][A
  0%|          | 1/450 [21:26<27:06,  3.62s/it]  
 69%|██████▊   | 309/450 [20:34<09:22,  3.99s/it][A

{'loss': 1.2677, 'learning_rate': 2e-05, 'epoch': 3.43}



                                               t][A
  0%|          | 1/450 [21:30<27:06,  3.62s/it]  
 69%|██████▉   | 310/450 [20:37<09:05,  3.90s/it][A

{'loss': 1.2218, 'learning_rate': 2e-05, 'epoch': 3.44}



                                               t][A
  0%|          | 1/450 [21:35<27:06,  3.62s/it]  
 69%|██████▉   | 311/450 [20:43<09:52,  4.26s/it][A

{'loss': 1.271, 'learning_rate': 2e-05, 'epoch': 3.46}



                                               t][A
  0%|          | 1/450 [21:38<27:06,  3.62s/it]  
 69%|██████▉   | 312/450 [20:46<09:23,  4.08s/it][A

{'loss': 1.2715, 'learning_rate': 2e-05, 'epoch': 3.47}



                                               t][A
  0%|          | 1/450 [21:42<27:06,  3.62s/it]  
 70%|██████▉   | 313/450 [20:50<08:48,  3.86s/it][A

{'loss': 1.2005, 'learning_rate': 2e-05, 'epoch': 3.48}



                                               t][A
  0%|          | 1/450 [21:47<27:06,  3.62s/it]  
 70%|██████▉   | 314/450 [20:54<09:25,  4.16s/it][A

{'loss': 1.1717, 'learning_rate': 2e-05, 'epoch': 3.49}



                                               t][A
  0%|          | 1/450 [21:50<27:06,  3.62s/it]  
 70%|███████   | 315/450 [20:58<08:44,  3.89s/it][A

{'loss': 1.3096, 'learning_rate': 2e-05, 'epoch': 3.5}



                                               t][A
  0%|          | 1/450 [21:53<27:06,  3.62s/it]  
 70%|███████   | 316/450 [21:01<08:18,  3.72s/it][A

{'loss': 1.3051, 'learning_rate': 2e-05, 'epoch': 3.51}



                                               t][A
  0%|          | 1/450 [21:58<27:06,  3.62s/it]  
 70%|███████   | 317/450 [21:05<08:45,  3.95s/it][A

{'loss': 1.1878, 'learning_rate': 2e-05, 'epoch': 3.52}



                                               t][A
  0%|          | 1/450 [22:02<27:06,  3.62s/it]  
 71%|███████   | 318/450 [21:09<08:40,  3.94s/it][A

{'loss': 1.2418, 'learning_rate': 2e-05, 'epoch': 3.53}



                                               t][A
  0%|          | 1/450 [22:05<27:06,  3.62s/it]  
 71%|███████   | 319/450 [21:13<08:26,  3.86s/it][A

{'loss': 1.3962, 'learning_rate': 2e-05, 'epoch': 3.54}



                                               t][A
  0%|          | 1/450 [22:10<27:06,  3.62s/it]  
 71%|███████   | 320/450 [21:17<08:40,  4.00s/it][A

{'loss': 1.299, 'learning_rate': 2e-05, 'epoch': 3.56}



                                               t][A
  0%|          | 1/450 [22:14<27:06,  3.62s/it]  
 71%|███████▏  | 321/450 [21:22<08:54,  4.14s/it][A

{'loss': 1.3695, 'learning_rate': 2e-05, 'epoch': 3.57}



                                               t][A
  0%|          | 1/450 [22:18<27:06,  3.62s/it]  
 72%|███████▏  | 322/450 [21:25<08:28,  3.97s/it][A

{'loss': 1.1061, 'learning_rate': 2e-05, 'epoch': 3.58}



                                               t][A
  0%|          | 1/450 [22:21<27:06,  3.62s/it]  
 72%|███████▏  | 323/450 [21:29<07:50,  3.71s/it][A

{'loss': 1.2401, 'learning_rate': 2e-05, 'epoch': 3.59}



                                               t][A
  0%|          | 1/450 [22:25<27:06,  3.62s/it]  
 72%|███████▏  | 324/450 [21:33<08:07,  3.87s/it][A

{'loss': 1.3242, 'learning_rate': 2e-05, 'epoch': 3.6}



                                               t][A
  0%|          | 1/450 [22:29<27:06,  3.62s/it]  
 72%|███████▏  | 325/450 [21:37<08:02,  3.86s/it][A

{'loss': 1.2457, 'learning_rate': 2e-05, 'epoch': 3.61}



                                               t][A
  0%|          | 1/450 [22:32<27:06,  3.62s/it]  
 72%|███████▏  | 326/450 [21:40<07:38,  3.70s/it][A

{'loss': 1.2978, 'learning_rate': 2e-05, 'epoch': 3.62}



                                               t][A
  0%|          | 1/450 [22:41<27:06,  3.62s/it]  
 73%|███████▎  | 327/450 [21:49<10:34,  5.16s/it][A

{'loss': 1.3897, 'learning_rate': 2e-05, 'epoch': 3.63}



                                               t][A
  0%|          | 1/450 [22:45<27:06,  3.62s/it]  
 73%|███████▎  | 328/450 [21:53<09:58,  4.91s/it][A

{'loss': 1.2661, 'learning_rate': 2e-05, 'epoch': 3.64}



                                               t][A
  0%|          | 1/450 [22:48<27:06,  3.62s/it]  
 73%|███████▎  | 329/450 [21:56<09:02,  4.48s/it][A

{'loss': 1.2118, 'learning_rate': 2e-05, 'epoch': 3.66}



                                               t][A
  0%|          | 1/450 [22:51<27:06,  3.62s/it]  
 73%|███████▎  | 330/450 [21:59<08:05,  4.04s/it][A

{'loss': 1.1694, 'learning_rate': 2e-05, 'epoch': 3.67}



                                               t][A
  0%|          | 1/450 [22:56<27:06,  3.62s/it]  

{'loss': 1.3988, 'learning_rate': 2e-05, 'epoch': 3.68}



 74%|███████▎  | 331/450 [22:03<08:03,  4.06s/it][A
                                               t][A
  0%|          | 1/450 [22:59<27:06,  3.62s/it]  
 74%|███████▍  | 332/450 [22:07<07:24,  3.77s/it][A

{'loss': 1.2731, 'learning_rate': 2e-05, 'epoch': 3.69}



                                               t][A
  0%|          | 1/450 [23:02<27:06,  3.62s/it]  
 74%|███████▍  | 333/450 [22:10<07:20,  3.77s/it][A

{'loss': 1.2065, 'learning_rate': 2e-05, 'epoch': 3.7}



                                               t][A
  0%|          | 1/450 [23:07<27:06,  3.62s/it]  
 74%|███████▍  | 334/450 [22:15<07:54,  4.09s/it][A

{'loss': 1.2534, 'learning_rate': 2e-05, 'epoch': 3.71}



                                               t][A
  0%|          | 1/450 [23:12<27:06,  3.62s/it]  
 74%|███████▍  | 335/450 [22:20<08:03,  4.21s/it][A

{'loss': 1.27, 'learning_rate': 2e-05, 'epoch': 3.72}



                                               t][A
  0%|          | 1/450 [23:15<27:06,  3.62s/it]  
 75%|███████▍  | 336/450 [22:23<07:30,  3.95s/it][A

{'loss': 1.2694, 'learning_rate': 2e-05, 'epoch': 3.73}



                                               t][A
  0%|          | 1/450 [23:19<27:06,  3.62s/it]  
 75%|███████▍  | 337/450 [22:27<07:26,  3.95s/it][A

{'loss': 1.3454, 'learning_rate': 2e-05, 'epoch': 3.74}



                                               t][A
  0%|          | 1/450 [23:23<27:06,  3.62s/it]  
 75%|███████▌  | 338/450 [22:31<07:10,  3.84s/it][A

{'loss': 1.2267, 'learning_rate': 2e-05, 'epoch': 3.76}



                                               t][A
  0%|          | 1/450 [23:28<27:06,  3.62s/it]  
 75%|███████▌  | 339/450 [22:36<07:52,  4.25s/it][A

{'loss': 1.218, 'learning_rate': 2e-05, 'epoch': 3.77}



                                               t][A
  0%|          | 1/450 [23:31<27:06,  3.62s/it]  
 76%|███████▌  | 340/450 [22:39<07:16,  3.97s/it][A

{'loss': 1.2273, 'learning_rate': 2e-05, 'epoch': 3.78}



                                               t][A
  0%|          | 1/450 [23:35<27:06,  3.62s/it]  
 76%|███████▌  | 341/450 [22:43<07:05,  3.90s/it][A

{'loss': 1.2489, 'learning_rate': 2e-05, 'epoch': 3.79}



                                               t][A
  0%|          | 1/450 [23:40<27:06,  3.62s/it]  
 76%|███████▌  | 342/450 [22:47<07:24,  4.11s/it][A

{'loss': 1.2881, 'learning_rate': 2e-05, 'epoch': 3.8}



                                               t][A
  0%|          | 1/450 [23:43<27:06,  3.62s/it]  
 76%|███████▌  | 343/450 [22:51<07:08,  4.01s/it][A

{'loss': 1.2662, 'learning_rate': 2e-05, 'epoch': 3.81}



                                               t][A
  0%|          | 1/450 [23:48<27:06,  3.62s/it]  
 76%|███████▋  | 344/450 [22:56<07:38,  4.33s/it][A

{'loss': 1.2939, 'learning_rate': 2e-05, 'epoch': 3.82}



                                               t][A
  0%|          | 1/450 [23:52<27:06,  3.62s/it]  
 77%|███████▋  | 345/450 [23:00<07:25,  4.25s/it][A

{'loss': 1.2775, 'learning_rate': 2e-05, 'epoch': 3.83}



                                               t][A
  0%|          | 1/450 [23:58<27:06,  3.62s/it]  
 77%|███████▋  | 346/450 [23:05<07:48,  4.51s/it][A

{'loss': 1.2644, 'learning_rate': 2e-05, 'epoch': 3.84}



                                               t][A
  0%|          | 1/450 [24:01<27:06,  3.62s/it]  
 77%|███████▋  | 347/450 [23:09<07:24,  4.31s/it][A

{'loss': 1.2018, 'learning_rate': 2e-05, 'epoch': 3.86}



                                               t][A
  0%|          | 1/450 [24:05<27:06,  3.62s/it]  
 77%|███████▋  | 348/450 [23:13<06:58,  4.11s/it][A

{'loss': 1.2024, 'learning_rate': 2e-05, 'epoch': 3.87}



                                               t][A
  0%|          | 1/450 [24:09<27:06,  3.62s/it]  
 78%|███████▊  | 349/450 [23:17<06:45,  4.02s/it][A

{'loss': 1.1545, 'learning_rate': 2e-05, 'epoch': 3.88}



                                               t][A
  0%|          | 1/450 [24:13<27:06,  3.62s/it]  
 78%|███████▊  | 350/450 [23:20<06:32,  3.93s/it][A

{'loss': 1.2889, 'learning_rate': 2e-05, 'epoch': 3.89}



                                               t][A
  0%|          | 1/450 [24:16<27:06,  3.62s/it]  
 78%|███████▊  | 351/450 [23:24<06:21,  3.86s/it][A

{'loss': 1.325, 'learning_rate': 2e-05, 'epoch': 3.9}



                                               t][A
  0%|          | 1/450 [24:20<27:06,  3.62s/it]  
 78%|███████▊  | 352/450 [23:28<06:09,  3.77s/it][A

{'loss': 1.2361, 'learning_rate': 2e-05, 'epoch': 3.91}



                                               t][A
  0%|          | 1/450 [24:24<27:06,  3.62s/it]  
 78%|███████▊  | 353/450 [23:32<06:27,  3.99s/it][A

{'loss': 1.1623, 'learning_rate': 2e-05, 'epoch': 3.92}



                                               t][A
  0%|          | 1/450 [24:28<27:06,  3.62s/it]  
 79%|███████▊  | 354/450 [23:36<06:28,  4.04s/it][A

{'loss': 1.2293, 'learning_rate': 2e-05, 'epoch': 3.93}



                                               t][A
  0%|          | 1/450 [24:32<27:06,  3.62s/it]  
 79%|███████▉  | 355/450 [23:40<06:17,  3.98s/it][A

{'loss': 1.2276, 'learning_rate': 2e-05, 'epoch': 3.94}



                                               t][A
  0%|          | 1/450 [24:36<27:06,  3.62s/it]  
 79%|███████▉  | 356/450 [23:44<06:02,  3.86s/it][A

{'loss': 1.3927, 'learning_rate': 2e-05, 'epoch': 3.96}



                                               t][A
  0%|          | 1/450 [24:40<27:06,  3.62s/it]  
 79%|███████▉  | 357/450 [23:48<06:12,  4.00s/it][A

{'loss': 1.2086, 'learning_rate': 2e-05, 'epoch': 3.97}



                                               t][A
  0%|          | 1/450 [24:46<27:06,  3.62s/it]  
 80%|███████▉  | 358/450 [23:54<06:48,  4.44s/it][A

{'loss': 1.2342, 'learning_rate': 2e-05, 'epoch': 3.98}



                                               t][A
  0%|          | 1/450 [24:50<27:06,  3.62s/it]  
 80%|███████▉  | 359/450 [23:58<06:49,  4.50s/it][A

{'loss': 1.3277, 'learning_rate': 2e-05, 'epoch': 3.99}



                                               t][A
  0%|          | 1/450 [24:54<27:06,  3.62s/it]  
 80%|████████  | 360/450 [24:02<06:24,  4.27s/it][A

{'loss': 1.095, 'learning_rate': 2e-05, 'epoch': 4.0}



                                               t][A
  0%|          | 1/450 [24:58<27:06,  3.62s/it]  
 80%|████████  | 361/450 [24:05<06:01,  4.06s/it][A

{'loss': 1.2089, 'learning_rate': 2e-05, 'epoch': 4.01}



                                               t][A
  0%|          | 1/450 [25:01<27:06,  3.62s/it]  
 80%|████████  | 362/450 [24:09<05:52,  4.00s/it][A

{'loss': 1.1913, 'learning_rate': 2e-05, 'epoch': 4.02}



                                               t][A
  0%|          | 1/450 [25:05<27:06,  3.62s/it]  
 81%|████████  | 363/450 [24:13<05:43,  3.95s/it][A

{'loss': 1.2266, 'learning_rate': 2e-05, 'epoch': 4.03}



                                               t][A
  0%|          | 1/450 [25:10<27:06,  3.62s/it]  
 81%|████████  | 364/450 [24:18<05:50,  4.07s/it][A

{'loss': 1.2277, 'learning_rate': 2e-05, 'epoch': 4.04}



                                               t][A
  0%|          | 1/450 [25:14<27:06,  3.62s/it]  
 81%|████████  | 365/450 [24:22<05:53,  4.16s/it][A

{'loss': 1.1979, 'learning_rate': 2e-05, 'epoch': 4.06}



                                               t][A
  0%|          | 1/450 [25:18<27:06,  3.62s/it]  
 81%|████████▏ | 366/450 [24:25<05:32,  3.96s/it][A

{'loss': 1.0823, 'learning_rate': 2e-05, 'epoch': 4.07}



                                               t][A
  0%|          | 1/450 [25:21<27:06,  3.62s/it]  
 82%|████████▏ | 367/450 [24:29<05:22,  3.89s/it][A

{'loss': 1.154, 'learning_rate': 2e-05, 'epoch': 4.08}



                                               t][A
  0%|          | 1/450 [25:25<27:06,  3.62s/it]  
 82%|████████▏ | 368/450 [24:33<05:08,  3.77s/it][A

{'loss': 1.0512, 'learning_rate': 2e-05, 'epoch': 4.09}



                                               t][A
  0%|          | 1/450 [25:29<27:06,  3.62s/it]  
 82%|████████▏ | 369/450 [24:37<05:19,  3.95s/it][A

{'loss': 1.2045, 'learning_rate': 2e-05, 'epoch': 4.1}



                                               t][A
  0%|          | 1/450 [25:33<27:06,  3.62s/it]  
 82%|████████▏ | 370/450 [24:40<05:04,  3.81s/it][A

{'loss': 1.014, 'learning_rate': 2e-05, 'epoch': 4.11}



                                               t][A
  0%|          | 1/450 [25:37<27:06,  3.62s/it]  
 82%|████████▏ | 371/450 [24:44<05:03,  3.84s/it][A

{'loss': 1.3316, 'learning_rate': 2e-05, 'epoch': 4.12}



                                               t][A
  0%|          | 1/450 [25:42<27:06,  3.62s/it]  
 83%|████████▎ | 372/450 [24:50<05:47,  4.45s/it][A

{'loss': 1.203, 'learning_rate': 2e-05, 'epoch': 4.13}



                                               t][A
  0%|          | 1/450 [25:46<27:06,  3.62s/it]  
 83%|████████▎ | 373/450 [24:54<05:29,  4.27s/it][A

{'loss': 1.12, 'learning_rate': 2e-05, 'epoch': 4.14}



                                               t][A
  0%|          | 1/450 [25:50<27:06,  3.62s/it]  
 83%|████████▎ | 374/450 [24:58<05:08,  4.06s/it][A

{'loss': 1.1264, 'learning_rate': 2e-05, 'epoch': 4.16}



                                               t][A
  0%|          | 1/450 [25:54<27:06,  3.62s/it]  
 83%|████████▎ | 375/450 [25:02<05:10,  4.14s/it][A

{'loss': 1.1057, 'learning_rate': 2e-05, 'epoch': 4.17}



                                               t][A
  0%|          | 1/450 [25:58<27:06,  3.62s/it]  
 84%|████████▎ | 376/450 [25:06<04:54,  3.98s/it][A

{'loss': 1.2399, 'learning_rate': 2e-05, 'epoch': 4.18}



                                               t][A
  0%|          | 1/450 [26:01<27:06,  3.62s/it]  
 84%|████████▍ | 377/450 [25:09<04:43,  3.88s/it][A

{'loss': 1.1301, 'learning_rate': 2e-05, 'epoch': 4.19}



                                               t][A
  0%|          | 1/450 [26:05<27:06,  3.62s/it]  
 84%|████████▍ | 378/450 [25:13<04:39,  3.88s/it][A

{'loss': 1.3355, 'learning_rate': 2e-05, 'epoch': 4.2}



                                               t][A
  0%|          | 1/450 [26:09<27:06,  3.62s/it]  
 84%|████████▍ | 379/450 [25:17<04:36,  3.90s/it][A

{'loss': 1.1899, 'learning_rate': 2e-05, 'epoch': 4.21}



                                               t][A
  0%|          | 1/450 [26:14<27:06,  3.62s/it]  
 84%|████████▍ | 380/450 [25:22<04:47,  4.10s/it][A

{'loss': 1.2065, 'learning_rate': 2e-05, 'epoch': 4.22}



                                               t][A
  0%|          | 1/450 [26:17<27:06,  3.62s/it]  
 85%|████████▍ | 381/450 [25:25<04:30,  3.92s/it][A

{'loss': 1.136, 'learning_rate': 2e-05, 'epoch': 4.23}



                                               t][A
  0%|          | 1/450 [26:21<27:06,  3.62s/it]  
 85%|████████▍ | 382/450 [25:29<04:21,  3.85s/it][A

{'loss': 1.1253, 'learning_rate': 2e-05, 'epoch': 4.24}



                                               t][A
  0%|          | 1/450 [26:24<27:06,  3.62s/it]  
 85%|████████▌ | 383/450 [25:32<04:07,  3.69s/it][A

{'loss': 1.0939, 'learning_rate': 2e-05, 'epoch': 4.26}



                                               t][A
  0%|          | 1/450 [26:28<27:06,  3.62s/it]  
 85%|████████▌ | 384/450 [25:36<03:57,  3.60s/it][A

{'loss': 1.1914, 'learning_rate': 2e-05, 'epoch': 4.27}



                                               t][A
  0%|          | 1/450 [26:31<27:06,  3.62s/it]  
 86%|████████▌ | 385/450 [25:39<03:44,  3.45s/it]

{'loss': 1.0254, 'learning_rate': 2e-05, 'epoch': 4.28}


[A
                                               t][A
  0%|          | 1/450 [26:36<27:06,  3.62s/it]  
 86%|████████▌ | 386/450 [25:44<04:11,  3.94s/it][A

{'loss': 1.2813, 'learning_rate': 2e-05, 'epoch': 4.29}



                                               t][A
  0%|          | 1/450 [26:39<27:06,  3.62s/it]  
 86%|████████▌ | 387/450 [25:47<03:52,  3.69s/it][A

{'loss': 1.33, 'learning_rate': 2e-05, 'epoch': 4.3}



                                               t][A
  0%|          | 1/450 [26:43<27:06,  3.62s/it]  
 86%|████████▌ | 388/450 [25:50<03:46,  3.66s/it][A

{'loss': 1.2726, 'learning_rate': 2e-05, 'epoch': 4.31}



                                               t][A
  0%|          | 1/450 [26:46<27:06,  3.62s/it]  
 86%|████████▋ | 389/450 [25:54<03:40,  3.62s/it][A

{'loss': 1.285, 'learning_rate': 2e-05, 'epoch': 4.32}



                                               t][A
  0%|          | 1/450 [26:50<27:06,  3.62s/it]  
 87%|████████▋ | 390/450 [25:58<03:42,  3.70s/it][A

{'loss': 1.2668, 'learning_rate': 2e-05, 'epoch': 4.33}



                                               t][A
  0%|          | 1/450 [26:54<27:06,  3.62s/it]  
 87%|████████▋ | 391/450 [26:02<03:43,  3.78s/it][A

{'loss': 1.1492, 'learning_rate': 2e-05, 'epoch': 4.34}



                                               t][A
  0%|          | 1/450 [26:59<27:06,  3.62s/it]  
 87%|████████▋ | 392/450 [26:07<03:56,  4.08s/it][A

{'loss': 1.3813, 'learning_rate': 2e-05, 'epoch': 4.36}



                                               t][A
  0%|          | 1/450 [27:02<27:06,  3.62s/it]  
 87%|████████▋ | 393/450 [26:10<03:46,  3.98s/it][A

{'loss': 1.2568, 'learning_rate': 2e-05, 'epoch': 4.37}



                                               t][A
  0%|          | 1/450 [27:07<27:06,  3.62s/it]  
 88%|████████▊ | 394/450 [26:15<03:54,  4.19s/it][A

{'loss': 1.2187, 'learning_rate': 2e-05, 'epoch': 4.38}



                                               t][A
  0%|          | 1/450 [27:11<27:06,  3.62s/it]  
 88%|████████▊ | 395/450 [26:19<03:43,  4.06s/it][A

{'loss': 1.2138, 'learning_rate': 2e-05, 'epoch': 4.39}



                                               t][A
  0%|          | 1/450 [27:15<27:06,  3.62s/it]  
 88%|████████▊ | 396/450 [26:22<03:32,  3.94s/it][A

{'loss': 1.1566, 'learning_rate': 2e-05, 'epoch': 4.4}



                                               t][A
  0%|          | 1/450 [27:19<27:06,  3.62s/it]  
 88%|████████▊ | 397/450 [26:26<03:29,  3.96s/it][A

{'loss': 1.1737, 'learning_rate': 2e-05, 'epoch': 4.41}



                                               t][A
  0%|          | 1/450 [27:23<27:06,  3.62s/it]  
 88%|████████▊ | 398/450 [26:31<03:33,  4.10s/it][A

{'loss': 1.0587, 'learning_rate': 2e-05, 'epoch': 4.42}



                                               t][A
  0%|          | 1/450 [27:28<27:06,  3.62s/it]  
 89%|████████▊ | 399/450 [26:36<03:48,  4.47s/it][A

{'loss': 1.2795, 'learning_rate': 2e-05, 'epoch': 4.43}



                                               t][A
  0%|          | 1/450 [27:33<27:06,  3.62s/it]  
 89%|████████▉ | 400/450 [26:41<03:43,  4.47s/it][A

{'loss': 1.1618, 'learning_rate': 2e-05, 'epoch': 4.44}



                                               t][A
  0%|          | 1/450 [27:36<27:06,  3.62s/it]  
 89%|████████▉ | 401/450 [26:44<03:23,  4.15s/it][A

{'loss': 1.34, 'learning_rate': 2e-05, 'epoch': 4.46}



                                               t][A
  0%|          | 1/450 [27:41<27:06,  3.62s/it]  
 89%|████████▉ | 402/450 [26:49<03:28,  4.35s/it][A

{'loss': 1.209, 'learning_rate': 2e-05, 'epoch': 4.47}



                                               t][A
  0%|          | 1/450 [27:46<27:06,  3.62s/it]  
 90%|████████▉ | 403/450 [26:54<03:30,  4.49s/it][A

{'loss': 1.2528, 'learning_rate': 2e-05, 'epoch': 4.48}



                                               t][A
  0%|          | 1/450 [27:49<27:06,  3.62s/it]  
 90%|████████▉ | 404/450 [26:57<03:14,  4.23s/it][A

{'loss': 1.2385, 'learning_rate': 2e-05, 'epoch': 4.49}



                                               t][A
  0%|          | 1/450 [27:54<27:06,  3.62s/it]  
 90%|█████████ | 405/450 [27:02<03:17,  4.40s/it][A

{'loss': 1.1926, 'learning_rate': 2e-05, 'epoch': 4.5}



                                               t][A
  0%|          | 1/450 [27:57<27:06,  3.62s/it]  
 90%|█████████ | 406/450 [27:05<02:55,  4.00s/it][A

{'loss': 1.1949, 'learning_rate': 2e-05, 'epoch': 4.51}



                                               t][A
  0%|          | 1/450 [28:01<27:06,  3.62s/it]  
 90%|█████████ | 407/450 [27:09<02:49,  3.94s/it][A

{'loss': 1.2489, 'learning_rate': 2e-05, 'epoch': 4.52}



                                               t][A
  0%|          | 1/450 [28:05<27:06,  3.62s/it]  
 91%|█████████ | 408/450 [27:13<02:44,  3.92s/it][A

{'loss': 1.2361, 'learning_rate': 2e-05, 'epoch': 4.53}



                                               t][A
  0%|          | 1/450 [28:09<27:06,  3.62s/it]  
 91%|█████████ | 409/450 [27:17<02:48,  4.10s/it][A

{'loss': 1.186, 'learning_rate': 2e-05, 'epoch': 4.54}



                                               t][A
  0%|          | 1/450 [28:13<27:06,  3.62s/it]  
 91%|█████████ | 410/450 [27:20<02:32,  3.81s/it][A

{'loss': 1.2088, 'learning_rate': 2e-05, 'epoch': 4.56}



                                               t][A
  0%|          | 1/450 [28:16<27:06,  3.62s/it]  
 91%|█████████▏| 411/450 [27:24<02:25,  3.74s/it][A

{'loss': 1.2573, 'learning_rate': 2e-05, 'epoch': 4.57}



                                               t][A
  0%|          | 1/450 [28:20<27:06,  3.62s/it]  
 92%|█████████▏| 412/450 [27:28<02:24,  3.79s/it][A

{'loss': 1.3231, 'learning_rate': 2e-05, 'epoch': 4.58}



                                               t][A
  0%|          | 1/450 [28:24<27:06,  3.62s/it]  
 92%|█████████▏| 413/450 [27:32<02:19,  3.77s/it][A

{'loss': 1.1717, 'learning_rate': 2e-05, 'epoch': 4.59}



                                               t][A
  0%|          | 1/450 [28:31<27:06,  3.62s/it]  
 92%|█████████▏| 414/450 [27:39<02:51,  4.77s/it][A

{'loss': 1.2754, 'learning_rate': 2e-05, 'epoch': 4.6}



                                               t][A
  0%|          | 1/450 [28:34<27:06,  3.62s/it]  
 92%|█████████▏| 415/450 [27:42<02:29,  4.29s/it][A

{'loss': 1.1537, 'learning_rate': 2e-05, 'epoch': 4.61}



                                               t][A
  0%|          | 1/450 [28:39<27:06,  3.62s/it]  
 92%|█████████▏| 416/450 [27:47<02:29,  4.40s/it][A

{'loss': 1.3393, 'learning_rate': 2e-05, 'epoch': 4.62}



                                               t][A
  0%|          | 1/450 [28:42<27:06,  3.62s/it]  
 93%|█████████▎| 417/450 [27:50<02:17,  4.16s/it][A

{'loss': 1.3616, 'learning_rate': 2e-05, 'epoch': 4.63}



                                               t][A
  0%|          | 1/450 [28:46<27:06,  3.62s/it]  
 93%|█████████▎| 418/450 [27:54<02:11,  4.11s/it][A

{'loss': 1.2624, 'learning_rate': 2e-05, 'epoch': 4.64}



                                               t][A
  0%|          | 1/450 [28:51<27:06,  3.62s/it]  
 93%|█████████▎| 419/450 [27:59<02:10,  4.21s/it][A

{'loss': 1.3203, 'learning_rate': 2e-05, 'epoch': 4.66}



                                               t][A
  0%|          | 1/450 [28:54<27:06,  3.62s/it]  
 93%|█████████▎| 420/450 [28:02<01:58,  3.94s/it][A

{'loss': 1.1883, 'learning_rate': 2e-05, 'epoch': 4.67}



                                               t][A
  0%|          | 1/450 [28:58<27:06,  3.62s/it]  
 94%|█████████▎| 421/450 [28:06<01:53,  3.93s/it][A

{'loss': 1.1471, 'learning_rate': 2e-05, 'epoch': 4.68}



                                               t][A
  0%|          | 1/450 [29:02<27:06,  3.62s/it]  
 94%|█████████▍| 422/450 [28:10<01:52,  4.01s/it][A

{'loss': 1.2844, 'learning_rate': 2e-05, 'epoch': 4.69}



                                               t][A
  0%|          | 1/450 [29:06<27:06,  3.62s/it]  
 94%|█████████▍| 423/450 [28:14<01:47,  3.98s/it][A

{'loss': 1.1279, 'learning_rate': 2e-05, 'epoch': 4.7}



                                               t][A
  0%|          | 1/450 [29:10<27:06,  3.62s/it]  
 94%|█████████▍| 424/450 [28:18<01:45,  4.07s/it][A

{'loss': 1.1775, 'learning_rate': 2e-05, 'epoch': 4.71}



                                               t][A
  0%|          | 1/450 [29:14<27:06,  3.62s/it]  
 94%|█████████▍| 425/450 [28:22<01:40,  4.02s/it][A

{'loss': 1.172, 'learning_rate': 2e-05, 'epoch': 4.72}



                                               t][A
  0%|          | 1/450 [29:19<27:06,  3.62s/it]  
 95%|█████████▍| 426/450 [28:27<01:41,  4.22s/it][A

{'loss': 1.1898, 'learning_rate': 2e-05, 'epoch': 4.73}



                                               t][A
  0%|          | 1/450 [29:22<27:06,  3.62s/it]  
 95%|█████████▍| 427/450 [28:30<01:30,  3.95s/it][A

{'loss': 1.3115, 'learning_rate': 2e-05, 'epoch': 4.74}



                                               t][A
  0%|          | 1/450 [29:27<27:06,  3.62s/it]  
 95%|█████████▌| 428/450 [28:35<01:31,  4.14s/it][A

{'loss': 1.1819, 'learning_rate': 2e-05, 'epoch': 4.76}



                                               t][A
  0%|          | 1/450 [29:31<27:06,  3.62s/it]  
 95%|█████████▌| 429/450 [28:39<01:25,  4.09s/it][A

{'loss': 1.4133, 'learning_rate': 2e-05, 'epoch': 4.77}



                                               t][A
  0%|          | 1/450 [29:34<27:06,  3.62s/it]  
 96%|█████████▌| 430/450 [28:42<01:14,  3.70s/it][A

{'loss': 1.0906, 'learning_rate': 2e-05, 'epoch': 4.78}



                                               t][A
  0%|          | 1/450 [29:38<27:06,  3.62s/it]  
 96%|█████████▌| 431/450 [28:46<01:12,  3.81s/it][A

{'loss': 1.1073, 'learning_rate': 2e-05, 'epoch': 4.79}



                                               t][A
  0%|          | 1/450 [29:41<27:06,  3.62s/it]  
 96%|█████████▌| 432/450 [28:49<01:06,  3.71s/it][A

{'loss': 1.2781, 'learning_rate': 2e-05, 'epoch': 4.8}



                                               t][A
  0%|          | 1/450 [29:45<27:06,  3.62s/it]  
 96%|█████████▌| 433/450 [28:53<01:05,  3.84s/it][A

{'loss': 1.2681, 'learning_rate': 2e-05, 'epoch': 4.81}



                                               t][A
  0%|          | 1/450 [29:49<27:06,  3.62s/it]  
 96%|█████████▋| 434/450 [28:57<01:00,  3.75s/it][A

{'loss': 1.1153, 'learning_rate': 2e-05, 'epoch': 4.82}



                                               t][A
  0%|          | 1/450 [29:53<27:06,  3.62s/it]  
 97%|█████████▋| 435/450 [29:01<00:56,  3.77s/it][A

{'loss': 1.3497, 'learning_rate': 2e-05, 'epoch': 4.83}



                                               t][A
  0%|          | 1/450 [29:57<27:06,  3.62s/it]  
 97%|█████████▋| 436/450 [29:05<00:53,  3.85s/it][A

{'loss': 1.2134, 'learning_rate': 2e-05, 'epoch': 4.84}



                                               t][A
  0%|          | 1/450 [30:00<27:06,  3.62s/it]  
 97%|█████████▋| 437/450 [29:08<00:49,  3.79s/it][A

{'loss': 1.3535, 'learning_rate': 2e-05, 'epoch': 4.86}



                                               t][A
  0%|          | 1/450 [30:04<27:06,  3.62s/it]  
 97%|█████████▋| 438/450 [29:12<00:45,  3.83s/it][A

{'loss': 1.3021, 'learning_rate': 2e-05, 'epoch': 4.87}



                                               t][A
  0%|          | 1/450 [30:08<27:06,  3.62s/it]  
 98%|█████████▊| 439/450 [29:16<00:41,  3.79s/it][A

{'loss': 1.1148, 'learning_rate': 2e-05, 'epoch': 4.88}



                                               t][A
  0%|          | 1/450 [30:12<27:06,  3.62s/it]  
 98%|█████████▊| 440/450 [29:20<00:37,  3.77s/it][A

{'loss': 1.0422, 'learning_rate': 2e-05, 'epoch': 4.89}



                                               t][A
  0%|          | 1/450 [30:15<27:06,  3.62s/it]  
 98%|█████████▊| 441/450 [29:23<00:32,  3.65s/it][A

{'loss': 1.3359, 'learning_rate': 2e-05, 'epoch': 4.9}



                                               t][A
  0%|          | 1/450 [30:19<27:06,  3.62s/it]  
 98%|█████████▊| 442/450 [29:27<00:30,  3.82s/it][A

{'loss': 1.128, 'learning_rate': 2e-05, 'epoch': 4.91}



                                               t][A
  0%|          | 1/450 [30:24<27:06,  3.62s/it]  
 98%|█████████▊| 443/450 [29:32<00:28,  4.07s/it][A

{'loss': 1.2882, 'learning_rate': 2e-05, 'epoch': 4.92}



                                               t][A
  0%|          | 1/450 [30:29<27:06,  3.62s/it]  
 99%|█████████▊| 444/450 [29:37<00:25,  4.26s/it][A

{'loss': 1.2593, 'learning_rate': 2e-05, 'epoch': 4.93}



                                               t][A
  0%|          | 1/450 [30:34<27:06,  3.62s/it]  
 99%|█████████▉| 445/450 [29:42<00:23,  4.62s/it][A

{'loss': 1.2965, 'learning_rate': 2e-05, 'epoch': 4.94}



                                               t][A
  0%|          | 1/450 [30:38<27:06,  3.62s/it]  
 99%|█████████▉| 446/450 [29:46<00:17,  4.43s/it][A

{'loss': 1.2561, 'learning_rate': 2e-05, 'epoch': 4.96}



                                               t][A
  0%|          | 1/450 [30:44<27:06,  3.62s/it]  
 99%|█████████▉| 447/450 [29:52<00:14,  4.83s/it][A

{'loss': 1.195, 'learning_rate': 2e-05, 'epoch': 4.97}



                                               t][A
  0%|          | 1/450 [30:47<27:06,  3.62s/it]  
100%|█████████▉| 448/450 [29:55<00:08,  4.36s/it][A

{'loss': 1.1439, 'learning_rate': 2e-05, 'epoch': 4.98}



                                               t][A
  0%|          | 1/450 [30:51<27:06,  3.62s/it]  
100%|█████████▉| 449/450 [29:59<00:04,  4.13s/it][A

{'loss': 1.2452, 'learning_rate': 2e-05, 'epoch': 4.99}



                                               t][A
  0%|          | 1/450 [30:54<27:06,  3.62s/it]  
                                               t][A
  0%|          | 1/450 [30:54<27:06,  3.62s/it]  
100%|██████████| 450/450 [30:02<00:00,  4.01s/it][A


{'loss': 1.3703, 'learning_rate': 2e-05, 'epoch': 5.0}
{'train_runtime': 1802.7215, 'train_samples_per_second': 1.997, 'train_steps_per_second': 0.25, 'train_loss': 1.2846886356671652, 'epoch': 5.0}


### Generating answer with tuned model

In [66]:
# 720 train, 720 eval
prefix_len = np.sum(np.array(test_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_tokens = test_dataset[9]["input_ids"][:prefix_len]
peft_model.eval()
generated = peft_model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(generated.to('cpu').flatten())
print(generated_text)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
Санкт-Петербургский государственный университет


АВРАМЕНКО Полина Андреевна
Выпускная квалификационная работа
Веб-туны как часть южнокорейской культуры в XXI веке (на примере романтических историй) 
Уровень образования: магистратура
Направление 58.04.01 «Востоковедение и африканистика»
Основная образовательная программа BM.5808 «Культура народов Азии и Африки (с изучением языков Азии и Африки)»


Научный руководитель:
доцент, Кафедра корееведения, Санкт-Петербургский государственный университет Гурьева Анастасия Александровна

Рецензент:
приглашенный преподаватель, Кафедра корееведения, Санкт-Петербургская школа социальных наук и востоковедения,
доцент, Санкт-Петербургский филиал федерального государственного автономного 

In [71]:
# 720 train, 720 eval
# inference train sample
peft_model.eval()
prefix_len = np.sum(np.array(train_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_tokens = train_dataset[9]["input_ids"][:prefix_len]
generated = peft_model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(generated.to('cpu').flatten())
print(generated_text)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
САНКТ-ПЕТЕРБУРГСКИЙ ГОСУДАРСТВЕННЫЙ
УНИВЕРСИТЕТ

Мануэль Антонио Агилар Ривера
Выпускная квалификационная работа
«Multiplication Operators in Cauchy-de Branges Spaces»
Уровень образования: магистратура
Направление 01.04.01 “Математика”
Основная образовательная программа BM.5832.2019
“Современная математика”
Научный руководитель:
Профессор, математико-механический факультет СПбГУ,
доктор физ.-мат. наук, профессор РАН
Баранов Антон Дмитриевич.
Рецензент:
Профессор, Факультет математики,
Автономный университет Мадрида, кандидат физ.-мат. наук
Якубович Дмитрий Владимирович
Санкт-Петербург
2021

Contents
Contents

2

Cauchy–de Branges spaces
3
Cauchy–de Branges spaces as Reproducing Kernel Hilbert spaces . 4
The Division Prope

### View test vs train

In [69]:
# 720 train, 720 eval
# prefix_len = np.sum(np.array(test_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_tokens = test_dataset[9]["input_ids"] #[:prefix_len]
# peft_model.eval()
# generated = peft_model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(prefix_tokens) # generated.to('cpu').flatten())
print(generated_text)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
Санкт-Петербургский государственный университет


АВРАМЕНКО Полина Андреевна
Выпускная квалификационная работа
Веб-туны как часть южнокорейской культуры в XXI веке (на примере романтических историй) 
Уровень образования: магистратура
Направление 58.04.01 «Востоковедение и африканистика»
Основная образовательная программа BM.5808 «Культура народов Азии и Африки (с изучением языков Азии и Африки)»


Научный руководитель:
доцент, Кафедра корееведения, Санкт-Петербургский государственный университет Гурьева Анастасия Александровна

Рецензент:
приглашенный преподаватель, Кафедра корееведения, Санкт-Петербургская школа социальных наук и востоковедения,
доцент, Санкт-Петербургский филиал федерального государственного автономного 

In [70]:
# 720 train, 720 eval
# prefix_len = np.sum(np.array(test_dataset[9]["labels"]) == IGNORE_INDEX)
prefix_tokens = train_dataset[9]["input_ids"] #[:prefix_len]
# peft_model.eval()
# generated = peft_model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_text = tokenizer.decode(prefix_tokens) # generated.to('cpu').flatten())
print(generated_text)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Below is a diploma text. Your task is to generate abstract of this diploma.

### Input:
САНКТ-ПЕТЕРБУРГСКИЙ ГОСУДАРСТВЕННЫЙ
УНИВЕРСИТЕТ

Мануэль Антонио Агилар Ривера
Выпускная квалификационная работа
«Multiplication Operators in Cauchy-de Branges Spaces»
Уровень образования: магистратура
Направление 01.04.01 “Математика”
Основная образовательная программа BM.5832.2019
“Современная математика”
Научный руководитель:
Профессор, математико-механический факультет СПбГУ,
доктор физ.-мат. наук, профессор РАН
Баранов Антон Дмитриевич.
Рецензент:
Профессор, Факультет математики,
Автономный университет Мадрида, кандидат физ.-мат. наук
Якубович Дмитрий Владимирович
Санкт-Петербург
2021

Contents
Contents

2

Cauchy–de Branges spaces
3
Cauchy–de Branges spaces as Reproducing Kernel Hilbert spaces . 4
The Division Prope

## Infer learnt model with raw & compare to the target
- raw — model
- learnt — peft_model
- target — abstract

In [79]:
test_df = pd.read_csv("/home/jupyter/mnt/datasets/diplomas/russian_dataset/russian_dataset_test.csv")
test_df.head()

Unnamed: 0,id,year,diploma,abstract,study_field,degree,original_diploma_extension
0,41453,2023,Санкт-Петербургский государственный университе...,Абдуллаев Ш.У. тема диссертации: «Роль политик...,INTERNATIONAL RELATIONS,MASTER'S STUDIES,.doc
1,43790,2023,Санкт-Петербургский государственный университе...,Выпускная квалификационная работа посвящена из...,POLITICAL SCIENCE,BACHELOR STUDIES,.docx
2,41165,2023,Санкт-Петербургский государственный университе...,В данной работе описывается реализация обобщен...,MATHEMATICS AND MECHANICS,BACHELOR STUDIES,.pdf
3,42349,2023,ПРАВИТЕЛЬСТВО РОССИЙСКОЙ ФЕДЕРАЦИИ \nСАНКТ-ПЕТ...,В выпускной квалификационной работе раскрывает...,SOCIOLOGY,BACHELOR STUDIES,.docx
4,40166,2023,Федеральное государственное бюджетное образова...,На сегодняшний день наблюдается высокий спрос ...,MANAGEMENT,BACHELOR STUDIES,.docx


In [82]:
with open(REPOSITOTY_DIR_PATH.joinpath("src/notebooks/junk/mcs_df_human_filled_processed.json"), "r") as f:
    asessors_questions = json.load(f) 

In [86]:
ids = [int(x['meta']['id']) for x in asessors_questions]

In [91]:
diploma_prefix_len=1000

In [141]:
def get_learnt_model_result(prefix_len, prefix_tokens):
    peft_model.eval()
    generated = peft_model.generate(prefix_tokens.reshape((1, -1)).to(device))
    generated_continue = tokenizer.decode(generated.to('cpu').flatten()[prefix_len:])
    return generated_continue
    # generated_text_all = tokenizer.decode(generated.to('cpu').flatten())
    # return generated_text_all, generated_coninue

In [142]:
def get_prefix_len_and_tokens(row):
    prompt_input_diploma = PROMPT_DICT["prompt_input_diploma_special"]
    source = prompt_input_diploma.format(input=row["diploma"][:diploma_prefix_len])

    target = f"{row['abstract']}{tokenizer.eos_token}"

    data_dict = preprocess([source], [target], tokenizer)
    
    prefix_len = np.sum(np.array(data_dict["labels"][0]) == IGNORE_INDEX)
    # print(prefix_len)
    prefix_tokens = data_dict["input_ids"][0][:prefix_len]
    # print(prefix_tokens)
    return prefix_len, prefix_tokens

In [143]:
def get_raw_model_result(prefix_len, prefix_tokens):
    model.eval()
    generated = model.generate(prefix_tokens.reshape((1, -1)).to(device))
    generated_continue = tokenizer.decode(generated.to('cpu').flatten()[prefix_len:])
    return generated_continue
    # generated_text_all = tokenizer.decode(generated.to('cpu').flatten())
    # return generated_text_all, generated_coninue

In [144]:
new_rows = []
for _, row in tqdm(test_df.iloc[ids].iterrows(), total=len(ids), desc="Rows..."):
    new_row = copy.deepcopy(row)
    prefix_len, prefix_tokens = get_prefix_len_and_tokens(row)
    new_row["raw_model"] = get_raw_model_result(prefix_len, prefix_tokens)
    new_row["learnt"] = get_learnt_model_result(prefix_len, prefix_tokens)
    new_rows.append(new_row)
new_df = pd.DataFrame(new_rows)
new_df.head()

Rows...:   0%|          | 0/70 [00:00<?, ?it/s]

Unnamed: 0,id,year,diploma,abstract,study_field,degree,original_diploma_extension,raw_model,learnt
12,45042,2023,АЙВАЗЬЯН Аршак Владимирович\nВыпускная квалифи...,В этой работе мы строим правую трансферную мод...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматривается модельная структура н...,В данной работе рассматривается модельная стру...
25,45043,2023,Санкт-Петербургский государственный университе...,"Пусть 𝐾 выпуклое тело в ℝ^𝑛. Определим 𝑑𝑛,𝑛−1(...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Плотность решетки трансляций - это минимальная...,В работе рассматриваются плотности решеток тра...
37,45044,2023,Санкт-Петербургский государственный университе...,Работа посвящена повышению производительности ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе представлены результаты исслед...,В работе рассматривается задача булевой выполн...
101,45046,2023,Санкт-Петербургский государственный университе...,В работе мы обобщаем результаты об энергии нат...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматриваются классы случайных проц...,В данной работе рассматривается энергетически-...
152,45047,2023,Санкт–Петербургский государственный университе...,В рамках данной работы рассматривается подход ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе рассматривается задача настраи...,В данной работе рассматривается задача добавле...


In [145]:
new_df.to_csv(ARTIFACTS_DIR_PATH.joinpath("diplomas_abstracts/mcs_raw_learnt_abstract.csv"))

## Try to save & load learnt model

In [146]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [147]:
model_id = "nvdenisov2002/llama-longLoRA-v1"

In [None]:
peft_model.push_to_hub(model_id)

In [None]:
trainer.push_to_hub(model_id)

In [150]:
another_peft_config = PeftConfig.from_pretrained(model_id)
another_peft_model = PeftModel.from_pretrained(model, model_id)


Downloading adapter_config.json: 100%|██████████| 674/674 [00:00<00:00, 4.63MB/s]

Downloading adapter_model.bin:   0%|          | 0.00/1.08G [00:00<?, ?B/s][A
Downloading adapter_model.bin:   1%|          | 10.5M/1.08G [00:00<00:45, 23.6MB/s][A
Downloading adapter_model.bin:   2%|▏         | 21.0M/1.08G [00:00<00:35, 29.6MB/s][A
Downloading adapter_model.bin:   3%|▎         | 31.5M/1.08G [00:00<00:28, 36.5MB/s][A
Downloading adapter_model.bin:   4%|▍         | 41.9M/1.08G [00:01<00:23, 43.8MB/s][A
Downloading adapter_model.bin:   5%|▍         | 52.4M/1.08G [00:01<00:20, 50.8MB/s][A
Downloading adapter_model.bin:   6%|▌         | 62.9M/1.08G [00:01<00:18, 56.5MB/s][A
Downloading adapter_model.bin:   7%|▋         | 73.4M/1.08G [00:01<00:17, 58.8MB/s][A
Downloading adapter_model.bin:   8%|▊         | 83.9M/1.08G [00:01<00:16, 61.2MB/s][A
Downloading adapter_model.bin:   9%|▊         | 94.4M/1.08G [00:01<00:15, 63.4MB/s][A
Downloading adapter_model.bin:  10%|▉         | 105M/1.

In [151]:
row

id                                                                        45137
year                                                                       2023
diploma                       Санкт–Петербургский государственный университе...
abstract                      В данной работе представлены реализация фронте...
study_field                                    MATHEMATICS AND COMPUTER SCIENCE
degree                                                         BACHELOR STUDIES
original_diploma_extension                                                 .pdf
Name: 1238, dtype: object

In [152]:
prefix_len, prefix_tokens = get_prefix_len_and_tokens(row)
learnt_old = get_learnt_model_result(prefix_len, prefix_tokens)

In [153]:
another_peft_model.eval()
generated = another_peft_model.generate(prefix_tokens.reshape((1, -1)).to(device))
generated_continue = tokenizer.decode(generated.to('cpu').flatten()[prefix_len:])
learnt_new = generated_continue

In [155]:
learnt_old == learnt_new

False

In [156]:
learnt_old

'В работе рассмотрена задача создания персонажа для игры в жанре «космическая охота на разумных существ». Разработана архитектура мобильного приложения для создания персонажа, которая состоит из 3 модулей: Front-end, Back-end и сервис. В Front-end модуле представлены экраны, которые позволяют пользователю создавать персонажа. В Back-end модуле реализован сервис, который позволяет пользователю сохранить созданный персонаж на сервере и использовать его в игре. В Back-end модуле также реализован сервис, который позволяет пользователю изменить свой персонаж, а также сервис, который позволяет пользователю загрузить созданный им персонаж на сервер.</s>'

In [158]:
learnt_old_2 = get_learnt_model_result(prefix_len, prefix_tokens)
learnt_old_2

'В работе рассмотрена задача создания персонажа для онлайн-игры в жанре массовой многопользовательской ролевой игры. Рассматривается архитектура приложения, применяемые технологии, а также реализованные экраны.</s>'

In [157]:
learnt_new

'В работе рассматривается проблема создания персонажа в онлайн-игре. На основе анализа существующих решений и анализа потребностей игроков была разработана архитектура мобильного приложения, реализованного на основе фреймворка Flutter. В результате было создано мобильное приложение для создания персонажа в онлайн-игре.</s>'