In [1]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.",
)

warnings.filterwarnings(
    "ignore",
    message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.",
)

In [2]:
# Written by Yukang Chen
# Some code based on https://github.com/epfml/landmark-attention
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tqdm.notebook import tqdm

import sys
import io
import os
import copy
import json
import math
import logging
import pandas as pd
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence

import torch
import transformers
from torch.utils.data import Dataset
from transformers import Trainer, DataCollatorForLanguageModeling
from llama_attn_replace_sft import replace_llama_attn
from gptneox_attn_replace import replace_gpt_neox_attn
from peft import LoraConfig, get_peft_model
from torch.distributed import barrier

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f

def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
    "prompt_no_input_llama2":(
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
    "prompt_input_llama2": (
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} \n{input} [/INST]"
    ),
    "prompt_llama2": "[INST]{instruction}[/INST]",
    "prompt_input_diploma_special":(
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\nBelow is a diploma text. Your task is to generate abstract of this diploma.\n\n### Input:\n{input}\n\n"
    ),
}


@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="EleutherAI/pythia-1.4b-deduped")
    model_type: Optional[str] = field(default="llama")


@dataclass
class DataArguments:
    train_data_path: str = field(default=None, metadata={"help": "Path to the training data."})
    val_data_path: str = field(default=None, metadata={"help": "Path to the validation data."})
    nrows: int = 1
    diploma_prefix_len: int = 1


@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=8192 * 4,
        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
    )
    use_flash_attn: bool = field(
        default=True,
        metadata={"help": "Whether use flash attention for training."},
    )
    use_full_attn: bool = field(
        default=False,
        metadata={"help": "Whether to use plain, full-attention for training."},
    )
    low_rank_training: bool = field(
        default=True,
        metadata={"help": "Whether use low rank adaptation for training."},
    )
    trainable_params: str = field(
        default="embed,norm",
        metadata={"help": "Additional trainable parameters except LoRA weights, if low rank training."},
    )

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg


def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = []
    for text in tqdm(strings, desc="Texts..."):
        tokenized_list.append(tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ))
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    targets_tokenized = _tokenize_fn(targets, tokenizer)
    examples_tokenized = _tokenize_fn(examples, tokenizer)
    input_ids = [] 
    for example_input_id, target_input_id, example_len, target_len in zip(examples_tokenized["input_ids"], targets_tokenized["input_ids"], examples_tokenized["input_ids_lens"], targets_tokenized["input_ids_lens"]):
        limit = tokenizer.model_max_length
        res = example_input_id
        if example_len == limit:
            res = example_input_id.tolist()[:-target_len] + target_input_id.tolist()[:target_len]
        input_id = torch.tensor(res, dtype=torch.int)
        input_id = input_id.type(torch.LongTensor)
        input_ids.append(input_id)
    labels = copy.deepcopy(input_ids)
    for label, example_len, target_len in zip(labels, examples_tokenized["input_ids_lens"], targets_tokenized["input_ids_lens"]):
        ignore_end = example_len - target_len
        label[:ignore_end] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, nrows: int, diploma_prefix_len: int):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        data_table = pd.read_csv(data_path)
        if nrows is not None:
            data_table = data_table.sample(min(len(data_table), nrows))
        else:
            data_table = data_table.sample(frac=1)

        logging.warning("Formatting inputs...")

        prompt_input_diploma = PROMPT_DICT["prompt_input_diploma_special"]
        sources = [
            prompt_input_diploma.format(input=diploma[:diploma_prefix_len])
            for diploma in data_table["diploma"]
        ]

        targets = [f"### Response:{abstract}{tokenizer.eos_token}" for abstract in data_table["abstract"]]

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )


def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.train_data_path, nrows=data_args.nrows, diploma_prefix_len=data_args.diploma_prefix_len)
    val_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.val_data_path, nrows=data_args.nrows, diploma_prefix_len=data_args.diploma_prefix_len)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset, eval_dataset=val_dataset, data_collator=data_collator)


def train(model_args, data_args, training_args):
    print("Begin train")
    
    print("Parsed arguments")

    # NOTE: May expand supported model types in the future
    if model_args.model_type == "gpt-neox":
        replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
    else:
        replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)

    # Set RoPE scaling factor
    config = transformers.AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=training_args.cache_dir,
    )

    orig_rope_scaling = getattr(config, "rope_scaling", None)
    if orig_rope_scaling is None:
        orig_rope_scaling = {"factor": 1}
    orig_rope_scaling_factor = orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
    orig_ctx_len = getattr(config, "max_position_embeddings", None)
    if orig_ctx_len:
        orig_ctx_len *= orig_rope_scaling_factor
        if training_args.model_max_length > orig_ctx_len:
            scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
            config.rope_scaling = {"type": "linear", "factor": scaling_factor}
            
    print("Created config")

    # Load model and tokenizer
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        config=config,
        cache_dir=training_args.cache_dir,
        torch_dtype=torch.bfloat16,
    )
    
    print("Loaded model")

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=training_args.cache_dir,
        model_max_length=training_args.model_max_length,
        padding_side="right",
        use_fast=True,
    )
    
    print("Loaded tokenizer")

    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.bos_token is None:
        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=special_tokens_dict,
        tokenizer=tokenizer,
        model=model,
    )

    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
    
    print("Created data_module")

    if training_args.low_rank_training:
        if model_args.model_type == "gpt-neox":
            # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
            targets = ["query_key_value", "dense"]
        else:
            targets=["q_proj", "k_proj", "v_proj", "o_proj"]

        config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=targets,
            lora_dropout=0,
            bias="none",
            task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, config)
        # enable trainable params
        [p.requires_grad_() for n, p in model.named_parameters() if any([k in n for k in training_args.trainable_params.split(",")])]

    model.config.use_cache = False         # required for gradient checkpointing
    model.enable_input_require_grads()     # required for gradient checkpointing
    model.gradient_checkpointing_enable()  # enable gradient checkpointing
    
    print("Prepared model to learn")

    trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
    trainer.train()
    trainer.save_state()
    trainer.save_model(output_dir=training_args.output_dir)
    
    print("Learnt model")



In [3]:
import os
from huggingface_hub import login
login(os.environ['hf-read-token'])

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /tmp/xdg_cache/huggingface/token
Login successful


In [4]:
from pathlib import Path
CACHE_DIR = Path("../cache/")
DATASET_DIR = Path("/home/jupyter/mnt/datasets/spbu_diplomas/russian_dataset/")
OUTPUT_DIR = Path("./llama_replaced_attn_8k_without_checkpoints_new_tokenize_attempt_3_output_dir/")
LOGGING_DIR = Path("./llama_replaced_attn_8k_without_checkpoints_new_tokenize_attempt_3_logging_dir/")
MODEL_MAX_LENGTH = 8192
INF = int(1e7)

In [6]:
model_args = ModelArguments(
    model_name_or_path="meta-llama/Llama-2-7b-hf",
)


data_args = DataArguments(
    train_data_path=DATASET_DIR.joinpath("russian_dataset_train.csv").as_posix(),
    val_data_path=DATASET_DIR.joinpath("russian_dataset_val.csv").as_posix(),
    nrows=None, # 720 * 3,
    diploma_prefix_len=INF,
)

training_args = TrainingArguments(
    bf16=True,
    output_dir=OUTPUT_DIR.as_posix(),
    model_max_length=MODEL_MAX_LENGTH,
    use_flash_attn=True,
    low_rank_training=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="no",
    # eval_steps=1,
    learning_rate=2e-5,
    weight_decay=0.0,
    warmup_steps=20,
    lr_scheduler_type="constant_with_warmup",
    logging_steps=1,
    logging_dir=LOGGING_DIR.as_posix(),
    deepspeed="ds_configs/stage2.json",
    tf32=True,
    cache_dir=CACHE_DIR.as_posix(),
    report_to=['tensorboard'],
    save_strategy="no",
)
training_args



In [7]:
print("Begin train")

print("Parsed arguments")

# NOTE: May expand supported model types in the future
if model_args.model_type == "gpt-neox":
    replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
else:
    replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)

# Set RoPE scaling factor
config = transformers.AutoConfig.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
)

orig_rope_scaling = getattr(config, "rope_scaling", None)
if orig_rope_scaling is None:
    orig_rope_scaling = {"factor": 1}
orig_rope_scaling_factor = orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
orig_ctx_len = getattr(config, "max_position_embeddings", None)
if orig_ctx_len:
    orig_ctx_len *= orig_rope_scaling_factor
    if training_args.model_max_length > orig_ctx_len:
        scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
        config.rope_scaling = {"type": "linear", "factor": scaling_factor}

print("Created config")

# Load model and tokenizer
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    config=config,
    cache_dir=training_args.cache_dir,
    torch_dtype=torch.bfloat16,
)

print("Loaded model")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=True,
)

print("Loaded tokenizer")

special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)

Begin train
Parsed arguments
Created config


Loading checkpoint shards: 100%|██████████| 2/2 [02:05<00:00, 62.54s/it]

Loaded model





Loaded tokenizer

Using pad_token, but it is not set yet.





In [8]:
data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)

print("Created data_module")

if training_args.low_rank_training:
    if model_args.model_type == "gpt-neox":
        # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
        targets = ["query_key_value", "dense"]
    else:
        targets=["q_proj", "k_proj", "v_proj", "o_proj"]

    config = LoraConfig(
        r=4,
        lora_alpha=16,
        target_modules=targets,
        lora_dropout=0,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)
    # enable trainable params
    [p.requires_grad_() for n, p in model.named_parameters() if any([k in n for k in training_args.trainable_params.split(",")])]

model.config.use_cache = False         # required for gradient checkpointing
model.enable_input_require_grads()     # required for gradient checkpointing
model.gradient_checkpointing_enable()  # enable gradient checkpointing

print("Prepared model to learn")



Texts...:   0%|          | 0/7623 [00:00<?, ?it/s]

Texts...:   0%|          | 0/7623 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=tor

Texts...:   0%|          | 0/1397 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1397 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)
  input_id = torch.tensor(res, dtype=torch.int)


Created data_module
Prepared model to learn


### Example of new tokenization

In [94]:
tokenizer.decode(data_module["train_dataset"][0]["input_ids"])

'<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nBelow is a diploma text. Your task is to generate abstract of this diploma.\n\n### Input:\nСАНКТ-ПЕТЕРБУРГСКИЙ\nГОСУДАРСТВЕННЫЙ УНИВЕРСИТЕТ\nФизический факультет\nКафедра Статистической физики\n\nУравнения стохастической динамики в\nокрестности λ-точки\nВыпускная квалификационная работа студента 406\nгруппы\nЖаворонкова Юрия Александровича\n\nНаучный руководитель:\nд.ф. - м.н., профессор Налимов М.Ю.\nРецензент:\nд.ф. - м.н., профессор Антонов Н.В.\n\nСанкт-Петербург\n2017 г.\n\n\x0cСодержание\n1 Введение\n\n2\n\n2 Модель F стохастической динамики\n\n3\n\n3 Модификация модели\n\n5\n\n4 Заключение\n\n12\n\nСписок литературы\n\n13\n\n1\n\n\x0c1\n\nВведение\n\nНастоящая работа посвящена изучению явления Бозе-конденсации. Актуальность данного направления исследования подтверждается многочисленными работами ка

### Continue training

In [None]:
import numpy as np

check = []
lens = []
for x in data_module["train_dataset"]:
    check.append(list(set(x["labels"].tolist())).__len__() == 1)
    lens.append(x["input_ids"].ne(tokenizer.pad_token_id).sum())
np.mean(check), np.mean(lens)

In [9]:
trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)

print("Learnt model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


[2024-05-18 11:28:26,517] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  0%|          | 1/2856 [00:20<16:22:10, 20.64s/it]

{'loss': 2.9548, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 2/2856 [00:38<15:07:48, 19.08s/it]

{'loss': 1.9645, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 3/2856 [00:56<14:43:15, 18.58s/it]

{'loss': 2.8266, 'learning_rate': 3e-06, 'epoch': 0.0}


  0%|          | 4/2856 [01:13<14:17:36, 18.04s/it]

{'loss': 1.8719, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


  0%|          | 5/2856 [01:31<14:16:09, 18.02s/it]

{'loss': 2.1615, 'learning_rate': 5e-06, 'epoch': 0.01}


  0%|          | 6/2856 [01:49<14:15:16, 18.01s/it]

{'loss': 3.2755, 'learning_rate': 6e-06, 'epoch': 0.01}


  0%|          | 7/2856 [02:07<14:14:40, 18.00s/it]

{'loss': 2.1006, 'learning_rate': 7e-06, 'epoch': 0.01}


  0%|          | 8/2856 [02:25<14:14:07, 17.99s/it]

{'loss': 2.4925, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


  0%|          | 9/2856 [02:43<14:13:35, 17.99s/it]

{'loss': 2.1734, 'learning_rate': 9e-06, 'epoch': 0.01}


  0%|          | 10/2856 [03:01<14:13:06, 17.99s/it]

{'loss': 3.1877, 'learning_rate': 1e-05, 'epoch': 0.01}


  0%|          | 11/2856 [03:19<14:12:39, 17.98s/it]

{'loss': 3.3237, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


  0%|          | 12/2856 [03:37<14:12:22, 17.98s/it]

{'loss': 1.8983, 'learning_rate': 1.2e-05, 'epoch': 0.01}


  0%|          | 13/2856 [03:55<14:12:16, 17.99s/it]

{'loss': 1.9597, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


  0%|          | 14/2856 [04:13<14:11:51, 17.98s/it]

{'loss': 1.8581, 'learning_rate': 1.4e-05, 'epoch': 0.01}


  1%|          | 15/2856 [04:31<14:11:33, 17.98s/it]

{'loss': 1.8683, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.02}


  1%|          | 16/2856 [04:49<14:11:10, 17.98s/it]

{'loss': 1.7584, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}


  1%|          | 17/2856 [05:07<14:11:04, 17.99s/it]

{'loss': 1.9928, 'learning_rate': 1.7e-05, 'epoch': 0.02}


  1%|          | 18/2856 [05:25<14:10:35, 17.98s/it]

{'loss': 2.1075, 'learning_rate': 1.8e-05, 'epoch': 0.02}


  1%|          | 19/2856 [05:43<14:10:15, 17.98s/it]

{'loss': 1.8724, 'learning_rate': 1.9e-05, 'epoch': 0.02}


  1%|          | 20/2856 [06:01<14:10:04, 17.98s/it]

{'loss': 1.8487, 'learning_rate': 2e-05, 'epoch': 0.02}


  1%|          | 21/2856 [06:19<14:09:38, 17.98s/it]

{'loss': 2.7135, 'learning_rate': 2e-05, 'epoch': 0.02}


  1%|          | 22/2856 [06:37<14:09:19, 17.98s/it]

{'loss': 1.8158, 'learning_rate': 2e-05, 'epoch': 0.02}


  1%|          | 23/2856 [06:55<14:08:10, 17.96s/it]

{'loss': 1.6795, 'learning_rate': 2e-05, 'epoch': 0.02}


  1%|          | 24/2856 [07:13<14:08:19, 17.97s/it]

{'loss': 2.0106, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 25/2856 [07:31<14:08:14, 17.98s/it]

{'loss': 1.893, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 26/2856 [07:49<14:08:04, 17.98s/it]

{'loss': 3.7759, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 27/2856 [08:07<14:07:50, 17.98s/it]

{'loss': 1.8675, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 28/2856 [08:25<14:07:37, 17.98s/it]

{'loss': 3.0361, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 29/2856 [08:43<14:07:17, 17.98s/it]

{'loss': 3.0266, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 30/2856 [09:01<14:07:04, 17.98s/it]

{'loss': 1.9269, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 31/2856 [09:19<14:06:51, 17.99s/it]

{'loss': 1.8861, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 32/2856 [09:37<14:06:24, 17.98s/it]

{'loss': 2.7717, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 33/2856 [09:55<14:05:57, 17.98s/it]

{'loss': 2.1063, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 34/2856 [10:13<14:05:47, 17.98s/it]

{'loss': 1.8299, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|          | 35/2856 [10:31<14:05:26, 17.98s/it]

{'loss': 1.7339, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 36/2856 [10:49<14:05:06, 17.98s/it]

{'loss': 1.9513, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 37/2856 [11:07<14:05:04, 17.99s/it]

{'loss': 2.0943, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 38/2856 [11:25<14:04:39, 17.98s/it]

{'loss': 2.8494, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 39/2856 [11:43<14:04:10, 17.98s/it]

{'loss': 1.692, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 40/2856 [12:01<14:03:51, 17.98s/it]

{'loss': 1.8844, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 41/2856 [12:19<14:03:35, 17.98s/it]

{'loss': 2.001, 'learning_rate': 2e-05, 'epoch': 0.04}


  1%|▏         | 42/2856 [12:37<14:03:18, 17.98s/it]

{'loss': 2.4501, 'learning_rate': 2e-05, 'epoch': 0.04}


  2%|▏         | 43/2856 [12:55<14:03:01, 17.98s/it]

{'loss': 2.3171, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 44/2856 [13:13<14:02:41, 17.98s/it]

{'loss': 1.8258, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 45/2856 [13:31<14:02:08, 17.98s/it]

{'loss': 1.6276, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 46/2856 [13:49<14:01:49, 17.97s/it]

{'loss': 1.8152, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 47/2856 [14:07<14:01:48, 17.98s/it]

{'loss': 1.6543, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 48/2856 [14:25<14:01:35, 17.98s/it]

{'loss': 1.7437, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 49/2856 [14:42<14:01:12, 17.98s/it]

{'loss': 2.6575, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 50/2856 [15:00<14:00:52, 17.98s/it]

{'loss': 1.8811, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 51/2856 [15:18<14:00:37, 17.98s/it]

{'loss': 1.8973, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 52/2856 [15:36<14:00:23, 17.98s/it]

{'loss': 1.7018, 'learning_rate': 2e-05, 'epoch': 0.05}


  2%|▏         | 53/2856 [15:54<14:00:12, 17.99s/it]

{'loss': 2.0214, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 54/2856 [16:12<13:59:26, 17.98s/it]

{'loss': 2.0681, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 55/2856 [16:30<13:59:13, 17.98s/it]

{'loss': 1.8788, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 56/2856 [16:48<13:59:00, 17.98s/it]

{'loss': 1.8276, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 57/2856 [17:06<13:58:52, 17.98s/it]

{'loss': 1.784, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 58/2856 [17:24<13:55:00, 17.91s/it]

{'loss': 2.3151, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 59/2856 [17:42<13:55:47, 17.93s/it]

{'loss': 2.9176, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 60/2856 [18:00<13:56:16, 17.95s/it]

{'loss': 1.7253, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 61/2856 [18:18<13:56:34, 17.96s/it]

{'loss': 1.792, 'learning_rate': 2e-05, 'epoch': 0.06}


  2%|▏         | 62/2856 [18:36<13:56:45, 17.97s/it]

{'loss': 1.6837, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 63/2856 [18:54<13:56:27, 17.97s/it]

{'loss': 1.702, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 64/2856 [19:12<13:56:28, 17.98s/it]

{'loss': 1.9383, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 65/2856 [19:30<13:56:32, 17.98s/it]

{'loss': 2.2871, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 66/2856 [19:48<13:56:11, 17.98s/it]

{'loss': 2.2085, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 67/2856 [20:04<13:31:42, 17.46s/it]

{'loss': 1.7817, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 68/2856 [20:22<13:38:51, 17.62s/it]

{'loss': 1.8188, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 69/2856 [20:40<13:43:33, 17.73s/it]

{'loss': 1.6767, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 70/2856 [20:58<13:41:35, 17.69s/it]

{'loss': 1.6983, 'learning_rate': 2e-05, 'epoch': 0.07}


  2%|▏         | 71/2856 [21:16<13:45:27, 17.78s/it]

{'loss': 1.6172, 'learning_rate': 2e-05, 'epoch': 0.07}


  3%|▎         | 72/2856 [21:34<13:47:59, 17.84s/it]

{'loss': 1.6615, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 73/2856 [21:52<13:49:31, 17.88s/it]

{'loss': 1.5779, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 74/2856 [22:10<13:50:42, 17.92s/it]

{'loss': 1.7634, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 75/2856 [22:28<13:51:21, 17.94s/it]

{'loss': 1.7767, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 76/2856 [22:46<13:51:43, 17.95s/it]

{'loss': 1.7934, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 77/2856 [23:04<13:52:01, 17.96s/it]

{'loss': 1.6584, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 78/2856 [23:22<13:52:13, 17.97s/it]

{'loss': 1.7498, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 79/2856 [23:38<13:25:42, 17.41s/it]

{'loss': 1.8385, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 80/2856 [23:56<13:33:38, 17.59s/it]

{'loss': 1.7916, 'learning_rate': 2e-05, 'epoch': 0.08}


  3%|▎         | 81/2856 [24:14<13:39:06, 17.71s/it]

{'loss': 1.5739, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 82/2856 [24:32<13:42:30, 17.79s/it]

{'loss': 1.6437, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 83/2856 [24:50<13:44:51, 17.85s/it]

{'loss': 1.6518, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 84/2856 [25:08<13:46:29, 17.89s/it]

{'loss': 1.5532, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 85/2856 [25:26<13:47:28, 17.92s/it]

{'loss': 1.8538, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 86/2856 [25:44<13:48:13, 17.94s/it]

{'loss': 2.3484, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 87/2856 [26:02<13:48:45, 17.96s/it]

{'loss': 1.8384, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 88/2856 [26:20<13:48:59, 17.97s/it]

{'loss': 2.0644, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 89/2856 [26:38<13:49:14, 17.98s/it]

{'loss': 1.6552, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 90/2856 [26:56<13:48:58, 17.98s/it]

{'loss': 1.4055, 'learning_rate': 2e-05, 'epoch': 0.09}


  3%|▎         | 91/2856 [27:14<13:48:49, 17.99s/it]

{'loss': 1.8388, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 92/2856 [27:32<13:48:19, 17.98s/it]

{'loss': 1.6876, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 93/2856 [27:50<13:48:08, 17.98s/it]

{'loss': 1.6682, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 94/2856 [28:08<13:47:56, 17.99s/it]

{'loss': 1.6322, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 95/2856 [28:26<13:48:28, 18.00s/it]

{'loss': 1.6248, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 96/2856 [28:44<13:47:49, 18.00s/it]

{'loss': 1.681, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 97/2856 [29:02<13:47:23, 17.99s/it]

{'loss': 2.0232, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 98/2856 [29:20<13:46:53, 17.99s/it]

{'loss': 1.7449, 'learning_rate': 2e-05, 'epoch': 0.1}


  3%|▎         | 99/2856 [29:38<13:46:41, 17.99s/it]

{'loss': 1.6629, 'learning_rate': 2e-05, 'epoch': 0.1}


  4%|▎         | 100/2856 [29:56<13:46:24, 17.99s/it]

{'loss': 1.8204, 'learning_rate': 2e-05, 'epoch': 0.1}


  4%|▎         | 101/2856 [30:14<13:46:14, 17.99s/it]

{'loss': 1.577, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▎         | 102/2856 [30:32<13:45:55, 17.99s/it]

{'loss': 1.7236, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▎         | 103/2856 [30:48<13:18:29, 17.40s/it]

{'loss': 1.5563, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▎         | 104/2856 [31:06<13:26:18, 17.58s/it]

{'loss': 1.7547, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▎         | 105/2856 [31:24<13:31:55, 17.71s/it]

{'loss': 1.6868, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▎         | 106/2856 [31:42<13:35:25, 17.79s/it]

{'loss': 1.5728, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▎         | 107/2856 [32:00<13:37:51, 17.85s/it]

{'loss': 1.923, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▍         | 108/2856 [32:18<13:39:34, 17.89s/it]

{'loss': 1.6464, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▍         | 109/2856 [32:36<13:40:32, 17.92s/it]

{'loss': 1.6406, 'learning_rate': 2e-05, 'epoch': 0.11}


  4%|▍         | 110/2856 [32:54<13:41:01, 17.94s/it]

{'loss': 2.0917, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 111/2856 [33:12<13:41:20, 17.95s/it]

{'loss': 1.5883, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 112/2856 [33:30<13:41:27, 17.96s/it]

{'loss': 1.6312, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 113/2856 [33:48<13:41:31, 17.97s/it]

{'loss': 1.7181, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 114/2856 [34:06<13:41:31, 17.98s/it]

{'loss': 1.5295, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 115/2856 [34:24<13:41:25, 17.98s/it]

{'loss': 1.6653, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 116/2856 [34:41<13:41:04, 17.98s/it]

{'loss': 1.8284, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 117/2856 [34:59<13:40:53, 17.98s/it]

{'loss': 1.7138, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 118/2856 [35:17<13:40:37, 17.98s/it]

{'loss': 1.6282, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 119/2856 [35:35<13:40:25, 17.99s/it]

{'loss': 1.5208, 'learning_rate': 2e-05, 'epoch': 0.12}


  4%|▍         | 120/2856 [35:53<13:39:55, 17.98s/it]

{'loss': 1.7472, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 121/2856 [36:11<13:39:41, 17.98s/it]

{'loss': 1.6311, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 122/2856 [36:29<13:39:22, 17.98s/it]

{'loss': 1.544, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 123/2856 [36:47<13:39:02, 17.98s/it]

{'loss': 1.9773, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 124/2856 [37:05<13:38:55, 17.99s/it]

{'loss': 1.6573, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 125/2856 [37:23<13:38:41, 17.99s/it]

{'loss': 1.5989, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 126/2856 [37:41<13:38:19, 17.99s/it]

{'loss': 1.5736, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 127/2856 [37:59<13:38:16, 17.99s/it]

{'loss': 1.5641, 'learning_rate': 2e-05, 'epoch': 0.13}


  4%|▍         | 128/2856 [38:17<13:37:59, 17.99s/it]

{'loss': 1.6119, 'learning_rate': 2e-05, 'epoch': 0.13}


  5%|▍         | 129/2856 [38:35<13:37:34, 17.99s/it]

{'loss': 2.3771, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 130/2856 [38:53<13:37:07, 17.99s/it]

{'loss': 1.5588, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 131/2856 [39:11<13:36:33, 17.98s/it]

{'loss': 1.6175, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 132/2856 [39:29<13:35:54, 17.97s/it]

{'loss': 1.5713, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 133/2856 [39:47<13:35:31, 17.97s/it]

{'loss': 1.5704, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 134/2856 [40:05<13:35:02, 17.97s/it]

{'loss': 1.637, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 135/2856 [40:23<13:34:35, 17.96s/it]

{'loss': 1.6192, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 136/2856 [40:41<13:34:23, 17.96s/it]

{'loss': 1.6326, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 137/2856 [40:59<13:34:23, 17.97s/it]

{'loss': 1.6268, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 138/2856 [41:17<13:34:26, 17.98s/it]

{'loss': 1.7479, 'learning_rate': 2e-05, 'epoch': 0.14}


  5%|▍         | 139/2856 [41:35<13:34:17, 17.98s/it]

{'loss': 1.5584, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▍         | 140/2856 [41:53<13:34:15, 17.99s/it]

{'loss': 1.6264, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▍         | 141/2856 [42:11<13:34:04, 17.99s/it]

{'loss': 1.4505, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▍         | 142/2856 [42:29<13:33:40, 17.99s/it]

{'loss': 1.7946, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▌         | 143/2856 [42:47<13:33:21, 17.99s/it]

{'loss': 1.5146, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▌         | 144/2856 [43:05<13:33:05, 17.99s/it]

{'loss': 1.7168, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▌         | 145/2856 [43:23<13:32:52, 17.99s/it]

{'loss': 1.6169, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▌         | 146/2856 [43:41<13:32:20, 17.99s/it]

{'loss': 1.5213, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▌         | 147/2856 [43:59<13:32:04, 17.99s/it]

{'loss': 1.6885, 'learning_rate': 2e-05, 'epoch': 0.15}


  5%|▌         | 148/2856 [44:17<13:31:56, 17.99s/it]

{'loss': 1.4236, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 149/2856 [44:35<13:32:47, 18.02s/it]

{'loss': 1.6189, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 150/2856 [44:53<13:31:43, 18.00s/it]

{'loss': 2.3399, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 151/2856 [45:10<13:23:17, 17.82s/it]

{'loss': 1.4512, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 152/2856 [45:28<13:25:11, 17.87s/it]

{'loss': 1.5513, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 153/2856 [45:46<13:26:25, 17.90s/it]

{'loss': 1.4192, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 154/2856 [46:04<13:27:06, 17.92s/it]

{'loss': 1.5592, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 155/2856 [46:22<13:27:43, 17.94s/it]

{'loss': 1.6604, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 156/2856 [46:40<13:27:46, 17.95s/it]

{'loss': 1.5532, 'learning_rate': 2e-05, 'epoch': 0.16}


  5%|▌         | 157/2856 [46:58<13:27:47, 17.96s/it]

{'loss': 1.6398, 'learning_rate': 2e-05, 'epoch': 0.16}


  6%|▌         | 158/2856 [47:16<13:27:42, 17.96s/it]

{'loss': 1.7956, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 159/2856 [47:34<13:27:24, 17.96s/it]

{'loss': 1.6002, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 160/2856 [47:52<13:27:07, 17.96s/it]

{'loss': 1.5479, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 161/2856 [48:10<13:27:09, 17.97s/it]

{'loss': 1.656, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 162/2856 [48:28<13:27:17, 17.98s/it]

{'loss': 1.6525, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 163/2856 [48:46<13:26:59, 17.98s/it]

{'loss': 1.6351, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 164/2856 [49:04<13:26:48, 17.98s/it]

{'loss': 1.7944, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 165/2856 [49:22<13:26:28, 17.98s/it]

{'loss': 1.5305, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 166/2856 [49:40<13:26:30, 17.99s/it]

{'loss': 1.5355, 'learning_rate': 2e-05, 'epoch': 0.17}


  6%|▌         | 167/2856 [49:58<13:26:18, 17.99s/it]

{'loss': 1.4912, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 168/2856 [50:16<13:22:06, 17.90s/it]

{'loss': 1.6213, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 169/2856 [50:34<13:23:04, 17.93s/it]

{'loss': 1.6685, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 170/2856 [50:52<13:23:41, 17.95s/it]

{'loss': 1.7277, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 171/2856 [51:10<13:23:59, 17.97s/it]

{'loss': 1.4742, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 172/2856 [51:28<13:24:11, 17.98s/it]

{'loss': 1.6549, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 173/2856 [51:46<13:24:23, 17.99s/it]

{'loss': 1.6614, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 174/2856 [52:04<13:24:22, 18.00s/it]

{'loss': 2.2194, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 175/2856 [52:22<13:24:14, 18.00s/it]

{'loss': 1.6921, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 176/2856 [52:40<13:24:01, 18.00s/it]

{'loss': 1.3014, 'learning_rate': 2e-05, 'epoch': 0.18}


  6%|▌         | 177/2856 [52:58<13:23:54, 18.00s/it]

{'loss': 1.4845, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▌         | 178/2856 [53:16<13:23:47, 18.01s/it]

{'loss': 1.608, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▋         | 179/2856 [53:34<13:23:23, 18.01s/it]

{'loss': 1.5585, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▋         | 180/2856 [53:52<13:22:41, 18.00s/it]

{'loss': 1.5525, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▋         | 181/2856 [54:10<13:22:27, 18.00s/it]

{'loss': 1.603, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▋         | 182/2856 [54:28<13:22:21, 18.00s/it]

{'loss': 1.4403, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▋         | 183/2856 [54:46<13:22:14, 18.01s/it]

{'loss': 1.5125, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▋         | 184/2856 [55:04<13:21:50, 18.01s/it]

{'loss': 1.6269, 'learning_rate': 2e-05, 'epoch': 0.19}


  6%|▋         | 185/2856 [55:22<13:21:10, 18.00s/it]

{'loss': 1.6518, 'learning_rate': 2e-05, 'epoch': 0.19}


  7%|▋         | 186/2856 [55:40<13:21:00, 18.00s/it]

{'loss': 1.517, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 187/2856 [55:58<13:20:38, 18.00s/it]

{'loss': 1.4373, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 188/2856 [56:16<13:20:13, 18.00s/it]

{'loss': 1.5434, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 189/2856 [56:34<13:20:07, 18.00s/it]

{'loss': 1.9057, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 190/2856 [56:52<13:19:55, 18.00s/it]

{'loss': 1.4471, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 191/2856 [57:10<13:19:54, 18.01s/it]

{'loss': 1.4629, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 192/2856 [57:28<13:19:38, 18.01s/it]

{'loss': 1.4319, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 193/2856 [57:46<13:19:14, 18.01s/it]

{'loss': 1.5625, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 194/2856 [58:04<13:18:40, 18.00s/it]

{'loss': 2.0253, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 195/2856 [58:22<13:18:19, 18.00s/it]

{'loss': 1.5204, 'learning_rate': 2e-05, 'epoch': 0.2}


  7%|▋         | 196/2856 [58:40<13:18:01, 18.00s/it]

{'loss': 1.646, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 197/2856 [58:58<13:17:40, 18.00s/it]

{'loss': 1.4522, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 198/2856 [59:16<13:17:15, 18.00s/it]

{'loss': 1.5856, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 199/2856 [59:34<13:16:51, 17.99s/it]

{'loss': 1.5923, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 200/2856 [59:50<12:52:56, 17.46s/it]

{'loss': 1.3269, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 201/2856 [1:00:08<12:59:23, 17.61s/it]

{'loss': 1.4347, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 202/2856 [1:00:26<13:03:45, 17.72s/it]

{'loss': 1.4359, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 203/2856 [1:00:42<12:46:54, 17.34s/it]

{'loss': 1.4802, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 204/2856 [1:01:00<12:55:13, 17.54s/it]

{'loss': 1.566, 'learning_rate': 2e-05, 'epoch': 0.21}


  7%|▋         | 205/2856 [1:01:18<13:00:50, 17.67s/it]

{'loss': 1.9004, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 206/2856 [1:01:36<13:04:56, 17.77s/it]

{'loss': 1.4574, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 207/2856 [1:01:54<13:07:54, 17.85s/it]

{'loss': 1.4806, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 208/2856 [1:02:12<13:09:41, 17.89s/it]

{'loss': 1.4602, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 209/2856 [1:02:30<13:10:43, 17.92s/it]

{'loss': 1.5622, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 210/2856 [1:02:48<13:11:29, 17.95s/it]

{'loss': 1.58, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 211/2856 [1:03:06<13:12:03, 17.97s/it]

{'loss': 1.5841, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 212/2856 [1:03:24<13:12:17, 17.98s/it]

{'loss': 1.5753, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 213/2856 [1:03:42<13:12:12, 17.98s/it]

{'loss': 1.584, 'learning_rate': 2e-05, 'epoch': 0.22}


  7%|▋         | 214/2856 [1:04:00<13:12:06, 17.99s/it]

{'loss': 1.5284, 'learning_rate': 2e-05, 'epoch': 0.22}


  8%|▊         | 215/2856 [1:04:18<13:11:48, 17.99s/it]

{'loss': 1.763, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 216/2856 [1:04:36<13:11:12, 17.98s/it]

{'loss': 1.5705, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 217/2856 [1:04:54<13:10:53, 17.98s/it]

{'loss': 1.4739, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 218/2856 [1:05:12<13:10:24, 17.98s/it]

{'loss': 1.4286, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 219/2856 [1:05:30<13:09:54, 17.97s/it]

{'loss': 1.4706, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 220/2856 [1:05:48<13:09:34, 17.97s/it]

{'loss': 1.5634, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 221/2856 [1:06:06<13:09:17, 17.97s/it]

{'loss': 1.5871, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 222/2856 [1:06:24<13:08:48, 17.97s/it]

{'loss': 1.7175, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 223/2856 [1:06:42<13:08:34, 17.97s/it]

{'loss': 1.4971, 'learning_rate': 2e-05, 'epoch': 0.23}


  8%|▊         | 224/2856 [1:07:00<13:08:17, 17.97s/it]

{'loss': 1.6742, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 225/2856 [1:07:18<13:07:54, 17.97s/it]

{'loss': 1.476, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 226/2856 [1:07:36<13:07:43, 17.97s/it]

{'loss': 1.5784, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 227/2856 [1:07:54<13:07:24, 17.97s/it]

{'loss': 1.5633, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 228/2856 [1:08:12<13:07:05, 17.97s/it]

{'loss': 1.668, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 229/2856 [1:08:30<13:06:49, 17.97s/it]

{'loss': 1.7534, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 230/2856 [1:08:48<13:06:30, 17.97s/it]

{'loss': 1.6081, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 231/2856 [1:09:06<13:06:11, 17.97s/it]

{'loss': 2.0696, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 232/2856 [1:09:24<13:05:48, 17.97s/it]

{'loss': 1.4998, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 233/2856 [1:09:42<13:05:27, 17.97s/it]

{'loss': 1.5754, 'learning_rate': 2e-05, 'epoch': 0.24}


  8%|▊         | 234/2856 [1:10:00<13:05:11, 17.97s/it]

{'loss': 1.4892, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 235/2856 [1:10:18<13:04:57, 17.97s/it]

{'loss': 1.5134, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 236/2856 [1:10:36<13:04:36, 17.97s/it]

{'loss': 1.5771, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 237/2856 [1:10:54<13:04:13, 17.97s/it]

{'loss': 1.4651, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 238/2856 [1:11:12<13:03:50, 17.96s/it]

{'loss': 1.6542, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 239/2856 [1:11:30<13:03:27, 17.96s/it]

{'loss': 1.4359, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 240/2856 [1:11:48<13:03:05, 17.96s/it]

{'loss': 1.4455, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 241/2856 [1:12:06<13:02:53, 17.96s/it]

{'loss': 1.4884, 'learning_rate': 2e-05, 'epoch': 0.25}


  8%|▊         | 242/2856 [1:12:24<13:02:28, 17.96s/it]

{'loss': 1.4481, 'learning_rate': 2e-05, 'epoch': 0.25}


  9%|▊         | 243/2856 [1:12:42<13:02:14, 17.96s/it]

{'loss': 1.609, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▊         | 244/2856 [1:12:58<12:36:00, 17.37s/it]

{'loss': 1.513, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▊         | 245/2856 [1:13:15<12:43:36, 17.55s/it]

{'loss': 1.4593, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▊         | 246/2856 [1:13:33<12:48:38, 17.67s/it]

{'loss': 1.5698, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▊         | 247/2856 [1:13:51<12:52:11, 17.76s/it]

{'loss': 1.6117, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▊         | 248/2856 [1:14:09<12:54:38, 17.82s/it]

{'loss': 1.5024, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▊         | 249/2856 [1:14:27<12:56:14, 17.86s/it]

{'loss': 1.4374, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▉         | 250/2856 [1:14:45<12:57:09, 17.89s/it]

{'loss': 1.5462, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▉         | 251/2856 [1:15:03<12:57:52, 17.92s/it]

{'loss': 1.6853, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▉         | 252/2856 [1:15:21<12:58:05, 17.93s/it]

{'loss': 1.4496, 'learning_rate': 2e-05, 'epoch': 0.26}


  9%|▉         | 253/2856 [1:15:39<12:58:10, 17.94s/it]

{'loss': 1.4345, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 254/2856 [1:15:57<12:58:19, 17.95s/it]

{'loss': 1.5804, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 255/2856 [1:16:15<12:58:16, 17.95s/it]

{'loss': 1.5325, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 256/2856 [1:16:33<12:58:19, 17.96s/it]

{'loss': 1.5122, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 257/2856 [1:16:51<12:58:10, 17.96s/it]

{'loss': 1.8649, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 258/2856 [1:17:09<12:57:48, 17.96s/it]

{'loss': 1.4377, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 259/2856 [1:17:27<12:57:34, 17.96s/it]

{'loss': 1.6147, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 260/2856 [1:17:45<12:57:10, 17.96s/it]

{'loss': 1.5139, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 261/2856 [1:18:03<12:56:58, 17.96s/it]

{'loss': 1.5424, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 262/2856 [1:18:21<12:56:48, 17.97s/it]

{'loss': 1.4392, 'learning_rate': 2e-05, 'epoch': 0.27}


  9%|▉         | 263/2856 [1:18:39<12:56:29, 17.97s/it]

{'loss': 1.5255, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 264/2856 [1:18:57<12:56:05, 17.96s/it]

{'loss': 1.6433, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 265/2856 [1:19:15<12:55:37, 17.96s/it]

{'loss': 1.4723, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 266/2856 [1:19:33<12:55:14, 17.96s/it]

{'loss': 1.567, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 267/2856 [1:19:51<12:55:00, 17.96s/it]

{'loss': 1.5696, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 268/2856 [1:20:09<12:54:48, 17.96s/it]

{'loss': 1.4817, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 269/2856 [1:20:27<12:54:44, 17.97s/it]

{'loss': 1.5067, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 270/2856 [1:20:45<12:54:23, 17.97s/it]

{'loss': 1.5714, 'learning_rate': 2e-05, 'epoch': 0.28}


  9%|▉         | 271/2856 [1:21:03<12:54:02, 17.97s/it]

{'loss': 1.4794, 'learning_rate': 2e-05, 'epoch': 0.28}


 10%|▉         | 272/2856 [1:21:21<12:53:50, 17.97s/it]

{'loss': 1.4069, 'learning_rate': 2e-05, 'epoch': 0.29}


KeyboardInterrupt: 

In [11]:
import os
from huggingface_hub import login
login(os.environ['hf-write-token'])
model_id = "nvdenisov2002/llama-longLoRA-v5-8k-all-samples-3-epochs"
model.push_to_hub(model_id)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /tmp/xdg_cache/huggingface/token
Login successful



adapter_model.bin:   0%|          | 0.00/533M [00:00<?, ?B/s][A
adapter_model.bin:   0%|          | 8.19k/533M [00:00<3:34:54, 41.3kB/s][A
adapter_model.bin:   0%|          | 328k/533M [00:00<07:25, 1.19MB/s]   [A
adapter_model.bin:   1%|          | 3.01M/533M [00:00<00:58, 8.98MB/s][A
adapter_model.bin:   1%|          | 4.55M/533M [00:00<00:52, 10.1MB/s][A
adapter_model.bin:   1%|          | 6.48M/533M [00:00<00:45, 11.6MB/s][A
adapter_model.bin:   1%|▏         | 7.89M/533M [00:00<00:53, 9.73MB/s][A
adapter_model.bin:   2%|▏         | 12.6M/533M [00:01<00:29, 17.8MB/s][A
adapter_model.bin:   3%|▎         | 16.0M/533M [00:01<00:43, 11.8MB/s][A
adapter_model.bin:   5%|▍         | 24.1M/533M [00:01<00:25, 20.2MB/s][A
adapter_model.bin:   5%|▌         | 27.1M/533M [00:02<00:36, 13.9MB/s][A
adapter_model.bin:   5%|▌         | 29.1M/533M [00:02<00:35, 14.2MB/s][A
adapter_model.bin:   6%|▌         | 32.0M/533M [00:02<00:51, 9.73MB/s][A
adapter_model.bin:   8%|▊         | 41.1M/

CommitInfo(commit_url='https://huggingface.co/nvdenisov2002/llama-longLoRA-v5-8k-all-samples-3-epochs/commit/83f0033350c63e1ec4a388387e27fccbfe17c0ac', commit_message='Upload model', commit_description='', oid='83f0033350c63e1ec4a388387e27fccbfe17c0ac', pr_url=None, pr_revision=None, pr_num=None)

### Try to inference model

In [21]:
input_ids = tokenizer("a cat sat on a")["input_ids"]
input_ids = torch.tensor(input_ids, dtype=torch.int).reshape((1, -1)).to('cuda')
input_ids

tensor([[   1,  263, 6635, 3290,  373,  263]], device='cuda:0',
       dtype=torch.int32)

In [None]:
print("kek")

In [None]:
generated = model.generate(input_ids=input_ids)



In [None]:
tokenizer.decode(generated)

### Interrupt because of bad tokenizing

In [9]:
trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)

print("Learnt model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


[2024-05-10 03:40:40,924] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  0%|          | 1/1350 [00:34<13:02:07, 34.79s/it]

{'loss': 27.1375, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 2/1350 [01:10<13:15:38, 35.41s/it]

{'loss': 0.0, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}


  0%|          | 3/1350 [01:46<13:19:25, 35.61s/it]

{'loss': 0.0, 'learning_rate': 3e-06, 'epoch': 0.01}


  0%|          | 4/1350 [02:22<13:20:55, 35.70s/it]

{'loss': 0.0, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


  0%|          | 5/1350 [02:58<13:21:36, 35.76s/it]

{'loss': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}


  0%|          | 6/1350 [03:34<13:21:45, 35.79s/it]

{'loss': 0.0, 'learning_rate': 6e-06, 'epoch': 0.02}


  1%|          | 7/1350 [04:09<13:21:55, 35.83s/it]

{'loss': 24.0691, 'learning_rate': 7e-06, 'epoch': 0.03}


  1%|          | 8/1350 [04:45<13:17:50, 35.67s/it]

{'loss': 59.9522, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.03}


KeyboardInterrupt: 

In [None]:
from huggingface_hub import login
login("")
model_id = "nvdenisov2002/llama-longLoRA-v3-16k-2160-samples"
model.push_to_hub(model_id)

### Fail to save checkpoint

In [43]:
trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)

print("Learnt model")


                                                    it][A
  0%|          | 4/1350 [3:39:10<6:06:46, 16.35s/it]   
 18%|█▊        | 248/1350 [2:26:44<10:56:37, 35.75s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.92}



                                                    it][A
  0%|          | 4/1350 [3:39:46<6:06:46, 16.35s/it]   
 18%|█▊        | 249/1350 [2:27:20<10:56:48, 35.79s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.92}



                                                    it][A
  0%|          | 4/1350 [3:40:21<6:06:46, 16.35s/it]   
 19%|█▊        | 250/1350 [2:27:56<10:56:48, 35.83s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.93}



                                                    it][A
  0%|          | 4/1350 [3:40:57<6:06:46, 16.35s/it]   
 19%|█▊        | 251/1350 [2:28:32<10:56:38, 35.85s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.93}



                                                    it][A
  0%|          | 4/1350 [3:41:33<6:06:46, 16.35s/it]   
 19%|█▊        | 252/1350 [2:29:08<10:56:25, 35.87s/it][A

{'loss': 0.2232, 'learning_rate': 2e-05, 'epoch': 0.93}



                                                    it][A
  0%|          | 4/1350 [3:42:09<6:06:46, 16.35s/it]   
 19%|█▊        | 253/1350 [2:29:43<10:55:50, 35.87s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.94}



                                                    it][A
  0%|          | 4/1350 [3:42:45<6:06:46, 16.35s/it]   
 19%|█▉        | 254/1350 [2:30:19<10:55:15, 35.87s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.94}



                                                    it][A
  0%|          | 4/1350 [3:43:20<6:06:46, 16.35s/it]   
 19%|█▉        | 255/1350 [2:30:54<10:48:00, 35.51s/it][A

{'loss': 5.5682, 'learning_rate': 2e-05, 'epoch': 0.94}



                                                    it][A
  0%|          | 4/1350 [3:43:56<6:06:46, 16.35s/it]   
 19%|█▉        | 256/1350 [2:31:30<10:49:38, 35.63s/it][A

{'loss': 11.3305, 'learning_rate': 2e-05, 'epoch': 0.95}



                                                    it][A
  0%|          | 4/1350 [3:44:31<6:06:46, 16.35s/it]   
 19%|█▉        | 257/1350 [2:32:06<10:50:26, 35.71s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.95}



                                                    it][A
  0%|          | 4/1350 [3:45:06<6:06:46, 16.35s/it]   
 19%|█▉        | 258/1350 [2:32:40<10:43:43, 35.37s/it][A

{'loss': 1.3391, 'learning_rate': 2e-05, 'epoch': 0.96}



                                                    it][A
  0%|          | 4/1350 [3:45:42<6:06:46, 16.35s/it]   
 19%|█▉        | 259/1350 [2:33:16<10:46:01, 35.53s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.96}



                                                    it][A
  0%|          | 4/1350 [3:46:18<6:06:46, 16.35s/it]   
 19%|█▉        | 260/1350 [2:33:52<10:47:18, 35.63s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.96}



                                                    it][A
  0%|          | 4/1350 [3:46:54<6:06:46, 16.35s/it]   
 19%|█▉        | 261/1350 [2:34:28<10:47:59, 35.70s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.97}



                                                    it][A
  0%|          | 4/1350 [3:47:24<6:06:46, 16.35s/it]   
 19%|█▉        | 262/1350 [2:34:59<10:19:09, 34.14s/it][A

{'loss': 0.7814, 'learning_rate': 2e-05, 'epoch': 0.97}



                                                    it][A
  0%|          | 4/1350 [3:47:59<6:06:46, 16.35s/it]   
 19%|█▉        | 263/1350 [2:35:34<10:24:06, 34.45s/it][A

{'loss': 2.4559, 'learning_rate': 2e-05, 'epoch': 0.97}



                                                    it][A
  0%|          | 4/1350 [3:48:34<6:06:46, 16.35s/it]   
 20%|█▉        | 264/1350 [2:36:08<10:22:57, 34.42s/it][A

{'loss': 3.1618, 'learning_rate': 2e-05, 'epoch': 0.98}



                                                    it][A
  0%|          | 4/1350 [3:49:09<6:06:46, 16.35s/it]   
 20%|█▉        | 265/1350 [2:36:43<10:27:25, 34.70s/it][A

{'loss': 2.3243, 'learning_rate': 2e-05, 'epoch': 0.98}



                                                    it][A
  0%|          | 4/1350 [3:49:41<6:06:46, 16.35s/it]   
 20%|█▉        | 266/1350 [2:37:16<10:12:47, 33.92s/it][A

{'loss': 5.0069, 'learning_rate': 2e-05, 'epoch': 0.99}



                                                    it][A
  0%|          | 4/1350 [3:50:17<6:06:46, 16.35s/it]   
 20%|█▉        | 267/1350 [2:37:51<10:22:51, 34.51s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 0.99}



                                                    it][A
  0%|          | 4/1350 [3:50:51<6:06:46, 16.35s/it]   
 20%|█▉        | 268/1350 [2:38:25<10:18:50, 34.32s/it][A

{'loss': 1.0675, 'learning_rate': 2e-05, 'epoch': 0.99}



                                                    it][A
  0%|          | 4/1350 [3:51:27<6:06:46, 16.35s/it]   
 20%|█▉        | 269/1350 [2:39:01<10:26:46, 34.79s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.0}



                                                    it][A
  0%|          | 4/1350 [3:52:03<6:06:46, 16.35s/it]   
 20%|██        | 270/1350 [2:39:37<10:32:10, 35.12s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.0}



                                                    it][A
  0%|          | 4/1350 [3:52:39<6:06:46, 16.35s/it]   
 20%|██        | 271/1350 [2:40:13<10:35:44, 35.35s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.0}



                                                    it][A
  0%|          | 4/1350 [3:53:14<6:06:46, 16.35s/it]   
 20%|██        | 272/1350 [2:40:49<10:37:55, 35.51s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.01}



                                                    it][A
  0%|          | 4/1350 [3:53:49<6:06:46, 16.35s/it]   
 20%|██        | 273/1350 [2:41:23<10:30:37, 35.13s/it][A

{'loss': 11.2734, 'learning_rate': 2e-05, 'epoch': 1.01}



                                                    it][A
  0%|          | 4/1350 [3:54:25<6:06:46, 16.35s/it]   
 20%|██        | 274/1350 [2:41:59<10:34:05, 35.36s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.01}



                                                    it][A
  0%|          | 4/1350 [3:55:01<6:06:46, 16.35s/it]   
 20%|██        | 275/1350 [2:42:35<10:36:21, 35.52s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.02}



                                                    it][A
  0%|          | 4/1350 [3:55:36<6:06:46, 16.35s/it]   
 20%|██        | 276/1350 [2:43:11<10:37:41, 35.63s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.02}



                                                    it][A
  0%|          | 4/1350 [3:56:12<6:06:46, 16.35s/it]   
 21%|██        | 277/1350 [2:43:47<10:38:28, 35.70s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.03}



                                                    it][A
  0%|          | 4/1350 [3:56:47<6:06:46, 16.35s/it]   
 21%|██        | 278/1350 [2:44:22<10:34:43, 35.53s/it][A

{'loss': 2.6169, 'learning_rate': 2e-05, 'epoch': 1.03}



                                                    it][A
  0%|          | 4/1350 [3:57:23<6:06:46, 16.35s/it]   
 21%|██        | 279/1350 [2:44:58<10:36:02, 35.63s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.03}



                                                    it][A
  0%|          | 4/1350 [3:57:59<6:06:46, 16.35s/it]   
 21%|██        | 280/1350 [2:45:33<10:36:53, 35.71s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.04}



                                                    it][A
  0%|          | 4/1350 [3:58:35<6:06:46, 16.35s/it]   
 21%|██        | 281/1350 [2:46:09<10:37:10, 35.76s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.04}



                                                    it][A
  0%|          | 4/1350 [3:59:11<6:06:46, 16.35s/it]   
 21%|██        | 282/1350 [2:46:45<10:37:12, 35.80s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.04}



                                                    it][A
  0%|          | 4/1350 [3:59:47<6:06:46, 16.35s/it]   
 21%|██        | 283/1350 [2:47:21<10:37:11, 35.83s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.05}



                                                    it][A
  0%|          | 4/1350 [4:00:22<6:06:46, 16.35s/it]   
 21%|██        | 284/1350 [2:47:56<10:31:34, 35.55s/it][A

{'loss': 0.2573, 'learning_rate': 2e-05, 'epoch': 1.05}



                                                    it][A
  0%|          | 4/1350 [4:00:58<6:06:46, 16.35s/it]   
 21%|██        | 285/1350 [2:48:32<10:32:46, 35.65s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.06}



                                                    it][A
  0%|          | 4/1350 [4:01:32<6:06:46, 16.35s/it]   
 21%|██        | 286/1350 [2:49:06<10:25:09, 35.25s/it][A

{'loss': 5.4466, 'learning_rate': 2e-05, 'epoch': 1.06}



                                                    it][A
  0%|          | 4/1350 [4:02:08<6:06:46, 16.35s/it]   
 21%|██▏       | 287/1350 [2:49:42<10:27:51, 35.44s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.06}



                                                    it][A
  0%|          | 4/1350 [4:02:44<6:06:46, 16.35s/it]   
 21%|██▏       | 288/1350 [2:50:18<10:29:36, 35.57s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.07}



                                                    it][A
  0%|          | 4/1350 [4:03:20<6:06:46, 16.35s/it]   
 21%|██▏       | 289/1350 [2:50:54<10:32:31, 35.77s/it][A

{'loss': 6.3529, 'learning_rate': 2e-05, 'epoch': 1.07}



                                                    it][A
  0%|          | 4/1350 [4:03:56<6:06:46, 16.35s/it]   
 21%|██▏       | 290/1350 [2:51:30<10:32:25, 35.80s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.07}



                                                    it][A
  0%|          | 4/1350 [4:04:32<6:06:46, 16.35s/it]   
 22%|██▏       | 291/1350 [2:52:06<10:32:15, 35.82s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.08}



                                                    it][A
  0%|          | 4/1350 [4:05:08<6:06:46, 16.35s/it]   


{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.08}


 22%|██▏       | 292/1350 [2:52:42<10:31:58, 35.84s/it][A
                                                    it][A
  0%|          | 4/1350 [4:05:43<6:06:46, 16.35s/it]   
 22%|██▏       | 293/1350 [2:53:18<10:31:36, 35.85s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.09}



                                                    it][A
  0%|          | 4/1350 [4:06:19<6:06:46, 16.35s/it]   
 22%|██▏       | 294/1350 [2:53:54<10:31:03, 35.86s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.09}



                                                    it][A
  0%|          | 4/1350 [4:06:55<6:06:46, 16.35s/it]   
 22%|██▏       | 295/1350 [2:54:30<10:30:46, 35.87s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.09}



                                                    it][A
  0%|          | 4/1350 [4:07:31<6:06:46, 16.35s/it]   
 22%|██▏       | 296/1350 [2:55:05<10:30:07, 35.87s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.1}



                                                    it][A
  0%|          | 4/1350 [4:08:05<6:06:46, 16.35s/it]   
 22%|██▏       | 297/1350 [2:55:40<10:21:06, 35.39s/it][A

{'loss': 1.7712, 'learning_rate': 2e-05, 'epoch': 1.1}



                                                    it][A
  0%|          | 4/1350 [4:08:41<6:06:46, 16.35s/it]   
 22%|██▏       | 298/1350 [2:56:16<10:23:05, 35.54s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.1}



                                                    it][A
  0%|          | 4/1350 [4:09:17<6:06:46, 16.35s/it]   
 22%|██▏       | 299/1350 [2:56:51<10:24:24, 35.65s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.11}



                                                    it][A
  0%|          | 4/1350 [4:09:53<6:06:46, 16.35s/it]   
 22%|██▏       | 300/1350 [2:57:27<10:25:02, 35.72s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.11}



                                                    it][A
  0%|          | 4/1350 [4:10:29<6:06:46, 16.35s/it]   
 22%|██▏       | 301/1350 [2:58:03<10:25:14, 35.76s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.11}



                                                    it][A
  0%|          | 4/1350 [4:11:04<6:06:46, 16.35s/it]   
 22%|██▏       | 302/1350 [2:58:38<10:21:52, 35.60s/it][A

{'loss': 2.3707, 'learning_rate': 2e-05, 'epoch': 1.12}



                                                    it][A
  0%|          | 4/1350 [4:11:40<6:06:46, 16.35s/it]   
 22%|██▏       | 303/1350 [2:59:14<10:22:47, 35.69s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.12}



                                                    it][A
  0%|          | 4/1350 [4:12:13<6:06:46, 16.35s/it]   
 23%|██▎       | 304/1350 [2:59:48<10:09:45, 34.98s/it][A

{'loss': 3.8183, 'learning_rate': 2e-05, 'epoch': 1.13}



                                                    it][A
  0%|          | 4/1350 [4:12:49<6:06:46, 16.35s/it]   
 23%|██▎       | 305/1350 [3:00:23<10:13:50, 35.24s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.13}



                                                    it][A
  0%|          | 4/1350 [4:13:24<6:06:46, 16.35s/it]   
 23%|██▎       | 306/1350 [3:00:58<10:10:48, 35.10s/it][A

{'loss': 7.1336, 'learning_rate': 2e-05, 'epoch': 1.13}



                                                    it][A
  0%|          | 4/1350 [4:14:00<6:06:46, 16.35s/it]   
 23%|██▎       | 307/1350 [3:01:34<10:14:16, 35.34s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.14}



                                                    it][A
  0%|          | 4/1350 [4:14:36<6:06:46, 16.35s/it]   
 23%|██▎       | 308/1350 [3:02:10<10:16:42, 35.51s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.14}



                                                    it][A
  0%|          | 4/1350 [4:15:12<6:06:46, 16.35s/it]   
 23%|██▎       | 309/1350 [3:02:46<10:18:02, 35.62s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.14}



                                                    it][A
  0%|          | 4/1350 [4:15:47<6:06:46, 16.35s/it]   
 23%|██▎       | 310/1350 [3:03:22<10:18:37, 35.69s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.15}



                                                    it][A
  0%|          | 4/1350 [4:16:23<6:06:46, 16.35s/it]   
 23%|██▎       | 311/1350 [3:03:58<10:19:05, 35.75s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.15}



                                                    it][A
  0%|          | 4/1350 [4:16:59<6:06:46, 16.35s/it]   
 23%|██▎       | 312/1350 [3:04:34<10:19:07, 35.79s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.16}



                                                    it][A
  0%|          | 4/1350 [4:17:35<6:06:46, 16.35s/it]   
 23%|██▎       | 313/1350 [3:05:09<10:19:02, 35.82s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.16}



                                                    it][A
  0%|          | 4/1350 [4:18:11<6:06:46, 16.35s/it]   
 23%|██▎       | 314/1350 [3:05:45<10:19:28, 35.88s/it][A

{'loss': 0.4689, 'learning_rate': 2e-05, 'epoch': 1.16}



                                                    it][A
  0%|          | 4/1350 [4:18:47<6:06:46, 16.35s/it]   
 23%|██▎       | 315/1350 [3:06:21<10:18:58, 35.88s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.17}



                                                    it][A
  0%|          | 4/1350 [4:19:23<6:06:46, 16.35s/it]   
 23%|██▎       | 316/1350 [3:06:57<10:18:22, 35.88s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.17}



                                                    it][A
  0%|          | 4/1350 [4:19:59<6:06:46, 16.35s/it]   
 23%|██▎       | 317/1350 [3:07:33<10:17:43, 35.88s/it][A

{'loss': 0.283, 'learning_rate': 2e-05, 'epoch': 1.17}



                                                    it][A
  0%|          | 4/1350 [4:20:34<6:06:46, 16.35s/it]   
 24%|██▎       | 318/1350 [3:08:08<10:12:06, 35.59s/it][A

{'loss': 1.9401, 'learning_rate': 2e-05, 'epoch': 1.18}



                                                    it][A
  0%|          | 4/1350 [4:21:10<6:06:46, 16.35s/it]   
 24%|██▎       | 319/1350 [3:08:44<10:13:04, 35.68s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.18}



                                                    it][A
  0%|          | 4/1350 [4:21:45<6:06:46, 16.35s/it]   
 24%|██▎       | 320/1350 [3:09:20<10:13:34, 35.74s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.19}



                                                    it][A
  0%|          | 4/1350 [4:22:19<6:06:46, 16.35s/it]   
 24%|██▍       | 321/1350 [3:09:53<10:02:18, 35.12s/it][A

{'loss': 16.0999, 'learning_rate': 2e-05, 'epoch': 1.19}



                                                    t] [A
  0%|          | 4/1350 [4:22:53<6:06:46, 16.35s/it]  
 24%|██▍       | 322/1350 [3:10:28<9:56:51, 34.84s/it][A

{'loss': 4.1742, 'learning_rate': 2e-05, 'epoch': 1.19}



                                                    it][A
  0%|          | 4/1350 [4:23:29<6:06:46, 16.35s/it]   
 24%|██▍       | 323/1350 [3:11:04<10:01:42, 35.15s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.2}



                                                    it][A
  0%|          | 4/1350 [4:24:05<6:06:46, 16.35s/it]   
 24%|██▍       | 324/1350 [3:11:39<10:04:50, 35.37s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.2}



                                                    it][A
  0%|          | 4/1350 [4:24:41<6:06:46, 16.35s/it]   
 24%|██▍       | 325/1350 [3:12:15<10:06:54, 35.53s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.2}



                                                    it][A
  0%|          | 4/1350 [4:25:16<6:06:46, 16.35s/it]   
 24%|██▍       | 326/1350 [3:12:51<10:05:33, 35.48s/it][A

{'loss': 2.1165, 'learning_rate': 2e-05, 'epoch': 1.21}



                                                    it][A
  0%|          | 4/1350 [4:25:51<6:06:46, 16.35s/it]   
 24%|██▍       | 327/1350 [3:13:25<10:00:36, 35.23s/it][A

{'loss': 8.2507, 'learning_rate': 2e-05, 'epoch': 1.21}



                                                    it][A
  0%|          | 4/1350 [4:26:27<6:06:46, 16.35s/it]   
 24%|██▍       | 328/1350 [3:14:01<10:04:34, 35.49s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.21}



                                                    t] [A
  0%|          | 4/1350 [4:27:01<6:06:46, 16.35s/it]  
 24%|██▍       | 329/1350 [3:14:36<9:56:58, 35.08s/it][A

{'loss': 1.4066, 'learning_rate': 2e-05, 'epoch': 1.22}



                                                    t][A
  0%|          | 4/1350 [4:27:36<6:06:46, 16.35s/it]  
 24%|██▍       | 330/1350 [3:15:11<9:56:36, 35.09s/it][A

{'loss': 3.531, 'learning_rate': 2e-05, 'epoch': 1.22}



                                                    it][A
  0%|          | 4/1350 [4:28:13<6:06:46, 16.35s/it]   
 25%|██▍       | 331/1350 [3:15:47<10:01:40, 35.43s/it][A

{'loss': 0.4917, 'learning_rate': 2e-05, 'epoch': 1.23}



                                                    it][A
  0%|          | 4/1350 [4:28:49<6:06:46, 16.35s/it]   
 25%|██▍       | 332/1350 [3:16:23<10:03:52, 35.59s/it][A

{'loss': 34.5847, 'learning_rate': 2e-05, 'epoch': 1.23}



                                                    it][A
  0%|          | 4/1350 [4:29:24<6:06:46, 16.35s/it]   
 25%|██▍       | 333/1350 [3:16:59<10:04:42, 35.68s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.23}



                                                    it][A
  0%|          | 4/1350 [4:30:00<6:06:46, 16.35s/it]   
 25%|██▍       | 334/1350 [3:17:34<10:03:10, 35.62s/it][A

{'loss': 1.5376, 'learning_rate': 2e-05, 'epoch': 1.24}



                                                    it][A
  0%|          | 4/1350 [4:30:36<6:06:46, 16.35s/it]   
 25%|██▍       | 335/1350 [3:18:10<10:03:55, 35.70s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.24}



                                                    it][A
  0%|          | 4/1350 [4:31:12<6:06:46, 16.35s/it]   
 25%|██▍       | 336/1350 [3:18:46<10:04:12, 35.75s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.24}



                                                    it][A
  0%|          | 4/1350 [4:31:47<6:06:46, 16.35s/it]   
 25%|██▍       | 337/1350 [3:19:21<10:02:08, 35.67s/it][A

{'loss': 4.3105, 'learning_rate': 2e-05, 'epoch': 1.25}



                                                    it][A
  0%|          | 4/1350 [4:32:23<6:06:46, 16.35s/it]   
 25%|██▌       | 338/1350 [3:19:57<10:02:44, 35.74s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.25}



                                                    it][A
  0%|          | 4/1350 [4:32:59<6:06:46, 16.35s/it]   
 25%|██▌       | 339/1350 [3:20:33<10:02:53, 35.78s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.26}



                                                    it][A
  0%|          | 4/1350 [4:33:35<6:06:46, 16.35s/it]   
 25%|██▌       | 340/1350 [3:21:09<10:02:55, 35.82s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.26}



                                                    it][A
  0%|          | 4/1350 [4:34:11<6:06:46, 16.35s/it]   
 25%|██▌       | 341/1350 [3:21:45<10:02:40, 35.84s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.26}



                                                    it][A
  0%|          | 4/1350 [4:34:47<6:06:46, 16.35s/it]   
 25%|██▌       | 342/1350 [3:22:21<10:02:13, 35.85s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.27}



                                                    t] [A
  0%|          | 4/1350 [4:35:22<6:06:46, 16.35s/it]  
 25%|██▌       | 343/1350 [3:22:56<9:57:06, 35.58s/it][A

{'loss': 7.4707, 'learning_rate': 2e-05, 'epoch': 1.27}



                                                    t][A
  0%|          | 4/1350 [4:35:57<6:06:46, 16.35s/it]  
 25%|██▌       | 344/1350 [3:23:32<9:58:00, 35.67s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.27}



                                                    t][A
  0%|          | 4/1350 [4:36:33<6:06:46, 16.35s/it]  
 26%|██▌       | 345/1350 [3:24:08<9:58:22, 35.72s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.28}



                                                    t][A
  0%|          | 4/1350 [4:37:09<6:06:46, 16.35s/it]  
 26%|██▌       | 346/1350 [3:24:43<9:58:43, 35.78s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.28}



                                                    t][A
  0%|          | 4/1350 [4:37:45<6:06:46, 16.35s/it]  
 26%|██▌       | 347/1350 [3:25:19<9:58:32, 35.81s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.29}



                                                    t][A
  0%|          | 4/1350 [4:38:21<6:06:46, 16.35s/it]  
 26%|██▌       | 348/1350 [3:25:55<9:58:18, 35.83s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.29}



                                                    t][A
  0%|          | 4/1350 [4:38:55<6:06:46, 16.35s/it]  
 26%|██▌       | 349/1350 [3:26:29<9:49:15, 35.32s/it][A

{'loss': 2.1028, 'learning_rate': 2e-05, 'epoch': 1.29}



                                                    t][A
  0%|          | 4/1350 [4:39:31<6:06:46, 16.35s/it]  
 26%|██▌       | 350/1350 [3:27:05<9:51:26, 35.49s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.3}



                                                    t][A
  0%|          | 4/1350 [4:40:07<6:06:46, 16.35s/it]  
 26%|██▌       | 351/1350 [3:27:41<9:52:49, 35.61s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.3}



                                                    t][A
  0%|          | 4/1350 [4:40:43<6:06:46, 16.35s/it]  
 26%|██▌       | 352/1350 [3:28:17<9:53:36, 35.69s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.3}



                                                    t][A
  0%|          | 4/1350 [4:41:19<6:06:46, 16.35s/it]  

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.31}



 26%|██▌       | 353/1350 [3:28:53<9:54:04, 35.75s/it][A
                                                    t][A
  0%|          | 4/1350 [4:41:55<6:06:46, 16.35s/it]  
 26%|██▌       | 354/1350 [3:29:29<9:55:31, 35.87s/it][A

{'loss': 0.4242, 'learning_rate': 2e-05, 'epoch': 1.31}



                                                    t][A
  0%|          | 4/1350 [4:42:30<6:06:46, 16.35s/it]  
 26%|██▋       | 355/1350 [3:30:04<9:51:14, 35.65s/it][A

{'loss': 9.4839, 'learning_rate': 2e-05, 'epoch': 1.31}



                                                    t][A
  0%|          | 4/1350 [4:43:04<6:06:46, 16.35s/it]  
 26%|██▋       | 356/1350 [3:30:39<9:44:07, 35.26s/it][A

{'loss': 7.4787, 'learning_rate': 2e-05, 'epoch': 1.32}



                                                    t][A
  0%|          | 4/1350 [4:43:39<6:06:46, 16.35s/it]  
 26%|██▋       | 357/1350 [3:31:13<9:41:08, 35.11s/it][A

{'loss': 0.3009, 'learning_rate': 2e-05, 'epoch': 1.32}



                                                    t][A
  0%|          | 4/1350 [4:44:15<6:06:46, 16.35s/it]  
 27%|██▋       | 358/1350 [3:31:49<9:44:22, 35.35s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.33}



                                                    t][A
  0%|          | 4/1350 [4:44:51<6:06:46, 16.35s/it]  
 27%|██▋       | 359/1350 [3:32:25<9:46:32, 35.51s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.33}



                                                    t][A
  0%|          | 4/1350 [4:45:27<6:06:46, 16.35s/it]  
 27%|██▋       | 360/1350 [3:33:01<9:47:53, 35.63s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.33}



                                                    t][A
  0%|          | 4/1350 [4:46:03<6:06:46, 16.35s/it]  
 27%|██▋       | 361/1350 [3:33:37<9:48:33, 35.71s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.34}



                                                    t][A
  0%|          | 4/1350 [4:46:38<6:06:46, 16.35s/it]  
 27%|██▋       | 362/1350 [3:34:13<9:48:53, 35.76s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.34}



                                                    t][A
  0%|          | 4/1350 [4:47:14<6:06:46, 16.35s/it]  
 27%|██▋       | 363/1350 [3:34:49<9:48:55, 35.80s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.34}



                                                    t][A
  0%|          | 4/1350 [4:47:50<6:06:46, 16.35s/it]  
 27%|██▋       | 364/1350 [3:35:25<9:48:45, 35.83s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.35}



                                                    t][A
  0%|          | 4/1350 [4:48:26<6:06:46, 16.35s/it]  
 27%|██▋       | 365/1350 [3:36:00<9:48:26, 35.84s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.35}



                                                    t][A
  0%|          | 4/1350 [4:49:02<6:06:46, 16.35s/it]  
 27%|██▋       | 366/1350 [3:36:36<9:45:44, 35.72s/it][A

{'loss': 1.7465, 'learning_rate': 2e-05, 'epoch': 1.36}



                                                    t][A
  0%|          | 4/1350 [4:49:37<6:06:46, 16.35s/it]  
 27%|██▋       | 367/1350 [3:37:12<9:45:54, 35.76s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.36}



                                                    t][A
  0%|          | 4/1350 [4:50:13<6:06:46, 16.35s/it]  
 27%|██▋       | 368/1350 [3:37:48<9:45:50, 35.80s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.36}



                                                    t][A
  0%|          | 4/1350 [4:50:49<6:06:46, 16.35s/it]  
 27%|██▋       | 369/1350 [3:38:23<9:43:35, 35.69s/it][A

{'loss': 1.529, 'learning_rate': 2e-05, 'epoch': 1.37}



                                                    t][A
  0%|          | 4/1350 [4:51:25<6:06:46, 16.35s/it]  
 27%|██▋       | 370/1350 [3:38:59<9:43:59, 35.75s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.37}



                                                    t][A
  0%|          | 4/1350 [4:52:01<6:06:46, 16.35s/it]  
 27%|██▋       | 371/1350 [3:39:35<9:44:00, 35.79s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.37}



                                                    t][A
  0%|          | 4/1350 [4:52:35<6:06:46, 16.35s/it]  
 28%|██▊       | 372/1350 [3:40:10<9:38:21, 35.48s/it][A

{'loss': 10.8289, 'learning_rate': 2e-05, 'epoch': 1.38}



                                                    t][A
  0%|          | 4/1350 [4:53:11<6:06:46, 16.35s/it]  
 28%|██▊       | 373/1350 [3:40:46<9:39:50, 35.61s/it][A

{'loss': 11.0621, 'learning_rate': 2e-05, 'epoch': 1.38}



                                                    t][A
  0%|          | 4/1350 [4:53:47<6:06:46, 16.35s/it]  
 28%|██▊       | 374/1350 [3:41:21<9:40:36, 35.69s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.39}



                                                    t][A
  0%|          | 4/1350 [4:54:23<6:06:46, 16.35s/it]  
 28%|██▊       | 375/1350 [3:41:57<9:41:03, 35.76s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.39}



                                                    t][A
  0%|          | 4/1350 [4:54:59<6:06:46, 16.35s/it]  
 28%|██▊       | 376/1350 [3:42:33<9:41:04, 35.80s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.39}



                                                    t][A
  0%|          | 4/1350 [4:55:28<6:06:46, 16.35s/it]  
 28%|██▊       | 377/1350 [3:43:03<9:10:12, 33.93s/it][A

{'loss': 0.9222, 'learning_rate': 2e-05, 'epoch': 1.4}



                                                    t][A
  0%|          | 4/1350 [4:56:04<6:06:46, 16.35s/it]  
 28%|██▊       | 378/1350 [3:43:39<9:19:10, 34.52s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.4}



                                                    t][A
  0%|          | 4/1350 [4:56:39<6:06:46, 16.35s/it]  
 28%|██▊       | 379/1350 [3:44:13<9:18:54, 34.54s/it][A

{'loss': 30.0378, 'learning_rate': 2e-05, 'epoch': 1.4}



                                                    t][A
  0%|          | 4/1350 [4:57:15<6:06:46, 16.35s/it]  
 28%|██▊       | 380/1350 [3:44:49<9:26:00, 35.01s/it][A

{'loss': 0.3621, 'learning_rate': 2e-05, 'epoch': 1.41}



                                                    t][A
  0%|          | 4/1350 [4:57:51<6:06:46, 16.35s/it]  
 28%|██▊       | 381/1350 [3:45:25<9:29:41, 35.28s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.41}



                                                    t][A
  0%|          | 4/1350 [4:58:27<6:06:46, 16.35s/it]  
 28%|██▊       | 382/1350 [3:46:01<9:32:07, 35.46s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.41}



                                                    t][A
  0%|          | 4/1350 [4:59:03<6:06:46, 16.35s/it]  
 28%|██▊       | 383/1350 [3:46:37<9:34:47, 35.66s/it][A

{'loss': 30.561, 'learning_rate': 2e-05, 'epoch': 1.42}



                                                    t][A
  0%|          | 4/1350 [4:59:36<6:06:46, 16.35s/it]  
 28%|██▊       | 384/1350 [3:47:11<9:23:23, 34.99s/it][A

{'loss': 0.4959, 'learning_rate': 2e-05, 'epoch': 1.42}



                                                    t][A
  0%|          | 4/1350 [5:00:12<6:06:46, 16.35s/it]  
 29%|██▊       | 385/1350 [3:47:47<9:27:42, 35.30s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.43}



                                                    t][A
  0%|          | 4/1350 [5:00:48<6:06:46, 16.35s/it]  
 29%|██▊       | 386/1350 [3:48:23<9:29:42, 35.46s/it][A

{'loss': 0.5149, 'learning_rate': 2e-05, 'epoch': 1.43}



                                                    t][A
  0%|          | 4/1350 [5:01:24<6:06:46, 16.35s/it]  
 29%|██▊       | 387/1350 [3:48:58<9:31:06, 35.58s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.43}



                                                    t][A
  0%|          | 4/1350 [5:02:00<6:06:46, 16.35s/it]  
 29%|██▊       | 388/1350 [3:49:34<9:31:55, 35.67s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.44}



                                                    t][A
  0%|          | 4/1350 [5:02:36<6:06:46, 16.35s/it]  
 29%|██▉       | 389/1350 [3:50:10<9:32:24, 35.74s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.44}



                                                    t][A
  0%|          | 4/1350 [5:03:11<6:06:46, 16.35s/it]  
 29%|██▉       | 390/1350 [3:50:45<9:28:55, 35.56s/it][A

{'loss': 42.1481, 'learning_rate': 2e-05, 'epoch': 1.44}



                                                    t][A
  0%|          | 4/1350 [5:03:47<6:06:46, 16.35s/it]  
 29%|██▉       | 391/1350 [3:51:21<9:29:56, 35.66s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.45}



                                                    t][A
  0%|          | 4/1350 [5:04:23<6:06:46, 16.35s/it]  
 29%|██▉       | 392/1350 [3:51:57<9:31:43, 35.81s/it][A

{'loss': 15.0322, 'learning_rate': 2e-05, 'epoch': 1.45}



                                                    t][A
  0%|          | 4/1350 [5:04:59<6:06:46, 16.35s/it]  
 29%|██▉       | 393/1350 [3:52:33<9:31:33, 35.83s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.46}



                                                    t][A
  0%|          | 4/1350 [5:05:35<6:06:46, 16.35s/it]  
 29%|██▉       | 394/1350 [3:53:09<9:31:14, 35.85s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.46}



                                                    t][A
  0%|          | 4/1350 [5:06:08<6:06:46, 16.35s/it]  
 29%|██▉       | 395/1350 [3:53:42<9:18:21, 35.08s/it][A

{'loss': 24.9077, 'learning_rate': 2e-05, 'epoch': 1.46}



                                                    t][A
  0%|          | 4/1350 [5:06:44<6:06:46, 16.35s/it]  
 29%|██▉       | 396/1350 [3:54:18<9:21:36, 35.32s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.47}



                                                    t][A
  0%|          | 4/1350 [5:07:18<6:06:46, 16.35s/it]  
 29%|██▉       | 397/1350 [3:54:53<9:16:38, 35.05s/it][A

{'loss': 0.6553, 'learning_rate': 2e-05, 'epoch': 1.47}



                                                    t][A
  0%|          | 4/1350 [5:07:54<6:06:46, 16.35s/it]  
 29%|██▉       | 398/1350 [3:55:29<9:20:10, 35.30s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.47}



                                                    t][A
  0%|          | 4/1350 [5:08:30<6:06:46, 16.35s/it]  
 30%|██▉       | 399/1350 [3:56:04<9:21:19, 35.41s/it][A

{'loss': 0.5423, 'learning_rate': 2e-05, 'epoch': 1.48}



                                                    t][A
  0%|          | 4/1350 [5:09:06<6:06:46, 16.35s/it]  
 30%|██▉       | 400/1350 [3:56:40<9:23:07, 35.57s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.48}



                                                    t][A
  0%|          | 4/1350 [5:09:42<6:06:46, 16.35s/it]  
 30%|██▉       | 401/1350 [3:57:16<9:24:12, 35.67s/it][A

{'loss': 3.7628, 'learning_rate': 2e-05, 'epoch': 1.49}



                                                    t][A
  0%|          | 4/1350 [5:10:16<6:06:46, 16.35s/it]  
 30%|██▉       | 402/1350 [3:57:51<9:18:00, 35.32s/it][A

{'loss': 31.5327, 'learning_rate': 2e-05, 'epoch': 1.49}



                                                    t][A
  0%|          | 4/1350 [5:10:52<6:06:46, 16.35s/it]  
 30%|██▉       | 403/1350 [3:58:27<9:20:04, 35.48s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.49}



                                                    t][A
  0%|          | 4/1350 [5:11:28<6:06:46, 16.35s/it]  
 30%|██▉       | 404/1350 [3:59:02<9:21:27, 35.61s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.5}



                                                    t][A
  0%|          | 4/1350 [5:12:04<6:06:46, 16.35s/it]  
 30%|███       | 405/1350 [3:59:38<9:22:19, 35.70s/it][A

{'loss': 0.0847, 'learning_rate': 2e-05, 'epoch': 1.5}



                                                    t][A
  0%|          | 4/1350 [5:12:36<6:06:46, 16.35s/it]  
 30%|███       | 406/1350 [4:00:10<9:03:55, 34.57s/it][A

{'loss': 1.0173, 'learning_rate': 2e-05, 'epoch': 1.5}



                                                    t][A
  0%|          | 4/1350 [5:13:12<6:06:46, 16.35s/it]  
 30%|███       | 407/1350 [4:00:46<9:09:38, 34.97s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.51}



                                                    t][A
  0%|          | 4/1350 [5:13:48<6:06:46, 16.35s/it]  
 30%|███       | 408/1350 [4:01:22<9:13:16, 35.24s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.51}



                                                    t][A
  0%|          | 4/1350 [5:14:24<6:06:46, 16.35s/it]  
 30%|███       | 409/1350 [4:01:58<9:15:43, 35.43s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.51}



                                                    t][A
  0%|          | 4/1350 [5:15:00<6:06:46, 16.35s/it]  
 30%|███       | 410/1350 [4:02:34<9:18:20, 35.64s/it][A

{'loss': 1.0929, 'learning_rate': 2e-05, 'epoch': 1.52}



                                                    t][A
  0%|          | 4/1350 [5:15:36<6:06:46, 16.35s/it]  
 30%|███       | 411/1350 [4:03:10<9:19:02, 35.72s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.52}



                                                    t][A
  0%|          | 4/1350 [5:16:12<6:06:46, 16.35s/it]  
 31%|███       | 412/1350 [4:03:46<9:19:31, 35.79s/it][A

{'loss': 1.7598, 'learning_rate': 2e-05, 'epoch': 1.53}



                                                    t][A
  0%|          | 4/1350 [5:16:43<6:06:46, 16.35s/it]  
 31%|███       | 413/1350 [4:04:18<8:59:39, 34.56s/it][A

{'loss': 0.7089, 'learning_rate': 2e-05, 'epoch': 1.53}



                                                    t][A
  0%|          | 4/1350 [5:17:17<6:06:46, 16.35s/it]  
 31%|███       | 414/1350 [4:04:52<8:57:00, 34.42s/it][A

{'loss': 15.1172, 'learning_rate': 2e-05, 'epoch': 1.53}



                                                    t][A
  0%|          | 4/1350 [5:17:53<6:06:46, 16.35s/it]  
 31%|███       | 415/1350 [4:05:28<9:03:16, 34.86s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.54}



                                                    t][A
  0%|          | 4/1350 [5:18:29<6:06:46, 16.35s/it]  
 31%|███       | 416/1350 [4:06:03<9:07:33, 35.17s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.54}



                                                    t][A
  0%|          | 4/1350 [5:19:05<6:06:46, 16.35s/it]  
 31%|███       | 417/1350 [4:06:39<9:10:20, 35.39s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.54}



                                                    t][A
  0%|          | 4/1350 [5:19:41<6:06:46, 16.35s/it]  
 31%|███       | 418/1350 [4:07:15<9:12:04, 35.54s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.55}



                                                    t][A
  0%|          | 4/1350 [5:20:17<6:06:46, 16.35s/it]  
 31%|███       | 419/1350 [4:07:51<9:13:09, 35.65s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.55}



                                                    t][A
  0%|          | 4/1350 [5:20:51<6:06:46, 16.35s/it]  
 31%|███       | 420/1350 [4:08:25<9:05:11, 35.17s/it][A

{'loss': 3.139, 'learning_rate': 2e-05, 'epoch': 1.56}



                                                    t][A
  0%|          | 4/1350 [5:21:27<6:06:46, 16.35s/it]  
 31%|███       | 421/1350 [4:09:01<9:07:54, 35.39s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.56}



                                                    t][A
  0%|          | 4/1350 [5:22:03<6:06:46, 16.35s/it]  
 31%|███▏      | 422/1350 [4:09:37<9:09:34, 35.53s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.56}



                                                    t][A
  0%|          | 4/1350 [5:22:36<6:06:46, 16.35s/it]  
 31%|███▏      | 423/1350 [4:10:11<9:00:00, 34.95s/it][A

{'loss': 5.5087, 'learning_rate': 2e-05, 'epoch': 1.57}



                                                    t][A
  0%|          | 4/1350 [5:23:12<6:06:46, 16.35s/it]  
 31%|███▏      | 424/1350 [4:10:47<9:03:50, 35.24s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.57}



                                                    t][A
  0%|          | 4/1350 [5:23:48<6:06:46, 16.35s/it]  
 31%|███▏      | 425/1350 [4:11:22<9:06:15, 35.43s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.57}



                                                    t][A
  0%|          | 4/1350 [5:24:24<6:06:46, 16.35s/it]  
 32%|███▏      | 426/1350 [4:11:58<9:07:45, 35.57s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.58}



                                                    t][A
  0%|          | 4/1350 [5:25:00<6:06:46, 16.35s/it]  
 32%|███▏      | 427/1350 [4:12:34<9:08:44, 35.67s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.58}



                                                    t][A
  0%|          | 4/1350 [5:25:36<6:06:46, 16.35s/it]  
 32%|███▏      | 428/1350 [4:13:10<9:09:12, 35.74s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.59}



                                                    t][A
  0%|          | 4/1350 [5:26:11<6:06:46, 16.35s/it]  
 32%|███▏      | 429/1350 [4:13:45<9:04:41, 35.48s/it][A

{'loss': 11.6278, 'learning_rate': 2e-05, 'epoch': 1.59}



                                                    t][A
  0%|          | 4/1350 [5:26:47<6:06:46, 16.35s/it]  
 32%|███▏      | 430/1350 [4:14:21<9:05:57, 35.61s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.59}



                                                    t][A
  0%|          | 4/1350 [5:27:22<6:06:46, 16.35s/it]  
 32%|███▏      | 431/1350 [4:14:57<9:06:32, 35.68s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.6}



                                                    t][A
  0%|          | 4/1350 [5:27:58<6:06:46, 16.35s/it]  
 32%|███▏      | 432/1350 [4:15:33<9:06:53, 35.74s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.6}



                                                    t][A
  0%|          | 4/1350 [5:28:34<6:06:46, 16.35s/it]  
 32%|███▏      | 433/1350 [4:16:09<9:06:59, 35.79s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.6}



                                                    t][A
  0%|          | 4/1350 [5:29:10<6:06:46, 16.35s/it]  
 32%|███▏      | 434/1350 [4:16:44<9:06:58, 35.83s/it][A

{'loss': 0.1871, 'learning_rate': 2e-05, 'epoch': 1.61}



                                                    t][A
  0%|          | 4/1350 [5:29:46<6:06:46, 16.35s/it]  
 32%|███▏      | 435/1350 [4:17:20<9:06:43, 35.85s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.61}



                                                    t][A
  0%|          | 4/1350 [5:30:22<6:06:46, 16.35s/it]  
 32%|███▏      | 436/1350 [4:17:56<9:06:20, 35.86s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.61}



                                                    t][A
  0%|          | 4/1350 [5:30:58<6:06:46, 16.35s/it]  
 32%|███▏      | 437/1350 [4:18:32<9:05:50, 35.87s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.62}



                                                    t][A
  0%|          | 4/1350 [5:31:34<6:06:46, 16.35s/it]  
 32%|███▏      | 438/1350 [4:19:08<9:05:20, 35.88s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.62}



                                                    t][A
  0%|          | 4/1350 [5:32:10<6:06:46, 16.35s/it]  
 33%|███▎      | 439/1350 [4:19:44<9:04:50, 35.88s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.63}



                                                    t][A
  0%|          | 4/1350 [5:32:45<6:06:46, 16.35s/it]  
 33%|███▎      | 440/1350 [4:20:20<9:04:20, 35.89s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.63}



                                                    t][A
  0%|          | 4/1350 [5:33:21<6:06:46, 16.35s/it]  
 33%|███▎      | 441/1350 [4:20:56<9:03:45, 35.89s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.63}



                                                    t][A
  0%|          | 4/1350 [5:33:57<6:06:46, 16.35s/it]  
 33%|███▎      | 442/1350 [4:21:32<9:03:08, 35.89s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.64}



                                                    t][A
  0%|          | 4/1350 [5:34:33<6:06:46, 16.35s/it]  
 33%|███▎      | 443/1350 [4:22:07<9:02:30, 35.89s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.64}



                                                    t][A
  0%|          | 4/1350 [5:35:09<6:06:46, 16.35s/it]  
 33%|███▎      | 444/1350 [4:22:43<9:01:55, 35.89s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.64}



                                                    t][A
  0%|          | 4/1350 [5:35:45<6:06:46, 16.35s/it]  
 33%|███▎      | 445/1350 [4:23:19<9:01:13, 35.88s/it][A

{'loss': 0.7607, 'learning_rate': 2e-05, 'epoch': 1.65}



                                                    t][A
  0%|          | 4/1350 [5:36:21<6:06:46, 16.35s/it]  
 33%|███▎      | 446/1350 [4:23:55<9:00:39, 35.88s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.65}



                                                    t][A
  0%|          | 4/1350 [5:36:56<6:06:46, 16.35s/it]  
 33%|███▎      | 447/1350 [4:24:30<8:57:39, 35.72s/it][A

{'loss': 2.0233, 'learning_rate': 2e-05, 'epoch': 1.66}



                                                    t][A
  0%|          | 4/1350 [5:37:32<6:06:46, 16.35s/it]  
 33%|███▎      | 448/1350 [4:25:06<8:57:46, 35.77s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.66}



                                                    t][A
  0%|          | 4/1350 [5:38:08<6:06:46, 16.35s/it]  
 33%|███▎      | 449/1350 [4:25:42<8:57:42, 35.81s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.66}



                                                    t][A
  0%|          | 4/1350 [5:38:44<6:06:46, 16.35s/it]  
 33%|███▎      | 450/1350 [4:26:18<8:57:35, 35.84s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.67}



                                                    t][A
  0%|          | 4/1350 [5:39:18<6:06:46, 16.35s/it]  
 33%|███▎      | 451/1350 [4:26:53<8:50:22, 35.40s/it][A

{'loss': 10.3477, 'learning_rate': 2e-05, 'epoch': 1.67}



                                                    t][A
  0%|          | 4/1350 [5:39:54<6:06:46, 16.35s/it]  
 33%|███▎      | 452/1350 [4:27:28<8:50:48, 35.47s/it][A

{'loss': 7.7737, 'learning_rate': 2e-05, 'epoch': 1.67}



                                                    t][A
  0%|          | 4/1350 [5:40:30<6:06:46, 16.35s/it]  
 34%|███▎      | 453/1350 [4:28:04<8:52:03, 35.59s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.68}



                                                    t][A
  0%|          | 4/1350 [5:41:06<6:06:46, 16.35s/it]  
 34%|███▎      | 454/1350 [4:28:40<8:53:36, 35.73s/it][A

{'loss': 25.9005, 'learning_rate': 2e-05, 'epoch': 1.68}



                                                    t][A
  0%|          | 4/1350 [5:41:39<6:06:46, 16.35s/it]  
 34%|███▎      | 455/1350 [4:29:13<8:39:59, 34.86s/it][A

{'loss': 8.6223, 'learning_rate': 2e-05, 'epoch': 1.69}



                                                    t][A
  0%|          | 4/1350 [5:42:14<6:06:46, 16.35s/it]  
 34%|███▍      | 456/1350 [4:29:49<8:44:00, 35.17s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.69}



                                                    t][A
  0%|          | 4/1350 [5:42:50<6:06:46, 16.35s/it]  
 34%|███▍      | 457/1350 [4:30:25<8:46:37, 35.38s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.69}



                                                    t][A
  0%|          | 4/1350 [5:43:26<6:06:46, 16.35s/it]  
 34%|███▍      | 458/1350 [4:31:01<8:48:23, 35.54s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.7}



                                                    t][A
  0%|          | 4/1350 [5:44:02<6:06:46, 16.35s/it]  
 34%|███▍      | 459/1350 [4:31:36<8:48:30, 35.59s/it][A

{'loss': 0.2966, 'learning_rate': 2e-05, 'epoch': 1.7}



                                                    t][A
  0%|          | 4/1350 [5:44:38<6:06:46, 16.35s/it]  
 34%|███▍      | 460/1350 [4:32:12<8:49:16, 35.68s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.7}



                                                    t][A
  0%|          | 4/1350 [5:45:13<6:06:46, 16.35s/it]  
 34%|███▍      | 461/1350 [4:32:47<8:46:46, 35.55s/it][A

{'loss': 13.484, 'learning_rate': 2e-05, 'epoch': 1.71}



                                                    t][A
  0%|          | 4/1350 [5:45:49<6:06:46, 16.35s/it]  
 34%|███▍      | 462/1350 [4:33:23<8:47:34, 35.65s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.71}



                                                    t][A
  0%|          | 4/1350 [5:46:24<6:06:46, 16.35s/it]  
 34%|███▍      | 463/1350 [4:33:58<8:43:18, 35.40s/it][A

{'loss': 4.9293, 'learning_rate': 2e-05, 'epoch': 1.71}



                                                    t][A
  0%|          | 4/1350 [5:47:00<6:06:46, 16.35s/it]  
 34%|███▍      | 464/1350 [4:34:34<8:44:53, 35.55s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.72}



                                                    t][A
  0%|          | 4/1350 [5:47:36<6:06:46, 16.35s/it]  
 34%|███▍      | 465/1350 [4:35:10<8:45:50, 35.65s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.72}



                                                    t][A
  0%|          | 4/1350 [5:48:11<6:06:46, 16.35s/it]  
 35%|███▍      | 466/1350 [4:35:46<8:46:15, 35.72s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.73}



                                                    t][A
  0%|          | 4/1350 [5:48:47<6:06:46, 16.35s/it]  
 35%|███▍      | 467/1350 [4:36:22<8:46:21, 35.77s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.73}



                                                    t][A
  0%|          | 4/1350 [5:49:23<6:06:46, 16.35s/it]  
 35%|███▍      | 468/1350 [4:36:58<8:46:22, 35.81s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.73}



                                                    t][A
  0%|          | 4/1350 [5:49:59<6:06:46, 16.35s/it]  
 35%|███▍      | 469/1350 [4:37:34<8:47:30, 35.93s/it][A

{'loss': 16.7825, 'learning_rate': 2e-05, 'epoch': 1.74}



                                                    t][A
  0%|          | 4/1350 [5:50:35<6:06:46, 16.35s/it]  
 35%|███▍      | 470/1350 [4:38:10<8:46:46, 35.92s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.74}



                                                    t][A
  0%|          | 4/1350 [5:51:09<6:06:46, 16.35s/it]  
 35%|███▍      | 471/1350 [4:38:44<8:37:24, 35.32s/it][A

{'loss': 1.1626, 'learning_rate': 2e-05, 'epoch': 1.74}



                                                    t][A
  0%|          | 4/1350 [5:51:43<6:06:46, 16.35s/it]  
 35%|███▍      | 472/1350 [4:39:17<8:29:52, 34.84s/it][A

{'loss': 2.3504, 'learning_rate': 2e-05, 'epoch': 1.75}



                                                    t][A
  0%|          | 4/1350 [5:52:19<6:06:46, 16.35s/it]  
 35%|███▌      | 473/1350 [4:39:53<8:33:51, 35.16s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.75}



                                                    t][A
  0%|          | 4/1350 [5:52:55<6:06:46, 16.35s/it]  
 35%|███▌      | 474/1350 [4:40:29<8:36:26, 35.37s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.76}



                                                    t][A
  0%|          | 4/1350 [5:53:31<6:06:46, 16.35s/it]  
 35%|███▌      | 475/1350 [4:41:05<8:38:08, 35.53s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.76}



                                                    t][A
  0%|          | 4/1350 [5:54:07<6:06:46, 16.35s/it]  
 35%|███▌      | 476/1350 [4:41:41<8:39:10, 35.64s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.76}



                                                    t][A
  0%|          | 4/1350 [5:54:42<6:06:46, 16.35s/it]  
 35%|███▌      | 477/1350 [4:42:17<8:39:41, 35.72s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.77}



                                                    t][A
  0%|          | 4/1350 [5:55:17<6:06:46, 16.35s/it]  
 35%|███▌      | 478/1350 [4:42:51<8:34:06, 35.37s/it][A

{'loss': 2.1152, 'learning_rate': 2e-05, 'epoch': 1.77}



                                                    t][A
  0%|          | 4/1350 [5:55:52<6:06:46, 16.35s/it]  
 35%|███▌      | 479/1350 [4:43:27<8:33:39, 35.38s/it][A

{'loss': 5.7765, 'learning_rate': 2e-05, 'epoch': 1.77}



                                                    t][A
  0%|          | 4/1350 [5:56:28<6:06:46, 16.35s/it]  
 36%|███▌      | 480/1350 [4:44:03<8:35:18, 35.54s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.78}



                                                    t][A
  0%|          | 4/1350 [5:57:04<6:06:46, 16.35s/it]  
 36%|███▌      | 481/1350 [4:44:38<8:34:24, 35.52s/it][A

{'loss': 0.0717, 'learning_rate': 2e-05, 'epoch': 1.78}



                                                    t][A
  0%|          | 4/1350 [5:57:40<6:06:46, 16.35s/it]  
 36%|███▌      | 482/1350 [4:45:14<8:35:24, 35.63s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.79}



                                                    t][A
  0%|          | 4/1350 [5:58:14<6:06:46, 16.35s/it]  
 36%|███▌      | 483/1350 [4:45:48<8:29:26, 35.26s/it][A

{'loss': 10.1828, 'learning_rate': 2e-05, 'epoch': 1.79}



                                                    t][A
  0%|          | 4/1350 [5:58:50<6:06:46, 16.35s/it]  
 36%|███▌      | 484/1350 [4:46:24<8:31:35, 35.45s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.79}



                                                    t][A
  0%|          | 4/1350 [5:59:26<6:06:46, 16.35s/it]  
 36%|███▌      | 485/1350 [4:47:00<8:32:55, 35.58s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.8}



                                                    t][A
  0%|          | 4/1350 [6:00:02<6:06:46, 16.35s/it]  
 36%|███▌      | 486/1350 [4:47:36<8:33:38, 35.67s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.8}



                                                    t][A
  0%|          | 4/1350 [6:00:38<6:06:46, 16.35s/it]  
 36%|███▌      | 487/1350 [4:48:12<8:34:00, 35.74s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.8}



                                                    t][A
  0%|          | 4/1350 [6:01:14<6:06:46, 16.35s/it]  
 36%|███▌      | 488/1350 [4:48:48<8:34:10, 35.79s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.81}



                                                    t][A
  0%|          | 4/1350 [6:01:49<6:06:46, 16.35s/it]  
 36%|███▌      | 489/1350 [4:49:24<8:33:58, 35.82s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.81}



                                                    t][A
  0%|          | 4/1350 [6:02:25<6:06:46, 16.35s/it]  
 36%|███▋      | 490/1350 [4:50:00<8:33:39, 35.84s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.81}



                                                    t][A
  0%|          | 4/1350 [6:03:01<6:06:46, 16.35s/it]  
 36%|███▋      | 491/1350 [4:50:35<8:31:48, 35.75s/it][A

{'loss': 0.5235, 'learning_rate': 2e-05, 'epoch': 1.82}



                                                    t][A
  0%|          | 4/1350 [6:03:33<6:06:46, 16.35s/it]  
 36%|███▋      | 492/1350 [4:51:07<8:13:42, 34.52s/it][A

{'loss': 6.5001, 'learning_rate': 2e-05, 'epoch': 1.82}



                                                    t][A
  0%|          | 4/1350 [6:04:06<6:06:46, 16.35s/it]  
 37%|███▋      | 493/1350 [4:51:41<8:10:27, 34.34s/it][A

{'loss': 7.6994, 'learning_rate': 2e-05, 'epoch': 1.83}



                                                    t][A
  0%|          | 4/1350 [6:04:42<6:06:46, 16.35s/it]  
 37%|███▋      | 494/1350 [4:52:17<8:16:31, 34.80s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.83}



                                                    t][A
  0%|          | 4/1350 [6:05:17<6:06:46, 16.35s/it]  
 37%|███▋      | 495/1350 [4:52:51<8:15:46, 34.79s/it][A

{'loss': 0.4588, 'learning_rate': 2e-05, 'epoch': 1.83}



                                                    t][A
  0%|          | 4/1350 [6:05:52<6:06:46, 16.35s/it]  
 37%|███▋      | 496/1350 [4:53:26<8:14:16, 34.73s/it][A

{'loss': 31.3838, 'learning_rate': 2e-05, 'epoch': 1.84}



                                                    t][A
  0%|          | 4/1350 [6:06:27<6:06:46, 16.35s/it]  
 37%|███▋      | 497/1350 [4:54:01<8:16:07, 34.90s/it][A

{'loss': 7.1576, 'learning_rate': 2e-05, 'epoch': 1.84}



                                                    t][A
  0%|          | 4/1350 [6:07:03<6:06:46, 16.35s/it]  
 37%|███▋      | 498/1350 [4:54:37<8:19:54, 35.21s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.84}



                                                    t][A
  0%|          | 4/1350 [6:07:38<6:06:46, 16.35s/it]  
 37%|███▋      | 499/1350 [4:55:12<8:17:31, 35.08s/it][A

{'loss': 3.5492, 'learning_rate': 2e-05, 'epoch': 1.85}



                                                    t][A
  0%|          | 4/1350 [6:08:14<6:06:46, 16.35s/it]  
 37%|███▋      | 500/1350 [4:55:48<8:20:23, 35.32s/it][A

{'loss': 0.0, 'learning_rate': 2e-05, 'epoch': 1.85}




AttributeError: 'LlamaForCausalLM' object has no attribute 'save_checkpoint'

In [None]:
print("kek")

In [None]:
model

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model_id = "nvdenisov2002/llama-longLoRA-v3-16k"
model.push_to_hub(model_id)

### Unlucky try

In [28]:
train(model_args, data_args, training_args)

Begin train
Parsed arguments
Created config


Loading checkpoint shards: 100%|██████████| 2/2 [03:39<00:00, 109.68s/it]

Loaded model



Using pad_token, but it is not set yet.


Loaded tokenizer




Created data_module
Prepared model to learn


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


AttributeError: module 'wandb.proto.wandb_internal_pb2' has no attribute 'Result'