# Libraries

In [1]:
# %pip install -U transformers
# %pip install -U datasets
# %pip install -U accelerate
# %pip install -U peft
# %pip install -U trl
# %pip install -U bitsandbytes

In [1]:
import os, torch, wandb

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

from datasets import load_dataset, concatenate_datasets
from trl import SFTTrainer, setup_chat_format
from dataclasses import dataclass

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
file_with_tokens = "tokens.txt"
model_casual_name = "Qwen 1.5B"
path_to_source_model = "C:\\Users\\USER_ELISEY\\qwen2.5_1.5"
path_to_save_finetuned_model = "russia_chad_1.5"

path_to_dataset = "miracl/miracl"
dataset_casual_name = "MiraCl Russian dataset"

'''
If paths don't work, please try local except of global path(or reversed)
'''

"\nIf paths don't work, please try local except of global path(or reversed)\n"

## Setup Huggingface 🤗 & Wandb

In [4]:
from huggingface_hub import login

f = open(file_with_tokens, "r")
hf_t = f.readline()[:-1]
wndb_t = f.readline()[:-1]

login(token = hf_t)

wandb.login(key = wndb_t)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\USER_ELISEY\.cache\huggingface\token
Login successful


[34m[1mwandb[0m: Currently logged in as: [33mez1071[0m ([33mez1071-mipt[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\USER_ELISEY\_netrc


True

In [5]:
run = wandb.init(
    project=f'Fine-tune {model_casual_name} on Russian Dataset', 
    job_type="training"
)

In [6]:
@dataclass
class Config:
    model_name = model_casual_name
    dataset_name = dataset_casual_name
    new_model = path_to_save_finetuned_model
    torch_dtype = torch.float16
    attn_implementation = "eager"
cfg = Config()

# Loading model and tokenizer

In [7]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=cfg.torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    path_to_source_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=cfg.attn_implementation
)
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
    

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(path_to_source_model)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad|>'
print(len(tokenizer))

151665


## LoRA adapter

In [9]:
# LoRA config
peft_config = LoraConfig(
    r=32
    ,#16
    lora_alpha=64,#64
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

# Data

## Load

In [10]:
dataset = load_dataset(path_to_dataset, 'ru', trust_remote_code=True)

In [11]:
# dataset["train"]["positive_passages"]

## Format to chat 

In [12]:
def format_chat_template(row):
    content = row["positive_passages"][0]["text"] + ". "
    for i in range(1, len(row["positive_passages"])):
        content += row["positive_passages"][i]["text"]
        content += ". "
    row_json = [{"role": "user", "content": row["query"]},
               {"role": "assistant", "content": content}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [13]:
dataset = concatenate_datasets([
    dataset['dev'],
    dataset['train']
])

dataset = dataset.remove_columns('negative_passages')

In [14]:
dataset['positive_passages'][0][0]["text"]

'Кари́бский кризис\xa0— исторический термин, определяющий чрезвычайно напряжённое политическое, дипломатическое и военное противостояние между Советским Союзом и Соединёнными Штатами в октябре 1962 года, которое было вызвано размещением США ядерного оружия в Турции в 1961 году и впоследствии тайной переброской и размещением на Кубе военных частей и подразделений Вооружённых Сил СССР, техники и вооружения, включая ядерное оружие. Кризис мог привести к глобальной ядерной войне. Кубинцы называют его «Октябрьским кризисом» (), в США распространено название «Кубинский ракетный кризис» ().'

In [15]:
dataset = dataset.map(
    format_chat_template,
    num_proc=1,
)

Map: 100%|██████████| 5935/5935 [00:00<00:00, 6299.91 examples/s]


## Select only part

In [16]:
dataset_sh = dataset.shuffle(seed=2024)#.select(range(10_000))
dataset_sh

Dataset({
    features: ['query_id', 'query', 'positive_passages', 'text'],
    num_rows: 5935
})

In [17]:
dataset_sh = dataset_sh.train_test_split(0.1)

In [18]:
dataset_sh = dataset_sh.remove_columns('positive_passages')
dataset_sh = dataset_sh.remove_columns('query')
dataset_sh = dataset_sh.remove_columns('query_id')

In [19]:
dataset_sh

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5341
    })
    test: Dataset({
        features: ['text'],
        num_rows: 594
    })
})

# Train model

## Training arguments

In [20]:
training_arguments = TrainingArguments(
    output_dir=cfg.new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
#     num_train_epochs=1,
    max_steps=5000,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=20,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    group_by_length=True,
    report_to="wandb",
    run_name="Llama-3.1-medicine",
)

In [21]:
print(len(tokenizer))

151665


## Train model

In [22]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_sh["train"],
    eval_dataset=dataset_sh["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 5341/5341 [00:01<00:00, 3018.79 examples/s]
Map: 100%|██████████| 594/594 [00:00<00:00, 2795.89 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [23]:
trainer.train()

  0%|          | 20/5000 [00:14<57:12,  1.45it/s] 

{'loss': 1.9043, 'grad_norm': 1.0541912317276, 'learning_rate': 0.0001995991983967936, 'epoch': 0.01}


  1%|          | 40/5000 [00:28<56:16,  1.47it/s]

{'loss': 1.722, 'grad_norm': 2.072873115539551, 'learning_rate': 0.00019879759519038077, 'epoch': 0.01}


  1%|          | 60/5000 [00:42<56:43,  1.45it/s]

{'loss': 1.554, 'grad_norm': 1.0307807922363281, 'learning_rate': 0.00019799599198396794, 'epoch': 0.02}


  2%|▏         | 80/5000 [00:55<55:43,  1.47it/s]

{'loss': 1.8057, 'grad_norm': 1.3169037103652954, 'learning_rate': 0.0001971943887775551, 'epoch': 0.03}


  2%|▏         | 100/5000 [01:09<55:25,  1.47it/s]

{'loss': 1.4794, 'grad_norm': 2.464627981185913, 'learning_rate': 0.0001963927855711423, 'epoch': 0.04}


  2%|▏         | 120/5000 [01:23<55:31,  1.47it/s]

{'loss': 1.7326, 'grad_norm': 1.291088581085205, 'learning_rate': 0.00019559118236472947, 'epoch': 0.04}


  3%|▎         | 140/5000 [01:36<55:09,  1.47it/s]

{'loss': 1.6255, 'grad_norm': 1.6883840560913086, 'learning_rate': 0.00019478957915831664, 'epoch': 0.05}


  3%|▎         | 160/5000 [01:50<55:25,  1.46it/s]

{'loss': 1.5661, 'grad_norm': 0.9823126196861267, 'learning_rate': 0.0001939879759519038, 'epoch': 0.06}


  4%|▎         | 180/5000 [02:04<54:45,  1.47it/s]

{'loss': 1.6617, 'grad_norm': 1.3230966329574585, 'learning_rate': 0.000193186372745491, 'epoch': 0.07}


  4%|▍         | 200/5000 [02:17<54:16,  1.47it/s]

{'loss': 1.5171, 'grad_norm': 1.5884487628936768, 'learning_rate': 0.00019238476953907816, 'epoch': 0.07}


                                                  
  4%|▍         | 200/5000 [03:41<54:16,  1.47it/s]

{'eval_loss': 1.6288844347000122, 'eval_runtime': 84.0188, 'eval_samples_per_second': 7.07, 'eval_steps_per_second': 7.07, 'epoch': 0.07}


  4%|▍         | 220/5000 [03:55<56:58,  1.40it/s]   

{'loss': 1.7859, 'grad_norm': 1.0874289274215698, 'learning_rate': 0.00019158316633266533, 'epoch': 0.08}


  5%|▍         | 240/5000 [04:09<53:53,  1.47it/s]

{'loss': 1.5485, 'grad_norm': 1.6375373601913452, 'learning_rate': 0.0001907815631262525, 'epoch': 0.09}


  5%|▌         | 260/5000 [04:22<54:10,  1.46it/s]

{'loss': 1.5493, 'grad_norm': 1.0219361782073975, 'learning_rate': 0.00018997995991983967, 'epoch': 0.1}


  6%|▌         | 280/5000 [04:36<53:38,  1.47it/s]

{'loss': 1.6642, 'grad_norm': 1.627577543258667, 'learning_rate': 0.00018917835671342686, 'epoch': 0.1}


  6%|▌         | 300/5000 [04:50<53:19,  1.47it/s]

{'loss': 1.5032, 'grad_norm': 1.8321025371551514, 'learning_rate': 0.00018837675350701405, 'epoch': 0.11}


  6%|▋         | 320/5000 [05:03<53:36,  1.46it/s]

{'loss': 1.6663, 'grad_norm': 1.2727171182632446, 'learning_rate': 0.00018757515030060122, 'epoch': 0.12}


  7%|▋         | 340/5000 [05:17<52:53,  1.47it/s]

{'loss': 1.5674, 'grad_norm': 1.516869306564331, 'learning_rate': 0.0001867735470941884, 'epoch': 0.13}


  7%|▋         | 360/5000 [05:31<53:06,  1.46it/s]

{'loss': 1.6258, 'grad_norm': 1.1156344413757324, 'learning_rate': 0.00018597194388777556, 'epoch': 0.13}


  8%|▊         | 380/5000 [05:44<52:33,  1.46it/s]

{'loss': 1.6553, 'grad_norm': 1.3841392993927002, 'learning_rate': 0.00018517034068136275, 'epoch': 0.14}


  8%|▊         | 400/5000 [05:58<51:55,  1.48it/s]

{'loss': 1.4772, 'grad_norm': 2.5458805561065674, 'learning_rate': 0.00018436873747494992, 'epoch': 0.15}


                                                  
  8%|▊         | 400/5000 [07:22<51:55,  1.48it/s]

{'eval_loss': 1.6064434051513672, 'eval_runtime': 83.6746, 'eval_samples_per_second': 7.099, 'eval_steps_per_second': 7.099, 'epoch': 0.15}


  8%|▊         | 420/5000 [07:35<54:24,  1.40it/s]   

{'loss': 1.6446, 'grad_norm': 1.4299770593643188, 'learning_rate': 0.00018356713426853708, 'epoch': 0.16}


  9%|▉         | 440/5000 [07:49<51:32,  1.47it/s]

{'loss': 1.5958, 'grad_norm': 1.955458402633667, 'learning_rate': 0.00018276553106212425, 'epoch': 0.16}


  9%|▉         | 460/5000 [08:03<51:54,  1.46it/s]

{'loss': 1.5323, 'grad_norm': 0.9640012979507446, 'learning_rate': 0.00018196392785571145, 'epoch': 0.17}


 10%|▉         | 480/5000 [08:16<51:17,  1.47it/s]

{'loss': 1.7115, 'grad_norm': 1.398743987083435, 'learning_rate': 0.0001811623246492986, 'epoch': 0.18}


 10%|█         | 500/5000 [08:30<50:53,  1.47it/s]

{'loss': 1.5177, 'grad_norm': 2.3355050086975098, 'learning_rate': 0.00018036072144288578, 'epoch': 0.19}


 10%|█         | 520/5000 [08:47<51:18,  1.46it/s]  

{'loss': 1.7811, 'grad_norm': 1.2081446647644043, 'learning_rate': 0.00017955911823647295, 'epoch': 0.19}


 11%|█         | 540/5000 [09:01<50:36,  1.47it/s]

{'loss': 1.6713, 'grad_norm': 1.564025640487671, 'learning_rate': 0.00017875751503006014, 'epoch': 0.2}


 11%|█         | 560/5000 [09:15<50:49,  1.46it/s]

{'loss': 1.5712, 'grad_norm': 1.161700963973999, 'learning_rate': 0.0001779559118236473, 'epoch': 0.21}


 12%|█▏        | 580/5000 [09:29<50:10,  1.47it/s]

{'loss': 1.6899, 'grad_norm': 1.1293479204177856, 'learning_rate': 0.00017715430861723447, 'epoch': 0.22}


 12%|█▏        | 600/5000 [09:42<49:46,  1.47it/s]

{'loss': 1.4668, 'grad_norm': 1.971906065940857, 'learning_rate': 0.00017635270541082164, 'epoch': 0.22}


                                                  
 12%|█▏        | 600/5000 [11:06<49:46,  1.47it/s]

{'eval_loss': 1.5881478786468506, 'eval_runtime': 83.5701, 'eval_samples_per_second': 7.108, 'eval_steps_per_second': 7.108, 'epoch': 0.22}


 12%|█▏        | 620/5000 [11:20<52:00,  1.40it/s]   

{'loss': 1.6687, 'grad_norm': 0.9735192060470581, 'learning_rate': 0.00017555110220440884, 'epoch': 0.23}


 13%|█▎        | 640/5000 [11:33<49:14,  1.48it/s]

{'loss': 1.5495, 'grad_norm': 1.4759401082992554, 'learning_rate': 0.000174749498997996, 'epoch': 0.24}


 13%|█▎        | 660/5000 [11:47<49:36,  1.46it/s]

{'loss': 1.4846, 'grad_norm': 1.1133471727371216, 'learning_rate': 0.00017394789579158317, 'epoch': 0.25}


 14%|█▎        | 680/5000 [12:00<49:00,  1.47it/s]

{'loss': 1.6521, 'grad_norm': 1.1063028573989868, 'learning_rate': 0.00017314629258517034, 'epoch': 0.25}


 14%|█▍        | 700/5000 [12:14<48:46,  1.47it/s]

{'loss': 1.4472, 'grad_norm': 2.6171388626098633, 'learning_rate': 0.0001723446893787575, 'epoch': 0.26}


 14%|█▍        | 720/5000 [12:28<48:52,  1.46it/s]

{'loss': 1.7503, 'grad_norm': 1.0857268571853638, 'learning_rate': 0.0001715430861723447, 'epoch': 0.27}


 15%|█▍        | 740/5000 [12:41<48:13,  1.47it/s]

{'loss': 1.5318, 'grad_norm': 1.5839183330535889, 'learning_rate': 0.00017074148296593187, 'epoch': 0.28}


 15%|█▌        | 760/5000 [12:55<48:30,  1.46it/s]

{'loss': 1.5601, 'grad_norm': 1.0815448760986328, 'learning_rate': 0.00016993987975951903, 'epoch': 0.28}


 16%|█▌        | 780/5000 [13:09<47:54,  1.47it/s]

{'loss': 1.6634, 'grad_norm': 1.246779203414917, 'learning_rate': 0.0001691382765531062, 'epoch': 0.29}


 16%|█▌        | 800/5000 [13:22<47:35,  1.47it/s]

{'loss': 1.4713, 'grad_norm': 2.5761847496032715, 'learning_rate': 0.0001683366733466934, 'epoch': 0.3}


                                                  
 16%|█▌        | 800/5000 [14:46<47:35,  1.47it/s]

{'eval_loss': 1.579858660697937, 'eval_runtime': 83.3788, 'eval_samples_per_second': 7.124, 'eval_steps_per_second': 7.124, 'epoch': 0.3}


 16%|█▋        | 820/5000 [15:00<49:56,  1.40it/s]   

{'loss': 1.7404, 'grad_norm': 1.0723878145217896, 'learning_rate': 0.00016753507014028056, 'epoch': 0.31}


 17%|█▋        | 840/5000 [15:13<47:02,  1.47it/s]

{'loss': 1.5773, 'grad_norm': 1.2389100790023804, 'learning_rate': 0.00016673346693386773, 'epoch': 0.31}


 17%|█▋        | 860/5000 [15:27<47:25,  1.46it/s]

{'loss': 1.474, 'grad_norm': 1.0127276182174683, 'learning_rate': 0.00016593186372745492, 'epoch': 0.32}


 18%|█▊        | 880/5000 [15:40<46:37,  1.47it/s]

{'loss': 1.6265, 'grad_norm': 1.3076599836349487, 'learning_rate': 0.0001651302605210421, 'epoch': 0.33}


 18%|█▊        | 900/5000 [15:54<46:31,  1.47it/s]

{'loss': 1.3396, 'grad_norm': 1.7678700685501099, 'learning_rate': 0.00016432865731462928, 'epoch': 0.34}


 18%|█▊        | 920/5000 [16:08<46:32,  1.46it/s]

{'loss': 1.647, 'grad_norm': 0.9690725803375244, 'learning_rate': 0.00016352705410821645, 'epoch': 0.34}


 19%|█▉        | 940/5000 [16:22<45:59,  1.47it/s]

{'loss': 1.5366, 'grad_norm': 1.5145351886749268, 'learning_rate': 0.00016272545090180362, 'epoch': 0.35}


 19%|█▉        | 960/5000 [16:35<46:09,  1.46it/s]

{'loss': 1.6242, 'grad_norm': 1.7017486095428467, 'learning_rate': 0.00016192384769539078, 'epoch': 0.36}


 20%|█▉        | 980/5000 [16:49<45:42,  1.47it/s]

{'loss': 1.5289, 'grad_norm': 1.110245704650879, 'learning_rate': 0.00016112224448897798, 'epoch': 0.37}


 20%|██        | 1000/5000 [17:02<45:17,  1.47it/s]

{'loss': 1.48, 'grad_norm': 1.8722481727600098, 'learning_rate': 0.00016032064128256515, 'epoch': 0.37}


                                                   
 20%|██        | 1000/5000 [18:26<45:17,  1.47it/s]

{'eval_loss': 1.5687843561172485, 'eval_runtime': 83.4616, 'eval_samples_per_second': 7.117, 'eval_steps_per_second': 7.117, 'epoch': 0.37}


 20%|██        | 1020/5000 [18:40<47:26,  1.40it/s]   

{'loss': 1.6802, 'grad_norm': 0.9624963998794556, 'learning_rate': 0.0001595190380761523, 'epoch': 0.38}


 21%|██        | 1040/5000 [18:54<44:51,  1.47it/s]

{'loss': 1.5803, 'grad_norm': 1.5215131044387817, 'learning_rate': 0.00015871743486973948, 'epoch': 0.39}


 21%|██        | 1060/5000 [19:08<44:56,  1.46it/s]

{'loss': 1.4768, 'grad_norm': 1.3324047327041626, 'learning_rate': 0.00015791583166332667, 'epoch': 0.4}


 22%|██▏       | 1080/5000 [19:21<44:24,  1.47it/s]

{'loss': 1.6579, 'grad_norm': 1.3307372331619263, 'learning_rate': 0.00015711422845691384, 'epoch': 0.4}


 22%|██▏       | 1100/5000 [19:35<44:02,  1.48it/s]

{'loss': 1.4496, 'grad_norm': 2.021751642227173, 'learning_rate': 0.000156312625250501, 'epoch': 0.41}


 22%|██▏       | 1120/5000 [19:49<44:18,  1.46it/s]

{'loss': 1.6927, 'grad_norm': 1.1367692947387695, 'learning_rate': 0.00015551102204408818, 'epoch': 0.42}


 23%|██▎       | 1140/5000 [20:02<43:52,  1.47it/s]

{'loss': 1.5586, 'grad_norm': 1.8711525201797485, 'learning_rate': 0.00015470941883767537, 'epoch': 0.43}


 23%|██▎       | 1160/5000 [20:16<43:52,  1.46it/s]

{'loss': 1.5158, 'grad_norm': 1.0642542839050293, 'learning_rate': 0.00015390781563126254, 'epoch': 0.43}


 24%|██▎       | 1180/5000 [20:30<43:16,  1.47it/s]

{'loss': 1.6944, 'grad_norm': 1.1477900743484497, 'learning_rate': 0.0001531062124248497, 'epoch': 0.44}


 24%|██▍       | 1200/5000 [20:43<43:03,  1.47it/s]

{'loss': 1.5182, 'grad_norm': 1.5678701400756836, 'learning_rate': 0.00015230460921843687, 'epoch': 0.45}


                                                   
 24%|██▍       | 1200/5000 [22:07<43:03,  1.47it/s]

{'eval_loss': 1.558111310005188, 'eval_runtime': 83.3627, 'eval_samples_per_second': 7.125, 'eval_steps_per_second': 7.125, 'epoch': 0.45}


 24%|██▍       | 1220/5000 [22:20<45:09,  1.39it/s]   

{'loss': 1.6791, 'grad_norm': 1.2056505680084229, 'learning_rate': 0.00015150300601202404, 'epoch': 0.46}


 25%|██▍       | 1240/5000 [22:34<42:34,  1.47it/s]

{'loss': 1.5597, 'grad_norm': 1.6304683685302734, 'learning_rate': 0.00015070140280561123, 'epoch': 0.46}


 25%|██▌       | 1260/5000 [22:48<42:47,  1.46it/s]

{'loss': 1.4971, 'grad_norm': 1.1565040349960327, 'learning_rate': 0.0001498997995991984, 'epoch': 0.47}


 26%|██▌       | 1280/5000 [23:01<42:08,  1.47it/s]

{'loss': 1.6517, 'grad_norm': 1.1526302099227905, 'learning_rate': 0.00014909819639278557, 'epoch': 0.48}


 26%|██▌       | 1300/5000 [23:15<41:56,  1.47it/s]

{'loss': 1.2761, 'grad_norm': 1.969957709312439, 'learning_rate': 0.00014829659318637273, 'epoch': 0.49}


 26%|██▋       | 1320/5000 [23:29<41:59,  1.46it/s]

{'loss': 1.7298, 'grad_norm': 1.1366448402404785, 'learning_rate': 0.00014749498997995993, 'epoch': 0.49}


 27%|██▋       | 1340/5000 [23:42<41:22,  1.47it/s]

{'loss': 1.5363, 'grad_norm': 1.5389230251312256, 'learning_rate': 0.0001466933867735471, 'epoch': 0.5}


 27%|██▋       | 1360/5000 [23:56<41:32,  1.46it/s]

{'loss': 1.438, 'grad_norm': 1.4921152591705322, 'learning_rate': 0.00014589178356713426, 'epoch': 0.51}


 28%|██▊       | 1380/5000 [24:10<41:01,  1.47it/s]

{'loss': 1.7374, 'grad_norm': 1.3817801475524902, 'learning_rate': 0.00014509018036072143, 'epoch': 0.52}


 28%|██▊       | 1400/5000 [24:23<40:45,  1.47it/s]

{'loss': 1.3932, 'grad_norm': 2.279097080230713, 'learning_rate': 0.00014428857715430862, 'epoch': 0.52}


                                                   
 28%|██▊       | 1400/5000 [25:47<40:45,  1.47it/s]

{'eval_loss': 1.5473897457122803, 'eval_runtime': 83.3465, 'eval_samples_per_second': 7.127, 'eval_steps_per_second': 7.127, 'epoch': 0.52}


 28%|██▊       | 1420/5000 [26:00<42:40,  1.40it/s]   

{'loss': 1.6431, 'grad_norm': 0.9636339545249939, 'learning_rate': 0.00014348697394789582, 'epoch': 0.53}


 29%|██▉       | 1440/5000 [26:14<40:26,  1.47it/s]

{'loss': 1.5556, 'grad_norm': 1.8030067682266235, 'learning_rate': 0.00014268537074148298, 'epoch': 0.54}


 29%|██▉       | 1460/5000 [26:28<40:28,  1.46it/s]

{'loss': 1.485, 'grad_norm': 0.9336861371994019, 'learning_rate': 0.00014188376753507015, 'epoch': 0.55}


 30%|██▉       | 1480/5000 [26:41<39:58,  1.47it/s]

{'loss': 1.5434, 'grad_norm': 1.4420720338821411, 'learning_rate': 0.00014108216432865732, 'epoch': 0.55}


 30%|███       | 1500/5000 [26:55<40:15,  1.45it/s]

{'loss': 1.3811, 'grad_norm': 1.9901541471481323, 'learning_rate': 0.0001402805611222445, 'epoch': 0.56}


 30%|███       | 1520/5000 [27:09<39:51,  1.46it/s]

{'loss': 1.6632, 'grad_norm': 1.3771893978118896, 'learning_rate': 0.00013947895791583168, 'epoch': 0.57}


 31%|███       | 1540/5000 [27:23<38:15,  1.51it/s]

{'loss': 1.6033, 'grad_norm': 1.461626410484314, 'learning_rate': 0.00013867735470941885, 'epoch': 0.58}


 31%|███       | 1560/5000 [27:37<40:23,  1.42it/s]

{'loss': 1.5016, 'grad_norm': 1.3385494947433472, 'learning_rate': 0.00013787575150300601, 'epoch': 0.58}


 32%|███▏      | 1580/5000 [27:51<39:12,  1.45it/s]

{'loss': 1.6584, 'grad_norm': 1.1523663997650146, 'learning_rate': 0.0001370741482965932, 'epoch': 0.59}


 32%|███▏      | 1600/5000 [28:05<38:47,  1.46it/s]

{'loss': 1.4261, 'grad_norm': 2.146028995513916, 'learning_rate': 0.00013627254509018038, 'epoch': 0.6}


                                                   
 32%|███▏      | 1600/5000 [29:30<38:47,  1.46it/s]

{'eval_loss': 1.5338919162750244, 'eval_runtime': 85.2954, 'eval_samples_per_second': 6.964, 'eval_steps_per_second': 6.964, 'epoch': 0.6}


 32%|███▏      | 1620/5000 [29:44<41:21,  1.36it/s]   

{'loss': 1.6695, 'grad_norm': 1.0545390844345093, 'learning_rate': 0.00013547094188376754, 'epoch': 0.61}


 33%|███▎      | 1640/5000 [29:58<38:14,  1.46it/s]

{'loss': 1.5556, 'grad_norm': 1.6395013332366943, 'learning_rate': 0.0001346693386773547, 'epoch': 0.61}


 33%|███▎      | 1660/5000 [30:12<39:11,  1.42it/s]

{'loss': 1.4413, 'grad_norm': 0.9542720913887024, 'learning_rate': 0.0001338677354709419, 'epoch': 0.62}


 34%|███▎      | 1680/5000 [30:26<37:59,  1.46it/s]

{'loss': 1.6816, 'grad_norm': 1.4211734533309937, 'learning_rate': 0.00013306613226452907, 'epoch': 0.63}


 34%|███▍      | 1700/5000 [30:39<37:27,  1.47it/s]

{'loss': 1.4445, 'grad_norm': 1.793872356414795, 'learning_rate': 0.00013226452905811624, 'epoch': 0.64}


 34%|███▍      | 1720/5000 [30:53<38:23,  1.42it/s]

{'loss': 1.7046, 'grad_norm': 0.9207528233528137, 'learning_rate': 0.0001314629258517034, 'epoch': 0.64}


 35%|███▍      | 1740/5000 [31:07<37:09,  1.46it/s]

{'loss': 1.4233, 'grad_norm': 2.2340400218963623, 'learning_rate': 0.00013066132264529057, 'epoch': 0.65}


 35%|███▌      | 1760/5000 [31:21<37:58,  1.42it/s]

{'loss': 1.4292, 'grad_norm': 1.0795091390609741, 'learning_rate': 0.00012985971943887777, 'epoch': 0.66}


 36%|███▌      | 1780/5000 [31:35<36:45,  1.46it/s]

{'loss': 1.6053, 'grad_norm': 1.2430328130722046, 'learning_rate': 0.00012905811623246493, 'epoch': 0.67}


 36%|███▌      | 1800/5000 [31:49<38:57,  1.37it/s]

{'loss': 1.3089, 'grad_norm': 2.434885025024414, 'learning_rate': 0.0001282565130260521, 'epoch': 0.67}


                                                   
 36%|███▌      | 1800/5000 [33:14<38:57,  1.37it/s]

{'eval_loss': 1.524707317352295, 'eval_runtime': 85.4848, 'eval_samples_per_second': 6.949, 'eval_steps_per_second': 6.949, 'epoch': 0.67}


 36%|███▋      | 1820/5000 [33:28<38:57,  1.36it/s]   

{'loss': 1.6885, 'grad_norm': 1.0547446012496948, 'learning_rate': 0.00012745490981963927, 'epoch': 0.68}


 37%|███▋      | 1840/5000 [33:42<36:01,  1.46it/s]

{'loss': 1.5364, 'grad_norm': 1.676247239112854, 'learning_rate': 0.00012665330661322646, 'epoch': 0.69}


 37%|███▋      | 1860/5000 [33:56<36:47,  1.42it/s]

{'loss': 1.4809, 'grad_norm': 1.2054561376571655, 'learning_rate': 0.00012585170340681363, 'epoch': 0.7}


 38%|███▊      | 1880/5000 [34:10<35:43,  1.46it/s]

{'loss': 1.5993, 'grad_norm': 1.3020884990692139, 'learning_rate': 0.0001250501002004008, 'epoch': 0.7}


 38%|███▊      | 1900/5000 [34:23<35:17,  1.46it/s]

{'loss': 1.4, 'grad_norm': 2.064380407333374, 'learning_rate': 0.00012424849699398796, 'epoch': 0.71}


 38%|███▊      | 1920/5000 [34:38<36:08,  1.42it/s]

{'loss': 1.5846, 'grad_norm': 1.1366585493087769, 'learning_rate': 0.00012344689378757516, 'epoch': 0.72}


 39%|███▉      | 1940/5000 [34:51<34:49,  1.46it/s]

{'loss': 1.4979, 'grad_norm': 1.557286262512207, 'learning_rate': 0.00012264529058116232, 'epoch': 0.73}


 39%|███▉      | 1960/5000 [35:05<35:42,  1.42it/s]

{'loss': 1.5257, 'grad_norm': 1.1726192235946655, 'learning_rate': 0.00012184368737474952, 'epoch': 0.73}


 40%|███▉      | 1980/5000 [35:19<34:36,  1.45it/s]

{'loss': 1.6307, 'grad_norm': 1.4017503261566162, 'learning_rate': 0.00012104208416833669, 'epoch': 0.74}


 40%|████      | 2000/5000 [35:33<34:08,  1.46it/s]

{'loss': 1.3188, 'grad_norm': 3.7888283729553223, 'learning_rate': 0.00012024048096192387, 'epoch': 0.75}


                                                   
 40%|████      | 2000/5000 [36:58<34:08,  1.46it/s]

{'eval_loss': 1.5191594362258911, 'eval_runtime': 85.295, 'eval_samples_per_second': 6.964, 'eval_steps_per_second': 6.964, 'epoch': 0.75}


 40%|████      | 2020/5000 [37:13<36:23,  1.36it/s]   

{'loss': 1.6589, 'grad_norm': 1.1343111991882324, 'learning_rate': 0.00011943887775551103, 'epoch': 0.76}


 41%|████      | 2040/5000 [37:27<33:46,  1.46it/s]

{'loss': 1.5446, 'grad_norm': 1.4205430746078491, 'learning_rate': 0.00011863727454909821, 'epoch': 0.76}


 41%|████      | 2060/5000 [37:41<34:30,  1.42it/s]

{'loss': 1.5921, 'grad_norm': 1.1528855562210083, 'learning_rate': 0.00011783567134268538, 'epoch': 0.77}


 42%|████▏     | 2080/5000 [37:55<33:27,  1.45it/s]

{'loss': 1.6286, 'grad_norm': 1.454329252243042, 'learning_rate': 0.00011703406813627256, 'epoch': 0.78}


 42%|████▏     | 2100/5000 [38:08<33:01,  1.46it/s]

{'loss': 1.3517, 'grad_norm': 2.458158016204834, 'learning_rate': 0.00011623246492985973, 'epoch': 0.79}


 42%|████▏     | 2120/5000 [38:23<33:44,  1.42it/s]

{'loss': 1.5771, 'grad_norm': 1.0197527408599854, 'learning_rate': 0.00011543086172344691, 'epoch': 0.79}


 43%|████▎     | 2140/5000 [38:36<32:35,  1.46it/s]

{'loss': 1.4286, 'grad_norm': 1.8009154796600342, 'learning_rate': 0.00011462925851703408, 'epoch': 0.8}


 43%|████▎     | 2160/5000 [38:50<33:13,  1.42it/s]

{'loss': 1.4654, 'grad_norm': 1.0570141077041626, 'learning_rate': 0.00011382765531062126, 'epoch': 0.81}


 44%|████▎     | 2180/5000 [39:04<32:11,  1.46it/s]

{'loss': 1.523, 'grad_norm': 1.7726699113845825, 'learning_rate': 0.00011302605210420842, 'epoch': 0.82}


 44%|████▍     | 2200/5000 [39:18<31:46,  1.47it/s]

{'loss': 1.2127, 'grad_norm': 1.8036680221557617, 'learning_rate': 0.00011222444889779559, 'epoch': 0.82}


                                                   
 44%|████▍     | 2200/5000 [40:43<31:46,  1.47it/s]

{'eval_loss': 1.512000322341919, 'eval_runtime': 85.3873, 'eval_samples_per_second': 6.957, 'eval_steps_per_second': 6.957, 'epoch': 0.82}


 44%|████▍     | 2220/5000 [40:57<33:56,  1.37it/s]   

{'loss': 1.6248, 'grad_norm': 1.1107584238052368, 'learning_rate': 0.00011142284569138277, 'epoch': 0.83}


 45%|████▍     | 2240/5000 [41:11<31:23,  1.47it/s]

{'loss': 1.5368, 'grad_norm': 1.9869110584259033, 'learning_rate': 0.00011062124248496994, 'epoch': 0.84}


 45%|████▌     | 2260/5000 [41:25<32:11,  1.42it/s]

{'loss': 1.4358, 'grad_norm': 1.0753988027572632, 'learning_rate': 0.00010981963927855712, 'epoch': 0.85}


 46%|████▌     | 2280/5000 [41:39<30:56,  1.46it/s]

{'loss': 1.6695, 'grad_norm': 1.315610408782959, 'learning_rate': 0.00010901803607214429, 'epoch': 0.85}


 46%|████▌     | 2300/5000 [41:52<30:47,  1.46it/s]

{'loss': 1.4014, 'grad_norm': 2.7059433460235596, 'learning_rate': 0.00010821643286573147, 'epoch': 0.86}


 46%|████▋     | 2320/5000 [42:06<31:24,  1.42it/s]

{'loss': 1.6108, 'grad_norm': 1.0656530857086182, 'learning_rate': 0.00010741482965931863, 'epoch': 0.87}


 47%|████▋     | 2340/5000 [42:20<30:28,  1.45it/s]

{'loss': 1.5003, 'grad_norm': 1.4567065238952637, 'learning_rate': 0.00010661322645290582, 'epoch': 0.88}


 47%|████▋     | 2360/5000 [42:34<30:53,  1.42it/s]

{'loss': 1.4493, 'grad_norm': 1.2955718040466309, 'learning_rate': 0.00010581162324649298, 'epoch': 0.88}


 48%|████▊     | 2380/5000 [42:48<29:53,  1.46it/s]

{'loss': 1.6511, 'grad_norm': 1.2045758962631226, 'learning_rate': 0.00010501002004008016, 'epoch': 0.89}


 48%|████▊     | 2400/5000 [43:01<29:31,  1.47it/s]

{'loss': 1.385, 'grad_norm': 2.665609121322632, 'learning_rate': 0.00010420841683366733, 'epoch': 0.9}


                                                   
 48%|████▊     | 2400/5000 [44:27<29:31,  1.47it/s]

{'eval_loss': 1.501488447189331, 'eval_runtime': 85.4324, 'eval_samples_per_second': 6.953, 'eval_steps_per_second': 6.953, 'epoch': 0.9}


 48%|████▊     | 2420/5000 [44:41<31:48,  1.35it/s]   

{'loss': 1.6903, 'grad_norm': 1.018442988395691, 'learning_rate': 0.00010340681362725451, 'epoch': 0.91}


 49%|████▉     | 2440/5000 [44:55<29:13,  1.46it/s]

{'loss': 1.5172, 'grad_norm': 2.3696532249450684, 'learning_rate': 0.00010260521042084168, 'epoch': 0.91}


 49%|████▉     | 2460/5000 [45:09<29:47,  1.42it/s]

{'loss': 1.4301, 'grad_norm': 1.1025930643081665, 'learning_rate': 0.00010180360721442886, 'epoch': 0.92}


 50%|████▉     | 2480/5000 [45:23<28:54,  1.45it/s]

{'loss': 1.6441, 'grad_norm': 1.0844132900238037, 'learning_rate': 0.00010100200400801603, 'epoch': 0.93}


 50%|█████     | 2500/5000 [45:36<28:25,  1.47it/s]

{'loss': 1.3241, 'grad_norm': 2.5800840854644775, 'learning_rate': 0.0001002004008016032, 'epoch': 0.94}


 50%|█████     | 2520/5000 [45:51<29:05,  1.42it/s]

{'loss': 1.7339, 'grad_norm': 1.233261227607727, 'learning_rate': 9.939879759519039e-05, 'epoch': 0.94}


 51%|█████     | 2540/5000 [46:05<28:09,  1.46it/s]

{'loss': 1.5417, 'grad_norm': 1.609622597694397, 'learning_rate': 9.859719438877755e-05, 'epoch': 0.95}


 51%|█████     | 2560/5000 [46:19<28:36,  1.42it/s]

{'loss': 1.4683, 'grad_norm': 1.108019471168518, 'learning_rate': 9.779559118236473e-05, 'epoch': 0.96}


 52%|█████▏    | 2580/5000 [46:33<28:04,  1.44it/s]

{'loss': 1.5761, 'grad_norm': 1.2064259052276611, 'learning_rate': 9.69939879759519e-05, 'epoch': 0.97}


 52%|█████▏    | 2600/5000 [46:47<27:13,  1.47it/s]

{'loss': 1.3874, 'grad_norm': 1.808660626411438, 'learning_rate': 9.619238476953908e-05, 'epoch': 0.97}


                                                   
 52%|█████▏    | 2600/5000 [48:12<27:13,  1.47it/s]

{'eval_loss': 1.494887113571167, 'eval_runtime': 85.2928, 'eval_samples_per_second': 6.964, 'eval_steps_per_second': 6.964, 'epoch': 0.97}


 52%|█████▏    | 2620/5000 [48:26<29:02,  1.37it/s]   

{'loss': 1.6834, 'grad_norm': 1.2946499586105347, 'learning_rate': 9.539078156312625e-05, 'epoch': 0.98}


 53%|█████▎    | 2640/5000 [48:40<26:50,  1.47it/s]

{'loss': 1.5213, 'grad_norm': 1.313568353652954, 'learning_rate': 9.458917835671343e-05, 'epoch': 0.99}


 53%|█████▎    | 2660/5000 [48:54<27:26,  1.42it/s]

{'loss': 1.4112, 'grad_norm': 1.097413420677185, 'learning_rate': 9.378757515030061e-05, 'epoch': 1.0}


 54%|█████▎    | 2680/5000 [49:08<27:12,  1.42it/s]

{'loss': 1.4424, 'grad_norm': 1.0071860551834106, 'learning_rate': 9.298597194388778e-05, 'epoch': 1.0}


 54%|█████▍    | 2700/5000 [49:21<26:23,  1.45it/s]

{'loss': 1.2859, 'grad_norm': 1.5696388483047485, 'learning_rate': 9.218436873747496e-05, 'epoch': 1.01}


 54%|█████▍    | 2720/5000 [49:35<25:54,  1.47it/s]

{'loss': 1.0605, 'grad_norm': 2.02787184715271, 'learning_rate': 9.138276553106213e-05, 'epoch': 1.02}


 55%|█████▍    | 2740/5000 [49:49<26:11,  1.44it/s]

{'loss': 1.3061, 'grad_norm': 1.012282133102417, 'learning_rate': 9.05811623246493e-05, 'epoch': 1.03}


 55%|█████▌    | 2760/5000 [50:03<25:36,  1.46it/s]

{'loss': 1.2411, 'grad_norm': 1.2875914573669434, 'learning_rate': 8.977955911823647e-05, 'epoch': 1.03}


 56%|█████▌    | 2780/5000 [50:17<26:04,  1.42it/s]

{'loss': 1.1356, 'grad_norm': 1.1595932245254517, 'learning_rate': 8.897795591182365e-05, 'epoch': 1.04}


 56%|█████▌    | 2800/5000 [50:31<25:10,  1.46it/s]

{'loss': 1.3921, 'grad_norm': 1.334858775138855, 'learning_rate': 8.817635270541082e-05, 'epoch': 1.05}


                                                   
 56%|█████▌    | 2800/5000 [51:56<25:10,  1.46it/s]

{'eval_loss': 1.4995152950286865, 'eval_runtime': 85.3665, 'eval_samples_per_second': 6.958, 'eval_steps_per_second': 6.958, 'epoch': 1.05}


 56%|█████▋    | 2820/5000 [52:10<25:52,  1.40it/s]   

{'loss': 1.0905, 'grad_norm': 1.852837085723877, 'learning_rate': 8.7374749498998e-05, 'epoch': 1.06}


 57%|█████▋    | 2840/5000 [52:24<25:25,  1.42it/s]

{'loss': 1.3495, 'grad_norm': 1.1645323038101196, 'learning_rate': 8.657314629258517e-05, 'epoch': 1.06}


 57%|█████▋    | 2860/5000 [52:38<24:22,  1.46it/s]

{'loss': 1.252, 'grad_norm': 1.5137531757354736, 'learning_rate': 8.577154308617235e-05, 'epoch': 1.07}


 58%|█████▊    | 2880/5000 [52:52<24:51,  1.42it/s]

{'loss': 1.1444, 'grad_norm': 1.1166441440582275, 'learning_rate': 8.496993987975952e-05, 'epoch': 1.08}


 58%|█████▊    | 2900/5000 [53:06<24:06,  1.45it/s]

{'loss': 1.3389, 'grad_norm': 1.5741355419158936, 'learning_rate': 8.41683366733467e-05, 'epoch': 1.09}


 58%|█████▊    | 2920/5000 [53:19<23:38,  1.47it/s]

{'loss': 1.0452, 'grad_norm': 1.8961611986160278, 'learning_rate': 8.336673346693386e-05, 'epoch': 1.09}


 59%|█████▉    | 2940/5000 [53:34<24:10,  1.42it/s]

{'loss': 1.4036, 'grad_norm': 1.1647696495056152, 'learning_rate': 8.256513026052104e-05, 'epoch': 1.1}


 59%|█████▉    | 2960/5000 [53:47<23:20,  1.46it/s]

{'loss': 1.2493, 'grad_norm': 1.4920778274536133, 'learning_rate': 8.176352705410823e-05, 'epoch': 1.11}


 60%|█████▉    | 2980/5000 [54:01<23:41,  1.42it/s]

{'loss': 1.1145, 'grad_norm': 1.078609824180603, 'learning_rate': 8.096192384769539e-05, 'epoch': 1.12}


 60%|██████    | 3000/5000 [54:15<22:55,  1.45it/s]

{'loss': 1.3354, 'grad_norm': 1.448997974395752, 'learning_rate': 8.016032064128257e-05, 'epoch': 1.12}


                                                   
 60%|██████    | 3000/5000 [55:40<22:55,  1.45it/s]

{'eval_loss': 1.5012785196304321, 'eval_runtime': 85.3772, 'eval_samples_per_second': 6.957, 'eval_steps_per_second': 6.957, 'epoch': 1.12}


 60%|██████    | 3020/5000 [55:55<23:28,  1.41it/s]   

{'loss': 1.0305, 'grad_norm': 2.227388381958008, 'learning_rate': 7.935871743486974e-05, 'epoch': 1.13}


 61%|██████    | 3040/5000 [56:09<23:02,  1.42it/s]

{'loss': 1.2876, 'grad_norm': 1.2470111846923828, 'learning_rate': 7.855711422845692e-05, 'epoch': 1.14}


 61%|██████    | 3060/5000 [56:23<22:08,  1.46it/s]

{'loss': 1.178, 'grad_norm': 1.2259728908538818, 'learning_rate': 7.775551102204409e-05, 'epoch': 1.15}


 62%|██████▏   | 3080/5000 [56:37<22:30,  1.42it/s]

{'loss': 1.1764, 'grad_norm': 1.2819321155548096, 'learning_rate': 7.695390781563127e-05, 'epoch': 1.15}


 62%|██████▏   | 3100/5000 [56:51<21:44,  1.46it/s]

{'loss': 1.3234, 'grad_norm': 1.3805221319198608, 'learning_rate': 7.615230460921844e-05, 'epoch': 1.16}


 62%|██████▏   | 3120/5000 [57:04<21:26,  1.46it/s]

{'loss': 1.0388, 'grad_norm': 2.238760471343994, 'learning_rate': 7.535070140280562e-05, 'epoch': 1.17}


 63%|██████▎   | 3140/5000 [57:18<21:48,  1.42it/s]

{'loss': 1.3123, 'grad_norm': 1.1559170484542847, 'learning_rate': 7.454909819639278e-05, 'epoch': 1.18}


 63%|██████▎   | 3160/5000 [57:32<20:55,  1.47it/s]

{'loss': 1.2444, 'grad_norm': 1.7663027048110962, 'learning_rate': 7.374749498997996e-05, 'epoch': 1.18}


 64%|██████▎   | 3180/5000 [57:46<21:16,  1.43it/s]

{'loss': 1.193, 'grad_norm': 1.1202565431594849, 'learning_rate': 7.294589178356713e-05, 'epoch': 1.19}


 64%|██████▍   | 3200/5000 [58:00<20:30,  1.46it/s]

{'loss': 1.3102, 'grad_norm': 1.1128438711166382, 'learning_rate': 7.214428857715431e-05, 'epoch': 1.2}


                                                   
 64%|██████▍   | 3200/5000 [59:25<20:30,  1.46it/s]

{'eval_loss': 1.4961440563201904, 'eval_runtime': 85.3562, 'eval_samples_per_second': 6.959, 'eval_steps_per_second': 6.959, 'epoch': 1.2}


 64%|██████▍   | 3220/5000 [59:39<21:07,  1.40it/s]   

{'loss': 1.1025, 'grad_norm': 2.514831781387329, 'learning_rate': 7.134268537074149e-05, 'epoch': 1.21}


 65%|██████▍   | 3240/5000 [59:53<20:41,  1.42it/s]

{'loss': 1.3054, 'grad_norm': 1.2085232734680176, 'learning_rate': 7.054108216432866e-05, 'epoch': 1.21}


 65%|██████▌   | 3260/5000 [1:00:07<19:53,  1.46it/s]

{'loss': 1.2688, 'grad_norm': 1.3666778802871704, 'learning_rate': 6.973947895791584e-05, 'epoch': 1.22}


 66%|██████▌   | 3280/5000 [1:00:21<20:10,  1.42it/s]

{'loss': 1.2044, 'grad_norm': 1.215809941291809, 'learning_rate': 6.893787575150301e-05, 'epoch': 1.23}


 66%|██████▌   | 3300/5000 [1:00:35<19:24,  1.46it/s]

{'loss': 1.3457, 'grad_norm': 1.47800612449646, 'learning_rate': 6.813627254509019e-05, 'epoch': 1.24}


 66%|██████▋   | 3320/5000 [1:00:48<19:10,  1.46it/s]

{'loss': 0.9702, 'grad_norm': 2.021838665008545, 'learning_rate': 6.733466933867735e-05, 'epoch': 1.24}


 67%|██████▋   | 3340/5000 [1:01:02<20:10,  1.37it/s]

{'loss': 1.3439, 'grad_norm': 1.2958152294158936, 'learning_rate': 6.653306613226454e-05, 'epoch': 1.25}


 67%|██████▋   | 3360/5000 [1:01:16<18:38,  1.47it/s]

{'loss': 1.2303, 'grad_norm': 2.006469964981079, 'learning_rate': 6.57314629258517e-05, 'epoch': 1.26}


 68%|██████▊   | 3380/5000 [1:01:30<19:00,  1.42it/s]

{'loss': 1.1523, 'grad_norm': 1.1778770685195923, 'learning_rate': 6.492985971943888e-05, 'epoch': 1.27}


 68%|██████▊   | 3400/5000 [1:01:44<18:18,  1.46it/s]

{'loss': 1.3057, 'grad_norm': 1.6713106632232666, 'learning_rate': 6.412825651302605e-05, 'epoch': 1.27}


                                                     
 68%|██████▊   | 3400/5000 [1:03:09<18:18,  1.46it/s]

{'eval_loss': 1.4999918937683105, 'eval_runtime': 85.2729, 'eval_samples_per_second': 6.966, 'eval_steps_per_second': 6.966, 'epoch': 1.27}


 68%|██████▊   | 3420/5000 [1:03:23<18:44,  1.41it/s]   

{'loss': 1.0433, 'grad_norm': 1.3931348323822021, 'learning_rate': 6.332665330661323e-05, 'epoch': 1.28}


 69%|██████▉   | 3440/5000 [1:03:37<18:15,  1.42it/s]

{'loss': 1.403, 'grad_norm': 1.15233314037323, 'learning_rate': 6.25250501002004e-05, 'epoch': 1.29}


 69%|██████▉   | 3460/5000 [1:03:51<17:33,  1.46it/s]

{'loss': 1.2673, 'grad_norm': 1.7938209772109985, 'learning_rate': 6.172344689378758e-05, 'epoch': 1.3}


 70%|██████▉   | 3480/5000 [1:04:04<17:50,  1.42it/s]

{'loss': 1.1613, 'grad_norm': 1.1247414350509644, 'learning_rate': 6.092184368737476e-05, 'epoch': 1.3}


 70%|███████   | 3500/5000 [1:04:18<17:12,  1.45it/s]

{'loss': 1.2758, 'grad_norm': 1.124081015586853, 'learning_rate': 6.012024048096193e-05, 'epoch': 1.31}


 70%|███████   | 3520/5000 [1:04:33<16:51,  1.46it/s]

{'loss': 1.0381, 'grad_norm': 1.6677905321121216, 'learning_rate': 5.931863727454911e-05, 'epoch': 1.32}


 71%|███████   | 3540/5000 [1:04:47<16:55,  1.44it/s]

{'loss': 1.4431, 'grad_norm': 1.1523391008377075, 'learning_rate': 5.851703406813628e-05, 'epoch': 1.33}


 71%|███████   | 3560/5000 [1:05:01<16:27,  1.46it/s]

{'loss': 1.2633, 'grad_norm': 1.9322211742401123, 'learning_rate': 5.7715430861723455e-05, 'epoch': 1.33}


 72%|███████▏  | 3580/5000 [1:05:14<16:37,  1.42it/s]

{'loss': 1.2021, 'grad_norm': 1.1392221450805664, 'learning_rate': 5.691382765531063e-05, 'epoch': 1.34}


 72%|███████▏  | 3600/5000 [1:05:28<16:01,  1.46it/s]

{'loss': 1.3394, 'grad_norm': 1.3434017896652222, 'learning_rate': 5.6112224448897796e-05, 'epoch': 1.35}


                                                     
 72%|███████▏  | 3600/5000 [1:06:54<16:01,  1.46it/s]

{'eval_loss': 1.4943053722381592, 'eval_runtime': 85.4751, 'eval_samples_per_second': 6.949, 'eval_steps_per_second': 6.949, 'epoch': 1.35}


 72%|███████▏  | 3620/5000 [1:07:08<16:20,  1.41it/s]   

{'loss': 1.0259, 'grad_norm': 1.8404897451400757, 'learning_rate': 5.531062124248497e-05, 'epoch': 1.36}


 73%|███████▎  | 3640/5000 [1:07:22<15:59,  1.42it/s]

{'loss': 1.2982, 'grad_norm': 1.2737218141555786, 'learning_rate': 5.4509018036072143e-05, 'epoch': 1.36}


 73%|███████▎  | 3660/5000 [1:07:35<15:13,  1.47it/s]

{'loss': 1.233, 'grad_norm': 1.1587128639221191, 'learning_rate': 5.370741482965932e-05, 'epoch': 1.37}


 74%|███████▎  | 3680/5000 [1:07:49<15:29,  1.42it/s]

{'loss': 1.1973, 'grad_norm': 1.0787349939346313, 'learning_rate': 5.290581162324649e-05, 'epoch': 1.38}


 74%|███████▍  | 3700/5000 [1:08:03<14:52,  1.46it/s]

{'loss': 1.2314, 'grad_norm': 1.4011176824569702, 'learning_rate': 5.2104208416833665e-05, 'epoch': 1.39}


 74%|███████▍  | 3720/5000 [1:08:17<14:33,  1.47it/s]

{'loss': 1.1053, 'grad_norm': 1.9793256521224976, 'learning_rate': 5.130260521042084e-05, 'epoch': 1.39}


 75%|███████▍  | 3740/5000 [1:08:31<14:46,  1.42it/s]

{'loss': 1.3722, 'grad_norm': 1.4529658555984497, 'learning_rate': 5.050100200400801e-05, 'epoch': 1.4}


 75%|███████▌  | 3760/5000 [1:08:45<14:11,  1.46it/s]

{'loss': 1.1795, 'grad_norm': 1.6464844942092896, 'learning_rate': 4.9699398797595193e-05, 'epoch': 1.41}


 76%|███████▌  | 3780/5000 [1:08:59<14:17,  1.42it/s]

{'loss': 1.0982, 'grad_norm': 1.1724449396133423, 'learning_rate': 4.889779559118237e-05, 'epoch': 1.42}


 76%|███████▌  | 3800/5000 [1:09:13<13:43,  1.46it/s]

{'loss': 1.4003, 'grad_norm': 1.518367052078247, 'learning_rate': 4.809619238476954e-05, 'epoch': 1.42}


                                                     
 76%|███████▌  | 3800/5000 [1:10:38<13:43,  1.46it/s]

{'eval_loss': 1.486913800239563, 'eval_runtime': 85.3574, 'eval_samples_per_second': 6.959, 'eval_steps_per_second': 6.959, 'epoch': 1.42}


 76%|███████▋  | 3820/5000 [1:10:52<13:59,  1.40it/s]  

{'loss': 1.1139, 'grad_norm': 1.8046170473098755, 'learning_rate': 4.7294589178356715e-05, 'epoch': 1.43}


 77%|███████▋  | 3840/5000 [1:11:06<13:36,  1.42it/s]

{'loss': 1.3669, 'grad_norm': 1.1580557823181152, 'learning_rate': 4.649298597194389e-05, 'epoch': 1.44}


 77%|███████▋  | 3860/5000 [1:11:20<12:58,  1.46it/s]

{'loss': 1.2755, 'grad_norm': 1.9410029649734497, 'learning_rate': 4.569138276553106e-05, 'epoch': 1.45}


 78%|███████▊  | 3880/5000 [1:11:33<13:08,  1.42it/s]

{'loss': 1.0719, 'grad_norm': 1.2409614324569702, 'learning_rate': 4.488977955911824e-05, 'epoch': 1.45}


 78%|███████▊  | 3900/5000 [1:11:47<12:33,  1.46it/s]

{'loss': 1.3201, 'grad_norm': 1.2931753396987915, 'learning_rate': 4.408817635270541e-05, 'epoch': 1.46}


 78%|███████▊  | 3920/5000 [1:12:01<12:19,  1.46it/s]

{'loss': 1.0579, 'grad_norm': 2.4313032627105713, 'learning_rate': 4.3286573146292584e-05, 'epoch': 1.47}


 79%|███████▉  | 3940/5000 [1:12:15<12:24,  1.42it/s]

{'loss': 1.3382, 'grad_norm': 1.1147030591964722, 'learning_rate': 4.248496993987976e-05, 'epoch': 1.48}


 79%|███████▉  | 3960/5000 [1:12:29<11:52,  1.46it/s]

{'loss': 1.1699, 'grad_norm': 1.561671257019043, 'learning_rate': 4.168336673346693e-05, 'epoch': 1.48}


 80%|███████▉  | 3980/5000 [1:12:43<11:56,  1.42it/s]

{'loss': 1.1372, 'grad_norm': 1.1406221389770508, 'learning_rate': 4.088176352705411e-05, 'epoch': 1.49}


 80%|████████  | 4000/5000 [1:12:57<11:27,  1.46it/s]

{'loss': 1.2629, 'grad_norm': 1.410485863685608, 'learning_rate': 4.0080160320641287e-05, 'epoch': 1.5}


                                                     
 80%|████████  | 4000/5000 [1:14:22<11:27,  1.46it/s]

{'eval_loss': 1.4873018264770508, 'eval_runtime': 85.3309, 'eval_samples_per_second': 6.961, 'eval_steps_per_second': 6.961, 'epoch': 1.5}


 80%|████████  | 4020/5000 [1:14:37<11:38,  1.40it/s]  

{'loss': 1.0498, 'grad_norm': 1.772956371307373, 'learning_rate': 3.927855711422846e-05, 'epoch': 1.51}


 81%|████████  | 4040/5000 [1:14:51<11:05,  1.44it/s]

{'loss': 1.2994, 'grad_norm': 1.289461374282837, 'learning_rate': 3.8476953907815634e-05, 'epoch': 1.51}


 81%|████████  | 4060/5000 [1:15:04<10:42,  1.46it/s]

{'loss': 1.0897, 'grad_norm': 1.1330595016479492, 'learning_rate': 3.767535070140281e-05, 'epoch': 1.52}


 82%|████████▏ | 4080/5000 [1:15:18<10:46,  1.42it/s]

{'loss': 1.0865, 'grad_norm': 1.1841365098953247, 'learning_rate': 3.687374749498998e-05, 'epoch': 1.53}


 82%|████████▏ | 4100/5000 [1:15:32<10:25,  1.44it/s]

{'loss': 1.395, 'grad_norm': 1.4628987312316895, 'learning_rate': 3.6072144288577156e-05, 'epoch': 1.54}


 82%|████████▏ | 4120/5000 [1:15:46<10:01,  1.46it/s]

{'loss': 1.0029, 'grad_norm': 3.100632905960083, 'learning_rate': 3.527054108216433e-05, 'epoch': 1.54}


 83%|████████▎ | 4140/5000 [1:16:00<10:05,  1.42it/s]

{'loss': 1.3917, 'grad_norm': 1.1939431428909302, 'learning_rate': 3.4468937875751504e-05, 'epoch': 1.55}


 83%|████████▎ | 4160/5000 [1:16:14<09:33,  1.46it/s]

{'loss': 1.0509, 'grad_norm': 1.2553926706314087, 'learning_rate': 3.366733466933868e-05, 'epoch': 1.56}


 84%|████████▎ | 4180/5000 [1:16:28<09:36,  1.42it/s]

{'loss': 1.121, 'grad_norm': 1.3704503774642944, 'learning_rate': 3.286573146292585e-05, 'epoch': 1.57}


 84%|████████▍ | 4200/5000 [1:16:41<09:06,  1.46it/s]

{'loss': 1.2249, 'grad_norm': 1.3675533533096313, 'learning_rate': 3.2064128256513025e-05, 'epoch': 1.57}


                                                     
 84%|████████▍ | 4200/5000 [1:18:07<09:06,  1.46it/s]

{'eval_loss': 1.485638976097107, 'eval_runtime': 85.4378, 'eval_samples_per_second': 6.952, 'eval_steps_per_second': 6.952, 'epoch': 1.57}


 84%|████████▍ | 4220/5000 [1:18:21<09:15,  1.40it/s]  

{'loss': 1.083, 'grad_norm': 1.9491581916809082, 'learning_rate': 3.12625250501002e-05, 'epoch': 1.58}


 85%|████████▍ | 4240/5000 [1:18:35<08:55,  1.42it/s]

{'loss': 1.3962, 'grad_norm': 1.1605663299560547, 'learning_rate': 3.046092184368738e-05, 'epoch': 1.59}


 85%|████████▌ | 4260/5000 [1:18:49<08:52,  1.39it/s]

{'loss': 1.275, 'grad_norm': 1.7678169012069702, 'learning_rate': 2.9659318637274554e-05, 'epoch': 1.6}


 86%|████████▌ | 4280/5000 [1:19:02<08:26,  1.42it/s]

{'loss': 1.0809, 'grad_norm': 1.2095184326171875, 'learning_rate': 2.8857715430861727e-05, 'epoch': 1.6}


 86%|████████▌ | 4300/5000 [1:19:16<08:01,  1.45it/s]

{'loss': 1.2859, 'grad_norm': 1.5291186571121216, 'learning_rate': 2.8056112224448898e-05, 'epoch': 1.61}


 86%|████████▋ | 4320/5000 [1:19:30<07:45,  1.46it/s]

{'loss': 1.0687, 'grad_norm': 1.6056430339813232, 'learning_rate': 2.7254509018036072e-05, 'epoch': 1.62}


 87%|████████▋ | 4340/5000 [1:19:44<07:45,  1.42it/s]

{'loss': 1.3173, 'grad_norm': 1.357456088066101, 'learning_rate': 2.6452905811623246e-05, 'epoch': 1.63}


 87%|████████▋ | 4360/5000 [1:19:58<07:17,  1.46it/s]

{'loss': 1.1635, 'grad_norm': 1.8755524158477783, 'learning_rate': 2.565130260521042e-05, 'epoch': 1.63}


 88%|████████▊ | 4380/5000 [1:20:12<07:16,  1.42it/s]

{'loss': 1.1357, 'grad_norm': 1.1442514657974243, 'learning_rate': 2.4849699398797597e-05, 'epoch': 1.64}


 88%|████████▊ | 4400/5000 [1:20:26<06:53,  1.45it/s]

{'loss': 1.4194, 'grad_norm': 1.1992889642715454, 'learning_rate': 2.404809619238477e-05, 'epoch': 1.65}


                                                     
 88%|████████▊ | 4400/5000 [1:21:51<06:53,  1.45it/s]

{'eval_loss': 1.4784526824951172, 'eval_runtime': 85.3858, 'eval_samples_per_second': 6.957, 'eval_steps_per_second': 6.957, 'epoch': 1.65}


 88%|████████▊ | 4420/5000 [1:22:05<06:53,  1.40it/s]  

{'loss': 1.0224, 'grad_norm': 1.5648326873779297, 'learning_rate': 2.3246492985971944e-05, 'epoch': 1.66}


 89%|████████▉ | 4440/5000 [1:22:19<06:34,  1.42it/s]

{'loss': 1.3648, 'grad_norm': 1.245650291442871, 'learning_rate': 2.244488977955912e-05, 'epoch': 1.66}


 89%|████████▉ | 4460/5000 [1:22:33<06:09,  1.46it/s]

{'loss': 1.202, 'grad_norm': 1.47214937210083, 'learning_rate': 2.1643286573146292e-05, 'epoch': 1.67}


 90%|████████▉ | 4480/5000 [1:22:47<06:04,  1.43it/s]

{'loss': 1.1579, 'grad_norm': 1.3316459655761719, 'learning_rate': 2.0841683366733466e-05, 'epoch': 1.68}


 90%|█████████ | 4500/5000 [1:23:01<05:42,  1.46it/s]

{'loss': 1.2955, 'grad_norm': 1.6288127899169922, 'learning_rate': 2.0040080160320643e-05, 'epoch': 1.69}


 90%|█████████ | 4520/5000 [1:23:15<05:26,  1.47it/s]

{'loss': 1.0589, 'grad_norm': 2.5039963722229004, 'learning_rate': 1.9238476953907817e-05, 'epoch': 1.69}


 91%|█████████ | 4540/5000 [1:23:29<05:22,  1.43it/s]

{'loss': 1.3356, 'grad_norm': 1.4929897785186768, 'learning_rate': 1.843687374749499e-05, 'epoch': 1.7}


 91%|█████████ | 4560/5000 [1:23:43<05:00,  1.46it/s]

{'loss': 1.1778, 'grad_norm': 1.3762915134429932, 'learning_rate': 1.7635270541082165e-05, 'epoch': 1.71}


 92%|█████████▏| 4580/5000 [1:23:57<04:56,  1.42it/s]

{'loss': 1.0824, 'grad_norm': 1.2498661279678345, 'learning_rate': 1.683366733466934e-05, 'epoch': 1.72}


 92%|█████████▏| 4600/5000 [1:24:11<04:34,  1.46it/s]

{'loss': 1.2931, 'grad_norm': 1.6455055475234985, 'learning_rate': 1.6032064128256513e-05, 'epoch': 1.72}


                                                     
 92%|█████████▏| 4600/5000 [1:25:36<04:34,  1.46it/s]

{'eval_loss': 1.4774328470230103, 'eval_runtime': 85.2649, 'eval_samples_per_second': 6.967, 'eval_steps_per_second': 6.967, 'epoch': 1.72}


 92%|█████████▏| 4620/5000 [1:25:50<04:31,  1.40it/s]  

{'loss': 0.9528, 'grad_norm': 1.7661088705062866, 'learning_rate': 1.523046092184369e-05, 'epoch': 1.73}


 93%|█████████▎| 4640/5000 [1:26:04<04:13,  1.42it/s]

{'loss': 1.3417, 'grad_norm': 1.256549596786499, 'learning_rate': 1.4428857715430864e-05, 'epoch': 1.74}


 93%|█████████▎| 4660/5000 [1:26:17<03:52,  1.46it/s]

{'loss': 1.2179, 'grad_norm': 1.7380388975143433, 'learning_rate': 1.3627254509018036e-05, 'epoch': 1.74}


 94%|█████████▎| 4680/5000 [1:26:31<03:45,  1.42it/s]

{'loss': 1.1011, 'grad_norm': 1.3118456602096558, 'learning_rate': 1.282565130260521e-05, 'epoch': 1.75}


 94%|█████████▍| 4700/5000 [1:26:45<03:26,  1.45it/s]

{'loss': 1.2628, 'grad_norm': 1.6704397201538086, 'learning_rate': 1.2024048096192385e-05, 'epoch': 1.76}


 94%|█████████▍| 4720/5000 [1:26:59<03:11,  1.46it/s]

{'loss': 0.892, 'grad_norm': 2.0167152881622314, 'learning_rate': 1.122244488977956e-05, 'epoch': 1.77}


 95%|█████████▍| 4740/5000 [1:27:13<03:04,  1.41it/s]

{'loss': 1.2762, 'grad_norm': 0.9693607091903687, 'learning_rate': 1.0420841683366733e-05, 'epoch': 1.77}


 95%|█████████▌| 4760/5000 [1:27:27<02:44,  1.46it/s]

{'loss': 1.2676, 'grad_norm': 2.095167636871338, 'learning_rate': 9.619238476953909e-06, 'epoch': 1.78}


 96%|█████████▌| 4780/5000 [1:27:41<02:35,  1.42it/s]

{'loss': 1.1104, 'grad_norm': 1.3370203971862793, 'learning_rate': 8.817635270541082e-06, 'epoch': 1.79}


 96%|█████████▌| 4800/5000 [1:27:55<02:17,  1.46it/s]

{'loss': 1.3409, 'grad_norm': 1.4838082790374756, 'learning_rate': 8.016032064128256e-06, 'epoch': 1.8}


                                                     
 96%|█████████▌| 4800/5000 [1:29:20<02:17,  1.46it/s]

{'eval_loss': 1.4771814346313477, 'eval_runtime': 85.1892, 'eval_samples_per_second': 6.973, 'eval_steps_per_second': 6.973, 'epoch': 1.8}


 96%|█████████▋| 4820/5000 [1:29:33<02:07,  1.41it/s]  

{'loss': 0.9839, 'grad_norm': 1.6164482831954956, 'learning_rate': 7.214428857715432e-06, 'epoch': 1.8}


 97%|█████████▋| 4840/5000 [1:29:47<01:52,  1.42it/s]

{'loss': 1.3622, 'grad_norm': 1.0703555345535278, 'learning_rate': 6.412825651302605e-06, 'epoch': 1.81}


 97%|█████████▋| 4860/5000 [1:30:01<01:35,  1.46it/s]

{'loss': 1.2718, 'grad_norm': 1.5623197555541992, 'learning_rate': 5.61122244488978e-06, 'epoch': 1.82}


 98%|█████████▊| 4880/5000 [1:30:15<01:25,  1.41it/s]

{'loss': 1.0962, 'grad_norm': 1.1740843057632446, 'learning_rate': 4.809619238476954e-06, 'epoch': 1.83}


 98%|█████████▊| 4900/5000 [1:30:29<01:08,  1.45it/s]

{'loss': 1.2595, 'grad_norm': 1.3795653581619263, 'learning_rate': 4.008016032064128e-06, 'epoch': 1.83}


 98%|█████████▊| 4920/5000 [1:30:43<00:54,  1.47it/s]

{'loss': 0.9406, 'grad_norm': 1.9885172843933105, 'learning_rate': 3.2064128256513024e-06, 'epoch': 1.84}


 99%|█████████▉| 4940/5000 [1:30:57<00:42,  1.42it/s]

{'loss': 1.4118, 'grad_norm': 1.1353631019592285, 'learning_rate': 2.404809619238477e-06, 'epoch': 1.85}


 99%|█████████▉| 4960/5000 [1:31:11<00:27,  1.46it/s]

{'loss': 1.1693, 'grad_norm': 1.5633442401885986, 'learning_rate': 1.6032064128256512e-06, 'epoch': 1.86}


100%|█████████▉| 4980/5000 [1:31:25<00:14,  1.42it/s]

{'loss': 1.1441, 'grad_norm': 1.1632230281829834, 'learning_rate': 8.016032064128256e-07, 'epoch': 1.86}


100%|██████████| 5000/5000 [1:31:39<00:00,  1.46it/s]

{'loss': 1.2332, 'grad_norm': 1.7265483140945435, 'learning_rate': 0.0, 'epoch': 1.87}


                                                     
100%|██████████| 5000/5000 [1:33:04<00:00,  1.46it/s]

{'eval_loss': 1.4753975868225098, 'eval_runtime': 85.2535, 'eval_samples_per_second': 6.967, 'eval_steps_per_second': 6.967, 'epoch': 1.87}


100%|██████████| 5000/5000 [1:33:05<00:00,  1.12s/it]

{'train_runtime': 5585.0477, 'train_samples_per_second': 1.79, 'train_steps_per_second': 0.895, 'train_loss': 1.3976033634185792, 'epoch': 1.87}





TrainOutput(global_step=5000, training_loss=1.3976033634185792, metrics={'train_runtime': 5585.0477, 'train_samples_per_second': 1.79, 'train_steps_per_second': 0.895, 'total_flos': 2.956069404358963e+16, 'train_loss': 1.3976033634185792, 'epoch': 1.872308556450103})

In [24]:
path_to_save = path_to_save_finetuned_model + "\\final"
trainer.save_model(path_to_save)
model.save_pretrained(path_to_save)
tokenizer.save_pretrained(path_to_save)

('russia_chad_1.5\\final\\tokenizer_config.json',
 'russia_chad_1.5\\final\\special_tokens_map.json',
 'russia_chad_1.5\\final\\vocab.json',
 'russia_chad_1.5\\final\\merges.txt',
 'russia_chad_1.5\\final\\added_tokens.json',
 'russia_chad_1.5\\final\\tokenizer.json')