In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, set_seed, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
import random
import os

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


## Define model

In [3]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Demo

## Nucleus sampling + Top-K sampling + num sequences = 5

In [11]:
# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained("../output/gpt2/final", pad_token_id=tokenizer.eos_token_id).to(torch_device)

# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# encode context the generation is conditioned on
sep_token = "<|reply|>"
input_text = "Oh nice! I got around this (kind of) by being a bio major with a minor in neuro research. My school had 2 neruo programs - bio based and psych based. My minor was in psych based neuro but I took my bio electives as bio based neuro courses. Still had to do calc 1, calc 2, chem 1, chem 2, orgo 1, orgo 2, physics 1, and physics 2. But I DIDNT have to take intro to pharma kinetics, inorganic chemistry and a few other higher level chem classes. I did this bc the psych based neuro courses had almost no bio and I love bio. But math and chem are my kryptonite."
model_inputs = tokenizer([" ".join([input_text, sep_token])], return_tensors='pt').to(torch_device)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    early_stopping=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8,
    num_return_sequences=5,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=False).split('<|reply|>')[1].split('\n')[0][1:]
    print(f"{i}: {text}\n")



Output:
----------------------------------------------------------------------------------------------------
0: This is my favorite part of the bio class.  

1: &gt; My minor was in psych based neuro but I took my bio electives as bio based neuro courses.

2: I have no idea what you're talking about. I'm interested to know. 

3: This is what I think! 

4: How did your bio pass?  I also knew that I had to pay a lot of money, even though I was very good at a math course. I know that was a good one to pay



# Training

## Preprocess

In [4]:
BATCH_SIZE = 2
MAX_LENGTH = 512
EPOCHS = 2

bos_token='<|startoftext|>'
eos_token='<|endoftext|>'
pad_token='<|endoftext|>'
sep_token='<|reply|>'

tokenizer = AutoTokenizer.from_pretrained("distilgpt2", return_tensors='pt', eos_token=eos_token, pad_token=pad_token)

# Tokenizer function for later mapping.
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH)

def insert_tags(pair):
    return " ".join([eos_token, pair[0], sep_token, pair[1], eos_token])

# Create dataset as a DatasetDict object
DATA_DIR = '../data/raw/'
filenames = os.listdir(DATA_DIR)
dfs = [pd.read_csv(DATA_DIR + name, index_col='Unnamed: 0') for name in filenames]
df = pd.concat(dfs)
def create_dataset(df):
    comments = df[['comment', 'reply']]['comment'].apply(str).to_list()
    replies = df[['comment', 'reply']]['reply'].apply(str).to_list()
    texts = [insert_tags(pair) for pair in zip(comments, replies)]

    train_percentage = 0.9
    validation_percentage = 0.07
    test_percentage = 0.03

    random.shuffle(texts)
    texts_size = len(texts)
    texts_train = texts[:int(train_percentage*texts_size)]
    texts_validation = texts[int(train_percentage*texts_size):]

    dataset = dict()
    dataset['train'] = Dataset.from_dict({'text': texts_train})
    dataset['validation'] = Dataset.from_dict({'text': texts_validation})
    datasets = DatasetDict(dataset)
    return datasets

def group_texts(examples):
    examples['labels'] = examples['input_ids']
    return examples

dataset = create_dataset(df)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["text"],
    )

tokenized_dataset.set_format("pt", columns=['input_ids', 'attention_mask'], output_all_columns=True)

lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    num_proc=1,
    )

lm_dataset.set_format("pt", columns=['input_ids', 'attention_mask', 'labels'], output_all_columns=True)

# Data Collator pads the inputs for Causal Language Modeling.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

                                                                      

In [5]:
tokenizer.decode(lm_dataset['train'][1]['labels'])

"<|endoftext|> They can be used well, and they can be handled poorly.\n\nGood Romance Subplots come into play in the story, and use character to inform plot.\n\nFor Good examples let's look at Game of Thrones/ASOIAF.\n\nWe have three main romances that we are going to look at.\n\nRobb Stark and Jeyne Westerling/Talisa, Jon Snow and Ygritte, Eddard Stark and Catelyn Tully.\n\nRobb falls in puppy-love with Jeyne/Talisa, a lustful passionate romance that Robb abandons his previous alliances to engage in.\n\nThey meet and have a night of passion, with Robb later realizing the position he has been put in.\n\nThis makes sense since we see Robb being a passionate young man at war, and he justifies it as being the honourable repercussion of a foolish act.\n\nAnd it alienates his ally, and has real repercussions beyond what happens.\n\nThen we have Jon and Ygritte; similar to Robb and his spouse we have an honourable man falling into passionate love with a woman, and in the end it has repercuss

## Training

In [6]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(torch_device)

training_args = TrainingArguments(
    output_dir="../output/gpt2",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    save_steps=10000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"],
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/124858 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 500/124858 [03:05<12:41:29,  2.72it/s]

{'loss': 3.789, 'learning_rate': 1.991990901664291e-05, 'epoch': 0.01}


  1%|          | 1000/124858 [06:10<12:44:53,  2.70it/s]

{'loss': 3.7322, 'learning_rate': 1.9839818033285815e-05, 'epoch': 0.02}


  1%|          | 1500/124858 [09:15<12:42:43,  2.70it/s]

{'loss': 3.6608, 'learning_rate': 1.975972704992872e-05, 'epoch': 0.02}


  2%|▏         | 2000/124858 [12:20<12:36:22,  2.71it/s]

{'loss': 3.6488, 'learning_rate': 1.967963606657163e-05, 'epoch': 0.03}


  2%|▏         | 2500/124858 [15:24<12:23:54,  2.74it/s]

{'loss': 3.6391, 'learning_rate': 1.9599545083214533e-05, 'epoch': 0.04}


  2%|▏         | 3000/124858 [18:27<12:19:41,  2.75it/s]

{'loss': 3.6252, 'learning_rate': 1.951945409985744e-05, 'epoch': 0.05}


  3%|▎         | 3500/124858 [21:30<12:16:45,  2.75it/s]

{'loss': 3.6513, 'learning_rate': 1.9439363116500343e-05, 'epoch': 0.06}


  3%|▎         | 4000/124858 [24:33<12:04:30,  2.78it/s]

{'loss': 3.5916, 'learning_rate': 1.9359272133143252e-05, 'epoch': 0.06}


  4%|▎         | 4500/124858 [27:37<12:13:49,  2.73it/s]

{'loss': 3.5859, 'learning_rate': 1.927918114978616e-05, 'epoch': 0.07}


  4%|▍         | 5000/124858 [30:40<12:09:04,  2.74it/s]

{'loss': 3.6016, 'learning_rate': 1.9199090166429065e-05, 'epoch': 0.08}


  4%|▍         | 5500/124858 [33:43<12:11:05,  2.72it/s]

{'loss': 3.5772, 'learning_rate': 1.911899918307197e-05, 'epoch': 0.09}


  5%|▍         | 6000/124858 [36:46<12:07:27,  2.72it/s]

{'loss': 3.5823, 'learning_rate': 1.903890819971488e-05, 'epoch': 0.1}


  5%|▌         | 6500/124858 [39:50<12:00:22,  2.74it/s]

{'loss': 3.5969, 'learning_rate': 1.8958817216357784e-05, 'epoch': 0.1}


  6%|▌         | 7000/124858 [42:53<12:01:00,  2.72it/s]

{'loss': 3.584, 'learning_rate': 1.8878726233000692e-05, 'epoch': 0.11}


  6%|▌         | 7500/124858 [45:57<11:59:14,  2.72it/s]

{'loss': 3.5502, 'learning_rate': 1.8798635249643597e-05, 'epoch': 0.12}


  6%|▋         | 8000/124858 [49:00<11:57:53,  2.71it/s]

{'loss': 3.5844, 'learning_rate': 1.8718544266286502e-05, 'epoch': 0.13}


  7%|▋         | 8500/124858 [52:04<11:54:59,  2.71it/s]

{'loss': 3.5204, 'learning_rate': 1.863845328292941e-05, 'epoch': 0.14}


  7%|▋         | 9000/124858 [55:07<11:48:52,  2.72it/s]

{'loss': 3.5567, 'learning_rate': 1.8558362299572315e-05, 'epoch': 0.14}


  8%|▊         | 9500/124858 [58:10<11:51:11,  2.70it/s]

{'loss': 3.5232, 'learning_rate': 1.847827131621522e-05, 'epoch': 0.15}


  8%|▊         | 10000/124858 [1:01:14<11:47:34,  2.71it/s]

{'loss': 3.5665, 'learning_rate': 1.839818033285813e-05, 'epoch': 0.16}


  8%|▊         | 10500/124858 [1:04:19<11:40:31,  2.72it/s]

{'loss': 3.541, 'learning_rate': 1.8318089349501034e-05, 'epoch': 0.17}


  9%|▉         | 11000/124858 [1:07:22<11:29:08,  2.75it/s]

{'loss': 3.5337, 'learning_rate': 1.8237998366143942e-05, 'epoch': 0.18}


  9%|▉         | 11500/124858 [1:10:26<11:36:46,  2.71it/s]

{'loss': 3.5735, 'learning_rate': 1.8157907382786847e-05, 'epoch': 0.18}


 10%|▉         | 12000/124858 [1:13:30<11:33:41,  2.71it/s]

{'loss': 3.5256, 'learning_rate': 1.8077816399429752e-05, 'epoch': 0.19}


 10%|█         | 12500/124858 [1:16:33<11:26:38,  2.73it/s]

{'loss': 3.5418, 'learning_rate': 1.799772541607266e-05, 'epoch': 0.2}


 10%|█         | 13000/124858 [1:19:48<10:59:16,  2.83it/s]

{'loss': 3.5223, 'learning_rate': 1.7917634432715566e-05, 'epoch': 0.21}


 11%|█         | 13500/124858 [1:22:48<11:15:31,  2.75it/s]

{'loss': 3.5415, 'learning_rate': 1.783754344935847e-05, 'epoch': 0.22}


 11%|█         | 14000/124858 [1:25:50<11:12:47,  2.75it/s]

{'loss': 3.5424, 'learning_rate': 1.775745246600138e-05, 'epoch': 0.22}


 12%|█▏        | 14500/124858 [1:28:50<10:54:34,  2.81it/s]

{'loss': 3.5528, 'learning_rate': 1.7677361482644287e-05, 'epoch': 0.23}


 12%|█▏        | 15000/124858 [1:31:52<11:18:25,  2.70it/s]

{'loss': 3.5262, 'learning_rate': 1.7597270499287192e-05, 'epoch': 0.24}


 12%|█▏        | 15500/124858 [1:34:57<11:15:16,  2.70it/s]

{'loss': 3.4837, 'learning_rate': 1.7517179515930097e-05, 'epoch': 0.25}


 13%|█▎        | 16000/124858 [1:38:02<11:09:55,  2.71it/s]

{'loss': 3.5634, 'learning_rate': 1.7437088532573002e-05, 'epoch': 0.26}


 13%|█▎        | 16500/124858 [1:41:07<11:06:57,  2.71it/s]

{'loss': 3.4852, 'learning_rate': 1.735699754921591e-05, 'epoch': 0.26}


 14%|█▎        | 17000/124858 [1:44:12<11:04:03,  2.71it/s]

{'loss': 3.5107, 'learning_rate': 1.727690656585882e-05, 'epoch': 0.27}


 14%|█▍        | 17500/124858 [1:47:16<11:00:47,  2.71it/s]

{'loss': 3.5046, 'learning_rate': 1.7196815582501724e-05, 'epoch': 0.28}


 14%|█▍        | 18000/124858 [1:50:21<10:57:51,  2.71it/s]

{'loss': 3.5044, 'learning_rate': 1.711672459914463e-05, 'epoch': 0.29}


 15%|█▍        | 18500/124858 [1:53:26<10:56:05,  2.70it/s]

{'loss': 3.485, 'learning_rate': 1.7036633615787537e-05, 'epoch': 0.3}


 15%|█▌        | 19000/124858 [1:56:31<10:53:04,  2.70it/s]

{'loss': 3.5044, 'learning_rate': 1.6956542632430443e-05, 'epoch': 0.3}


 16%|█▌        | 19500/124858 [1:59:36<10:49:45,  2.70it/s]

{'loss': 3.4737, 'learning_rate': 1.6876451649073348e-05, 'epoch': 0.31}


 16%|█▌        | 20000/124858 [2:02:41<10:46:47,  2.70it/s]

{'loss': 3.5123, 'learning_rate': 1.6796360665716256e-05, 'epoch': 0.32}


 16%|█▋        | 20500/124858 [2:05:47<10:43:48,  2.70it/s]

{'loss': 3.4974, 'learning_rate': 1.671626968235916e-05, 'epoch': 0.33}


 17%|█▋        | 21000/124858 [2:08:52<10:40:35,  2.70it/s]

{'loss': 3.4962, 'learning_rate': 1.663617869900207e-05, 'epoch': 0.34}


 17%|█▋        | 21500/124858 [2:11:57<10:37:30,  2.70it/s]

{'loss': 3.5032, 'learning_rate': 1.6556087715644974e-05, 'epoch': 0.34}


 18%|█▊        | 22000/124858 [2:15:02<10:34:41,  2.70it/s]

{'loss': 3.503, 'learning_rate': 1.647599673228788e-05, 'epoch': 0.35}


 18%|█▊        | 22500/124858 [2:18:08<10:31:18,  2.70it/s]

{'loss': 3.5242, 'learning_rate': 1.6395905748930788e-05, 'epoch': 0.36}


 18%|█▊        | 23000/124858 [2:21:13<10:28:19,  2.70it/s]

{'loss': 3.4937, 'learning_rate': 1.6315814765573693e-05, 'epoch': 0.37}


 19%|█▉        | 23500/124858 [2:24:18<10:25:59,  2.70it/s]

{'loss': 3.513, 'learning_rate': 1.6235723782216598e-05, 'epoch': 0.38}


 19%|█▉        | 24000/124858 [2:27:23<10:22:03,  2.70it/s]

{'loss': 3.4519, 'learning_rate': 1.6155632798859506e-05, 'epoch': 0.38}


 20%|█▉        | 24500/124858 [2:30:28<10:19:07,  2.70it/s]

{'loss': 3.4945, 'learning_rate': 1.607554181550241e-05, 'epoch': 0.39}


 20%|██        | 25000/124858 [2:33:33<10:15:59,  2.70it/s]

{'loss': 3.5161, 'learning_rate': 1.599545083214532e-05, 'epoch': 0.4}


 20%|██        | 25500/124858 [2:36:38<10:12:41,  2.70it/s]

{'loss': 3.4913, 'learning_rate': 1.5915359848788224e-05, 'epoch': 0.41}


 21%|██        | 26000/124858 [2:39:43<10:09:52,  2.70it/s]

{'loss': 3.4465, 'learning_rate': 1.583526886543113e-05, 'epoch': 0.42}


 21%|██        | 26500/124858 [2:42:48<10:06:51,  2.70it/s]

{'loss': 3.4658, 'learning_rate': 1.5755177882074038e-05, 'epoch': 0.42}


 22%|██▏       | 27000/124858 [2:45:53<10:04:02,  2.70it/s]

{'loss': 3.4875, 'learning_rate': 1.5675086898716946e-05, 'epoch': 0.43}


 22%|██▏       | 27500/124858 [2:48:58<10:00:45,  2.70it/s]

{'loss': 3.5014, 'learning_rate': 1.559499591535985e-05, 'epoch': 0.44}


 22%|██▏       | 28000/124858 [2:52:03<9:57:32,  2.70it/s] 

{'loss': 3.4725, 'learning_rate': 1.5514904932002756e-05, 'epoch': 0.45}


 23%|██▎       | 28500/124858 [2:55:08<9:54:23,  2.70it/s]

{'loss': 3.497, 'learning_rate': 1.543481394864566e-05, 'epoch': 0.46}


 23%|██▎       | 29000/124858 [2:58:14<9:51:27,  2.70it/s]

{'loss': 3.4911, 'learning_rate': 1.535472296528857e-05, 'epoch': 0.46}


 24%|██▎       | 29500/124858 [3:01:19<9:48:14,  2.70it/s]

{'loss': 3.4473, 'learning_rate': 1.5274631981931475e-05, 'epoch': 0.47}


 24%|██▍       | 30000/124858 [3:04:24<9:45:12,  2.70it/s]

{'loss': 3.4521, 'learning_rate': 1.5194540998574381e-05, 'epoch': 0.48}


 24%|██▍       | 30500/124858 [3:07:30<9:42:01,  2.70it/s] 

{'loss': 3.4574, 'learning_rate': 1.5114450015217288e-05, 'epoch': 0.49}


 25%|██▍       | 31000/124858 [3:10:35<9:38:52,  2.70it/s]

{'loss': 3.4401, 'learning_rate': 1.5034359031860195e-05, 'epoch': 0.5}


 25%|██▌       | 31500/124858 [3:13:40<9:35:47,  2.70it/s]

{'loss': 3.4639, 'learning_rate': 1.49542680485031e-05, 'epoch': 0.5}


 26%|██▌       | 32000/124858 [3:16:45<9:32:49,  2.70it/s]

{'loss': 3.441, 'learning_rate': 1.4874177065146006e-05, 'epoch': 0.51}


 26%|██▌       | 32500/124858 [3:19:50<9:29:47,  2.70it/s]

{'loss': 3.4585, 'learning_rate': 1.4794086081788915e-05, 'epoch': 0.52}


 26%|██▋       | 33000/124858 [3:22:55<9:26:43,  2.70it/s]

{'loss': 3.4485, 'learning_rate': 1.471399509843182e-05, 'epoch': 0.53}


 27%|██▋       | 33500/124858 [3:26:00<9:23:37,  2.70it/s]

{'loss': 3.4697, 'learning_rate': 1.4633904115074726e-05, 'epoch': 0.54}


 27%|██▋       | 34000/124858 [3:29:05<9:20:37,  2.70it/s]

{'loss': 3.4953, 'learning_rate': 1.4553813131717633e-05, 'epoch': 0.54}


 28%|██▊       | 34500/124858 [3:32:10<9:17:07,  2.70it/s]

{'loss': 3.4798, 'learning_rate': 1.4473722148360538e-05, 'epoch': 0.55}


 28%|██▊       | 35000/124858 [3:35:15<9:14:17,  2.70it/s]

{'loss': 3.4589, 'learning_rate': 1.4393631165003445e-05, 'epoch': 0.56}


 28%|██▊       | 35500/124858 [3:38:20<9:11:04,  2.70it/s]

{'loss': 3.4343, 'learning_rate': 1.431354018164635e-05, 'epoch': 0.57}


 29%|██▉       | 36000/124858 [3:41:26<9:08:08,  2.70it/s]

{'loss': 3.4466, 'learning_rate': 1.4233449198289258e-05, 'epoch': 0.58}


 29%|██▉       | 36500/124858 [3:44:31<9:05:13,  2.70it/s]

{'loss': 3.4742, 'learning_rate': 1.4153358214932165e-05, 'epoch': 0.58}


 30%|██▉       | 37000/124858 [3:47:36<9:01:53,  2.70it/s]

{'loss': 3.4324, 'learning_rate': 1.407326723157507e-05, 'epoch': 0.59}


 30%|███       | 37500/124858 [3:50:41<8:58:55,  2.70it/s]

{'loss': 3.4704, 'learning_rate': 1.3993176248217977e-05, 'epoch': 0.6}


 30%|███       | 38000/124858 [3:53:46<8:55:39,  2.70it/s]

{'loss': 3.4733, 'learning_rate': 1.3913085264860883e-05, 'epoch': 0.61}


 31%|███       | 38500/124858 [3:56:51<8:52:42,  2.70it/s]

{'loss': 3.4227, 'learning_rate': 1.3832994281503788e-05, 'epoch': 0.62}


 31%|███       | 39000/124858 [3:59:56<8:49:27,  2.70it/s]

{'loss': 3.4532, 'learning_rate': 1.3752903298146695e-05, 'epoch': 0.62}


 32%|███▏      | 39500/124858 [4:03:01<8:46:50,  2.70it/s]

{'loss': 3.4376, 'learning_rate': 1.3672812314789603e-05, 'epoch': 0.63}


 32%|███▏      | 40000/124858 [4:06:06<8:43:34,  2.70it/s]

{'loss': 3.4713, 'learning_rate': 1.3592721331432508e-05, 'epoch': 0.64}


 32%|███▏      | 40500/124858 [4:09:12<8:40:34,  2.70it/s] 

{'loss': 3.4779, 'learning_rate': 1.3512630348075415e-05, 'epoch': 0.65}


 33%|███▎      | 41000/124858 [4:12:17<8:37:28,  2.70it/s]

{'loss': 3.4624, 'learning_rate': 1.3432539364718322e-05, 'epoch': 0.66}


 33%|███▎      | 41500/124858 [4:15:23<8:34:15,  2.70it/s]

{'loss': 3.4651, 'learning_rate': 1.3352448381361227e-05, 'epoch': 0.66}


 34%|███▎      | 42000/124858 [4:18:28<8:31:21,  2.70it/s]

{'loss': 3.4657, 'learning_rate': 1.3272357398004134e-05, 'epoch': 0.67}


 34%|███▍      | 42500/124858 [4:21:33<8:28:09,  2.70it/s]

{'loss': 3.4442, 'learning_rate': 1.3192266414647039e-05, 'epoch': 0.68}


 34%|███▍      | 43000/124858 [4:24:38<8:24:53,  2.70it/s]

{'loss': 3.4303, 'learning_rate': 1.3112175431289947e-05, 'epoch': 0.69}


 35%|███▍      | 43500/124858 [4:27:43<8:21:53,  2.70it/s]

{'loss': 3.4474, 'learning_rate': 1.3032084447932854e-05, 'epoch': 0.7}


 35%|███▌      | 44000/124858 [4:30:48<8:18:52,  2.70it/s]

{'loss': 3.4578, 'learning_rate': 1.2951993464575759e-05, 'epoch': 0.7}


 36%|███▌      | 44500/124858 [4:33:53<8:15:48,  2.70it/s]

{'loss': 3.4502, 'learning_rate': 1.2871902481218665e-05, 'epoch': 0.71}


 36%|███▌      | 45000/124858 [4:36:58<8:12:42,  2.70it/s]

{'loss': 3.4397, 'learning_rate': 1.2791811497861572e-05, 'epoch': 0.72}


 36%|███▋      | 45500/124858 [4:40:03<8:09:18,  2.70it/s]

{'loss': 3.4319, 'learning_rate': 1.2711720514504477e-05, 'epoch': 0.73}


 37%|███▋      | 46000/124858 [4:43:08<8:06:30,  2.70it/s]

{'loss': 3.4467, 'learning_rate': 1.2631629531147385e-05, 'epoch': 0.74}


 37%|███▋      | 46500/124858 [4:46:13<8:03:16,  2.70it/s]

{'loss': 3.4285, 'learning_rate': 1.2551538547790292e-05, 'epoch': 0.74}


 38%|███▊      | 47000/124858 [4:49:18<8:00:09,  2.70it/s]

{'loss': 3.4024, 'learning_rate': 1.2471447564433197e-05, 'epoch': 0.75}


 38%|███▊      | 47500/124858 [4:52:23<7:57:07,  2.70it/s]

{'loss': 3.4058, 'learning_rate': 1.2391356581076104e-05, 'epoch': 0.76}


 38%|███▊      | 48000/124858 [4:55:29<7:54:02,  2.70it/s]

{'loss': 3.4374, 'learning_rate': 1.2311265597719009e-05, 'epoch': 0.77}


 39%|███▉      | 48500/124858 [4:58:34<7:51:02,  2.70it/s]

{'loss': 3.4586, 'learning_rate': 1.2231174614361915e-05, 'epoch': 0.78}


 39%|███▉      | 49000/124858 [5:01:39<7:47:54,  2.70it/s]

{'loss': 3.4125, 'learning_rate': 1.2151083631004822e-05, 'epoch': 0.78}


 40%|███▉      | 49500/124858 [5:04:44<7:44:47,  2.70it/s]

{'loss': 3.4307, 'learning_rate': 1.2070992647647727e-05, 'epoch': 0.79}


 40%|████      | 50000/124858 [5:07:49<7:41:42,  2.70it/s]

{'loss': 3.4073, 'learning_rate': 1.1990901664290636e-05, 'epoch': 0.8}


 40%|████      | 50500/124858 [5:10:55<7:38:41,  2.70it/s] 

{'loss': 3.4279, 'learning_rate': 1.1910810680933542e-05, 'epoch': 0.81}


 41%|████      | 51000/124858 [5:14:00<7:35:44,  2.70it/s]

{'loss': 3.4238, 'learning_rate': 1.1830719697576447e-05, 'epoch': 0.82}


 41%|████      | 51500/124858 [5:17:05<7:32:27,  2.70it/s]

{'loss': 3.4571, 'learning_rate': 1.1750628714219354e-05, 'epoch': 0.82}


 42%|████▏     | 52000/124858 [5:20:10<7:29:13,  2.70it/s]

{'loss': 3.4452, 'learning_rate': 1.167053773086226e-05, 'epoch': 0.83}


 42%|████▏     | 52500/124858 [5:23:15<7:26:37,  2.70it/s]

{'loss': 3.4029, 'learning_rate': 1.1590446747505166e-05, 'epoch': 0.84}


 42%|████▏     | 53000/124858 [5:26:20<7:23:03,  2.70it/s]

{'loss': 3.4555, 'learning_rate': 1.1510355764148074e-05, 'epoch': 0.85}


 43%|████▎     | 53500/124858 [5:29:25<7:20:01,  2.70it/s]

{'loss': 3.439, 'learning_rate': 1.143026478079098e-05, 'epoch': 0.86}


 43%|████▎     | 54000/124858 [5:32:30<7:17:08,  2.70it/s]

{'loss': 3.4278, 'learning_rate': 1.1350173797433886e-05, 'epoch': 0.86}


 44%|████▎     | 54500/124858 [5:35:35<7:14:01,  2.70it/s]

{'loss': 3.4109, 'learning_rate': 1.1270082814076792e-05, 'epoch': 0.87}


 44%|████▍     | 55000/124858 [5:38:40<7:10:52,  2.70it/s]

{'loss': 3.4463, 'learning_rate': 1.1189991830719697e-05, 'epoch': 0.88}


 44%|████▍     | 55500/124858 [5:41:46<7:07:54,  2.70it/s]

{'loss': 3.4627, 'learning_rate': 1.1109900847362604e-05, 'epoch': 0.89}


 45%|████▍     | 56000/124858 [5:44:51<7:04:41,  2.70it/s]

{'loss': 3.4001, 'learning_rate': 1.1029809864005513e-05, 'epoch': 0.9}


 45%|████▌     | 56500/124858 [5:47:56<7:01:31,  2.70it/s]

{'loss': 3.4411, 'learning_rate': 1.0949718880648418e-05, 'epoch': 0.91}


 46%|████▌     | 57000/124858 [5:51:01<6:58:41,  2.70it/s]

{'loss': 3.4242, 'learning_rate': 1.0869627897291324e-05, 'epoch': 0.91}


 46%|████▌     | 57500/124858 [5:54:06<6:55:29,  2.70it/s]

{'loss': 3.4334, 'learning_rate': 1.0789536913934231e-05, 'epoch': 0.92}


 46%|████▋     | 58000/124858 [5:57:11<6:52:31,  2.70it/s]

{'loss': 3.406, 'learning_rate': 1.0709445930577136e-05, 'epoch': 0.93}


 47%|████▋     | 58500/124858 [6:00:16<6:49:19,  2.70it/s]

{'loss': 3.3986, 'learning_rate': 1.0629354947220043e-05, 'epoch': 0.94}


 47%|████▋     | 59000/124858 [6:03:21<6:46:30,  2.70it/s]

{'loss': 3.3967, 'learning_rate': 1.054926396386295e-05, 'epoch': 0.95}


 48%|████▊     | 59500/124858 [6:06:26<6:43:10,  2.70it/s]

{'loss': 3.405, 'learning_rate': 1.0469172980505854e-05, 'epoch': 0.95}


 48%|████▊     | 60000/124858 [6:09:31<6:40:05,  2.70it/s]

{'loss': 3.4363, 'learning_rate': 1.0389081997148763e-05, 'epoch': 0.96}


 48%|████▊     | 60500/124858 [6:12:37<6:36:50,  2.70it/s] 

{'loss': 3.4048, 'learning_rate': 1.0308991013791668e-05, 'epoch': 0.97}


 49%|████▉     | 61000/124858 [6:15:42<6:33:57,  2.70it/s]

{'loss': 3.3979, 'learning_rate': 1.0228900030434574e-05, 'epoch': 0.98}


 49%|████▉     | 61500/124858 [6:18:48<6:30:45,  2.70it/s]

{'loss': 3.4163, 'learning_rate': 1.0148809047077481e-05, 'epoch': 0.99}


 50%|████▉     | 62000/124858 [6:21:53<6:27:42,  2.70it/s]

{'loss': 3.3993, 'learning_rate': 1.0068718063720386e-05, 'epoch': 0.99}


                                                          
 50%|█████     | 62429/124858 [6:37:41<5:35:28,  3.10it/s]

{'eval_loss': 3.311511278152466, 'eval_runtime': 789.5805, 'eval_samples_per_second': 17.571, 'eval_steps_per_second': 8.786, 'epoch': 1.0}


 50%|█████     | 62500/124858 [6:38:07<6:24:36,  2.70it/s]    

{'loss': 3.4013, 'learning_rate': 9.988627080363293e-06, 'epoch': 1.0}


 50%|█████     | 63000/124858 [6:41:12<6:21:33,  2.70it/s]

{'loss': 3.357, 'learning_rate': 9.9085360970062e-06, 'epoch': 1.01}


 51%|█████     | 63500/124858 [6:44:17<6:18:24,  2.70it/s]

{'loss': 3.4161, 'learning_rate': 9.828445113649106e-06, 'epoch': 1.02}


 51%|█████▏    | 64000/124858 [6:47:22<6:15:31,  2.70it/s]

{'loss': 3.3808, 'learning_rate': 9.748354130292013e-06, 'epoch': 1.03}


 52%|█████▏    | 64500/124858 [6:50:27<6:12:20,  2.70it/s]

{'loss': 3.3396, 'learning_rate': 9.668263146934918e-06, 'epoch': 1.03}


 52%|█████▏    | 65000/124858 [6:53:32<6:09:11,  2.70it/s]

{'loss': 3.364, 'learning_rate': 9.588172163577825e-06, 'epoch': 1.04}


 52%|█████▏    | 65500/124858 [6:56:37<6:06:08,  2.70it/s]

{'loss': 3.3602, 'learning_rate': 9.508081180220731e-06, 'epoch': 1.05}


 53%|█████▎    | 66000/124858 [6:59:43<6:03:05,  2.70it/s]

{'loss': 3.3567, 'learning_rate': 9.427990196863638e-06, 'epoch': 1.06}


 53%|█████▎    | 66500/124858 [7:02:48<5:59:55,  2.70it/s]

{'loss': 3.3384, 'learning_rate': 9.347899213506545e-06, 'epoch': 1.07}


 54%|█████▎    | 67000/124858 [7:05:53<5:56:52,  2.70it/s]

{'loss': 3.3523, 'learning_rate': 9.267808230149451e-06, 'epoch': 1.07}


 54%|█████▍    | 67500/124858 [7:08:58<5:53:48,  2.70it/s]

{'loss': 3.3626, 'learning_rate': 9.187717246792356e-06, 'epoch': 1.08}


 54%|█████▍    | 68000/124858 [7:12:03<5:50:43,  2.70it/s]

{'loss': 3.3715, 'learning_rate': 9.107626263435263e-06, 'epoch': 1.09}


 55%|█████▍    | 68500/124858 [7:15:08<5:47:41,  2.70it/s]

{'loss': 3.3573, 'learning_rate': 9.02753528007817e-06, 'epoch': 1.1}


 55%|█████▌    | 69000/124858 [7:18:13<5:44:30,  2.70it/s]

{'loss': 3.3537, 'learning_rate': 8.947444296721076e-06, 'epoch': 1.11}


 56%|█████▌    | 69500/124858 [7:21:18<5:41:50,  2.70it/s]

{'loss': 3.3552, 'learning_rate': 8.867353313363981e-06, 'epoch': 1.11}


 56%|█████▌    | 70000/124858 [7:24:23<5:38:31,  2.70it/s]

{'loss': 3.371, 'learning_rate': 8.787262330006888e-06, 'epoch': 1.12}


 56%|█████▋    | 70500/124858 [7:27:29<5:35:19,  2.70it/s] 

{'loss': 3.3741, 'learning_rate': 8.707171346649795e-06, 'epoch': 1.13}


 57%|█████▋    | 71000/124858 [7:30:34<5:32:07,  2.70it/s]

{'loss': 3.357, 'learning_rate': 8.627080363292702e-06, 'epoch': 1.14}


 57%|█████▋    | 71500/124858 [7:33:39<5:29:12,  2.70it/s]

{'loss': 3.3748, 'learning_rate': 8.546989379935608e-06, 'epoch': 1.15}


 58%|█████▊    | 72000/124858 [7:36:44<5:26:00,  2.70it/s]

{'loss': 3.3658, 'learning_rate': 8.466898396578513e-06, 'epoch': 1.15}


 58%|█████▊    | 72500/124858 [7:39:50<5:23:05,  2.70it/s]

{'loss': 3.3603, 'learning_rate': 8.38680741322142e-06, 'epoch': 1.16}


 58%|█████▊    | 73000/124858 [7:42:55<5:19:55,  2.70it/s]

{'loss': 3.3593, 'learning_rate': 8.306716429864327e-06, 'epoch': 1.17}


 59%|█████▉    | 73500/124858 [7:46:00<5:16:57,  2.70it/s]

{'loss': 3.3752, 'learning_rate': 8.226625446507233e-06, 'epoch': 1.18}


 59%|█████▉    | 74000/124858 [7:49:05<5:13:38,  2.70it/s]

{'loss': 3.3598, 'learning_rate': 8.14653446315014e-06, 'epoch': 1.19}


 60%|█████▉    | 74500/124858 [7:52:10<5:10:39,  2.70it/s]

{'loss': 3.3428, 'learning_rate': 8.066443479793045e-06, 'epoch': 1.19}


 60%|██████    | 75000/124858 [7:55:15<5:07:27,  2.70it/s]

{'loss': 3.3738, 'learning_rate': 7.986352496435952e-06, 'epoch': 1.2}


 60%|██████    | 75500/124858 [7:58:20<5:04:21,  2.70it/s]

{'loss': 3.3747, 'learning_rate': 7.906261513078858e-06, 'epoch': 1.21}


 61%|██████    | 76000/124858 [8:01:25<5:01:30,  2.70it/s]

{'loss': 3.3798, 'learning_rate': 7.826170529721765e-06, 'epoch': 1.22}


 61%|██████▏   | 76500/124858 [8:04:30<4:58:19,  2.70it/s]

{'loss': 3.375, 'learning_rate': 7.746079546364672e-06, 'epoch': 1.23}


 62%|██████▏   | 77000/124858 [8:07:35<4:55:14,  2.70it/s]

{'loss': 3.3506, 'learning_rate': 7.665988563007577e-06, 'epoch': 1.23}


 62%|██████▏   | 77500/124858 [8:10:40<4:52:13,  2.70it/s]

{'loss': 3.3813, 'learning_rate': 7.585897579650484e-06, 'epoch': 1.24}


 62%|██████▏   | 78000/124858 [8:13:45<4:49:07,  2.70it/s]

{'loss': 3.3522, 'learning_rate': 7.50580659629339e-06, 'epoch': 1.25}


 63%|██████▎   | 78500/124858 [8:16:50<4:46:00,  2.70it/s]

{'loss': 3.3637, 'learning_rate': 7.425715612936296e-06, 'epoch': 1.26}


 63%|██████▎   | 79000/124858 [8:19:55<4:42:48,  2.70it/s]

{'loss': 3.3609, 'learning_rate': 7.345624629579202e-06, 'epoch': 1.27}


 64%|██████▎   | 79500/124858 [8:23:01<4:39:40,  2.70it/s]

{'loss': 3.3269, 'learning_rate': 7.265533646222109e-06, 'epoch': 1.27}


 64%|██████▍   | 80000/124858 [8:26:06<4:36:46,  2.70it/s]

{'loss': 3.3736, 'learning_rate': 7.185442662865015e-06, 'epoch': 1.28}


 64%|██████▍   | 80500/124858 [8:29:12<4:33:35,  2.70it/s]

{'loss': 3.3709, 'learning_rate': 7.105351679507921e-06, 'epoch': 1.29}


 65%|██████▍   | 81000/124858 [8:32:17<4:30:29,  2.70it/s]

{'loss': 3.3554, 'learning_rate': 7.025260696150828e-06, 'epoch': 1.3}


 65%|██████▌   | 81500/124858 [8:35:22<4:27:23,  2.70it/s]

{'loss': 3.3673, 'learning_rate': 6.9451697127937345e-06, 'epoch': 1.31}


 66%|██████▌   | 82000/124858 [8:38:27<4:24:24,  2.70it/s]

{'loss': 3.3608, 'learning_rate': 6.86507872943664e-06, 'epoch': 1.31}


 66%|██████▌   | 82500/124858 [8:41:32<4:21:24,  2.70it/s]

{'loss': 3.3368, 'learning_rate': 6.784987746079546e-06, 'epoch': 1.32}


 66%|██████▋   | 83000/124858 [8:44:37<4:18:25,  2.70it/s]

{'loss': 3.3539, 'learning_rate': 6.704896762722454e-06, 'epoch': 1.33}


 67%|██████▋   | 83500/124858 [8:47:42<4:15:10,  2.70it/s]

{'loss': 3.3202, 'learning_rate': 6.62480577936536e-06, 'epoch': 1.34}


 67%|██████▋   | 84000/124858 [8:50:48<4:12:09,  2.70it/s]

{'loss': 3.3295, 'learning_rate': 6.5447147960082654e-06, 'epoch': 1.35}


 68%|██████▊   | 84500/124858 [8:53:53<4:09:00,  2.70it/s]

{'loss': 3.3504, 'learning_rate': 6.464623812651172e-06, 'epoch': 1.35}


 68%|██████▊   | 85000/124858 [8:56:58<4:06:00,  2.70it/s]

{'loss': 3.3079, 'learning_rate': 6.384532829294079e-06, 'epoch': 1.36}


 68%|██████▊   | 85500/124858 [9:00:03<4:02:55,  2.70it/s]

{'loss': 3.3472, 'learning_rate': 6.304441845936985e-06, 'epoch': 1.37}


 69%|██████▉   | 86000/124858 [9:03:08<3:59:46,  2.70it/s]

{'loss': 3.3407, 'learning_rate': 6.224350862579891e-06, 'epoch': 1.38}


 69%|██████▉   | 86500/124858 [9:06:13<3:56:36,  2.70it/s]

{'loss': 3.3549, 'learning_rate': 6.144259879222798e-06, 'epoch': 1.39}


 70%|██████▉   | 87000/124858 [9:09:18<3:53:42,  2.70it/s]

{'loss': 3.3385, 'learning_rate': 6.064168895865704e-06, 'epoch': 1.39}


 70%|███████   | 87500/124858 [9:12:24<3:50:27,  2.70it/s]

{'loss': 3.3384, 'learning_rate': 5.98407791250861e-06, 'epoch': 1.4}


 70%|███████   | 88000/124858 [9:15:29<3:47:21,  2.70it/s]

{'loss': 3.3345, 'learning_rate': 5.9039869291515165e-06, 'epoch': 1.41}


 71%|███████   | 88500/124858 [9:18:34<3:44:20,  2.70it/s]

{'loss': 3.3157, 'learning_rate': 5.823895945794423e-06, 'epoch': 1.42}


 71%|███████▏  | 89000/124858 [9:21:39<3:41:18,  2.70it/s]

{'loss': 3.3564, 'learning_rate': 5.743804962437329e-06, 'epoch': 1.43}


 72%|███████▏  | 89500/124858 [9:24:44<3:38:09,  2.70it/s]

{'loss': 3.3639, 'learning_rate': 5.663713979080236e-06, 'epoch': 1.43}


 72%|███████▏  | 90000/124858 [9:27:49<3:35:03,  2.70it/s]

{'loss': 3.3578, 'learning_rate': 5.583622995723142e-06, 'epoch': 1.44}


 72%|███████▏  | 90500/124858 [9:30:55<3:32:04,  2.70it/s]

{'loss': 3.3817, 'learning_rate': 5.503532012366048e-06, 'epoch': 1.45}


 73%|███████▎  | 91000/124858 [9:34:00<3:28:50,  2.70it/s]

{'loss': 3.3443, 'learning_rate': 5.423441029008955e-06, 'epoch': 1.46}


 73%|███████▎  | 91500/124858 [9:37:05<3:25:47,  2.70it/s]

{'loss': 3.3466, 'learning_rate': 5.343350045651861e-06, 'epoch': 1.47}


 74%|███████▎  | 92000/124858 [9:40:11<3:22:48,  2.70it/s]

{'loss': 3.3565, 'learning_rate': 5.2632590622947675e-06, 'epoch': 1.47}


 74%|███████▍  | 92500/124858 [9:43:16<3:19:41,  2.70it/s]

{'loss': 3.3673, 'learning_rate': 5.183168078937673e-06, 'epoch': 1.48}


 74%|███████▍  | 93000/124858 [9:46:21<3:16:33,  2.70it/s]

{'loss': 3.3284, 'learning_rate': 5.10307709558058e-06, 'epoch': 1.49}


 75%|███████▍  | 93500/124858 [9:49:26<3:13:32,  2.70it/s]

{'loss': 3.3534, 'learning_rate': 5.022986112223486e-06, 'epoch': 1.5}


 75%|███████▌  | 94000/124858 [9:52:31<3:10:26,  2.70it/s]

{'loss': 3.3544, 'learning_rate': 4.9428951288663926e-06, 'epoch': 1.51}


 76%|███████▌  | 94500/124858 [9:55:36<3:07:17,  2.70it/s]

{'loss': 3.3085, 'learning_rate': 4.862804145509299e-06, 'epoch': 1.51}


 76%|███████▌  | 95000/124858 [9:58:41<3:04:13,  2.70it/s]

{'loss': 3.3834, 'learning_rate': 4.782713162152205e-06, 'epoch': 1.52}


 76%|███████▋  | 95500/124858 [10:01:46<3:01:10,  2.70it/s]

{'loss': 3.3688, 'learning_rate': 4.702622178795112e-06, 'epoch': 1.53}


 77%|███████▋  | 96000/124858 [10:04:52<2:58:06,  2.70it/s]

{'loss': 3.3339, 'learning_rate': 4.6225311954380185e-06, 'epoch': 1.54}


 77%|███████▋  | 96500/124858 [10:07:57<2:55:02,  2.70it/s]

{'loss': 3.3389, 'learning_rate': 4.542440212080924e-06, 'epoch': 1.55}


 78%|███████▊  | 97000/124858 [10:11:02<2:51:54,  2.70it/s]

{'loss': 3.3321, 'learning_rate': 4.462349228723831e-06, 'epoch': 1.55}


 78%|███████▊  | 97500/124858 [10:14:07<2:48:44,  2.70it/s]

{'loss': 3.3691, 'learning_rate': 4.382258245366737e-06, 'epoch': 1.56}


 78%|███████▊  | 98000/124858 [10:17:12<2:45:44,  2.70it/s]

{'loss': 3.3593, 'learning_rate': 4.302167262009644e-06, 'epoch': 1.57}


 79%|███████▉  | 98500/124858 [10:20:17<2:42:37,  2.70it/s]

{'loss': 3.3266, 'learning_rate': 4.22207627865255e-06, 'epoch': 1.58}


 79%|███████▉  | 99000/124858 [10:23:22<2:39:31,  2.70it/s]

{'loss': 3.329, 'learning_rate': 4.141985295295456e-06, 'epoch': 1.59}


 80%|███████▉  | 99500/124858 [10:26:27<2:36:30,  2.70it/s]

{'loss': 3.3573, 'learning_rate': 4.061894311938363e-06, 'epoch': 1.59}


 80%|████████  | 100000/124858 [10:29:32<2:33:22,  2.70it/s]

{'loss': 3.3692, 'learning_rate': 3.981803328581269e-06, 'epoch': 1.6}


 80%|████████  | 100500/124858 [10:32:39<2:30:19,  2.70it/s]

{'loss': 3.3372, 'learning_rate': 3.901712345224175e-06, 'epoch': 1.61}


 81%|████████  | 101000/124858 [10:35:44<2:27:11,  2.70it/s]

{'loss': 3.3644, 'learning_rate': 3.821621361867081e-06, 'epoch': 1.62}


 81%|████████▏ | 101500/124858 [10:38:49<2:24:10,  2.70it/s]

{'loss': 3.3384, 'learning_rate': 3.7415303785099875e-06, 'epoch': 1.63}


 82%|████████▏ | 102000/124858 [10:41:54<2:21:03,  2.70it/s]

{'loss': 3.3098, 'learning_rate': 3.661439395152894e-06, 'epoch': 1.63}


 82%|████████▏ | 102500/124858 [10:44:59<2:17:55,  2.70it/s]

{'loss': 3.3365, 'learning_rate': 3.5813484117958005e-06, 'epoch': 1.64}


 82%|████████▏ | 103000/124858 [10:48:04<2:14:53,  2.70it/s]

{'loss': 3.3496, 'learning_rate': 3.5012574284387067e-06, 'epoch': 1.65}


 83%|████████▎ | 103500/124858 [10:51:09<2:11:46,  2.70it/s]

{'loss': 3.2858, 'learning_rate': 3.421166445081613e-06, 'epoch': 1.66}


 83%|████████▎ | 104000/124858 [10:54:15<2:08:38,  2.70it/s]

{'loss': 3.3446, 'learning_rate': 3.3410754617245193e-06, 'epoch': 1.67}


 84%|████████▎ | 104500/124858 [10:57:20<2:05:36,  2.70it/s]

{'loss': 3.3772, 'learning_rate': 3.2609844783674255e-06, 'epoch': 1.67}


 84%|████████▍ | 105000/124858 [11:00:25<2:02:33,  2.70it/s]

{'loss': 3.3605, 'learning_rate': 3.1808934950103322e-06, 'epoch': 1.68}


 84%|████████▍ | 105500/124858 [11:03:30<1:59:24,  2.70it/s]

{'loss': 3.3096, 'learning_rate': 3.100802511653238e-06, 'epoch': 1.69}


 85%|████████▍ | 106000/124858 [11:06:35<1:56:20,  2.70it/s]

{'loss': 3.3343, 'learning_rate': 3.0207115282961448e-06, 'epoch': 1.7}


 85%|████████▌ | 106500/124858 [11:09:40<1:53:18,  2.70it/s]

{'loss': 3.3478, 'learning_rate': 2.940620544939051e-06, 'epoch': 1.71}


 86%|████████▌ | 107000/124858 [11:12:46<1:50:19,  2.70it/s]

{'loss': 3.3493, 'learning_rate': 2.8605295615819573e-06, 'epoch': 1.71}


 86%|████████▌ | 107500/124858 [11:15:51<1:47:09,  2.70it/s]

{'loss': 3.3389, 'learning_rate': 2.780438578224864e-06, 'epoch': 1.72}


 86%|████████▋ | 108000/124858 [11:18:56<1:44:01,  2.70it/s]

{'loss': 3.3462, 'learning_rate': 2.70034759486777e-06, 'epoch': 1.73}


 87%|████████▋ | 108500/124858 [11:22:01<1:40:57,  2.70it/s]

{'loss': 3.324, 'learning_rate': 2.6202566115106766e-06, 'epoch': 1.74}


 87%|████████▋ | 109000/124858 [11:25:06<1:37:53,  2.70it/s]

{'loss': 3.3499, 'learning_rate': 2.5401656281535824e-06, 'epoch': 1.75}


 88%|████████▊ | 109500/124858 [11:28:11<1:34:45,  2.70it/s]

{'loss': 3.3574, 'learning_rate': 2.460074644796489e-06, 'epoch': 1.75}


 88%|████████▊ | 110000/124858 [11:31:17<1:31:42,  2.70it/s]

{'loss': 3.3375, 'learning_rate': 2.3799836614393954e-06, 'epoch': 1.76}


 89%|████████▊ | 110500/124858 [11:34:23<1:28:34,  2.70it/s]

{'loss': 3.3347, 'learning_rate': 2.2998926780823016e-06, 'epoch': 1.77}


 89%|████████▉ | 111000/124858 [11:37:28<1:25:30,  2.70it/s]

{'loss': 3.3747, 'learning_rate': 2.219801694725208e-06, 'epoch': 1.78}


 89%|████████▉ | 111500/124858 [11:40:33<1:22:24,  2.70it/s]

{'loss': 3.3393, 'learning_rate': 2.139710711368114e-06, 'epoch': 1.79}


 90%|████████▉ | 112000/124858 [11:43:38<1:19:20,  2.70it/s]

{'loss': 3.3183, 'learning_rate': 2.0596197280110205e-06, 'epoch': 1.79}


 90%|█████████ | 112500/124858 [11:46:43<1:16:14,  2.70it/s]

{'loss': 3.3464, 'learning_rate': 1.979528744653927e-06, 'epoch': 1.8}


 91%|█████████ | 113000/124858 [11:49:49<1:13:10,  2.70it/s]

{'loss': 3.3505, 'learning_rate': 1.8994377612968334e-06, 'epoch': 1.81}


 91%|█████████ | 113500/124858 [11:52:54<1:10:05,  2.70it/s]

{'loss': 3.3301, 'learning_rate': 1.8193467779397397e-06, 'epoch': 1.82}


 91%|█████████▏| 114000/124858 [11:55:59<1:07:01,  2.70it/s]

{'loss': 3.3285, 'learning_rate': 1.739255794582646e-06, 'epoch': 1.83}


 92%|█████████▏| 114500/124858 [11:59:04<1:03:56,  2.70it/s]

{'loss': 3.3488, 'learning_rate': 1.6591648112255522e-06, 'epoch': 1.83}


 92%|█████████▏| 115000/124858 [12:02:09<1:00:50,  2.70it/s]

{'loss': 3.3732, 'learning_rate': 1.5790738278684587e-06, 'epoch': 1.84}


 93%|█████████▎| 115500/124858 [12:05:14<57:45,  2.70it/s]  

{'loss': 3.3191, 'learning_rate': 1.498982844511365e-06, 'epoch': 1.85}


 93%|█████████▎| 116000/124858 [12:08:19<54:40,  2.70it/s]

{'loss': 3.3681, 'learning_rate': 1.4188918611542713e-06, 'epoch': 1.86}


 93%|█████████▎| 116500/124858 [12:11:25<51:42,  2.69it/s]

{'loss': 3.3265, 'learning_rate': 1.3388008777971778e-06, 'epoch': 1.87}


 94%|█████████▎| 117000/124858 [12:14:30<48:46,  2.69it/s]

{'loss': 3.3147, 'learning_rate': 1.258709894440084e-06, 'epoch': 1.87}


 94%|█████████▍| 117500/124858 [12:17:36<45:32,  2.69it/s]

{'loss': 3.3293, 'learning_rate': 1.1786189110829905e-06, 'epoch': 1.88}


 95%|█████████▍| 118000/124858 [12:20:42<42:26,  2.69it/s]

{'loss': 3.3509, 'learning_rate': 1.0985279277258968e-06, 'epoch': 1.89}


 95%|█████████▍| 118500/124858 [12:23:47<39:20,  2.69it/s]

{'loss': 3.3478, 'learning_rate': 1.018436944368803e-06, 'epoch': 1.9}


 95%|█████████▌| 119000/124858 [12:26:52<36:08,  2.70it/s]

{'loss': 3.3742, 'learning_rate': 9.383459610117093e-07, 'epoch': 1.91}


 96%|█████████▌| 119500/124858 [12:29:57<33:03,  2.70it/s]

{'loss': 3.3446, 'learning_rate': 8.582549776546158e-07, 'epoch': 1.91}


 96%|█████████▌| 120000/124858 [12:33:02<29:57,  2.70it/s]

{'loss': 3.3914, 'learning_rate': 7.781639942975221e-07, 'epoch': 1.92}


 97%|█████████▋| 120500/124858 [12:36:09<26:52,  2.70it/s]

{'loss': 3.3867, 'learning_rate': 6.980730109404284e-07, 'epoch': 1.93}


 97%|█████████▋| 121000/124858 [12:39:14<23:48,  2.70it/s]

{'loss': 3.3367, 'learning_rate': 6.179820275833347e-07, 'epoch': 1.94}


 97%|█████████▋| 121500/124858 [12:42:19<20:44,  2.70it/s]

{'loss': 3.3605, 'learning_rate': 5.37891044226241e-07, 'epoch': 1.95}


 98%|█████████▊| 122000/124858 [12:45:25<17:39,  2.70it/s]

{'loss': 3.3663, 'learning_rate': 4.5780006086914743e-07, 'epoch': 1.95}


 98%|█████████▊| 122500/124858 [12:48:30<14:35,  2.69it/s]

{'loss': 3.3313, 'learning_rate': 3.777090775120537e-07, 'epoch': 1.96}


 99%|█████████▊| 123000/124858 [12:51:35<11:28,  2.70it/s]

{'loss': 3.3542, 'learning_rate': 2.9761809415496003e-07, 'epoch': 1.97}


 99%|█████████▉| 123500/124858 [12:54:41<08:23,  2.70it/s]

{'loss': 3.3372, 'learning_rate': 2.1752711079786638e-07, 'epoch': 1.98}


 99%|█████████▉| 124000/124858 [12:57:47<05:18,  2.69it/s]

{'loss': 3.3596, 'learning_rate': 1.3743612744077273e-07, 'epoch': 1.99}


100%|█████████▉| 124500/124858 [13:00:52<02:12,  2.70it/s]

{'loss': 3.3093, 'learning_rate': 5.734514408367906e-08, 'epoch': 1.99}


                                                          
100%|██████████| 124858/124858 [13:17:15<00:00,  2.61it/s]

{'eval_loss': 3.2779133319854736, 'eval_runtime': 850.3934, 'eval_samples_per_second': 16.315, 'eval_steps_per_second': 8.157, 'epoch': 2.0}
{'train_runtime': 47835.8148, 'train_samples_per_second': 5.22, 'train_steps_per_second': 2.61, 'train_loss': 3.4186777360722407, 'epoch': 2.0}





TrainOutput(global_step=124858, training_loss=3.4186777360722407, metrics={'train_runtime': 47835.8148, 'train_samples_per_second': 5.22, 'train_steps_per_second': 2.61, 'train_loss': 3.4186777360722407, 'epoch': 2.0})