# Installing Dependencies

# Import Libraries

In [27]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, load_metric

# Import Model

In [30]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Importing and Processing Dataset

In [34]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import AutoTokenizer

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and set the padding token
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Replace with your model name
tokenizer.pad_token = tokenizer.eos_token

# Load your dataset from JSON
df_glaive = pd.read_json("hf://datasets/glaiveai/glaive-code-assistant/c9bc9129-eba0-4b10-8292-4ae70fc7fa0d.json")

# Convert the DataFrame to a HuggingFace Dataset
dataset = Dataset.from_pandas(df_glaive)

# Split the dataset into training and validation sets (90% train, 10% validation)
train_data = dataset.select([i for i in range(len(dataset)) if i % 10 != 0])  # 90% for training
val_data = dataset.select([i for i in range(len(dataset)) if i % 10 == 0])  # 10% for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['question'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['answer'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

# Apply tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Print a sample for debugging
print(train_data[0])
print(val_data[0])

Map:   0%|          | 0/122498 [00:00<?, ? examples/s]

Map:   0%|          | 0/13611 [00:00<?, ? examples/s]

{'question': "How can I install Python 3 on an AWS EC2 instance? I tried using the command `sudo yum install python3`, but I received an error message saying `No package python3 available.`. I searched online but didn't find a solution. Do I need to download and install it manually?", 'answer': "To install Python 3 on an AWS EC2 instance, you can use the Amazon Linux Extras Library. This library is a curated set of software that Amazon provides for the Amazon Linux 2 platform. It includes newer versions of software, like Python 3, that are not included in the default Amazon Linux 2 repositories. Here is a step by step process on how to do it:\n\n1. First, update your instance with the following command:\n\n```bash\nsudo yum update -y\n```\n\n2. Next, list available packages in the Amazon Linux Extras repository by typing:\n\n```bash\nsudo amazon-linux-extras list\n```\n\n3. You should see python3.8 available in the list. To install it, use the following command:\n\n```bash\nsudo amazon

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Load your dataset in a CSV file
dataset = load_dataset("csv", data_files="data.csv")

# Split the dataset into training and validation sets
train_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 != 0])  # Use 90% of the data for training
val_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 == 0])  # Use 10% of the data for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['Bad_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['Good_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

# Apply tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Train Model

In [35]:
# Example input for pre-trained model
input_text = """Hi how are you?"""

# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate output
model.to(device)
pre_training_output = model.generate(
    input_ids,
    max_length=300,
    num_return_sequences=1,
    temperature=0.7,
    top_k=50,
)

# Decode and print the result
decoded_output = tokenizer.decode(pre_training_output[0], skip_special_tokens=True)
print("Output before training:")
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output before training:
Hi how are you?

I'm a little bit of a nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm a big nerd. I'm


In [40]:
# Example input for pre-trained model
input_text = """what is the mistake in this python code here? a = [1,2,3,4,5,6)
 for x in a 
 print(x)"""

# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate output
model.to(device)
pre_training_output = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,
    top_k=50,
)

# Decode and print the result
decoded_output = tokenizer.decode(pre_training_output[0], skip_special_tokens=True)
print("Output before training:")
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output before training:
what is the mistake in this python code here? a = [1,2,3,4,5,6)
 for x in a 
 print(x)

print(x)

print(x)

print(x)

print(x)

print(x)

print(x)

print(x)

print(x)

print(x)

print(x)

print(


In [41]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./gpt2_glaive',
    overwrite_output_dir=True,
    num_train_epochs=0.5,
    per_device_train_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir='./logs',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

# Fine-tune the model
trainer.train()



  0%|          | 0/30625 [00:00<?, ?it/s]

{'loss': 4.9745, 'grad_norm': 38.67289733886719, 'learning_rate': 4.983673469387755e-05, 'epoch': 0.0}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.5602641105651855, 'eval_runtime': 111.29, 'eval_samples_per_second': 122.302, 'eval_steps_per_second': 15.293, 'epoch': 0.0}
{'loss': 4.5466, 'grad_norm': 33.64840316772461, 'learning_rate': 4.967346938775511e-05, 'epoch': 0.0}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.46965217590332, 'eval_runtime': 111.5692, 'eval_samples_per_second': 121.996, 'eval_steps_per_second': 15.255, 'epoch': 0.0}
{'loss': 4.6061, 'grad_norm': 17.511735916137695, 'learning_rate': 4.9510204081632655e-05, 'epoch': 0.0}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.373347282409668, 'eval_runtime': 111.6196, 'eval_samples_per_second': 121.941, 'eval_steps_per_second': 15.248, 'epoch': 0.0}
{'loss': 4.5174, 'grad_norm': 36.45682907104492, 'learning_rate': 4.93469387755102e-05, 'epoch': 0.01}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.361853122711182, 'eval_runtime': 111.6429, 'eval_samples_per_second': 121.916, 'eval_steps_per_second': 15.245, 'epoch': 0.01}
{'loss': 4.4957, 'grad_norm': 49.092288970947266, 'learning_rate': 4.918367346938776e-05, 'epoch': 0.01}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.9009599685668945, 'eval_runtime': 110.8343, 'eval_samples_per_second': 122.805, 'eval_steps_per_second': 15.356, 'epoch': 0.01}
{'loss': 4.4774, 'grad_norm': 18.7022705078125, 'learning_rate': 4.902040816326531e-05, 'epoch': 0.01}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.294500827789307, 'eval_runtime': 111.6508, 'eval_samples_per_second': 121.907, 'eval_steps_per_second': 15.244, 'epoch': 0.01}
{'loss': 4.4391, 'grad_norm': 12.33527660369873, 'learning_rate': 4.885714285714286e-05, 'epoch': 0.01}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.301912307739258, 'eval_runtime': 111.411, 'eval_samples_per_second': 122.169, 'eval_steps_per_second': 15.277, 'epoch': 0.01}
{'loss': 4.3404, 'grad_norm': 28.901321411132812, 'learning_rate': 4.869387755102041e-05, 'epoch': 0.01}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.296668529510498, 'eval_runtime': 111.1942, 'eval_samples_per_second': 122.407, 'eval_steps_per_second': 15.307, 'epoch': 0.01}
{'loss': 4.3669, 'grad_norm': 9.255460739135742, 'learning_rate': 4.853061224489796e-05, 'epoch': 0.01}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.222559928894043, 'eval_runtime': 111.4684, 'eval_samples_per_second': 122.106, 'eval_steps_per_second': 15.269, 'epoch': 0.01}
{'loss': 4.1347, 'grad_norm': 5.199283123016357, 'learning_rate': 4.836734693877551e-05, 'epoch': 0.02}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.220297336578369, 'eval_runtime': 111.5953, 'eval_samples_per_second': 121.967, 'eval_steps_per_second': 15.252, 'epoch': 0.02}
{'loss': 4.2863, 'grad_norm': 20.783721923828125, 'learning_rate': 4.8204081632653065e-05, 'epoch': 0.02}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.243503093719482, 'eval_runtime': 111.5886, 'eval_samples_per_second': 121.975, 'eval_steps_per_second': 15.252, 'epoch': 0.02}
{'loss': 4.3054, 'grad_norm': 6.754384517669678, 'learning_rate': 4.804081632653061e-05, 'epoch': 0.02}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.220836639404297, 'eval_runtime': 110.7251, 'eval_samples_per_second': 122.926, 'eval_steps_per_second': 15.371, 'epoch': 0.02}
{'loss': 4.2202, 'grad_norm': 22.855955123901367, 'learning_rate': 4.787755102040817e-05, 'epoch': 0.02}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.207033157348633, 'eval_runtime': 109.5817, 'eval_samples_per_second': 124.209, 'eval_steps_per_second': 15.532, 'epoch': 0.02}
{'loss': 4.2691, 'grad_norm': 26.771291732788086, 'learning_rate': 4.771428571428572e-05, 'epoch': 0.02}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.223531723022461, 'eval_runtime': 109.4897, 'eval_samples_per_second': 124.313, 'eval_steps_per_second': 15.545, 'epoch': 0.02}
{'loss': 4.297, 'grad_norm': 8.300406455993652, 'learning_rate': 4.7551020408163263e-05, 'epoch': 0.02}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.210755825042725, 'eval_runtime': 109.5846, 'eval_samples_per_second': 124.205, 'eval_steps_per_second': 15.531, 'epoch': 0.02}
{'loss': 4.1531, 'grad_norm': 9.440519332885742, 'learning_rate': 4.738775510204082e-05, 'epoch': 0.03}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.229766845703125, 'eval_runtime': 109.4995, 'eval_samples_per_second': 124.302, 'eval_steps_per_second': 15.543, 'epoch': 0.03}
{'loss': 4.1474, 'grad_norm': 5.966073036193848, 'learning_rate': 4.722448979591837e-05, 'epoch': 0.03}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.196539878845215, 'eval_runtime': 109.5675, 'eval_samples_per_second': 124.225, 'eval_steps_per_second': 15.534, 'epoch': 0.03}
{'loss': 4.2709, 'grad_norm': 8.998388290405273, 'learning_rate': 4.7061224489795916e-05, 'epoch': 0.03}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.174525737762451, 'eval_runtime': 109.5312, 'eval_samples_per_second': 124.266, 'eval_steps_per_second': 15.539, 'epoch': 0.03}
{'loss': 4.2288, 'grad_norm': 8.450685501098633, 'learning_rate': 4.6897959183673475e-05, 'epoch': 0.03}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.145242214202881, 'eval_runtime': 109.5796, 'eval_samples_per_second': 124.211, 'eval_steps_per_second': 15.532, 'epoch': 0.03}
{'loss': 4.2263, 'grad_norm': 5.533444881439209, 'learning_rate': 4.673469387755102e-05, 'epoch': 0.03}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.137927532196045, 'eval_runtime': 109.5046, 'eval_samples_per_second': 124.296, 'eval_steps_per_second': 15.543, 'epoch': 0.03}
{'loss': 4.2551, 'grad_norm': 20.7500057220459, 'learning_rate': 4.6571428571428575e-05, 'epoch': 0.03}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.363005638122559, 'eval_runtime': 109.5363, 'eval_samples_per_second': 124.26, 'eval_steps_per_second': 15.538, 'epoch': 0.03}
{'loss': 4.0801, 'grad_norm': 6.6810526847839355, 'learning_rate': 4.640816326530613e-05, 'epoch': 0.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.1602373123168945, 'eval_runtime': 109.636, 'eval_samples_per_second': 124.147, 'eval_steps_per_second': 15.524, 'epoch': 0.04}
{'loss': 4.0953, 'grad_norm': 13.332316398620605, 'learning_rate': 4.6244897959183674e-05, 'epoch': 0.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.207216739654541, 'eval_runtime': 109.5325, 'eval_samples_per_second': 124.264, 'eval_steps_per_second': 15.539, 'epoch': 0.04}
{'loss': 4.1675, 'grad_norm': 2.4331681728363037, 'learning_rate': 4.608163265306123e-05, 'epoch': 0.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.14959716796875, 'eval_runtime': 109.5797, 'eval_samples_per_second': 124.211, 'eval_steps_per_second': 15.532, 'epoch': 0.04}
{'loss': 4.1151, 'grad_norm': 13.208364486694336, 'learning_rate': 4.591836734693878e-05, 'epoch': 0.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.348509311676025, 'eval_runtime': 109.6369, 'eval_samples_per_second': 124.146, 'eval_steps_per_second': 15.524, 'epoch': 0.04}
{'loss': 4.225, 'grad_norm': 3.899728775024414, 'learning_rate': 4.5755102040816326e-05, 'epoch': 0.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.133893966674805, 'eval_runtime': 109.5619, 'eval_samples_per_second': 124.231, 'eval_steps_per_second': 15.535, 'epoch': 0.04}
{'loss': 4.2497, 'grad_norm': 5.247395038604736, 'learning_rate': 4.559183673469388e-05, 'epoch': 0.04}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.116611957550049, 'eval_runtime': 109.5988, 'eval_samples_per_second': 124.189, 'eval_steps_per_second': 15.529, 'epoch': 0.04}
{'loss': 4.2981, 'grad_norm': 6.629566192626953, 'learning_rate': 4.542857142857143e-05, 'epoch': 0.05}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.117306709289551, 'eval_runtime': 109.6027, 'eval_samples_per_second': 124.185, 'eval_steps_per_second': 15.529, 'epoch': 0.05}
{'loss': 4.1416, 'grad_norm': 7.947585105895996, 'learning_rate': 4.526530612244898e-05, 'epoch': 0.05}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.159780502319336, 'eval_runtime': 109.5032, 'eval_samples_per_second': 124.298, 'eval_steps_per_second': 15.543, 'epoch': 0.05}
{'loss': 4.1711, 'grad_norm': 3.4660732746124268, 'learning_rate': 4.510204081632654e-05, 'epoch': 0.05}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.112354755401611, 'eval_runtime': 109.531, 'eval_samples_per_second': 124.266, 'eval_steps_per_second': 15.539, 'epoch': 0.05}
{'loss': 4.1426, 'grad_norm': 3.0599429607391357, 'learning_rate': 4.4938775510204084e-05, 'epoch': 0.05}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.110419273376465, 'eval_runtime': 109.5872, 'eval_samples_per_second': 124.202, 'eval_steps_per_second': 15.531, 'epoch': 0.05}
{'loss': 4.1325, 'grad_norm': 6.1060285568237305, 'learning_rate': 4.477551020408163e-05, 'epoch': 0.05}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.095597743988037, 'eval_runtime': 109.553, 'eval_samples_per_second': 124.241, 'eval_steps_per_second': 15.536, 'epoch': 0.05}
{'loss': 4.0609, 'grad_norm': 11.919713973999023, 'learning_rate': 4.461224489795919e-05, 'epoch': 0.05}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.068836212158203, 'eval_runtime': 109.4607, 'eval_samples_per_second': 124.346, 'eval_steps_per_second': 15.549, 'epoch': 0.05}
{'loss': 4.0534, 'grad_norm': 4.450563907623291, 'learning_rate': 4.4448979591836737e-05, 'epoch': 0.06}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.107717990875244, 'eval_runtime': 109.5235, 'eval_samples_per_second': 124.275, 'eval_steps_per_second': 15.54, 'epoch': 0.06}
{'loss': 4.4088, 'grad_norm': 4.731905937194824, 'learning_rate': 4.428571428571428e-05, 'epoch': 0.06}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.119265556335449, 'eval_runtime': 109.5752, 'eval_samples_per_second': 124.216, 'eval_steps_per_second': 15.533, 'epoch': 0.06}
{'loss': 3.9978, 'grad_norm': 3.4483375549316406, 'learning_rate': 4.412244897959184e-05, 'epoch': 0.06}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.12792444229126, 'eval_runtime': 109.6135, 'eval_samples_per_second': 124.173, 'eval_steps_per_second': 15.527, 'epoch': 0.06}
{'loss': 4.2172, 'grad_norm': 4.733151435852051, 'learning_rate': 4.395918367346939e-05, 'epoch': 0.06}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.0758376121521, 'eval_runtime': 109.617, 'eval_samples_per_second': 124.169, 'eval_steps_per_second': 15.527, 'epoch': 0.06}
{'loss': 4.218, 'grad_norm': 2.4679746627807617, 'learning_rate': 4.379591836734694e-05, 'epoch': 0.06}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.099019527435303, 'eval_runtime': 109.5383, 'eval_samples_per_second': 124.258, 'eval_steps_per_second': 15.538, 'epoch': 0.06}
{'loss': 4.0344, 'grad_norm': 4.772653579711914, 'learning_rate': 4.3632653061224495e-05, 'epoch': 0.06}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.07487154006958, 'eval_runtime': 109.5907, 'eval_samples_per_second': 124.199, 'eval_steps_per_second': 15.531, 'epoch': 0.06}
{'loss': 4.1976, 'grad_norm': 6.565102577209473, 'learning_rate': 4.346938775510204e-05, 'epoch': 0.07}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.071948528289795, 'eval_runtime': 109.5269, 'eval_samples_per_second': 124.271, 'eval_steps_per_second': 15.54, 'epoch': 0.07}
{'loss': 4.26, 'grad_norm': 4.243940353393555, 'learning_rate': 4.3306122448979594e-05, 'epoch': 0.07}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.095213890075684, 'eval_runtime': 109.6008, 'eval_samples_per_second': 124.187, 'eval_steps_per_second': 15.529, 'epoch': 0.07}
{'loss': 4.1229, 'grad_norm': 3.787357807159424, 'learning_rate': 4.314285714285715e-05, 'epoch': 0.07}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.044898986816406, 'eval_runtime': 109.6119, 'eval_samples_per_second': 124.175, 'eval_steps_per_second': 15.528, 'epoch': 0.07}
{'loss': 4.2724, 'grad_norm': 5.557543754577637, 'learning_rate': 4.297959183673469e-05, 'epoch': 0.07}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.0657477378845215, 'eval_runtime': 109.6368, 'eval_samples_per_second': 124.146, 'eval_steps_per_second': 15.524, 'epoch': 0.07}
{'loss': 4.23, 'grad_norm': 2.291269063949585, 'learning_rate': 4.281632653061225e-05, 'epoch': 0.07}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.0535783767700195, 'eval_runtime': 109.5817, 'eval_samples_per_second': 124.209, 'eval_steps_per_second': 15.532, 'epoch': 0.07}
{'loss': 4.0713, 'grad_norm': 6.41731595993042, 'learning_rate': 4.26530612244898e-05, 'epoch': 0.07}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.069117546081543, 'eval_runtime': 109.6299, 'eval_samples_per_second': 124.154, 'eval_steps_per_second': 15.525, 'epoch': 0.07}
{'loss': 4.0067, 'grad_norm': 2.8631091117858887, 'learning_rate': 4.2489795918367345e-05, 'epoch': 0.08}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.0461039543151855, 'eval_runtime': 109.5428, 'eval_samples_per_second': 124.253, 'eval_steps_per_second': 15.537, 'epoch': 0.08}
{'loss': 4.1803, 'grad_norm': 9.380488395690918, 'learning_rate': 4.2326530612244905e-05, 'epoch': 0.08}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.057071685791016, 'eval_runtime': 109.5603, 'eval_samples_per_second': 124.233, 'eval_steps_per_second': 15.535, 'epoch': 0.08}
{'loss': 4.0202, 'grad_norm': 5.051041603088379, 'learning_rate': 4.216326530612245e-05, 'epoch': 0.08}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.081528663635254, 'eval_runtime': 109.5385, 'eval_samples_per_second': 124.258, 'eval_steps_per_second': 15.538, 'epoch': 0.08}
{'loss': 4.1852, 'grad_norm': 8.982549667358398, 'learning_rate': 4.2e-05, 'epoch': 0.08}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.098198890686035, 'eval_runtime': 109.5157, 'eval_samples_per_second': 124.283, 'eval_steps_per_second': 15.541, 'epoch': 0.08}
{'loss': 4.1755, 'grad_norm': 3.6063663959503174, 'learning_rate': 4.183673469387756e-05, 'epoch': 0.08}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.040622234344482, 'eval_runtime': 109.5484, 'eval_samples_per_second': 124.246, 'eval_steps_per_second': 15.537, 'epoch': 0.08}
{'loss': 4.0325, 'grad_norm': 7.212335586547852, 'learning_rate': 4.1673469387755104e-05, 'epoch': 0.08}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.027671813964844, 'eval_runtime': 109.5383, 'eval_samples_per_second': 124.258, 'eval_steps_per_second': 15.538, 'epoch': 0.08}
{'loss': 4.104, 'grad_norm': 2.3704311847686768, 'learning_rate': 4.151020408163265e-05, 'epoch': 0.08}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.0323967933654785, 'eval_runtime': 109.554, 'eval_samples_per_second': 124.24, 'eval_steps_per_second': 15.536, 'epoch': 0.08}
{'loss': 4.0576, 'grad_norm': 6.652912139892578, 'learning_rate': 4.134693877551021e-05, 'epoch': 0.09}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.0538859367370605, 'eval_runtime': 109.526, 'eval_samples_per_second': 124.272, 'eval_steps_per_second': 15.54, 'epoch': 0.09}
{'loss': 3.986, 'grad_norm': 2.738908529281616, 'learning_rate': 4.1183673469387756e-05, 'epoch': 0.09}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.02589225769043, 'eval_runtime': 109.5269, 'eval_samples_per_second': 124.271, 'eval_steps_per_second': 15.54, 'epoch': 0.09}
{'loss': 4.1869, 'grad_norm': 3.0372278690338135, 'learning_rate': 4.102040816326531e-05, 'epoch': 0.09}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.019473552703857, 'eval_runtime': 109.5516, 'eval_samples_per_second': 124.243, 'eval_steps_per_second': 15.536, 'epoch': 0.09}
{'loss': 4.1128, 'grad_norm': 7.767414569854736, 'learning_rate': 4.085714285714286e-05, 'epoch': 0.09}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.040103912353516, 'eval_runtime': 110.8279, 'eval_samples_per_second': 122.812, 'eval_steps_per_second': 15.357, 'epoch': 0.09}
{'loss': 3.9369, 'grad_norm': 2.3988208770751953, 'learning_rate': 4.069387755102041e-05, 'epoch': 0.09}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.032371997833252, 'eval_runtime': 110.6005, 'eval_samples_per_second': 123.065, 'eval_steps_per_second': 15.389, 'epoch': 0.09}
{'loss': 4.1358, 'grad_norm': 3.522266387939453, 'learning_rate': 4.053061224489796e-05, 'epoch': 0.09}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.021040916442871, 'eval_runtime': 109.5849, 'eval_samples_per_second': 124.205, 'eval_steps_per_second': 15.531, 'epoch': 0.09}
{'loss': 4.0765, 'grad_norm': 5.209183692932129, 'learning_rate': 4.0367346938775514e-05, 'epoch': 0.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.024949550628662, 'eval_runtime': 109.6045, 'eval_samples_per_second': 124.183, 'eval_steps_per_second': 15.529, 'epoch': 0.1}
{'loss': 4.3145, 'grad_norm': 2.847734212875366, 'learning_rate': 4.020408163265306e-05, 'epoch': 0.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.023052215576172, 'eval_runtime': 109.6039, 'eval_samples_per_second': 124.184, 'eval_steps_per_second': 15.529, 'epoch': 0.1}
{'loss': 4.024, 'grad_norm': 7.358754634857178, 'learning_rate': 4.004081632653062e-05, 'epoch': 0.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.028507709503174, 'eval_runtime': 109.6037, 'eval_samples_per_second': 124.184, 'eval_steps_per_second': 15.529, 'epoch': 0.1}
{'loss': 4.0476, 'grad_norm': 4.818171977996826, 'learning_rate': 3.9877551020408166e-05, 'epoch': 0.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.012923717498779, 'eval_runtime': 109.5898, 'eval_samples_per_second': 124.2, 'eval_steps_per_second': 15.531, 'epoch': 0.1}
{'loss': 4.0125, 'grad_norm': 2.971646547317505, 'learning_rate': 3.971428571428571e-05, 'epoch': 0.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.000617027282715, 'eval_runtime': 109.5888, 'eval_samples_per_second': 124.201, 'eval_steps_per_second': 15.531, 'epoch': 0.1}
{'loss': 4.114, 'grad_norm': 2.0294923782348633, 'learning_rate': 3.955102040816327e-05, 'epoch': 0.1}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.014309883117676, 'eval_runtime': 109.5004, 'eval_samples_per_second': 124.301, 'eval_steps_per_second': 15.543, 'epoch': 0.1}
{'loss': 4.029, 'grad_norm': 4.493895530700684, 'learning_rate': 3.938775510204082e-05, 'epoch': 0.11}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.001533031463623, 'eval_runtime': 109.5809, 'eval_samples_per_second': 124.21, 'eval_steps_per_second': 15.532, 'epoch': 0.11}
{'loss': 4.0195, 'grad_norm': 2.7981343269348145, 'learning_rate': 3.9224489795918365e-05, 'epoch': 0.11}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.012678146362305, 'eval_runtime': 109.5177, 'eval_samples_per_second': 124.281, 'eval_steps_per_second': 15.541, 'epoch': 0.11}
{'loss': 4.206, 'grad_norm': 3.8193840980529785, 'learning_rate': 3.9061224489795925e-05, 'epoch': 0.11}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.010492324829102, 'eval_runtime': 109.4738, 'eval_samples_per_second': 124.331, 'eval_steps_per_second': 15.547, 'epoch': 0.11}
{'loss': 4.0185, 'grad_norm': 2.617654323577881, 'learning_rate': 3.889795918367347e-05, 'epoch': 0.11}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.000938892364502, 'eval_runtime': 109.4921, 'eval_samples_per_second': 124.31, 'eval_steps_per_second': 15.545, 'epoch': 0.11}
{'loss': 4.0195, 'grad_norm': 3.1409425735473633, 'learning_rate': 3.8734693877551024e-05, 'epoch': 0.11}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.0092058181762695, 'eval_runtime': 109.5165, 'eval_samples_per_second': 124.283, 'eval_steps_per_second': 15.541, 'epoch': 0.11}
{'loss': 4.2147, 'grad_norm': 3.8207006454467773, 'learning_rate': 3.857142857142858e-05, 'epoch': 0.11}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.015017509460449, 'eval_runtime': 109.5017, 'eval_samples_per_second': 124.299, 'eval_steps_per_second': 15.543, 'epoch': 0.11}
{'loss': 4.107, 'grad_norm': 4.299478054046631, 'learning_rate': 3.840816326530612e-05, 'epoch': 0.12}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.027975082397461, 'eval_runtime': 109.71, 'eval_samples_per_second': 124.063, 'eval_steps_per_second': 15.514, 'epoch': 0.12}
{'loss': 4.0567, 'grad_norm': 4.485382080078125, 'learning_rate': 3.8244897959183676e-05, 'epoch': 0.12}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.018169403076172, 'eval_runtime': 109.5052, 'eval_samples_per_second': 124.295, 'eval_steps_per_second': 15.543, 'epoch': 0.12}
{'loss': 4.1122, 'grad_norm': 3.8289167881011963, 'learning_rate': 3.808163265306123e-05, 'epoch': 0.12}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.027092456817627, 'eval_runtime': 109.4962, 'eval_samples_per_second': 124.306, 'eval_steps_per_second': 15.544, 'epoch': 0.12}
{'loss': 3.9401, 'grad_norm': 3.1419408321380615, 'learning_rate': 3.7918367346938775e-05, 'epoch': 0.12}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.040889263153076, 'eval_runtime': 109.4671, 'eval_samples_per_second': 124.339, 'eval_steps_per_second': 15.548, 'epoch': 0.12}
{'loss': 3.8952, 'grad_norm': 3.175879716873169, 'learning_rate': 3.775510204081633e-05, 'epoch': 0.12}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.010098457336426, 'eval_runtime': 109.5117, 'eval_samples_per_second': 124.288, 'eval_steps_per_second': 15.542, 'epoch': 0.12}
{'loss': 3.9851, 'grad_norm': 3.3393962383270264, 'learning_rate': 3.759183673469388e-05, 'epoch': 0.12}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.985342502593994, 'eval_runtime': 109.5447, 'eval_samples_per_second': 124.251, 'eval_steps_per_second': 15.537, 'epoch': 0.12}
{'loss': 4.0861, 'grad_norm': 2.089745283126831, 'learning_rate': 3.742857142857143e-05, 'epoch': 0.13}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.009171485900879, 'eval_runtime': 109.5841, 'eval_samples_per_second': 124.206, 'eval_steps_per_second': 15.531, 'epoch': 0.13}
{'loss': 4.0995, 'grad_norm': 5.585481643676758, 'learning_rate': 3.726530612244899e-05, 'epoch': 0.13}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9912455081939697, 'eval_runtime': 109.6068, 'eval_samples_per_second': 124.18, 'eval_steps_per_second': 15.528, 'epoch': 0.13}
{'loss': 4.0191, 'grad_norm': 3.654374122619629, 'learning_rate': 3.7102040816326533e-05, 'epoch': 0.13}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9847769737243652, 'eval_runtime': 109.5893, 'eval_samples_per_second': 124.2, 'eval_steps_per_second': 15.531, 'epoch': 0.13}
{'loss': 3.9891, 'grad_norm': 5.228717803955078, 'learning_rate': 3.693877551020408e-05, 'epoch': 0.13}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.006066799163818, 'eval_runtime': 109.5871, 'eval_samples_per_second': 124.203, 'eval_steps_per_second': 15.531, 'epoch': 0.13}
{'loss': 3.9369, 'grad_norm': 17.643953323364258, 'learning_rate': 3.677551020408164e-05, 'epoch': 0.13}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.994655132293701, 'eval_runtime': 109.5377, 'eval_samples_per_second': 124.259, 'eval_steps_per_second': 15.538, 'epoch': 0.13}
{'loss': 3.9048, 'grad_norm': 5.819009304046631, 'learning_rate': 3.6612244897959186e-05, 'epoch': 0.13}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.981016159057617, 'eval_runtime': 109.5587, 'eval_samples_per_second': 124.235, 'eval_steps_per_second': 15.535, 'epoch': 0.13}
{'loss': 4.0356, 'grad_norm': 1.6630903482437134, 'learning_rate': 3.644897959183673e-05, 'epoch': 0.14}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9871022701263428, 'eval_runtime': 109.5709, 'eval_samples_per_second': 124.221, 'eval_steps_per_second': 15.533, 'epoch': 0.14}
{'loss': 4.0776, 'grad_norm': 3.6497743129730225, 'learning_rate': 3.628571428571429e-05, 'epoch': 0.14}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9906599521636963, 'eval_runtime': 109.5731, 'eval_samples_per_second': 124.218, 'eval_steps_per_second': 15.533, 'epoch': 0.14}
{'loss': 3.7693, 'grad_norm': 4.460656642913818, 'learning_rate': 3.612244897959184e-05, 'epoch': 0.14}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.985089063644409, 'eval_runtime': 109.6105, 'eval_samples_per_second': 124.176, 'eval_steps_per_second': 15.528, 'epoch': 0.14}
{'loss': 3.9655, 'grad_norm': 2.5629570484161377, 'learning_rate': 3.595918367346939e-05, 'epoch': 0.14}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.979598045349121, 'eval_runtime': 109.6327, 'eval_samples_per_second': 124.151, 'eval_steps_per_second': 15.525, 'epoch': 0.14}
{'loss': 4.0646, 'grad_norm': 4.0319037437438965, 'learning_rate': 3.5795918367346944e-05, 'epoch': 0.14}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9874813556671143, 'eval_runtime': 109.6431, 'eval_samples_per_second': 124.139, 'eval_steps_per_second': 15.523, 'epoch': 0.14}
{'loss': 4.0197, 'grad_norm': 1.7747122049331665, 'learning_rate': 3.563265306122449e-05, 'epoch': 0.14}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.988030195236206, 'eval_runtime': 109.6013, 'eval_samples_per_second': 124.187, 'eval_steps_per_second': 15.529, 'epoch': 0.14}
{'loss': 4.0926, 'grad_norm': 2.0107805728912354, 'learning_rate': 3.546938775510204e-05, 'epoch': 0.15}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9755423069000244, 'eval_runtime': 109.5763, 'eval_samples_per_second': 124.215, 'eval_steps_per_second': 15.533, 'epoch': 0.15}
{'loss': 3.8514, 'grad_norm': 3.3216347694396973, 'learning_rate': 3.5306122448979596e-05, 'epoch': 0.15}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.00956392288208, 'eval_runtime': 109.5963, 'eval_samples_per_second': 124.192, 'eval_steps_per_second': 15.53, 'epoch': 0.15}
{'loss': 4.0537, 'grad_norm': 2.0142462253570557, 'learning_rate': 3.514285714285714e-05, 'epoch': 0.15}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9777133464813232, 'eval_runtime': 109.6189, 'eval_samples_per_second': 124.167, 'eval_steps_per_second': 15.527, 'epoch': 0.15}
{'loss': 3.9972, 'grad_norm': 1.7710660696029663, 'learning_rate': 3.4979591836734695e-05, 'epoch': 0.15}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.023947715759277, 'eval_runtime': 109.6149, 'eval_samples_per_second': 124.171, 'eval_steps_per_second': 15.527, 'epoch': 0.15}
{'loss': 4.12, 'grad_norm': 4.356527805328369, 'learning_rate': 3.481632653061225e-05, 'epoch': 0.15}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.97652530670166, 'eval_runtime': 109.5473, 'eval_samples_per_second': 124.248, 'eval_steps_per_second': 15.537, 'epoch': 0.15}
{'loss': 3.9678, 'grad_norm': 7.051150798797607, 'learning_rate': 3.4653061224489795e-05, 'epoch': 0.15}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.975703716278076, 'eval_runtime': 109.5421, 'eval_samples_per_second': 124.254, 'eval_steps_per_second': 15.537, 'epoch': 0.15}
{'loss': 4.0749, 'grad_norm': 4.758042335510254, 'learning_rate': 3.4489795918367354e-05, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9756946563720703, 'eval_runtime': 109.5857, 'eval_samples_per_second': 124.204, 'eval_steps_per_second': 15.531, 'epoch': 0.16}
{'loss': 4.0048, 'grad_norm': 3.4862775802612305, 'learning_rate': 3.43265306122449e-05, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9742419719696045, 'eval_runtime': 109.5928, 'eval_samples_per_second': 124.196, 'eval_steps_per_second': 15.53, 'epoch': 0.16}
{'loss': 3.986, 'grad_norm': 1.7065801620483398, 'learning_rate': 3.416326530612245e-05, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9735803604125977, 'eval_runtime': 109.6797, 'eval_samples_per_second': 124.098, 'eval_steps_per_second': 15.518, 'epoch': 0.16}
{'loss': 3.8687, 'grad_norm': 3.1336231231689453, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9897584915161133, 'eval_runtime': 109.6645, 'eval_samples_per_second': 124.115, 'eval_steps_per_second': 15.52, 'epoch': 0.16}
{'loss': 4.0095, 'grad_norm': 4.4796142578125, 'learning_rate': 3.383673469387755e-05, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 4.005974769592285, 'eval_runtime': 109.6504, 'eval_samples_per_second': 124.131, 'eval_steps_per_second': 15.522, 'epoch': 0.16}
{'loss': 3.9166, 'grad_norm': 2.862626791000366, 'learning_rate': 3.36734693877551e-05, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9752705097198486, 'eval_runtime': 109.6174, 'eval_samples_per_second': 124.168, 'eval_steps_per_second': 15.527, 'epoch': 0.16}
{'loss': 4.1396, 'grad_norm': 7.394679546356201, 'learning_rate': 3.351020408163266e-05, 'epoch': 0.16}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9698221683502197, 'eval_runtime': 109.6348, 'eval_samples_per_second': 124.149, 'eval_steps_per_second': 15.524, 'epoch': 0.16}
{'loss': 3.9334, 'grad_norm': 7.905846118927002, 'learning_rate': 3.3346938775510205e-05, 'epoch': 0.17}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9786951541900635, 'eval_runtime': 109.6201, 'eval_samples_per_second': 124.165, 'eval_steps_per_second': 15.526, 'epoch': 0.17}
{'loss': 3.8805, 'grad_norm': 1.856276512145996, 'learning_rate': 3.318367346938776e-05, 'epoch': 0.17}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9898669719696045, 'eval_runtime': 109.6264, 'eval_samples_per_second': 124.158, 'eval_steps_per_second': 15.525, 'epoch': 0.17}
{'loss': 3.8258, 'grad_norm': 1.6188172101974487, 'learning_rate': 3.302040816326531e-05, 'epoch': 0.17}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9695801734924316, 'eval_runtime': 109.6859, 'eval_samples_per_second': 124.091, 'eval_steps_per_second': 15.517, 'epoch': 0.17}
{'loss': 3.9814, 'grad_norm': 3.7882423400878906, 'learning_rate': 3.285714285714286e-05, 'epoch': 0.17}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9569196701049805, 'eval_runtime': 109.5726, 'eval_samples_per_second': 124.219, 'eval_steps_per_second': 15.533, 'epoch': 0.17}
{'loss': 4.0668, 'grad_norm': 4.8499979972839355, 'learning_rate': 3.269387755102041e-05, 'epoch': 0.17}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9648685455322266, 'eval_runtime': 109.6781, 'eval_samples_per_second': 124.1, 'eval_steps_per_second': 15.518, 'epoch': 0.17}
{'loss': 4.0959, 'grad_norm': 5.825951099395752, 'learning_rate': 3.253061224489796e-05, 'epoch': 0.17}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9638001918792725, 'eval_runtime': 109.6467, 'eval_samples_per_second': 124.135, 'eval_steps_per_second': 15.523, 'epoch': 0.17}
{'loss': 3.9219, 'grad_norm': 2.193222999572754, 'learning_rate': 3.236734693877551e-05, 'epoch': 0.18}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.970899820327759, 'eval_runtime': 109.6326, 'eval_samples_per_second': 124.151, 'eval_steps_per_second': 15.525, 'epoch': 0.18}
{'loss': 3.9806, 'grad_norm': 2.2301106452941895, 'learning_rate': 3.220408163265306e-05, 'epoch': 0.18}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9491333961486816, 'eval_runtime': 109.5838, 'eval_samples_per_second': 124.206, 'eval_steps_per_second': 15.531, 'epoch': 0.18}
{'loss': 4.0017, 'grad_norm': 5.2124104499816895, 'learning_rate': 3.2040816326530615e-05, 'epoch': 0.18}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.972430467605591, 'eval_runtime': 109.5794, 'eval_samples_per_second': 124.211, 'eval_steps_per_second': 15.532, 'epoch': 0.18}
{'loss': 3.9525, 'grad_norm': 5.154917240142822, 'learning_rate': 3.187755102040816e-05, 'epoch': 0.18}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9516823291778564, 'eval_runtime': 109.5891, 'eval_samples_per_second': 124.2, 'eval_steps_per_second': 15.531, 'epoch': 0.18}
{'loss': 3.9963, 'grad_norm': 2.9394686222076416, 'learning_rate': 3.1714285714285715e-05, 'epoch': 0.18}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9693291187286377, 'eval_runtime': 109.6337, 'eval_samples_per_second': 124.15, 'eval_steps_per_second': 15.524, 'epoch': 0.18}
{'loss': 3.9465, 'grad_norm': 3.82352352142334, 'learning_rate': 3.155102040816327e-05, 'epoch': 0.18}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9509921073913574, 'eval_runtime': 109.6047, 'eval_samples_per_second': 124.183, 'eval_steps_per_second': 15.529, 'epoch': 0.18}
{'loss': 3.9748, 'grad_norm': 2.6522154808044434, 'learning_rate': 3.1387755102040814e-05, 'epoch': 0.19}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9569458961486816, 'eval_runtime': 109.5791, 'eval_samples_per_second': 124.212, 'eval_steps_per_second': 15.532, 'epoch': 0.19}
{'loss': 3.9213, 'grad_norm': 2.4038891792297363, 'learning_rate': 3.1224489795918374e-05, 'epoch': 0.19}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.952199697494507, 'eval_runtime': 109.6056, 'eval_samples_per_second': 124.182, 'eval_steps_per_second': 15.528, 'epoch': 0.19}
{'loss': 3.8775, 'grad_norm': 4.304388999938965, 'learning_rate': 3.106122448979592e-05, 'epoch': 0.19}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9641926288604736, 'eval_runtime': 109.6784, 'eval_samples_per_second': 124.099, 'eval_steps_per_second': 15.518, 'epoch': 0.19}
{'loss': 3.9582, 'grad_norm': 8.275628089904785, 'learning_rate': 3.0897959183673466e-05, 'epoch': 0.19}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9439146518707275, 'eval_runtime': 109.6656, 'eval_samples_per_second': 124.114, 'eval_steps_per_second': 15.52, 'epoch': 0.19}
{'loss': 4.1851, 'grad_norm': 2.593508720397949, 'learning_rate': 3.0734693877551026e-05, 'epoch': 0.19}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9512853622436523, 'eval_runtime': 109.7069, 'eval_samples_per_second': 124.067, 'eval_steps_per_second': 15.514, 'epoch': 0.19}
{'loss': 3.9274, 'grad_norm': 4.033092975616455, 'learning_rate': 3.057142857142857e-05, 'epoch': 0.19}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9507102966308594, 'eval_runtime': 109.685, 'eval_samples_per_second': 124.092, 'eval_steps_per_second': 15.517, 'epoch': 0.19}
{'loss': 3.9723, 'grad_norm': 6.4450602531433105, 'learning_rate': 3.040816326530612e-05, 'epoch': 0.2}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9640097618103027, 'eval_runtime': 109.6654, 'eval_samples_per_second': 124.114, 'eval_steps_per_second': 15.52, 'epoch': 0.2}
{'loss': 4.0192, 'grad_norm': 3.024960994720459, 'learning_rate': 3.0244897959183678e-05, 'epoch': 0.2}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.949735403060913, 'eval_runtime': 109.6951, 'eval_samples_per_second': 124.08, 'eval_steps_per_second': 15.516, 'epoch': 0.2}
{'loss': 3.7799, 'grad_norm': 2.4874165058135986, 'learning_rate': 3.0081632653061224e-05, 'epoch': 0.2}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.948024272918701, 'eval_runtime': 109.6643, 'eval_samples_per_second': 124.115, 'eval_steps_per_second': 15.52, 'epoch': 0.2}
{'loss': 4.069, 'grad_norm': 2.0504636764526367, 'learning_rate': 2.9918367346938774e-05, 'epoch': 0.2}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.94998836517334, 'eval_runtime': 109.6536, 'eval_samples_per_second': 124.127, 'eval_steps_per_second': 15.522, 'epoch': 0.2}
{'loss': 3.9199, 'grad_norm': 3.3429770469665527, 'learning_rate': 2.975510204081633e-05, 'epoch': 0.2}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.986797571182251, 'eval_runtime': 109.6532, 'eval_samples_per_second': 124.128, 'eval_steps_per_second': 15.522, 'epoch': 0.2}
{'loss': 4.138, 'grad_norm': 6.326511859893799, 'learning_rate': 2.959183673469388e-05, 'epoch': 0.2}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.957427501678467, 'eval_runtime': 109.5796, 'eval_samples_per_second': 124.211, 'eval_steps_per_second': 15.532, 'epoch': 0.2}
{'loss': 4.0424, 'grad_norm': 3.8447580337524414, 'learning_rate': 2.9428571428571426e-05, 'epoch': 0.21}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.941073417663574, 'eval_runtime': 109.6379, 'eval_samples_per_second': 124.145, 'eval_steps_per_second': 15.524, 'epoch': 0.21}
{'loss': 3.9769, 'grad_norm': 3.301389694213867, 'learning_rate': 2.9265306122448982e-05, 'epoch': 0.21}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9544870853424072, 'eval_runtime': 109.6948, 'eval_samples_per_second': 124.081, 'eval_steps_per_second': 15.516, 'epoch': 0.21}
{'loss': 4.0777, 'grad_norm': 2.2017979621887207, 'learning_rate': 2.9102040816326532e-05, 'epoch': 0.21}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.944809675216675, 'eval_runtime': 109.6236, 'eval_samples_per_second': 124.161, 'eval_steps_per_second': 15.526, 'epoch': 0.21}
{'loss': 3.9632, 'grad_norm': 1.7560545206069946, 'learning_rate': 2.8938775510204082e-05, 'epoch': 0.21}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9383327960968018, 'eval_runtime': 109.6044, 'eval_samples_per_second': 124.183, 'eval_steps_per_second': 15.529, 'epoch': 0.21}
{'loss': 3.9965, 'grad_norm': 1.9476699829101562, 'learning_rate': 2.8775510204081635e-05, 'epoch': 0.21}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.935319423675537, 'eval_runtime': 109.6149, 'eval_samples_per_second': 124.171, 'eval_steps_per_second': 15.527, 'epoch': 0.21}
{'loss': 3.9801, 'grad_norm': 2.116424083709717, 'learning_rate': 2.8612244897959184e-05, 'epoch': 0.21}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9420201778411865, 'eval_runtime': 109.6292, 'eval_samples_per_second': 124.155, 'eval_steps_per_second': 15.525, 'epoch': 0.21}
{'loss': 3.9792, 'grad_norm': 5.409671306610107, 'learning_rate': 2.8448979591836737e-05, 'epoch': 0.22}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9576101303100586, 'eval_runtime': 109.6606, 'eval_samples_per_second': 124.119, 'eval_steps_per_second': 15.521, 'epoch': 0.22}
{'loss': 3.9424, 'grad_norm': 2.1299631595611572, 'learning_rate': 2.8285714285714287e-05, 'epoch': 0.22}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.951190948486328, 'eval_runtime': 109.6659, 'eval_samples_per_second': 124.113, 'eval_steps_per_second': 15.52, 'epoch': 0.22}
{'loss': 3.9461, 'grad_norm': 4.184598445892334, 'learning_rate': 2.8122448979591837e-05, 'epoch': 0.22}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9399964809417725, 'eval_runtime': 109.5923, 'eval_samples_per_second': 124.197, 'eval_steps_per_second': 15.53, 'epoch': 0.22}
{'loss': 3.86, 'grad_norm': 3.176572799682617, 'learning_rate': 2.7959183673469393e-05, 'epoch': 0.22}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.946936845779419, 'eval_runtime': 109.6162, 'eval_samples_per_second': 124.17, 'eval_steps_per_second': 15.527, 'epoch': 0.22}
{'loss': 4.0487, 'grad_norm': 4.964203834533691, 'learning_rate': 2.779591836734694e-05, 'epoch': 0.22}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.939271926879883, 'eval_runtime': 109.5724, 'eval_samples_per_second': 124.219, 'eval_steps_per_second': 15.533, 'epoch': 0.22}
{'loss': 4.0617, 'grad_norm': 3.183668375015259, 'learning_rate': 2.763265306122449e-05, 'epoch': 0.22}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.931807041168213, 'eval_runtime': 109.621, 'eval_samples_per_second': 124.164, 'eval_steps_per_second': 15.526, 'epoch': 0.22}
{'loss': 3.9338, 'grad_norm': 4.273491859436035, 'learning_rate': 2.7469387755102045e-05, 'epoch': 0.23}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.963475465774536, 'eval_runtime': 109.6637, 'eval_samples_per_second': 124.116, 'eval_steps_per_second': 15.52, 'epoch': 0.23}
{'loss': 3.9922, 'grad_norm': 8.495768547058105, 'learning_rate': 2.730612244897959e-05, 'epoch': 0.23}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9335756301879883, 'eval_runtime': 109.6954, 'eval_samples_per_second': 124.08, 'eval_steps_per_second': 15.516, 'epoch': 0.23}
{'loss': 3.9348, 'grad_norm': 2.8464090824127197, 'learning_rate': 2.714285714285714e-05, 'epoch': 0.23}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.931912899017334, 'eval_runtime': 109.6483, 'eval_samples_per_second': 124.133, 'eval_steps_per_second': 15.522, 'epoch': 0.23}
{'loss': 3.8891, 'grad_norm': 6.234919548034668, 'learning_rate': 2.6979591836734697e-05, 'epoch': 0.23}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9400713443756104, 'eval_runtime': 109.6398, 'eval_samples_per_second': 124.143, 'eval_steps_per_second': 15.524, 'epoch': 0.23}
{'loss': 3.9416, 'grad_norm': 4.1793904304504395, 'learning_rate': 2.6816326530612247e-05, 'epoch': 0.23}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9409470558166504, 'eval_runtime': 109.6887, 'eval_samples_per_second': 124.088, 'eval_steps_per_second': 15.517, 'epoch': 0.23}
{'loss': 4.0169, 'grad_norm': 3.46290922164917, 'learning_rate': 2.6653061224489793e-05, 'epoch': 0.23}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.928579807281494, 'eval_runtime': 109.7303, 'eval_samples_per_second': 124.04, 'eval_steps_per_second': 15.511, 'epoch': 0.23}
{'loss': 4.1398, 'grad_norm': 2.4413344860076904, 'learning_rate': 2.648979591836735e-05, 'epoch': 0.24}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.931955575942993, 'eval_runtime': 109.6897, 'eval_samples_per_second': 124.086, 'eval_steps_per_second': 15.517, 'epoch': 0.24}
{'loss': 3.9157, 'grad_norm': 3.2747318744659424, 'learning_rate': 2.63265306122449e-05, 'epoch': 0.24}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9316160678863525, 'eval_runtime': 109.6363, 'eval_samples_per_second': 124.147, 'eval_steps_per_second': 15.524, 'epoch': 0.24}
{'loss': 4.0221, 'grad_norm': 3.471789598464966, 'learning_rate': 2.616326530612245e-05, 'epoch': 0.24}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.934507131576538, 'eval_runtime': 109.6704, 'eval_samples_per_second': 124.108, 'eval_steps_per_second': 15.519, 'epoch': 0.24}
{'loss': 4.025, 'grad_norm': 5.072558403015137, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.24}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9355273246765137, 'eval_runtime': 109.7388, 'eval_samples_per_second': 124.031, 'eval_steps_per_second': 15.51, 'epoch': 0.24}
{'loss': 3.9689, 'grad_norm': 5.810727119445801, 'learning_rate': 2.583673469387755e-05, 'epoch': 0.24}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9653425216674805, 'eval_runtime': 109.6998, 'eval_samples_per_second': 124.075, 'eval_steps_per_second': 15.515, 'epoch': 0.24}
{'loss': 3.8218, 'grad_norm': 4.841303825378418, 'learning_rate': 2.5673469387755104e-05, 'epoch': 0.24}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9398481845855713, 'eval_runtime': 109.7085, 'eval_samples_per_second': 124.065, 'eval_steps_per_second': 15.514, 'epoch': 0.24}
{'loss': 3.9434, 'grad_norm': 1.6430224180221558, 'learning_rate': 2.5510204081632654e-05, 'epoch': 0.24}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9350898265838623, 'eval_runtime': 109.7031, 'eval_samples_per_second': 124.071, 'eval_steps_per_second': 15.515, 'epoch': 0.24}
{'loss': 4.0257, 'grad_norm': 2.3612613677978516, 'learning_rate': 2.5346938775510204e-05, 'epoch': 0.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.940650701522827, 'eval_runtime': 109.7117, 'eval_samples_per_second': 124.062, 'eval_steps_per_second': 15.513, 'epoch': 0.25}
{'loss': 3.9695, 'grad_norm': 4.628580570220947, 'learning_rate': 2.518367346938776e-05, 'epoch': 0.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9326975345611572, 'eval_runtime': 109.7463, 'eval_samples_per_second': 124.022, 'eval_steps_per_second': 15.508, 'epoch': 0.25}
{'loss': 3.9185, 'grad_norm': 6.492946147918701, 'learning_rate': 2.5020408163265306e-05, 'epoch': 0.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9193389415740967, 'eval_runtime': 109.7112, 'eval_samples_per_second': 124.062, 'eval_steps_per_second': 15.513, 'epoch': 0.25}
{'loss': 3.9447, 'grad_norm': 4.028794288635254, 'learning_rate': 2.485714285714286e-05, 'epoch': 0.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.927025318145752, 'eval_runtime': 109.6822, 'eval_samples_per_second': 124.095, 'eval_steps_per_second': 15.518, 'epoch': 0.25}
{'loss': 3.8839, 'grad_norm': 4.026447772979736, 'learning_rate': 2.469387755102041e-05, 'epoch': 0.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9497971534729004, 'eval_runtime': 109.6888, 'eval_samples_per_second': 124.087, 'eval_steps_per_second': 15.517, 'epoch': 0.25}
{'loss': 3.9496, 'grad_norm': 5.279932022094727, 'learning_rate': 2.4530612244897962e-05, 'epoch': 0.25}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.92433500289917, 'eval_runtime': 109.7297, 'eval_samples_per_second': 124.041, 'eval_steps_per_second': 15.511, 'epoch': 0.25}
{'loss': 3.9791, 'grad_norm': 6.317784786224365, 'learning_rate': 2.436734693877551e-05, 'epoch': 0.26}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9253292083740234, 'eval_runtime': 109.7168, 'eval_samples_per_second': 124.056, 'eval_steps_per_second': 15.513, 'epoch': 0.26}
{'loss': 3.9688, 'grad_norm': 3.8488564491271973, 'learning_rate': 2.420408163265306e-05, 'epoch': 0.26}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9351766109466553, 'eval_runtime': 109.7523, 'eval_samples_per_second': 124.016, 'eval_steps_per_second': 15.508, 'epoch': 0.26}
{'loss': 3.998, 'grad_norm': 3.125335931777954, 'learning_rate': 2.4040816326530614e-05, 'epoch': 0.26}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9253597259521484, 'eval_runtime': 109.7493, 'eval_samples_per_second': 124.019, 'eval_steps_per_second': 15.508, 'epoch': 0.26}
{'loss': 3.9287, 'grad_norm': 5.867690086364746, 'learning_rate': 2.3877551020408164e-05, 'epoch': 0.26}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.917262554168701, 'eval_runtime': 109.71, 'eval_samples_per_second': 124.063, 'eval_steps_per_second': 15.514, 'epoch': 0.26}
{'loss': 3.7794, 'grad_norm': 2.477206230163574, 'learning_rate': 2.3714285714285717e-05, 'epoch': 0.26}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9518749713897705, 'eval_runtime': 109.727, 'eval_samples_per_second': 124.044, 'eval_steps_per_second': 15.511, 'epoch': 0.26}
{'loss': 3.9052, 'grad_norm': 2.857478380203247, 'learning_rate': 2.3551020408163266e-05, 'epoch': 0.26}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.931333303451538, 'eval_runtime': 109.7635, 'eval_samples_per_second': 124.003, 'eval_steps_per_second': 15.506, 'epoch': 0.26}
{'loss': 3.8353, 'grad_norm': 2.313563108444214, 'learning_rate': 2.3387755102040816e-05, 'epoch': 0.27}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9321699142456055, 'eval_runtime': 109.7249, 'eval_samples_per_second': 124.047, 'eval_steps_per_second': 15.512, 'epoch': 0.27}
{'loss': 3.8707, 'grad_norm': 2.0386300086975098, 'learning_rate': 2.322448979591837e-05, 'epoch': 0.27}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.922964096069336, 'eval_runtime': 109.6743, 'eval_samples_per_second': 124.104, 'eval_steps_per_second': 15.519, 'epoch': 0.27}
{'loss': 4.0231, 'grad_norm': 3.278372049331665, 'learning_rate': 2.306122448979592e-05, 'epoch': 0.27}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9250547885894775, 'eval_runtime': 109.7679, 'eval_samples_per_second': 123.998, 'eval_steps_per_second': 15.505, 'epoch': 0.27}
{'loss': 3.9823, 'grad_norm': 6.955981254577637, 'learning_rate': 2.289795918367347e-05, 'epoch': 0.27}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9257497787475586, 'eval_runtime': 109.7488, 'eval_samples_per_second': 124.02, 'eval_steps_per_second': 15.508, 'epoch': 0.27}
{'loss': 3.8778, 'grad_norm': 4.278469562530518, 'learning_rate': 2.273469387755102e-05, 'epoch': 0.27}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9221301078796387, 'eval_runtime': 109.7178, 'eval_samples_per_second': 124.055, 'eval_steps_per_second': 15.513, 'epoch': 0.27}
{'loss': 3.9306, 'grad_norm': 6.082777976989746, 'learning_rate': 2.257142857142857e-05, 'epoch': 0.27}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9362633228302, 'eval_runtime': 109.6911, 'eval_samples_per_second': 124.085, 'eval_steps_per_second': 15.516, 'epoch': 0.27}
{'loss': 3.9088, 'grad_norm': 4.292964458465576, 'learning_rate': 2.2408163265306124e-05, 'epoch': 0.28}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.940176486968994, 'eval_runtime': 109.7664, 'eval_samples_per_second': 124.0, 'eval_steps_per_second': 15.506, 'epoch': 0.28}
{'loss': 3.9094, 'grad_norm': 2.8597137928009033, 'learning_rate': 2.2244897959183673e-05, 'epoch': 0.28}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9300124645233154, 'eval_runtime': 109.7537, 'eval_samples_per_second': 124.014, 'eval_steps_per_second': 15.507, 'epoch': 0.28}
{'loss': 4.0245, 'grad_norm': 2.636338472366333, 'learning_rate': 2.2081632653061226e-05, 'epoch': 0.28}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.910710334777832, 'eval_runtime': 109.7118, 'eval_samples_per_second': 124.061, 'eval_steps_per_second': 15.513, 'epoch': 0.28}
{'loss': 3.9749, 'grad_norm': 1.6984196901321411, 'learning_rate': 2.1918367346938776e-05, 'epoch': 0.28}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9182095527648926, 'eval_runtime': 109.7777, 'eval_samples_per_second': 123.987, 'eval_steps_per_second': 15.504, 'epoch': 0.28}
{'loss': 4.0073, 'grad_norm': 5.758031845092773, 'learning_rate': 2.175510204081633e-05, 'epoch': 0.28}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9294357299804688, 'eval_runtime': 109.7829, 'eval_samples_per_second': 123.981, 'eval_steps_per_second': 15.503, 'epoch': 0.28}
{'loss': 3.9566, 'grad_norm': 3.041407346725464, 'learning_rate': 2.159183673469388e-05, 'epoch': 0.28}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9407782554626465, 'eval_runtime': 109.7125, 'eval_samples_per_second': 124.061, 'eval_steps_per_second': 15.513, 'epoch': 0.28}
{'loss': 3.9956, 'grad_norm': 3.3200321197509766, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.29}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.909761905670166, 'eval_runtime': 109.7104, 'eval_samples_per_second': 124.063, 'eval_steps_per_second': 15.514, 'epoch': 0.29}
{'loss': 4.0972, 'grad_norm': 3.4045281410217285, 'learning_rate': 2.126530612244898e-05, 'epoch': 0.29}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9098057746887207, 'eval_runtime': 109.7598, 'eval_samples_per_second': 124.007, 'eval_steps_per_second': 15.507, 'epoch': 0.29}
{'loss': 3.7637, 'grad_norm': 5.138885974884033, 'learning_rate': 2.110204081632653e-05, 'epoch': 0.29}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.921272039413452, 'eval_runtime': 109.7982, 'eval_samples_per_second': 123.964, 'eval_steps_per_second': 15.501, 'epoch': 0.29}
{'loss': 4.0417, 'grad_norm': 5.809049606323242, 'learning_rate': 2.0938775510204084e-05, 'epoch': 0.29}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9103431701660156, 'eval_runtime': 109.7884, 'eval_samples_per_second': 123.975, 'eval_steps_per_second': 15.503, 'epoch': 0.29}
{'loss': 3.9359, 'grad_norm': 4.2710490226745605, 'learning_rate': 2.0775510204081633e-05, 'epoch': 0.29}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.912792205810547, 'eval_runtime': 109.7111, 'eval_samples_per_second': 124.062, 'eval_steps_per_second': 15.513, 'epoch': 0.29}
{'loss': 3.8617, 'grad_norm': 4.013326168060303, 'learning_rate': 2.0612244897959186e-05, 'epoch': 0.29}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9258084297180176, 'eval_runtime': 109.7535, 'eval_samples_per_second': 124.014, 'eval_steps_per_second': 15.507, 'epoch': 0.29}
{'loss': 4.0174, 'grad_norm': 5.9332356452941895, 'learning_rate': 2.0448979591836736e-05, 'epoch': 0.3}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9140682220458984, 'eval_runtime': 109.7851, 'eval_samples_per_second': 123.979, 'eval_steps_per_second': 15.503, 'epoch': 0.3}
{'loss': 3.8539, 'grad_norm': 3.479311227798462, 'learning_rate': 2.0285714285714286e-05, 'epoch': 0.3}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9207494258880615, 'eval_runtime': 109.7538, 'eval_samples_per_second': 124.014, 'eval_steps_per_second': 15.507, 'epoch': 0.3}
{'loss': 3.8628, 'grad_norm': 4.376101493835449, 'learning_rate': 2.012244897959184e-05, 'epoch': 0.3}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.925036907196045, 'eval_runtime': 109.7392, 'eval_samples_per_second': 124.03, 'eval_steps_per_second': 15.51, 'epoch': 0.3}
{'loss': 3.9075, 'grad_norm': 7.126071929931641, 'learning_rate': 1.9959183673469388e-05, 'epoch': 0.3}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.934779405593872, 'eval_runtime': 109.718, 'eval_samples_per_second': 124.054, 'eval_steps_per_second': 15.512, 'epoch': 0.3}
{'loss': 3.9493, 'grad_norm': 5.32426118850708, 'learning_rate': 1.9795918367346938e-05, 'epoch': 0.3}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.91744327545166, 'eval_runtime': 109.6942, 'eval_samples_per_second': 124.081, 'eval_steps_per_second': 15.516, 'epoch': 0.3}
{'loss': 4.0668, 'grad_norm': 3.869046926498413, 'learning_rate': 1.963265306122449e-05, 'epoch': 0.3}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.911083221435547, 'eval_runtime': 109.7105, 'eval_samples_per_second': 124.063, 'eval_steps_per_second': 15.514, 'epoch': 0.3}
{'loss': 3.6964, 'grad_norm': 4.765562534332275, 'learning_rate': 1.946938775510204e-05, 'epoch': 0.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9122204780578613, 'eval_runtime': 109.7496, 'eval_samples_per_second': 124.019, 'eval_steps_per_second': 15.508, 'epoch': 0.31}
{'loss': 3.7868, 'grad_norm': 2.0526883602142334, 'learning_rate': 1.9306122448979593e-05, 'epoch': 0.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9164061546325684, 'eval_runtime': 109.7498, 'eval_samples_per_second': 124.018, 'eval_steps_per_second': 15.508, 'epoch': 0.31}
{'loss': 3.8649, 'grad_norm': 3.591797351837158, 'learning_rate': 1.9142857142857143e-05, 'epoch': 0.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.915024995803833, 'eval_runtime': 109.7475, 'eval_samples_per_second': 124.021, 'eval_steps_per_second': 15.508, 'epoch': 0.31}
{'loss': 3.9822, 'grad_norm': 3.9879467487335205, 'learning_rate': 1.8979591836734696e-05, 'epoch': 0.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9049112796783447, 'eval_runtime': 109.7282, 'eval_samples_per_second': 124.043, 'eval_steps_per_second': 15.511, 'epoch': 0.31}
{'loss': 4.146, 'grad_norm': 3.6801812648773193, 'learning_rate': 1.8816326530612246e-05, 'epoch': 0.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.905562400817871, 'eval_runtime': 109.7667, 'eval_samples_per_second': 123.999, 'eval_steps_per_second': 15.506, 'epoch': 0.31}
{'loss': 4.0478, 'grad_norm': 11.985651016235352, 'learning_rate': 1.8653061224489795e-05, 'epoch': 0.31}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.925471544265747, 'eval_runtime': 109.7678, 'eval_samples_per_second': 123.998, 'eval_steps_per_second': 15.505, 'epoch': 0.31}
{'loss': 3.9491, 'grad_norm': 2.1691195964813232, 'learning_rate': 1.8489795918367348e-05, 'epoch': 0.32}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9053890705108643, 'eval_runtime': 109.7511, 'eval_samples_per_second': 124.017, 'eval_steps_per_second': 15.508, 'epoch': 0.32}
{'loss': 3.929, 'grad_norm': 1.7834532260894775, 'learning_rate': 1.8326530612244898e-05, 'epoch': 0.32}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9060354232788086, 'eval_runtime': 109.7012, 'eval_samples_per_second': 124.073, 'eval_steps_per_second': 15.515, 'epoch': 0.32}
{'loss': 4.0266, 'grad_norm': 1.1737375259399414, 'learning_rate': 1.816326530612245e-05, 'epoch': 0.32}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9024524688720703, 'eval_runtime': 109.9425, 'eval_samples_per_second': 123.801, 'eval_steps_per_second': 15.481, 'epoch': 0.32}
{'loss': 4.0528, 'grad_norm': 2.8970844745635986, 'learning_rate': 1.8e-05, 'epoch': 0.32}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.902893543243408, 'eval_runtime': 109.7425, 'eval_samples_per_second': 124.027, 'eval_steps_per_second': 15.509, 'epoch': 0.32}
{'loss': 3.9357, 'grad_norm': 4.222126007080078, 'learning_rate': 1.7836734693877553e-05, 'epoch': 0.32}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.913132905960083, 'eval_runtime': 109.7251, 'eval_samples_per_second': 124.046, 'eval_steps_per_second': 15.511, 'epoch': 0.32}
{'loss': 3.8553, 'grad_norm': 8.25046443939209, 'learning_rate': 1.7673469387755103e-05, 'epoch': 0.32}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9104416370391846, 'eval_runtime': 109.773, 'eval_samples_per_second': 123.992, 'eval_steps_per_second': 15.505, 'epoch': 0.32}
{'loss': 3.8447, 'grad_norm': 4.435224533081055, 'learning_rate': 1.7510204081632653e-05, 'epoch': 0.32}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9045512676239014, 'eval_runtime': 109.7533, 'eval_samples_per_second': 124.015, 'eval_steps_per_second': 15.508, 'epoch': 0.32}
{'loss': 3.9954, 'grad_norm': 1.510097861289978, 'learning_rate': 1.7346938775510206e-05, 'epoch': 0.33}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.906507968902588, 'eval_runtime': 109.7983, 'eval_samples_per_second': 123.964, 'eval_steps_per_second': 15.501, 'epoch': 0.33}
{'loss': 3.8608, 'grad_norm': 1.9023758172988892, 'learning_rate': 1.7183673469387755e-05, 'epoch': 0.33}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.904294490814209, 'eval_runtime': 109.7791, 'eval_samples_per_second': 123.985, 'eval_steps_per_second': 15.504, 'epoch': 0.33}
{'loss': 3.8713, 'grad_norm': 4.336911201477051, 'learning_rate': 1.7020408163265305e-05, 'epoch': 0.33}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9048216342926025, 'eval_runtime': 109.7218, 'eval_samples_per_second': 124.05, 'eval_steps_per_second': 15.512, 'epoch': 0.33}
{'loss': 4.0705, 'grad_norm': 7.344269752502441, 'learning_rate': 1.6857142857142858e-05, 'epoch': 0.33}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9044876098632812, 'eval_runtime': 109.7328, 'eval_samples_per_second': 124.038, 'eval_steps_per_second': 15.51, 'epoch': 0.33}
{'loss': 3.6637, 'grad_norm': 4.20595645904541, 'learning_rate': 1.669387755102041e-05, 'epoch': 0.33}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.908569097518921, 'eval_runtime': 109.7517, 'eval_samples_per_second': 124.016, 'eval_steps_per_second': 15.508, 'epoch': 0.33}
{'loss': 3.9156, 'grad_norm': 2.794297456741333, 'learning_rate': 1.653061224489796e-05, 'epoch': 0.33}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.900724172592163, 'eval_runtime': 109.7296, 'eval_samples_per_second': 124.041, 'eval_steps_per_second': 15.511, 'epoch': 0.33}
{'loss': 4.0097, 'grad_norm': 3.9033756256103516, 'learning_rate': 1.636734693877551e-05, 'epoch': 0.34}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.900508403778076, 'eval_runtime': 109.7377, 'eval_samples_per_second': 124.032, 'eval_steps_per_second': 15.51, 'epoch': 0.34}
{'loss': 3.8829, 'grad_norm': 3.9630625247955322, 'learning_rate': 1.6204081632653063e-05, 'epoch': 0.34}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8964481353759766, 'eval_runtime': 109.7358, 'eval_samples_per_second': 124.034, 'eval_steps_per_second': 15.51, 'epoch': 0.34}
{'loss': 4.0097, 'grad_norm': 6.325133323669434, 'learning_rate': 1.6040816326530613e-05, 'epoch': 0.34}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.913163900375366, 'eval_runtime': 109.7308, 'eval_samples_per_second': 124.04, 'eval_steps_per_second': 15.511, 'epoch': 0.34}
{'loss': 3.915, 'grad_norm': 3.0232019424438477, 'learning_rate': 1.5877551020408162e-05, 'epoch': 0.34}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9177939891815186, 'eval_runtime': 109.7322, 'eval_samples_per_second': 124.038, 'eval_steps_per_second': 15.51, 'epoch': 0.34}
{'loss': 3.9107, 'grad_norm': 1.7064553499221802, 'learning_rate': 1.5714285714285715e-05, 'epoch': 0.34}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.902923107147217, 'eval_runtime': 109.7487, 'eval_samples_per_second': 124.02, 'eval_steps_per_second': 15.508, 'epoch': 0.34}
{'loss': 3.9256, 'grad_norm': 2.6622610092163086, 'learning_rate': 1.5551020408163265e-05, 'epoch': 0.34}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8942954540252686, 'eval_runtime': 109.7735, 'eval_samples_per_second': 123.992, 'eval_steps_per_second': 15.505, 'epoch': 0.34}
{'loss': 3.901, 'grad_norm': 2.353602886199951, 'learning_rate': 1.5387755102040818e-05, 'epoch': 0.35}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.898117780685425, 'eval_runtime': 109.7317, 'eval_samples_per_second': 124.039, 'eval_steps_per_second': 15.511, 'epoch': 0.35}
{'loss': 3.7932, 'grad_norm': 1.4856703281402588, 'learning_rate': 1.5224489795918368e-05, 'epoch': 0.35}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9037482738494873, 'eval_runtime': 109.6586, 'eval_samples_per_second': 124.122, 'eval_steps_per_second': 15.521, 'epoch': 0.35}
{'loss': 3.868, 'grad_norm': 1.9356938600540161, 'learning_rate': 1.5061224489795919e-05, 'epoch': 0.35}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.900517225265503, 'eval_runtime': 109.7187, 'eval_samples_per_second': 124.054, 'eval_steps_per_second': 15.512, 'epoch': 0.35}
{'loss': 3.9823, 'grad_norm': 3.316767930984497, 'learning_rate': 1.4897959183673472e-05, 'epoch': 0.35}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8904895782470703, 'eval_runtime': 109.7107, 'eval_samples_per_second': 124.063, 'eval_steps_per_second': 15.514, 'epoch': 0.35}
{'loss': 3.9814, 'grad_norm': 2.435436964035034, 'learning_rate': 1.473469387755102e-05, 'epoch': 0.35}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8955931663513184, 'eval_runtime': 109.7038, 'eval_samples_per_second': 124.07, 'eval_steps_per_second': 15.515, 'epoch': 0.35}
{'loss': 3.9716, 'grad_norm': 1.850406289100647, 'learning_rate': 1.4571428571428573e-05, 'epoch': 0.35}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9101920127868652, 'eval_runtime': 109.7411, 'eval_samples_per_second': 124.028, 'eval_steps_per_second': 15.509, 'epoch': 0.35}
{'loss': 4.0296, 'grad_norm': 3.9020791053771973, 'learning_rate': 1.4408163265306124e-05, 'epoch': 0.36}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8983817100524902, 'eval_runtime': 109.7354, 'eval_samples_per_second': 124.035, 'eval_steps_per_second': 15.51, 'epoch': 0.36}
{'loss': 4.0231, 'grad_norm': 1.961681842803955, 'learning_rate': 1.4244897959183674e-05, 'epoch': 0.36}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9056594371795654, 'eval_runtime': 109.6961, 'eval_samples_per_second': 124.079, 'eval_steps_per_second': 15.516, 'epoch': 0.36}
{'loss': 3.7184, 'grad_norm': 2.3271117210388184, 'learning_rate': 1.4081632653061225e-05, 'epoch': 0.36}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8993546962738037, 'eval_runtime': 109.6992, 'eval_samples_per_second': 124.076, 'eval_steps_per_second': 15.515, 'epoch': 0.36}
{'loss': 3.84, 'grad_norm': 4.272223949432373, 'learning_rate': 1.3918367346938776e-05, 'epoch': 0.36}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8954355716705322, 'eval_runtime': 109.7189, 'eval_samples_per_second': 124.053, 'eval_steps_per_second': 15.512, 'epoch': 0.36}
{'loss': 3.9338, 'grad_norm': 2.6310129165649414, 'learning_rate': 1.3755102040816328e-05, 'epoch': 0.36}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.897684335708618, 'eval_runtime': 109.6855, 'eval_samples_per_second': 124.091, 'eval_steps_per_second': 15.517, 'epoch': 0.36}
{'loss': 3.8855, 'grad_norm': 1.8985261917114258, 'learning_rate': 1.3591836734693877e-05, 'epoch': 0.36}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.888550043106079, 'eval_runtime': 109.6986, 'eval_samples_per_second': 124.076, 'eval_steps_per_second': 15.515, 'epoch': 0.36}
{'loss': 4.0693, 'grad_norm': 7.012241363525391, 'learning_rate': 1.3428571428571429e-05, 'epoch': 0.37}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9002320766448975, 'eval_runtime': 109.6756, 'eval_samples_per_second': 124.102, 'eval_steps_per_second': 15.518, 'epoch': 0.37}
{'loss': 3.8428, 'grad_norm': 3.572821617126465, 'learning_rate': 1.3265306122448982e-05, 'epoch': 0.37}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.895016670227051, 'eval_runtime': 109.7393, 'eval_samples_per_second': 124.03, 'eval_steps_per_second': 15.509, 'epoch': 0.37}
{'loss': 3.785, 'grad_norm': 3.1776232719421387, 'learning_rate': 1.310204081632653e-05, 'epoch': 0.37}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8976290225982666, 'eval_runtime': 109.7736, 'eval_samples_per_second': 123.992, 'eval_steps_per_second': 15.505, 'epoch': 0.37}
{'loss': 3.9007, 'grad_norm': 3.776785135269165, 'learning_rate': 1.2938775510204082e-05, 'epoch': 0.37}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8996641635894775, 'eval_runtime': 109.8083, 'eval_samples_per_second': 123.952, 'eval_steps_per_second': 15.5, 'epoch': 0.37}
{'loss': 3.8329, 'grad_norm': 2.686429262161255, 'learning_rate': 1.2775510204081634e-05, 'epoch': 0.37}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.892854690551758, 'eval_runtime': 109.7886, 'eval_samples_per_second': 123.975, 'eval_steps_per_second': 15.503, 'epoch': 0.37}
{'loss': 3.8696, 'grad_norm': 6.439247131347656, 'learning_rate': 1.2612244897959185e-05, 'epoch': 0.37}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.89316463470459, 'eval_runtime': 109.7661, 'eval_samples_per_second': 124.0, 'eval_steps_per_second': 15.506, 'epoch': 0.37}
{'loss': 3.8603, 'grad_norm': 2.3870060443878174, 'learning_rate': 1.2448979591836735e-05, 'epoch': 0.38}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.9000160694122314, 'eval_runtime': 109.7945, 'eval_samples_per_second': 123.968, 'eval_steps_per_second': 15.502, 'epoch': 0.38}
{'loss': 3.9348, 'grad_norm': 3.939785957336426, 'learning_rate': 1.2285714285714286e-05, 'epoch': 0.38}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.892699956893921, 'eval_runtime': 109.7006, 'eval_samples_per_second': 124.074, 'eval_steps_per_second': 15.515, 'epoch': 0.38}
{'loss': 3.9871, 'grad_norm': 3.772784948348999, 'learning_rate': 1.2122448979591837e-05, 'epoch': 0.38}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8916757106781006, 'eval_runtime': 109.7421, 'eval_samples_per_second': 124.027, 'eval_steps_per_second': 15.509, 'epoch': 0.38}
{'loss': 3.9714, 'grad_norm': 6.374429225921631, 'learning_rate': 1.1959183673469389e-05, 'epoch': 0.38}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.887117862701416, 'eval_runtime': 109.7443, 'eval_samples_per_second': 124.025, 'eval_steps_per_second': 15.509, 'epoch': 0.38}
{'loss': 3.7317, 'grad_norm': 2.805359125137329, 'learning_rate': 1.179591836734694e-05, 'epoch': 0.38}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8909239768981934, 'eval_runtime': 109.7019, 'eval_samples_per_second': 124.073, 'eval_steps_per_second': 15.515, 'epoch': 0.38}
{'loss': 3.9879, 'grad_norm': 3.6635990142822266, 'learning_rate': 1.163265306122449e-05, 'epoch': 0.38}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8876614570617676, 'eval_runtime': 109.7339, 'eval_samples_per_second': 124.036, 'eval_steps_per_second': 15.51, 'epoch': 0.38}
{'loss': 4.007, 'grad_norm': 3.980118989944458, 'learning_rate': 1.146938775510204e-05, 'epoch': 0.39}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.895789623260498, 'eval_runtime': 109.6878, 'eval_samples_per_second': 124.089, 'eval_steps_per_second': 15.517, 'epoch': 0.39}
{'loss': 3.9814, 'grad_norm': 6.9148383140563965, 'learning_rate': 1.1306122448979592e-05, 'epoch': 0.39}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.893587350845337, 'eval_runtime': 109.7132, 'eval_samples_per_second': 124.06, 'eval_steps_per_second': 15.513, 'epoch': 0.39}
{'loss': 4.0529, 'grad_norm': 5.04536247253418, 'learning_rate': 1.1142857142857143e-05, 'epoch': 0.39}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8957648277282715, 'eval_runtime': 109.7001, 'eval_samples_per_second': 124.075, 'eval_steps_per_second': 15.515, 'epoch': 0.39}
{'loss': 3.9035, 'grad_norm': 2.7389464378356934, 'learning_rate': 1.0979591836734695e-05, 'epoch': 0.39}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8862273693084717, 'eval_runtime': 109.7243, 'eval_samples_per_second': 124.047, 'eval_steps_per_second': 15.512, 'epoch': 0.39}
{'loss': 3.8499, 'grad_norm': 1.8737250566482544, 'learning_rate': 1.0816326530612246e-05, 'epoch': 0.39}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.892137050628662, 'eval_runtime': 109.7086, 'eval_samples_per_second': 124.065, 'eval_steps_per_second': 15.514, 'epoch': 0.39}
{'loss': 4.0293, 'grad_norm': 3.441020965576172, 'learning_rate': 1.0653061224489797e-05, 'epoch': 0.39}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.88917875289917, 'eval_runtime': 109.7646, 'eval_samples_per_second': 124.002, 'eval_steps_per_second': 15.506, 'epoch': 0.39}
{'loss': 3.9671, 'grad_norm': 6.593017101287842, 'learning_rate': 1.0489795918367347e-05, 'epoch': 0.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8827390670776367, 'eval_runtime': 109.6893, 'eval_samples_per_second': 124.087, 'eval_steps_per_second': 15.517, 'epoch': 0.4}
{'loss': 3.9136, 'grad_norm': 2.414716958999634, 'learning_rate': 1.0326530612244898e-05, 'epoch': 0.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8823606967926025, 'eval_runtime': 109.6175, 'eval_samples_per_second': 124.168, 'eval_steps_per_second': 15.527, 'epoch': 0.4}
{'loss': 4.1004, 'grad_norm': 3.5281319618225098, 'learning_rate': 1.016326530612245e-05, 'epoch': 0.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.891822338104248, 'eval_runtime': 109.6917, 'eval_samples_per_second': 124.084, 'eval_steps_per_second': 15.516, 'epoch': 0.4}
{'loss': 3.8975, 'grad_norm': 7.6591057777404785, 'learning_rate': 1e-05, 'epoch': 0.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8855373859405518, 'eval_runtime': 109.726, 'eval_samples_per_second': 124.045, 'eval_steps_per_second': 15.511, 'epoch': 0.4}
{'loss': 3.863, 'grad_norm': 5.040950298309326, 'learning_rate': 9.836734693877552e-06, 'epoch': 0.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.880448818206787, 'eval_runtime': 109.6729, 'eval_samples_per_second': 124.105, 'eval_steps_per_second': 15.519, 'epoch': 0.4}
{'loss': 3.7894, 'grad_norm': 6.2402119636535645, 'learning_rate': 9.673469387755102e-06, 'epoch': 0.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8858187198638916, 'eval_runtime': 109.7478, 'eval_samples_per_second': 124.021, 'eval_steps_per_second': 15.508, 'epoch': 0.4}
{'loss': 3.8899, 'grad_norm': 3.8487086296081543, 'learning_rate': 9.510204081632653e-06, 'epoch': 0.4}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8830225467681885, 'eval_runtime': 109.7122, 'eval_samples_per_second': 124.061, 'eval_steps_per_second': 15.513, 'epoch': 0.4}
{'loss': 4.014, 'grad_norm': 3.3087546825408936, 'learning_rate': 9.346938775510204e-06, 'epoch': 0.41}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.887237310409546, 'eval_runtime': 109.6969, 'eval_samples_per_second': 124.078, 'eval_steps_per_second': 15.515, 'epoch': 0.41}
{'loss': 3.7956, 'grad_norm': 3.5672478675842285, 'learning_rate': 9.183673469387756e-06, 'epoch': 0.41}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.882488250732422, 'eval_runtime': 109.651, 'eval_samples_per_second': 124.13, 'eval_steps_per_second': 15.522, 'epoch': 0.41}
{'loss': 3.9305, 'grad_norm': 5.036694526672363, 'learning_rate': 9.020408163265307e-06, 'epoch': 0.41}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.880845785140991, 'eval_runtime': 109.6425, 'eval_samples_per_second': 124.14, 'eval_steps_per_second': 15.523, 'epoch': 0.41}
{'loss': 3.8521, 'grad_norm': 3.710249900817871, 'learning_rate': 8.857142857142857e-06, 'epoch': 0.41}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8793208599090576, 'eval_runtime': 109.6087, 'eval_samples_per_second': 124.178, 'eval_steps_per_second': 15.528, 'epoch': 0.41}
{'loss': 3.8512, 'grad_norm': 4.3166399002075195, 'learning_rate': 8.69387755102041e-06, 'epoch': 0.41}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8814239501953125, 'eval_runtime': 109.653, 'eval_samples_per_second': 124.128, 'eval_steps_per_second': 15.522, 'epoch': 0.41}
{'loss': 3.9304, 'grad_norm': 4.216821670532227, 'learning_rate': 8.53061224489796e-06, 'epoch': 0.41}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8850138187408447, 'eval_runtime': 109.5928, 'eval_samples_per_second': 124.196, 'eval_steps_per_second': 15.53, 'epoch': 0.41}
{'loss': 3.8906, 'grad_norm': 2.212730884552002, 'learning_rate': 8.36734693877551e-06, 'epoch': 0.42}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.884316921234131, 'eval_runtime': 109.6129, 'eval_samples_per_second': 124.173, 'eval_steps_per_second': 15.527, 'epoch': 0.42}
{'loss': 3.9295, 'grad_norm': 4.883177757263184, 'learning_rate': 8.204081632653062e-06, 'epoch': 0.42}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8771684169769287, 'eval_runtime': 109.5836, 'eval_samples_per_second': 124.207, 'eval_steps_per_second': 15.532, 'epoch': 0.42}
{'loss': 3.9561, 'grad_norm': 3.670799732208252, 'learning_rate': 8.040816326530613e-06, 'epoch': 0.42}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8847010135650635, 'eval_runtime': 109.8169, 'eval_samples_per_second': 123.943, 'eval_steps_per_second': 15.499, 'epoch': 0.42}
{'loss': 3.7766, 'grad_norm': 5.110223293304443, 'learning_rate': 7.877551020408164e-06, 'epoch': 0.42}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8818037509918213, 'eval_runtime': 109.5707, 'eval_samples_per_second': 124.221, 'eval_steps_per_second': 15.533, 'epoch': 0.42}
{'loss': 4.0112, 'grad_norm': 6.718925476074219, 'learning_rate': 7.714285714285714e-06, 'epoch': 0.42}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.888493776321411, 'eval_runtime': 109.6099, 'eval_samples_per_second': 124.177, 'eval_steps_per_second': 15.528, 'epoch': 0.42}
{'loss': 3.963, 'grad_norm': 2.5887787342071533, 'learning_rate': 7.551020408163266e-06, 'epoch': 0.42}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.879132032394409, 'eval_runtime': 109.5901, 'eval_samples_per_second': 124.199, 'eval_steps_per_second': 15.531, 'epoch': 0.42}
{'loss': 3.9085, 'grad_norm': 2.423393964767456, 'learning_rate': 7.387755102040817e-06, 'epoch': 0.43}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8859174251556396, 'eval_runtime': 109.589, 'eval_samples_per_second': 124.2, 'eval_steps_per_second': 15.531, 'epoch': 0.43}
{'loss': 3.9276, 'grad_norm': 3.889587163925171, 'learning_rate': 7.224489795918368e-06, 'epoch': 0.43}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.878406286239624, 'eval_runtime': 109.584, 'eval_samples_per_second': 124.206, 'eval_steps_per_second': 15.531, 'epoch': 0.43}
{'loss': 3.9734, 'grad_norm': 3.7117395401000977, 'learning_rate': 7.061224489795918e-06, 'epoch': 0.43}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8778436183929443, 'eval_runtime': 109.6072, 'eval_samples_per_second': 124.18, 'eval_steps_per_second': 15.528, 'epoch': 0.43}
{'loss': 3.9929, 'grad_norm': 4.956496715545654, 'learning_rate': 6.897959183673469e-06, 'epoch': 0.43}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.875910997390747, 'eval_runtime': 109.5373, 'eval_samples_per_second': 124.259, 'eval_steps_per_second': 15.538, 'epoch': 0.43}
{'loss': 3.8616, 'grad_norm': 4.034073352813721, 'learning_rate': 6.734693877551021e-06, 'epoch': 0.43}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.877609968185425, 'eval_runtime': 109.6163, 'eval_samples_per_second': 124.17, 'eval_steps_per_second': 15.527, 'epoch': 0.43}
{'loss': 3.8346, 'grad_norm': 3.1382510662078857, 'learning_rate': 6.5714285714285714e-06, 'epoch': 0.43}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8765993118286133, 'eval_runtime': 109.5899, 'eval_samples_per_second': 124.199, 'eval_steps_per_second': 15.531, 'epoch': 0.43}
{'loss': 3.8998, 'grad_norm': 4.32956600189209, 'learning_rate': 6.408163265306124e-06, 'epoch': 0.44}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8765430450439453, 'eval_runtime': 109.6019, 'eval_samples_per_second': 124.186, 'eval_steps_per_second': 15.529, 'epoch': 0.44}
{'loss': 3.9147, 'grad_norm': 4.559917449951172, 'learning_rate': 6.244897959183674e-06, 'epoch': 0.44}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.875898599624634, 'eval_runtime': 109.6959, 'eval_samples_per_second': 124.079, 'eval_steps_per_second': 15.516, 'epoch': 0.44}
{'loss': 4.038, 'grad_norm': 4.273165225982666, 'learning_rate': 6.0816326530612245e-06, 'epoch': 0.44}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8730013370513916, 'eval_runtime': 109.6399, 'eval_samples_per_second': 124.143, 'eval_steps_per_second': 15.524, 'epoch': 0.44}
{'loss': 3.8217, 'grad_norm': 5.209369659423828, 'learning_rate': 5.918367346938776e-06, 'epoch': 0.44}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8799736499786377, 'eval_runtime': 109.6091, 'eval_samples_per_second': 124.178, 'eval_steps_per_second': 15.528, 'epoch': 0.44}
{'loss': 3.8701, 'grad_norm': 12.12435531616211, 'learning_rate': 5.755102040816327e-06, 'epoch': 0.44}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8764455318450928, 'eval_runtime': 109.5934, 'eval_samples_per_second': 124.195, 'eval_steps_per_second': 15.53, 'epoch': 0.44}
{'loss': 3.8132, 'grad_norm': 4.1400651931762695, 'learning_rate': 5.5918367346938776e-06, 'epoch': 0.44}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8769991397857666, 'eval_runtime': 109.6769, 'eval_samples_per_second': 124.101, 'eval_steps_per_second': 15.518, 'epoch': 0.44}
{'loss': 3.8946, 'grad_norm': 3.346680164337158, 'learning_rate': 5.428571428571429e-06, 'epoch': 0.45}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.877707004547119, 'eval_runtime': 109.6165, 'eval_samples_per_second': 124.169, 'eval_steps_per_second': 15.527, 'epoch': 0.45}
{'loss': 3.9043, 'grad_norm': 2.0021703243255615, 'learning_rate': 5.26530612244898e-06, 'epoch': 0.45}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.873960018157959, 'eval_runtime': 109.5858, 'eval_samples_per_second': 124.204, 'eval_steps_per_second': 15.531, 'epoch': 0.45}
{'loss': 3.942, 'grad_norm': 6.896919250488281, 'learning_rate': 5.102040816326531e-06, 'epoch': 0.45}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8776142597198486, 'eval_runtime': 109.5868, 'eval_samples_per_second': 124.203, 'eval_steps_per_second': 15.531, 'epoch': 0.45}
{'loss': 4.0011, 'grad_norm': 1.5463961362838745, 'learning_rate': 4.938775510204082e-06, 'epoch': 0.45}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.877866506576538, 'eval_runtime': 109.6696, 'eval_samples_per_second': 124.109, 'eval_steps_per_second': 15.519, 'epoch': 0.45}
{'loss': 4.182, 'grad_norm': 3.1203603744506836, 'learning_rate': 4.775510204081632e-06, 'epoch': 0.45}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8752925395965576, 'eval_runtime': 109.6177, 'eval_samples_per_second': 124.168, 'eval_steps_per_second': 15.527, 'epoch': 0.45}
{'loss': 3.8405, 'grad_norm': 4.707325458526611, 'learning_rate': 4.612244897959184e-06, 'epoch': 0.45}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8763229846954346, 'eval_runtime': 109.6107, 'eval_samples_per_second': 124.176, 'eval_steps_per_second': 15.528, 'epoch': 0.45}
{'loss': 4.0081, 'grad_norm': 5.3422040939331055, 'learning_rate': 4.448979591836735e-06, 'epoch': 0.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8762307167053223, 'eval_runtime': 109.5802, 'eval_samples_per_second': 124.21, 'eval_steps_per_second': 15.532, 'epoch': 0.46}
{'loss': 4.041, 'grad_norm': 6.738285064697266, 'learning_rate': 4.285714285714286e-06, 'epoch': 0.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8785440921783447, 'eval_runtime': 109.5347, 'eval_samples_per_second': 124.262, 'eval_steps_per_second': 15.538, 'epoch': 0.46}
{'loss': 3.6513, 'grad_norm': 6.024954795837402, 'learning_rate': 4.122448979591837e-06, 'epoch': 0.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8737545013427734, 'eval_runtime': 109.5915, 'eval_samples_per_second': 124.198, 'eval_steps_per_second': 15.53, 'epoch': 0.46}
{'loss': 3.9142, 'grad_norm': 3.2857542037963867, 'learning_rate': 3.959183673469388e-06, 'epoch': 0.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.87365984916687, 'eval_runtime': 109.6092, 'eval_samples_per_second': 124.178, 'eval_steps_per_second': 15.528, 'epoch': 0.46}
{'loss': 4.1376, 'grad_norm': 2.350471258163452, 'learning_rate': 3.7959183673469385e-06, 'epoch': 0.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8731887340545654, 'eval_runtime': 109.5921, 'eval_samples_per_second': 124.197, 'eval_steps_per_second': 15.53, 'epoch': 0.46}
{'loss': 3.955, 'grad_norm': 2.7196216583251953, 'learning_rate': 3.63265306122449e-06, 'epoch': 0.46}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8732964992523193, 'eval_runtime': 109.5677, 'eval_samples_per_second': 124.225, 'eval_steps_per_second': 15.534, 'epoch': 0.46}
{'loss': 3.9054, 'grad_norm': 3.510089159011841, 'learning_rate': 3.469387755102041e-06, 'epoch': 0.47}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8719286918640137, 'eval_runtime': 109.5847, 'eval_samples_per_second': 124.205, 'eval_steps_per_second': 15.531, 'epoch': 0.47}
{'loss': 3.8995, 'grad_norm': 5.223210334777832, 'learning_rate': 3.306122448979592e-06, 'epoch': 0.47}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8711624145507812, 'eval_runtime': 109.5678, 'eval_samples_per_second': 124.224, 'eval_steps_per_second': 15.534, 'epoch': 0.47}
{'loss': 4.0325, 'grad_norm': 6.45933198928833, 'learning_rate': 3.1428571428571433e-06, 'epoch': 0.47}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.872182607650757, 'eval_runtime': 109.5988, 'eval_samples_per_second': 124.189, 'eval_steps_per_second': 15.529, 'epoch': 0.47}
{'loss': 3.963, 'grad_norm': 3.353757858276367, 'learning_rate': 2.979591836734694e-06, 'epoch': 0.47}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8714895248413086, 'eval_runtime': 109.5324, 'eval_samples_per_second': 124.265, 'eval_steps_per_second': 15.539, 'epoch': 0.47}
{'loss': 3.9485, 'grad_norm': 5.918542861938477, 'learning_rate': 2.816326530612245e-06, 'epoch': 0.47}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8699936866760254, 'eval_runtime': 109.5696, 'eval_samples_per_second': 124.222, 'eval_steps_per_second': 15.534, 'epoch': 0.47}
{'loss': 3.7659, 'grad_norm': 2.658674478530884, 'learning_rate': 2.653061224489796e-06, 'epoch': 0.47}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8714182376861572, 'eval_runtime': 109.573, 'eval_samples_per_second': 124.219, 'eval_steps_per_second': 15.533, 'epoch': 0.47}
{'loss': 3.8856, 'grad_norm': 3.697685956954956, 'learning_rate': 2.4897959183673473e-06, 'epoch': 0.48}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8713080883026123, 'eval_runtime': 109.631, 'eval_samples_per_second': 124.153, 'eval_steps_per_second': 15.525, 'epoch': 0.48}
{'loss': 3.9331, 'grad_norm': 2.6389386653900146, 'learning_rate': 2.326530612244898e-06, 'epoch': 0.48}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8702569007873535, 'eval_runtime': 109.6271, 'eval_samples_per_second': 124.157, 'eval_steps_per_second': 15.525, 'epoch': 0.48}
{'loss': 4.0869, 'grad_norm': 4.746092796325684, 'learning_rate': 2.163265306122449e-06, 'epoch': 0.48}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8704230785369873, 'eval_runtime': 109.554, 'eval_samples_per_second': 124.24, 'eval_steps_per_second': 15.536, 'epoch': 0.48}
{'loss': 4.1115, 'grad_norm': 3.1562764644622803, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.48}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8705997467041016, 'eval_runtime': 109.5521, 'eval_samples_per_second': 124.242, 'eval_steps_per_second': 15.536, 'epoch': 0.48}
{'loss': 3.8888, 'grad_norm': 3.312621831893921, 'learning_rate': 1.8367346938775512e-06, 'epoch': 0.48}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8690850734710693, 'eval_runtime': 109.5909, 'eval_samples_per_second': 124.198, 'eval_steps_per_second': 15.53, 'epoch': 0.48}
{'loss': 3.8775, 'grad_norm': 8.784948348999023, 'learning_rate': 1.673469387755102e-06, 'epoch': 0.48}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8713443279266357, 'eval_runtime': 109.5476, 'eval_samples_per_second': 124.247, 'eval_steps_per_second': 15.537, 'epoch': 0.48}
{'loss': 3.9594, 'grad_norm': 3.757291793823242, 'learning_rate': 1.5102040816326532e-06, 'epoch': 0.48}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.870436906814575, 'eval_runtime': 109.5955, 'eval_samples_per_second': 124.193, 'eval_steps_per_second': 15.53, 'epoch': 0.48}
{'loss': 3.8169, 'grad_norm': 2.9188437461853027, 'learning_rate': 1.346938775510204e-06, 'epoch': 0.49}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.869765520095825, 'eval_runtime': 109.5857, 'eval_samples_per_second': 124.204, 'eval_steps_per_second': 15.531, 'epoch': 0.49}
{'loss': 3.9523, 'grad_norm': 2.57680082321167, 'learning_rate': 1.1836734693877552e-06, 'epoch': 0.49}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.869556427001953, 'eval_runtime': 109.609, 'eval_samples_per_second': 124.178, 'eval_steps_per_second': 15.528, 'epoch': 0.49}
{'loss': 3.8597, 'grad_norm': 3.792369842529297, 'learning_rate': 1.020408163265306e-06, 'epoch': 0.49}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8701488971710205, 'eval_runtime': 109.5265, 'eval_samples_per_second': 124.271, 'eval_steps_per_second': 15.54, 'epoch': 0.49}
{'loss': 3.8432, 'grad_norm': 5.377573013305664, 'learning_rate': 8.571428571428572e-07, 'epoch': 0.49}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.87017560005188, 'eval_runtime': 109.5815, 'eval_samples_per_second': 124.209, 'eval_steps_per_second': 15.532, 'epoch': 0.49}
{'loss': 3.7584, 'grad_norm': 4.878894329071045, 'learning_rate': 6.938775510204082e-07, 'epoch': 0.49}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.869622230529785, 'eval_runtime': 109.5605, 'eval_samples_per_second': 124.233, 'eval_steps_per_second': 15.535, 'epoch': 0.49}
{'loss': 3.8639, 'grad_norm': 2.654189109802246, 'learning_rate': 5.306122448979592e-07, 'epoch': 0.49}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.869727373123169, 'eval_runtime': 109.6202, 'eval_samples_per_second': 124.165, 'eval_steps_per_second': 15.526, 'epoch': 0.49}
{'loss': 3.9326, 'grad_norm': 3.435314893722534, 'learning_rate': 3.673469387755102e-07, 'epoch': 0.5}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.869140863418579, 'eval_runtime': 109.6924, 'eval_samples_per_second': 124.083, 'eval_steps_per_second': 15.516, 'epoch': 0.5}
{'loss': 3.88, 'grad_norm': 2.538628339767456, 'learning_rate': 2.0408163265306124e-07, 'epoch': 0.5}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8691937923431396, 'eval_runtime': 109.601, 'eval_samples_per_second': 124.187, 'eval_steps_per_second': 15.529, 'epoch': 0.5}
{'loss': 4.0022, 'grad_norm': 3.833045721054077, 'learning_rate': 4.081632653061225e-08, 'epoch': 0.5}


  0%|          | 0/1702 [00:00<?, ?it/s]

{'eval_loss': 3.8690946102142334, 'eval_runtime': 109.6178, 'eval_samples_per_second': 124.168, 'eval_steps_per_second': 15.527, 'epoch': 0.5}
{'train_runtime': 35623.3323, 'train_samples_per_second': 1.719, 'train_steps_per_second': 0.86, 'train_loss': 4.003233855578364, 'epoch': 0.5}


TrainOutput(global_step=30625, training_loss=4.003233855578364, metrics={'train_runtime': 35623.3323, 'train_samples_per_second': 1.719, 'train_steps_per_second': 0.86, 'total_flos': 1.600413696e+16, 'train_loss': 4.003233855578364, 'epoch': 0.5000081633985861})

# Save Model

In [6]:
trainer.save_model()

# Running Inference

In [7]:
# Inference Example
example_input = "<h1>Heading"
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=50, top_p=0.95, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Original HTML code: <h1>Heading
Corrected HTML code: <h1>Heading</h1>


In [8]:
# Inference Example
example_input = """<p>This is a paragraph."""
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=50, top_p=0.95, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Corrected HTML code: <p>This is a paragraph.</p>


In [10]:
# Inference Example
example_input = """<div><span>This is some text.</p></div>"""
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=50, top_p=0.95, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Corrected HTML code: <div><span>This is some text.</p></div>


In [47]:
# Inference Example
example_input = """list = [1,2,3,4,5,6"""
input_ids = tokenizer(example_input, return_tensors="pt")["input_ids"].to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, top_k=10, top_p=0.97, attention_mask=attention_mask)

# Decode and print the corrected HTML code
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#print("Original HTML code:", example_input)
print("Corrected HTML code:", decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Corrected HTML code: list = [1,2,3,4,5,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
