In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
import torch
from datasets import Dataset

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model_name = "facebook/opt-350m"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    device_map="auto"   # Automatically distribute the model across available GPUs
)

# Freeze most parameters in the base model but unfreeze the last few layers for fine-tuning
for name, param in model.named_parameters():
    if "decoder.layers" in name and any(str(i) in name for i in range(22, 24)):  # Unfreeze last two layers (example: layers 22 and 23)
        param.requires_grad = True
    else:
        param.requires_grad = False

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load dataset using kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

# Assuming the dataset is in CSV format
# The file is typically named "spam.csv" in this dataset
df_sms = pd.read_csv(f"{path}/spam.csv", encoding="ISO-8859-1")

# Rename columns to match expected names
df_sms = df_sms.rename(columns={"v1": "label", "v2": "text"})  # 'v1' is label, 'v2' is the message

# Drop unnecessary columns if present
df_sms = df_sms[['text', 'label']]

# Ensure consistent label encoding: 'ham' -> 0, 'spam' -> 1
df_sms['label'] = df_sms['label'].map({'ham': 0, 'spam': 1})

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df_sms)

# Split the dataset into training and validation sets (90% train, 10% validation)
train_data = dataset.select([i for i in range(len(dataset)) if i % 10 != 0])  # 90% for training
val_data = dataset.select([i for i in range(len(dataset)) if i % 10 == 0])  # 10% for validation

# Tokenize the input sequences
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', max_length=128, truncation=True)

# Apply tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# Convert labels to a format suitable for classification
train_data = train_data.rename_column("label", "labels")
val_data = val_data.rename_column("label", "labels")

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results_sms_spam",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    fp16=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)

# Start Training
trainer.train()

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5014 [00:00<?, ? examples/s]

Map:   0%|          | 0/558 [00:00<?, ? examples/s]



  0%|          | 0/942 [00:00<?, ?it/s]

{'loss': 0.6556, 'grad_norm': 3.0493531227111816, 'learning_rate': 1.9787685774946922e-05, 'epoch': 0.03}


  0%|          | 0/35 [00:00<?, ?it/s]

{'eval_loss': 0.2882041931152344, 'eval_runtime': 2.0347, 'eval_samples_per_second': 274.241, 'eval_steps_per_second': 17.201, 'epoch': 0.03}
{'loss': 0.265, 'grad_norm': 1.8173232078552246, 'learning_rate': 1.9575371549893843e-05, 'epoch': 0.06}


  0%|          | 0/35 [00:00<?, ?it/s]

{'eval_loss': 0.09563983976840973, 'eval_runtime': 2.0554, 'eval_samples_per_second': 271.48, 'eval_steps_per_second': 17.028, 'epoch': 0.06}
{'loss': 0.084, 'grad_norm': 0.6620710492134094, 'learning_rate': 1.9363057324840767e-05, 'epoch': 0.1}


  0%|          | 0/35 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [2]:
# Testing the model
def test_model(test_texts):
    # Tokenize the test texts
    inputs = tokenizer(test_texts, padding='max_length', max_length=128, truncation=True, return_tensors="pt").to(device)
    
    # Get predictions
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.cpu().numpy()

# Example test cases
test_texts = [
    "Free entry in 2 a weekly competition to win FA Cup final tickets! Text FA to 87121 to receive entry question.",
    "Hey, are we still meeting for coffee later?",
    "You have won a $1000 Walmart gift card. Call now to claim your prize.",
    "Can you send me the report by tonight?"
]

# Get predictions
predictions = test_model(test_texts)

# Map predictions back to labels
label_map = {0: "ham", 1: "spam"}
results = [label_map[pred] for pred in predictions]

# Print results
for text, label in zip(test_texts, results):
    print(f"Text: {text}\nPredicted Label: {label}\n")

Text: Free entry in 2 a weekly competition to win FA Cup final tickets! Text FA to 87121 to receive entry question.
Predicted Label: spam

Text: Hey, are we still meeting for coffee later?
Predicted Label: ham

Text: You have won a $1000 Walmart gift card. Call now to claim your prize.
Predicted Label: spam

Text: Can you send me the report by tonight?
Predicted Label: ham



In [4]:
ds

NameError: name 'ds' is not defined

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
from datasets import Dataset

# Load the model
model_name = "facebook/opt-350m"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=28,  # Number of unique emotion classes
    device_map="auto"
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Freeze most parameters in the base model but unfreeze the last few layers for fine-tuning
for name, param in model.named_parameters():
    if "decoder.layers" in name and any(str(i) in name for i in range(22, 24)):  # Unfreeze last two layers (example: layers 22 and 23)
        param.requires_grad = True
    else:
        param.requires_grad = False

# Load dataset
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

# Convert to Pandas DataFrame for preprocessing
df_train = pd.DataFrame(ds["train"])

# Retain only the required columns
df_train = df_train[['text', 'labels']]

# Flatten the labels if they are lists
df_train['labels'] = df_train['labels'].apply(lambda x: x[0] if isinstance(x, list) else x)

# Split the dataset (20% train, 80% test)
train_subset = df_train.sample(frac=0.2, random_state=42)  # Random 20% for training
test_subset = df_train.drop(train_subset.index)            # Remaining 80% for testing

# Convert back to Hugging Face Dataset
train_data = Dataset.from_pandas(train_subset)
test_data = Dataset.from_pandas(test_subset)

# Tokenize the input sequences
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', max_length=128, truncation=True)

# Apply tokenization
train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results_emotions",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    fp16=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer
)

# Start Training
trainer.train()


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8682 [00:00<?, ? examples/s]

Map:   0%|          | 0/34728 [00:00<?, ? examples/s]



  0%|          | 0/1629 [00:00<?, ?it/s]

{'loss': 4.2637, 'grad_norm': 20.502784729003906, 'learning_rate': 1.9877225291589935e-05, 'epoch': 0.02}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 3.4493839740753174, 'eval_runtime': 132.2367, 'eval_samples_per_second': 262.62, 'eval_steps_per_second': 16.418, 'epoch': 0.02}
{'loss': 3.399, 'grad_norm': 20.662921905517578, 'learning_rate': 1.975445058317987e-05, 'epoch': 0.04}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 3.1418936252593994, 'eval_runtime': 132.344, 'eval_samples_per_second': 262.407, 'eval_steps_per_second': 16.404, 'epoch': 0.04}
{'loss': 2.8836, 'grad_norm': 14.344673156738281, 'learning_rate': 1.96316758747698e-05, 'epoch': 0.06}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 3.0008111000061035, 'eval_runtime': 127.5547, 'eval_samples_per_second': 272.26, 'eval_steps_per_second': 17.02, 'epoch': 0.06}
{'loss': 3.2196, 'grad_norm': 15.672420501708984, 'learning_rate': 1.950890116635973e-05, 'epoch': 0.07}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.907414436340332, 'eval_runtime': 126.6901, 'eval_samples_per_second': 274.118, 'eval_steps_per_second': 17.136, 'epoch': 0.07}
{'loss': 2.9519, 'grad_norm': 15.776799201965332, 'learning_rate': 1.9386126457949662e-05, 'epoch': 0.09}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.8423619270324707, 'eval_runtime': 126.4323, 'eval_samples_per_second': 274.677, 'eval_steps_per_second': 17.171, 'epoch': 0.09}
{'loss': 2.6683, 'grad_norm': 19.41396141052246, 'learning_rate': 1.9263351749539596e-05, 'epoch': 0.11}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.764753818511963, 'eval_runtime': 126.4495, 'eval_samples_per_second': 274.639, 'eval_steps_per_second': 17.169, 'epoch': 0.11}
{'loss': 2.6397, 'grad_norm': 14.85412883758545, 'learning_rate': 1.914057704112953e-05, 'epoch': 0.13}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.683339834213257, 'eval_runtime': 126.2161, 'eval_samples_per_second': 275.147, 'eval_steps_per_second': 17.201, 'epoch': 0.13}
{'loss': 2.672, 'grad_norm': 14.28772258758545, 'learning_rate': 1.9017802332719463e-05, 'epoch': 0.15}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.6350808143615723, 'eval_runtime': 125.6882, 'eval_samples_per_second': 276.303, 'eval_steps_per_second': 17.273, 'epoch': 0.15}
{'loss': 2.4927, 'grad_norm': 15.020136833190918, 'learning_rate': 1.8895027624309393e-05, 'epoch': 0.17}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.5985424518585205, 'eval_runtime': 125.7787, 'eval_samples_per_second': 276.104, 'eval_steps_per_second': 17.26, 'epoch': 0.17}
{'loss': 2.4544, 'grad_norm': 13.63717269897461, 'learning_rate': 1.8772252915899326e-05, 'epoch': 0.18}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.5433852672576904, 'eval_runtime': 126.0393, 'eval_samples_per_second': 275.533, 'eval_steps_per_second': 17.225, 'epoch': 0.18}
{'loss': 2.5823, 'grad_norm': 19.99674415588379, 'learning_rate': 1.8649478207489257e-05, 'epoch': 0.2}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.502124309539795, 'eval_runtime': 125.9242, 'eval_samples_per_second': 275.785, 'eval_steps_per_second': 17.241, 'epoch': 0.2}
{'loss': 2.5748, 'grad_norm': 14.535670280456543, 'learning_rate': 1.852670349907919e-05, 'epoch': 0.22}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.4401469230651855, 'eval_runtime': 126.4228, 'eval_samples_per_second': 274.697, 'eval_steps_per_second': 17.173, 'epoch': 0.22}
{'loss': 2.3763, 'grad_norm': 10.493953704833984, 'learning_rate': 1.8403928790669124e-05, 'epoch': 0.24}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.4299204349517822, 'eval_runtime': 126.0575, 'eval_samples_per_second': 275.493, 'eval_steps_per_second': 17.222, 'epoch': 0.24}
{'loss': 2.4738, 'grad_norm': 11.594463348388672, 'learning_rate': 1.8281154082259057e-05, 'epoch': 0.26}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.385680675506592, 'eval_runtime': 125.8136, 'eval_samples_per_second': 276.027, 'eval_steps_per_second': 17.256, 'epoch': 0.26}
{'loss': 2.3968, 'grad_norm': 14.656950950622559, 'learning_rate': 1.815837937384899e-05, 'epoch': 0.28}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.3592305183410645, 'eval_runtime': 126.0056, 'eval_samples_per_second': 275.607, 'eval_steps_per_second': 17.229, 'epoch': 0.28}
{'loss': 2.5252, 'grad_norm': 12.501265525817871, 'learning_rate': 1.803560466543892e-05, 'epoch': 0.29}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.3071489334106445, 'eval_runtime': 126.304, 'eval_samples_per_second': 274.956, 'eval_steps_per_second': 17.189, 'epoch': 0.29}
{'loss': 2.253, 'grad_norm': 12.870792388916016, 'learning_rate': 1.7912829957028854e-05, 'epoch': 0.31}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.3083901405334473, 'eval_runtime': 125.5558, 'eval_samples_per_second': 276.594, 'eval_steps_per_second': 17.291, 'epoch': 0.31}
{'loss': 2.1349, 'grad_norm': 11.70281982421875, 'learning_rate': 1.7790055248618784e-05, 'epoch': 0.33}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.315347194671631, 'eval_runtime': 125.615, 'eval_samples_per_second': 276.464, 'eval_steps_per_second': 17.283, 'epoch': 0.33}
{'loss': 2.3887, 'grad_norm': 14.166975975036621, 'learning_rate': 1.7667280540208718e-05, 'epoch': 0.35}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.2622058391571045, 'eval_runtime': 126.2489, 'eval_samples_per_second': 275.076, 'eval_steps_per_second': 17.196, 'epoch': 0.35}
{'loss': 2.1084, 'grad_norm': 10.692850112915039, 'learning_rate': 1.754450583179865e-05, 'epoch': 0.37}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.1990485191345215, 'eval_runtime': 125.9219, 'eval_samples_per_second': 275.79, 'eval_steps_per_second': 17.241, 'epoch': 0.37}
{'loss': 2.0733, 'grad_norm': 14.216978073120117, 'learning_rate': 1.7421731123388585e-05, 'epoch': 0.39}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.18493914604187, 'eval_runtime': 126.1722, 'eval_samples_per_second': 275.243, 'eval_steps_per_second': 17.207, 'epoch': 0.39}
{'loss': 2.3805, 'grad_norm': 11.858458518981934, 'learning_rate': 1.7298956414978515e-05, 'epoch': 0.41}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.1623756885528564, 'eval_runtime': 125.5192, 'eval_samples_per_second': 276.675, 'eval_steps_per_second': 17.296, 'epoch': 0.41}
{'loss': 2.2324, 'grad_norm': 13.20415210723877, 'learning_rate': 1.717618170656845e-05, 'epoch': 0.42}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.1281251907348633, 'eval_runtime': 126.1021, 'eval_samples_per_second': 275.396, 'eval_steps_per_second': 17.216, 'epoch': 0.42}
{'loss': 2.1638, 'grad_norm': 10.843575477600098, 'learning_rate': 1.7053406998158382e-05, 'epoch': 0.44}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.1357662677764893, 'eval_runtime': 125.7418, 'eval_samples_per_second': 276.185, 'eval_steps_per_second': 17.266, 'epoch': 0.44}
{'loss': 2.1633, 'grad_norm': 11.151978492736816, 'learning_rate': 1.6930632289748312e-05, 'epoch': 0.46}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.1297812461853027, 'eval_runtime': 125.7971, 'eval_samples_per_second': 276.064, 'eval_steps_per_second': 17.258, 'epoch': 0.46}
{'loss': 2.1282, 'grad_norm': 12.20561408996582, 'learning_rate': 1.6807857581338245e-05, 'epoch': 0.48}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.054462194442749, 'eval_runtime': 126.1127, 'eval_samples_per_second': 275.373, 'eval_steps_per_second': 17.215, 'epoch': 0.48}
{'loss': 1.9187, 'grad_norm': 14.331145286560059, 'learning_rate': 1.668508287292818e-05, 'epoch': 0.5}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.064734697341919, 'eval_runtime': 125.683, 'eval_samples_per_second': 276.314, 'eval_steps_per_second': 17.274, 'epoch': 0.5}
{'loss': 2.1081, 'grad_norm': 11.247084617614746, 'learning_rate': 1.656230816451811e-05, 'epoch': 0.52}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.0552492141723633, 'eval_runtime': 125.8666, 'eval_samples_per_second': 275.911, 'eval_steps_per_second': 17.248, 'epoch': 0.52}
{'loss': 1.8153, 'grad_norm': 12.371651649475098, 'learning_rate': 1.6439533456108043e-05, 'epoch': 0.53}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 2.0196101665496826, 'eval_runtime': 125.4814, 'eval_samples_per_second': 276.758, 'eval_steps_per_second': 17.301, 'epoch': 0.53}
{'loss': 2.1308, 'grad_norm': 12.606748580932617, 'learning_rate': 1.6316758747697976e-05, 'epoch': 0.55}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.9782634973526, 'eval_runtime': 125.6316, 'eval_samples_per_second': 276.427, 'eval_steps_per_second': 17.281, 'epoch': 0.55}
{'loss': 1.9664, 'grad_norm': 14.043485641479492, 'learning_rate': 1.619398403928791e-05, 'epoch': 0.57}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.9771995544433594, 'eval_runtime': 125.7414, 'eval_samples_per_second': 276.186, 'eval_steps_per_second': 17.266, 'epoch': 0.57}
{'loss': 1.9271, 'grad_norm': 12.505230903625488, 'learning_rate': 1.607120933087784e-05, 'epoch': 0.59}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.9668465852737427, 'eval_runtime': 125.6999, 'eval_samples_per_second': 276.277, 'eval_steps_per_second': 17.271, 'epoch': 0.59}
{'loss': 1.8842, 'grad_norm': 18.12596893310547, 'learning_rate': 1.5948434622467773e-05, 'epoch': 0.61}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.9588996171951294, 'eval_runtime': 125.4935, 'eval_samples_per_second': 276.731, 'eval_steps_per_second': 17.3, 'epoch': 0.61}
{'loss': 2.0344, 'grad_norm': 12.718469619750977, 'learning_rate': 1.5825659914057703e-05, 'epoch': 0.63}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.9612551927566528, 'eval_runtime': 125.9924, 'eval_samples_per_second': 275.636, 'eval_steps_per_second': 17.231, 'epoch': 0.63}
{'loss': 1.9063, 'grad_norm': 10.606160163879395, 'learning_rate': 1.5702885205647637e-05, 'epoch': 0.64}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.9116121530532837, 'eval_runtime': 125.4254, 'eval_samples_per_second': 276.882, 'eval_steps_per_second': 17.309, 'epoch': 0.64}
{'loss': 1.9129, 'grad_norm': 13.875346183776855, 'learning_rate': 1.558011049723757e-05, 'epoch': 0.66}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.911092758178711, 'eval_runtime': 125.9392, 'eval_samples_per_second': 275.752, 'eval_steps_per_second': 17.238, 'epoch': 0.66}
{'loss': 1.9525, 'grad_norm': 10.503704071044922, 'learning_rate': 1.5457335788827504e-05, 'epoch': 0.68}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.9166126251220703, 'eval_runtime': 125.3901, 'eval_samples_per_second': 276.96, 'eval_steps_per_second': 17.314, 'epoch': 0.68}
{'loss': 1.9981, 'grad_norm': 12.076457023620605, 'learning_rate': 1.5334561080417437e-05, 'epoch': 0.7}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.8965275287628174, 'eval_runtime': 125.495, 'eval_samples_per_second': 276.728, 'eval_steps_per_second': 17.299, 'epoch': 0.7}
{'loss': 1.8672, 'grad_norm': 14.423935890197754, 'learning_rate': 1.5211786372007367e-05, 'epoch': 0.72}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.8555974960327148, 'eval_runtime': 125.5625, 'eval_samples_per_second': 276.579, 'eval_steps_per_second': 17.29, 'epoch': 0.72}
{'loss': 1.7897, 'grad_norm': 12.205228805541992, 'learning_rate': 1.50890116635973e-05, 'epoch': 0.74}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.8689967393875122, 'eval_runtime': 125.6622, 'eval_samples_per_second': 276.36, 'eval_steps_per_second': 17.276, 'epoch': 0.74}
{'loss': 1.9034, 'grad_norm': 12.100110054016113, 'learning_rate': 1.4966236955187233e-05, 'epoch': 0.76}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.8553743362426758, 'eval_runtime': 126.1283, 'eval_samples_per_second': 275.339, 'eval_steps_per_second': 17.213, 'epoch': 0.76}
{'loss': 1.8335, 'grad_norm': 11.607989311218262, 'learning_rate': 1.4843462246777164e-05, 'epoch': 0.77}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.872910737991333, 'eval_runtime': 125.8496, 'eval_samples_per_second': 275.948, 'eval_steps_per_second': 17.251, 'epoch': 0.77}
{'loss': 1.7738, 'grad_norm': 14.125314712524414, 'learning_rate': 1.4720687538367098e-05, 'epoch': 0.79}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.8755189180374146, 'eval_runtime': 128.2578, 'eval_samples_per_second': 270.767, 'eval_steps_per_second': 16.927, 'epoch': 0.79}
{'loss': 1.8562, 'grad_norm': 11.631851196289062, 'learning_rate': 1.459791282995703e-05, 'epoch': 0.81}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.8472983837127686, 'eval_runtime': 133.1432, 'eval_samples_per_second': 260.832, 'eval_steps_per_second': 16.306, 'epoch': 0.81}
{'loss': 1.7959, 'grad_norm': 12.617838859558105, 'learning_rate': 1.4475138121546963e-05, 'epoch': 0.83}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.826341152191162, 'eval_runtime': 128.7334, 'eval_samples_per_second': 269.767, 'eval_steps_per_second': 16.864, 'epoch': 0.83}
{'loss': 1.9249, 'grad_norm': 11.359489440917969, 'learning_rate': 1.4352363413136897e-05, 'epoch': 0.85}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.8221261501312256, 'eval_runtime': 128.3529, 'eval_samples_per_second': 270.567, 'eval_steps_per_second': 16.914, 'epoch': 0.85}
{'loss': 2.0772, 'grad_norm': 12.595452308654785, 'learning_rate': 1.4229588704726827e-05, 'epoch': 0.87}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7951661348342896, 'eval_runtime': 126.8348, 'eval_samples_per_second': 273.805, 'eval_steps_per_second': 17.117, 'epoch': 0.87}
{'loss': 1.8644, 'grad_norm': 9.98974895477295, 'learning_rate': 1.4106813996316759e-05, 'epoch': 0.88}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.777665615081787, 'eval_runtime': 126.7736, 'eval_samples_per_second': 273.937, 'eval_steps_per_second': 17.125, 'epoch': 0.88}
{'loss': 1.6587, 'grad_norm': 11.5889253616333, 'learning_rate': 1.3984039287906692e-05, 'epoch': 0.9}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7965433597564697, 'eval_runtime': 127.2127, 'eval_samples_per_second': 272.992, 'eval_steps_per_second': 17.066, 'epoch': 0.9}
{'loss': 1.986, 'grad_norm': 14.02219295501709, 'learning_rate': 1.3861264579496626e-05, 'epoch': 0.92}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7672375440597534, 'eval_runtime': 125.5705, 'eval_samples_per_second': 276.562, 'eval_steps_per_second': 17.289, 'epoch': 0.92}
{'loss': 1.7847, 'grad_norm': 19.87342071533203, 'learning_rate': 1.3738489871086557e-05, 'epoch': 0.94}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7573490142822266, 'eval_runtime': 125.5067, 'eval_samples_per_second': 276.702, 'eval_steps_per_second': 17.298, 'epoch': 0.94}
{'loss': 1.6932, 'grad_norm': 11.342931747436523, 'learning_rate': 1.3615715162676491e-05, 'epoch': 0.96}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7526469230651855, 'eval_runtime': 125.6519, 'eval_samples_per_second': 276.383, 'eval_steps_per_second': 17.278, 'epoch': 0.96}
{'loss': 1.7044, 'grad_norm': 13.00281810760498, 'learning_rate': 1.3492940454266423e-05, 'epoch': 0.98}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7581565380096436, 'eval_runtime': 125.5575, 'eval_samples_per_second': 276.59, 'eval_steps_per_second': 17.291, 'epoch': 0.98}
{'loss': 1.8713, 'grad_norm': 13.146998405456543, 'learning_rate': 1.3370165745856355e-05, 'epoch': 0.99}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7597558498382568, 'eval_runtime': 125.6849, 'eval_samples_per_second': 276.31, 'eval_steps_per_second': 17.273, 'epoch': 0.99}
{'loss': 1.7487, 'grad_norm': 14.83298110961914, 'learning_rate': 1.3247391037446286e-05, 'epoch': 1.01}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.754740595817566, 'eval_runtime': 125.7523, 'eval_samples_per_second': 276.162, 'eval_steps_per_second': 17.264, 'epoch': 1.01}
{'loss': 1.6765, 'grad_norm': 16.125356674194336, 'learning_rate': 1.312461632903622e-05, 'epoch': 1.03}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7434734106063843, 'eval_runtime': 125.9367, 'eval_samples_per_second': 275.758, 'eval_steps_per_second': 17.239, 'epoch': 1.03}
{'loss': 1.6028, 'grad_norm': 12.32500171661377, 'learning_rate': 1.3001841620626152e-05, 'epoch': 1.05}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7536464929580688, 'eval_runtime': 125.7774, 'eval_samples_per_second': 276.107, 'eval_steps_per_second': 17.261, 'epoch': 1.05}
{'loss': 1.6293, 'grad_norm': 9.613348007202148, 'learning_rate': 1.2879066912216085e-05, 'epoch': 1.07}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7602289915084839, 'eval_runtime': 126.0001, 'eval_samples_per_second': 275.619, 'eval_steps_per_second': 17.23, 'epoch': 1.07}
{'loss': 1.5469, 'grad_norm': 10.894640922546387, 'learning_rate': 1.2756292203806017e-05, 'epoch': 1.09}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7431530952453613, 'eval_runtime': 125.917, 'eval_samples_per_second': 275.801, 'eval_steps_per_second': 17.242, 'epoch': 1.09}
{'loss': 1.4878, 'grad_norm': 12.131148338317871, 'learning_rate': 1.263351749539595e-05, 'epoch': 1.1}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7390605211257935, 'eval_runtime': 125.8256, 'eval_samples_per_second': 276.001, 'eval_steps_per_second': 17.254, 'epoch': 1.1}
{'loss': 1.6253, 'grad_norm': 13.241412162780762, 'learning_rate': 1.251074278698588e-05, 'epoch': 1.12}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.759644627571106, 'eval_runtime': 126.41, 'eval_samples_per_second': 274.725, 'eval_steps_per_second': 17.174, 'epoch': 1.12}
{'loss': 1.5314, 'grad_norm': 15.77187728881836, 'learning_rate': 1.2387968078575814e-05, 'epoch': 1.14}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7438377141952515, 'eval_runtime': 127.6541, 'eval_samples_per_second': 272.048, 'eval_steps_per_second': 17.007, 'epoch': 1.14}
{'loss': 1.4164, 'grad_norm': 10.593084335327148, 'learning_rate': 1.2265193370165746e-05, 'epoch': 1.16}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7381561994552612, 'eval_runtime': 127.0122, 'eval_samples_per_second': 273.423, 'eval_steps_per_second': 17.093, 'epoch': 1.16}
{'loss': 1.5945, 'grad_norm': 12.348043441772461, 'learning_rate': 1.214241866175568e-05, 'epoch': 1.18}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7329989671707153, 'eval_runtime': 126.4588, 'eval_samples_per_second': 274.619, 'eval_steps_per_second': 17.168, 'epoch': 1.18}
{'loss': 1.4549, 'grad_norm': 9.91938591003418, 'learning_rate': 1.2019643953345613e-05, 'epoch': 1.2}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7175971269607544, 'eval_runtime': 126.9274, 'eval_samples_per_second': 273.605, 'eval_steps_per_second': 17.104, 'epoch': 1.2}
{'loss': 1.481, 'grad_norm': 13.80865478515625, 'learning_rate': 1.1896869244935545e-05, 'epoch': 1.22}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7326555252075195, 'eval_runtime': 126.471, 'eval_samples_per_second': 274.593, 'eval_steps_per_second': 17.166, 'epoch': 1.22}
{'loss': 1.7298, 'grad_norm': 11.069134712219238, 'learning_rate': 1.1774094536525478e-05, 'epoch': 1.23}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7224061489105225, 'eval_runtime': 125.7401, 'eval_samples_per_second': 276.189, 'eval_steps_per_second': 17.266, 'epoch': 1.23}
{'loss': 1.5465, 'grad_norm': 13.585461616516113, 'learning_rate': 1.1651319828115408e-05, 'epoch': 1.25}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7021195888519287, 'eval_runtime': 125.8623, 'eval_samples_per_second': 275.921, 'eval_steps_per_second': 17.249, 'epoch': 1.25}
{'loss': 1.4086, 'grad_norm': 15.277249336242676, 'learning_rate': 1.1528545119705342e-05, 'epoch': 1.27}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7097028493881226, 'eval_runtime': 126.1715, 'eval_samples_per_second': 275.245, 'eval_steps_per_second': 17.207, 'epoch': 1.27}
{'loss': 1.4801, 'grad_norm': 15.01574420928955, 'learning_rate': 1.1405770411295274e-05, 'epoch': 1.29}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7066168785095215, 'eval_runtime': 125.6886, 'eval_samples_per_second': 276.302, 'eval_steps_per_second': 17.273, 'epoch': 1.29}
{'loss': 1.6849, 'grad_norm': 11.566949844360352, 'learning_rate': 1.1282995702885207e-05, 'epoch': 1.31}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7010235786437988, 'eval_runtime': 127.2907, 'eval_samples_per_second': 272.824, 'eval_steps_per_second': 17.055, 'epoch': 1.31}
{'loss': 1.6954, 'grad_norm': 11.21959400177002, 'learning_rate': 1.1160220994475139e-05, 'epoch': 1.33}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6944050788879395, 'eval_runtime': 127.1515, 'eval_samples_per_second': 273.123, 'eval_steps_per_second': 17.074, 'epoch': 1.33}
{'loss': 1.3983, 'grad_norm': 12.239365577697754, 'learning_rate': 1.1037446286065072e-05, 'epoch': 1.34}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6862884759902954, 'eval_runtime': 125.5321, 'eval_samples_per_second': 276.646, 'eval_steps_per_second': 17.294, 'epoch': 1.34}
{'loss': 1.3438, 'grad_norm': 8.048335075378418, 'learning_rate': 1.0914671577655004e-05, 'epoch': 1.36}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.690748691558838, 'eval_runtime': 141.3592, 'eval_samples_per_second': 245.672, 'eval_steps_per_second': 15.358, 'epoch': 1.36}
{'loss': 1.3531, 'grad_norm': 12.17499828338623, 'learning_rate': 1.0791896869244936e-05, 'epoch': 1.38}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7023658752441406, 'eval_runtime': 135.3249, 'eval_samples_per_second': 256.627, 'eval_steps_per_second': 16.043, 'epoch': 1.38}
{'loss': 1.5706, 'grad_norm': 15.833398818969727, 'learning_rate': 1.0669122160834868e-05, 'epoch': 1.4}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7161729335784912, 'eval_runtime': 138.868, 'eval_samples_per_second': 250.079, 'eval_steps_per_second': 15.634, 'epoch': 1.4}
{'loss': 1.4832, 'grad_norm': 10.746454238891602, 'learning_rate': 1.0546347452424801e-05, 'epoch': 1.42}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.7038041353225708, 'eval_runtime': 142.3768, 'eval_samples_per_second': 243.916, 'eval_steps_per_second': 15.248, 'epoch': 1.42}
{'loss': 1.5544, 'grad_norm': 14.22551441192627, 'learning_rate': 1.0423572744014733e-05, 'epoch': 1.44}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6849215030670166, 'eval_runtime': 142.4146, 'eval_samples_per_second': 243.851, 'eval_steps_per_second': 15.244, 'epoch': 1.44}
{'loss': 1.3975, 'grad_norm': 11.801504135131836, 'learning_rate': 1.0300798035604667e-05, 'epoch': 1.45}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.670257329940796, 'eval_runtime': 142.4393, 'eval_samples_per_second': 243.809, 'eval_steps_per_second': 15.242, 'epoch': 1.45}
{'loss': 1.5193, 'grad_norm': 13.438996315002441, 'learning_rate': 1.01780233271946e-05, 'epoch': 1.47}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.680382490158081, 'eval_runtime': 131.8192, 'eval_samples_per_second': 263.452, 'eval_steps_per_second': 16.47, 'epoch': 1.47}
{'loss': 1.7805, 'grad_norm': 12.407393455505371, 'learning_rate': 1.0055248618784532e-05, 'epoch': 1.49}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.685682773590088, 'eval_runtime': 137.3543, 'eval_samples_per_second': 252.835, 'eval_steps_per_second': 15.806, 'epoch': 1.49}
{'loss': 1.7143, 'grad_norm': 15.133367538452148, 'learning_rate': 9.932473910374464e-06, 'epoch': 1.51}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6791750192642212, 'eval_runtime': 142.7703, 'eval_samples_per_second': 243.244, 'eval_steps_per_second': 15.206, 'epoch': 1.51}
{'loss': 1.3765, 'grad_norm': 9.829365730285645, 'learning_rate': 9.809699201964397e-06, 'epoch': 1.53}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.682852864265442, 'eval_runtime': 142.355, 'eval_samples_per_second': 243.954, 'eval_steps_per_second': 15.251, 'epoch': 1.53}
{'loss': 1.4298, 'grad_norm': 13.038516998291016, 'learning_rate': 9.686924493554329e-06, 'epoch': 1.55}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6850080490112305, 'eval_runtime': 142.2982, 'eval_samples_per_second': 244.051, 'eval_steps_per_second': 15.257, 'epoch': 1.55}
{'loss': 1.2952, 'grad_norm': 9.886930465698242, 'learning_rate': 9.56414978514426e-06, 'epoch': 1.57}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6762882471084595, 'eval_runtime': 142.1784, 'eval_samples_per_second': 244.256, 'eval_steps_per_second': 15.27, 'epoch': 1.57}
{'loss': 1.5816, 'grad_norm': 13.442007064819336, 'learning_rate': 9.441375076734194e-06, 'epoch': 1.58}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6746947765350342, 'eval_runtime': 142.0793, 'eval_samples_per_second': 244.427, 'eval_steps_per_second': 15.28, 'epoch': 1.58}
{'loss': 1.3146, 'grad_norm': 13.750150680541992, 'learning_rate': 9.318600368324126e-06, 'epoch': 1.6}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6763293743133545, 'eval_runtime': 142.2519, 'eval_samples_per_second': 244.13, 'eval_steps_per_second': 15.262, 'epoch': 1.6}
{'loss': 1.5387, 'grad_norm': 12.575230598449707, 'learning_rate': 9.195825659914058e-06, 'epoch': 1.62}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6802382469177246, 'eval_runtime': 142.2685, 'eval_samples_per_second': 244.102, 'eval_steps_per_second': 15.26, 'epoch': 1.62}
{'loss': 1.5649, 'grad_norm': 13.815862655639648, 'learning_rate': 9.073050951503991e-06, 'epoch': 1.64}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6762770414352417, 'eval_runtime': 142.2661, 'eval_samples_per_second': 244.106, 'eval_steps_per_second': 15.26, 'epoch': 1.64}
{'loss': 1.4986, 'grad_norm': 12.668876647949219, 'learning_rate': 8.950276243093923e-06, 'epoch': 1.66}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6890311241149902, 'eval_runtime': 141.9861, 'eval_samples_per_second': 244.587, 'eval_steps_per_second': 15.29, 'epoch': 1.66}
{'loss': 1.2592, 'grad_norm': 12.194942474365234, 'learning_rate': 8.827501534683855e-06, 'epoch': 1.68}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.686991572380066, 'eval_runtime': 142.1883, 'eval_samples_per_second': 244.24, 'eval_steps_per_second': 15.268, 'epoch': 1.68}
{'loss': 1.7536, 'grad_norm': 15.266300201416016, 'learning_rate': 8.704726826273789e-06, 'epoch': 1.69}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.685783863067627, 'eval_runtime': 142.1875, 'eval_samples_per_second': 244.241, 'eval_steps_per_second': 15.269, 'epoch': 1.69}
{'loss': 1.4127, 'grad_norm': 12.362787246704102, 'learning_rate': 8.58195211786372e-06, 'epoch': 1.71}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6675289869308472, 'eval_runtime': 142.1723, 'eval_samples_per_second': 244.267, 'eval_steps_per_second': 15.27, 'epoch': 1.71}
{'loss': 1.4147, 'grad_norm': 11.505629539489746, 'learning_rate': 8.459177409453652e-06, 'epoch': 1.73}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.661782145500183, 'eval_runtime': 142.1733, 'eval_samples_per_second': 244.265, 'eval_steps_per_second': 15.27, 'epoch': 1.73}
{'loss': 1.3303, 'grad_norm': 14.534158706665039, 'learning_rate': 8.336402701043586e-06, 'epoch': 1.75}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6618016958236694, 'eval_runtime': 142.9913, 'eval_samples_per_second': 242.868, 'eval_steps_per_second': 15.183, 'epoch': 1.75}
{'loss': 1.3295, 'grad_norm': 11.81948184967041, 'learning_rate': 8.213627992633517e-06, 'epoch': 1.77}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.665461778640747, 'eval_runtime': 143.2512, 'eval_samples_per_second': 242.427, 'eval_steps_per_second': 15.155, 'epoch': 1.77}
{'loss': 1.5191, 'grad_norm': 11.189281463623047, 'learning_rate': 8.090853284223451e-06, 'epoch': 1.79}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6745972633361816, 'eval_runtime': 143.4933, 'eval_samples_per_second': 242.018, 'eval_steps_per_second': 15.13, 'epoch': 1.79}
{'loss': 1.4737, 'grad_norm': 14.082925796508789, 'learning_rate': 7.968078575813383e-06, 'epoch': 1.8}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6656789779663086, 'eval_runtime': 143.0646, 'eval_samples_per_second': 242.743, 'eval_steps_per_second': 15.175, 'epoch': 1.8}
{'loss': 1.4288, 'grad_norm': 11.57168960571289, 'learning_rate': 7.845303867403316e-06, 'epoch': 1.82}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.655276894569397, 'eval_runtime': 142.2499, 'eval_samples_per_second': 244.134, 'eval_steps_per_second': 15.262, 'epoch': 1.82}
{'loss': 1.4432, 'grad_norm': 12.874267578125, 'learning_rate': 7.722529158993248e-06, 'epoch': 1.84}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.661490559577942, 'eval_runtime': 142.1107, 'eval_samples_per_second': 244.373, 'eval_steps_per_second': 15.277, 'epoch': 1.84}
{'loss': 1.4737, 'grad_norm': 11.042892456054688, 'learning_rate': 7.59975445058318e-06, 'epoch': 1.86}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6655654907226562, 'eval_runtime': 142.4062, 'eval_samples_per_second': 243.866, 'eval_steps_per_second': 15.245, 'epoch': 1.86}
{'loss': 1.6817, 'grad_norm': 14.56442642211914, 'learning_rate': 7.4769797421731125e-06, 'epoch': 1.88}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.651874303817749, 'eval_runtime': 142.3764, 'eval_samples_per_second': 243.917, 'eval_steps_per_second': 15.248, 'epoch': 1.88}
{'loss': 1.5141, 'grad_norm': 13.907103538513184, 'learning_rate': 7.354205033763045e-06, 'epoch': 1.9}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.66858971118927, 'eval_runtime': 142.1022, 'eval_samples_per_second': 244.387, 'eval_steps_per_second': 15.278, 'epoch': 1.9}
{'loss': 1.4648, 'grad_norm': 9.685714721679688, 'learning_rate': 7.231430325352979e-06, 'epoch': 1.92}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6475385427474976, 'eval_runtime': 142.2035, 'eval_samples_per_second': 244.213, 'eval_steps_per_second': 15.267, 'epoch': 1.92}
{'loss': 1.5493, 'grad_norm': 11.358406066894531, 'learning_rate': 7.10865561694291e-06, 'epoch': 1.93}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6399909257888794, 'eval_runtime': 142.4066, 'eval_samples_per_second': 243.865, 'eval_steps_per_second': 15.245, 'epoch': 1.93}
{'loss': 1.3458, 'grad_norm': 12.132195472717285, 'learning_rate': 6.985880908532843e-06, 'epoch': 1.95}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6425790786743164, 'eval_runtime': 142.079, 'eval_samples_per_second': 244.427, 'eval_steps_per_second': 15.28, 'epoch': 1.95}
{'loss': 1.4831, 'grad_norm': 13.09170150756836, 'learning_rate': 6.863106200122776e-06, 'epoch': 1.97}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6361024379730225, 'eval_runtime': 142.2174, 'eval_samples_per_second': 244.19, 'eval_steps_per_second': 15.265, 'epoch': 1.97}
{'loss': 1.3088, 'grad_norm': 15.755821228027344, 'learning_rate': 6.740331491712708e-06, 'epoch': 1.99}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6395447254180908, 'eval_runtime': 142.1411, 'eval_samples_per_second': 244.321, 'eval_steps_per_second': 15.274, 'epoch': 1.99}
{'loss': 1.4544, 'grad_norm': 13.779105186462402, 'learning_rate': 6.61755678330264e-06, 'epoch': 2.01}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6442103385925293, 'eval_runtime': 142.1157, 'eval_samples_per_second': 244.364, 'eval_steps_per_second': 15.276, 'epoch': 2.01}
{'loss': 1.0162, 'grad_norm': 10.920365333557129, 'learning_rate': 6.494782074892573e-06, 'epoch': 2.03}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6469438076019287, 'eval_runtime': 142.5325, 'eval_samples_per_second': 243.65, 'eval_steps_per_second': 15.232, 'epoch': 2.03}
{'loss': 1.3362, 'grad_norm': 12.01740837097168, 'learning_rate': 6.3720073664825055e-06, 'epoch': 2.04}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6378309726715088, 'eval_runtime': 142.1957, 'eval_samples_per_second': 244.227, 'eval_steps_per_second': 15.268, 'epoch': 2.04}
{'loss': 1.2688, 'grad_norm': 10.232728958129883, 'learning_rate': 6.249232658072437e-06, 'epoch': 2.06}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6363862752914429, 'eval_runtime': 142.0942, 'eval_samples_per_second': 244.401, 'eval_steps_per_second': 15.279, 'epoch': 2.06}
{'loss': 1.2827, 'grad_norm': 11.152498245239258, 'learning_rate': 6.12645794966237e-06, 'epoch': 2.08}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6505430936813354, 'eval_runtime': 146.235, 'eval_samples_per_second': 237.481, 'eval_steps_per_second': 14.846, 'epoch': 2.08}
{'loss': 1.3192, 'grad_norm': 13.91309642791748, 'learning_rate': 6.003683241252303e-06, 'epoch': 2.1}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6372748613357544, 'eval_runtime': 142.0166, 'eval_samples_per_second': 244.535, 'eval_steps_per_second': 15.287, 'epoch': 2.1}
{'loss': 1.1308, 'grad_norm': 11.93741512298584, 'learning_rate': 5.880908532842235e-06, 'epoch': 2.12}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6405781507492065, 'eval_runtime': 141.9687, 'eval_samples_per_second': 244.617, 'eval_steps_per_second': 15.292, 'epoch': 2.12}
{'loss': 1.3819, 'grad_norm': 11.54050350189209, 'learning_rate': 5.758133824432167e-06, 'epoch': 2.14}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6431386470794678, 'eval_runtime': 142.0004, 'eval_samples_per_second': 244.563, 'eval_steps_per_second': 15.289, 'epoch': 2.14}
{'loss': 1.361, 'grad_norm': 10.870129585266113, 'learning_rate': 5.6353591160221e-06, 'epoch': 2.15}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6403518915176392, 'eval_runtime': 142.0155, 'eval_samples_per_second': 244.537, 'eval_steps_per_second': 15.287, 'epoch': 2.15}
{'loss': 1.2937, 'grad_norm': 15.896017074584961, 'learning_rate': 5.512584407612032e-06, 'epoch': 2.17}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.638947606086731, 'eval_runtime': 141.9223, 'eval_samples_per_second': 244.697, 'eval_steps_per_second': 15.297, 'epoch': 2.17}
{'loss': 1.1318, 'grad_norm': 13.167329788208008, 'learning_rate': 5.389809699201964e-06, 'epoch': 2.19}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6408213376998901, 'eval_runtime': 142.828, 'eval_samples_per_second': 243.146, 'eval_steps_per_second': 15.2, 'epoch': 2.19}
{'loss': 1.2538, 'grad_norm': 14.103668212890625, 'learning_rate': 5.267034990791897e-06, 'epoch': 2.21}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6436198949813843, 'eval_runtime': 142.883, 'eval_samples_per_second': 243.052, 'eval_steps_per_second': 15.194, 'epoch': 2.21}
{'loss': 1.2662, 'grad_norm': 13.54637336730957, 'learning_rate': 5.14426028238183e-06, 'epoch': 2.23}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6428894996643066, 'eval_runtime': 142.8286, 'eval_samples_per_second': 243.145, 'eval_steps_per_second': 15.2, 'epoch': 2.23}
{'loss': 1.3243, 'grad_norm': 14.847944259643555, 'learning_rate': 5.021485573971763e-06, 'epoch': 2.25}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6434639692306519, 'eval_runtime': 142.9831, 'eval_samples_per_second': 242.882, 'eval_steps_per_second': 15.184, 'epoch': 2.25}
{'loss': 1.1801, 'grad_norm': 11.984455108642578, 'learning_rate': 4.898710865561695e-06, 'epoch': 2.27}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.647178053855896, 'eval_runtime': 142.079, 'eval_samples_per_second': 244.427, 'eval_steps_per_second': 15.28, 'epoch': 2.27}
{'loss': 1.2507, 'grad_norm': 12.554903984069824, 'learning_rate': 4.7759361571516274e-06, 'epoch': 2.28}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6467456817626953, 'eval_runtime': 141.7195, 'eval_samples_per_second': 245.047, 'eval_steps_per_second': 15.319, 'epoch': 2.28}
{'loss': 1.2696, 'grad_norm': 9.391874313354492, 'learning_rate': 4.653161448741559e-06, 'epoch': 2.3}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6450824737548828, 'eval_runtime': 141.891, 'eval_samples_per_second': 244.751, 'eval_steps_per_second': 15.3, 'epoch': 2.3}
{'loss': 1.412, 'grad_norm': 16.565555572509766, 'learning_rate': 4.530386740331492e-06, 'epoch': 2.32}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6418428421020508, 'eval_runtime': 142.1888, 'eval_samples_per_second': 244.239, 'eval_steps_per_second': 15.268, 'epoch': 2.32}
{'loss': 1.4664, 'grad_norm': 13.9568510055542, 'learning_rate': 4.4076120319214245e-06, 'epoch': 2.34}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6395347118377686, 'eval_runtime': 141.954, 'eval_samples_per_second': 244.643, 'eval_steps_per_second': 15.294, 'epoch': 2.34}
{'loss': 1.2071, 'grad_norm': 12.362855911254883, 'learning_rate': 4.284837323511357e-06, 'epoch': 2.36}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6363493204116821, 'eval_runtime': 141.9302, 'eval_samples_per_second': 244.684, 'eval_steps_per_second': 15.296, 'epoch': 2.36}
{'loss': 1.2132, 'grad_norm': 13.342720985412598, 'learning_rate': 4.162062615101289e-06, 'epoch': 2.38}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6343231201171875, 'eval_runtime': 141.892, 'eval_samples_per_second': 244.75, 'eval_steps_per_second': 15.3, 'epoch': 2.38}
{'loss': 1.1498, 'grad_norm': 10.817745208740234, 'learning_rate': 4.0392879066912225e-06, 'epoch': 2.39}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6367486715316772, 'eval_runtime': 142.0014, 'eval_samples_per_second': 244.561, 'eval_steps_per_second': 15.289, 'epoch': 2.39}
{'loss': 1.2611, 'grad_norm': 14.915385246276855, 'learning_rate': 3.916513198281154e-06, 'epoch': 2.41}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6400378942489624, 'eval_runtime': 142.0166, 'eval_samples_per_second': 244.535, 'eval_steps_per_second': 15.287, 'epoch': 2.41}
{'loss': 1.1928, 'grad_norm': 15.71280288696289, 'learning_rate': 3.7937384898710865e-06, 'epoch': 2.43}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6461905241012573, 'eval_runtime': 142.141, 'eval_samples_per_second': 244.321, 'eval_steps_per_second': 15.274, 'epoch': 2.43}
{'loss': 1.0488, 'grad_norm': 11.685186386108398, 'learning_rate': 3.6709637814610196e-06, 'epoch': 2.45}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.652025580406189, 'eval_runtime': 141.8124, 'eval_samples_per_second': 244.887, 'eval_steps_per_second': 15.309, 'epoch': 2.45}
{'loss': 1.2378, 'grad_norm': 16.240461349487305, 'learning_rate': 3.548189073050952e-06, 'epoch': 2.47}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6461347341537476, 'eval_runtime': 134.4275, 'eval_samples_per_second': 258.34, 'eval_steps_per_second': 16.15, 'epoch': 2.47}
{'loss': 1.1329, 'grad_norm': 13.344804763793945, 'learning_rate': 3.4254143646408845e-06, 'epoch': 2.49}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6488584280014038, 'eval_runtime': 126.6642, 'eval_samples_per_second': 274.174, 'eval_steps_per_second': 17.14, 'epoch': 2.49}
{'loss': 1.2247, 'grad_norm': 11.343757629394531, 'learning_rate': 3.3026396562308167e-06, 'epoch': 2.5}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6492007970809937, 'eval_runtime': 126.6555, 'eval_samples_per_second': 274.193, 'eval_steps_per_second': 17.141, 'epoch': 2.5}
{'loss': 1.3476, 'grad_norm': 17.670005798339844, 'learning_rate': 3.1798649478207493e-06, 'epoch': 2.52}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.648794412612915, 'eval_runtime': 127.3211, 'eval_samples_per_second': 272.759, 'eval_steps_per_second': 17.051, 'epoch': 2.52}
{'loss': 1.2441, 'grad_norm': 10.33586597442627, 'learning_rate': 3.0570902394106816e-06, 'epoch': 2.54}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.650249719619751, 'eval_runtime': 128.7222, 'eval_samples_per_second': 269.79, 'eval_steps_per_second': 16.866, 'epoch': 2.54}
{'loss': 1.3282, 'grad_norm': 15.424220085144043, 'learning_rate': 2.934315531000614e-06, 'epoch': 2.56}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6553305387496948, 'eval_runtime': 127.4053, 'eval_samples_per_second': 272.579, 'eval_steps_per_second': 17.04, 'epoch': 2.56}
{'loss': 1.2605, 'grad_norm': 14.33436393737793, 'learning_rate': 2.8115408225905465e-06, 'epoch': 2.58}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6548856496810913, 'eval_runtime': 128.2061, 'eval_samples_per_second': 270.876, 'eval_steps_per_second': 16.934, 'epoch': 2.58}
{'loss': 1.2976, 'grad_norm': 16.606101989746094, 'learning_rate': 2.6887661141804787e-06, 'epoch': 2.6}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6513653993606567, 'eval_runtime': 128.7752, 'eval_samples_per_second': 269.679, 'eval_steps_per_second': 16.859, 'epoch': 2.6}
{'loss': 1.2572, 'grad_norm': 15.757474899291992, 'learning_rate': 2.5659914057704118e-06, 'epoch': 2.62}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.652012825012207, 'eval_runtime': 127.9703, 'eval_samples_per_second': 271.375, 'eval_steps_per_second': 16.965, 'epoch': 2.62}
{'loss': 1.4063, 'grad_norm': 17.389781951904297, 'learning_rate': 2.443216697360344e-06, 'epoch': 2.63}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6506754159927368, 'eval_runtime': 128.318, 'eval_samples_per_second': 270.64, 'eval_steps_per_second': 16.919, 'epoch': 2.63}
{'loss': 1.4115, 'grad_norm': 15.386919021606445, 'learning_rate': 2.320441988950276e-06, 'epoch': 2.65}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6451449394226074, 'eval_runtime': 129.7671, 'eval_samples_per_second': 267.618, 'eval_steps_per_second': 16.73, 'epoch': 2.65}
{'loss': 1.3405, 'grad_norm': 17.086894989013672, 'learning_rate': 2.197667280540209e-06, 'epoch': 2.67}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6409125328063965, 'eval_runtime': 128.5262, 'eval_samples_per_second': 270.202, 'eval_steps_per_second': 16.891, 'epoch': 2.67}
{'loss': 1.2578, 'grad_norm': 18.194576263427734, 'learning_rate': 2.0748925721301415e-06, 'epoch': 2.69}


  0%|          | 0/2171 [00:00<?, ?it/s]

{'eval_loss': 1.6431468725204468, 'eval_runtime': 129.3176, 'eval_samples_per_second': 268.548, 'eval_steps_per_second': 16.788, 'epoch': 2.69}
{'loss': 1.2846, 'grad_norm': 15.103840827941895, 'learning_rate': 1.9521178637200737e-06, 'epoch': 2.71}


  0%|          | 0/2171 [00:00<?, ?it/s]

: 

In [None]:
import matplotlib.pyplot as plt

# Access the training log history
training_history = trainer.state.log_history

# Extract the training loss and evaluation loss
training_loss = [log['loss'] for log in training_history if 'loss' in log]
eval_loss = [log['eval_loss'] for log in training_history if 'eval_loss' in log]

# Extract the steps where metrics were logged
steps = [log['step'] for log in training_history if 'loss' in log]

# Plot the training and evaluation loss
plt.figure(figsize=(10, 6))
plt.plot(steps, training_loss, label='Training Loss', marker='o')
if eval_loss:  # Only plot if evaluation loss is logged
    plt.plot(steps, eval_loss[:len(steps)], label='Evaluation Loss', marker='x')

plt.title('Training and Evaluation Loss Over Steps')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()
