<h3>Import Necessary Libarary</h3>

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset, Dataset
import torch
from transformers import Trainer, TrainingArguments

<h3>Load Dataset</h3>

In [8]:
# Load the data from the file
file_path = './enhanced_fruits_dataset.txt'
with open(file_path, 'r') as file:
    data = file.readlines()

# Strip any leading/trailing whitespace characters
data = [line.strip() for line in data]

# Convert data to Hugging Face Dataset
dataset = Dataset.from_dict({'text': data})
dataset

Dataset({
    features: ['text'],
    num_rows: 66840
})

<h3>Tokenize Dataset</h3>

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokenized_output = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
    tokenized_output['labels'] = tokenized_output['input_ids'].copy()  # Copy input_ids to labels
    return tokenized_output

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask','labels'])
tokenized_datasets

Map: 100%|██████████| 66840/66840 [00:21<00:00, 3140.92 examples/s]


Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 66840
})

<h3>Training dataset</h3>

In [10]:
# Split the dataset into train and test sets
train_size = int(0.9 * len(tokenized_datasets))
test_size = len(tokenized_datasets) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(tokenized_datasets, [train_size, test_size])

<h3>Load GPT-2 Model</h3>

In [11]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

<h3>Train Model</h3>

In [12]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite the content of the output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    save_steps=5000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()


                                         
  0%|          | 0/37600 [08:46<?, ?it/s]             

{'loss': 9.6101, 'grad_norm': 206.12342834472656, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


                                         
  0%|          | 0/37600 [12:18<?, ?it/s]             

{'loss': 8.6267, 'grad_norm': 190.94349670410156, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


                                         
  0%|          | 0/37600 [15:48<?, ?it/s]             

{'loss': 6.5311, 'grad_norm': 168.7954864501953, 'learning_rate': 3e-06, 'epoch': 0.0}


                                         
  0%|          | 0/37600 [19:17<?, ?it/s]             

{'loss': 4.0098, 'grad_norm': 143.56442260742188, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [22:46<?, ?it/s]             

{'loss': 1.5725, 'grad_norm': 70.04573822021484, 'learning_rate': 5e-06, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [26:15<?, ?it/s]             

{'loss': 0.4099, 'grad_norm': 4.841333866119385, 'learning_rate': 6e-06, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [29:44<?, ?it/s]             

{'loss': 0.2202, 'grad_norm': 1.074854850769043, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [33:13<?, ?it/s]             

{'loss': 0.1785, 'grad_norm': 0.933830738067627, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [36:42<?, ?it/s]             

{'loss': 0.154, 'grad_norm': 0.9514370560646057, 'learning_rate': 9e-06, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [40:11<?, ?it/s]              

{'loss': 0.13, 'grad_norm': 0.8471193909645081, 'learning_rate': 1e-05, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [43:39<?, ?it/s]              

{'loss': 0.1178, 'grad_norm': 0.7805205583572388, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


                                         
  0%|          | 0/37600 [47:08<?, ?it/s]              

{'loss': 0.1034, 'grad_norm': 0.71112060546875, 'learning_rate': 1.2e-05, 'epoch': 0.02}


                                         
  0%|          | 0/37600 [50:37<?, ?it/s]              

{'loss': 0.0888, 'grad_norm': 0.6452836394309998, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.02}


                                         
  0%|          | 0/37600 [54:06<?, ?it/s]              

{'loss': 0.0776, 'grad_norm': 0.80597323179245, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.02}


                                         
  0%|          | 0/37600 [57:35<?, ?it/s]              

{'loss': 0.0629, 'grad_norm': 0.6489502191543579, 'learning_rate': 1.5e-05, 'epoch': 0.02}


                                         
  0%|          | 0/37600 [1:01:05<?, ?it/s]            

{'loss': 0.0583, 'grad_norm': 0.5321118235588074, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}


                                           
  0%|          | 0/37600 [1:04:34<?, ?it/s]            

{'loss': 0.05, 'grad_norm': 0.617705225944519, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.02}


                                           
  0%|          | 0/37600 [1:08:02<?, ?it/s]              

{'loss': 0.0454, 'grad_norm': 0.5716949701309204, 'learning_rate': 1.8e-05, 'epoch': 0.02}


                                           
  0%|          | 0/37600 [1:11:31<?, ?it/s]              

{'loss': 0.0382, 'grad_norm': 0.478259414434433, 'learning_rate': 1.9e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:15:00<?, ?it/s]              

{'loss': 0.0343, 'grad_norm': 0.43901583552360535, 'learning_rate': 2e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:18:28<?, ?it/s]              

{'loss': 0.0323, 'grad_norm': 0.39512884616851807, 'learning_rate': 2.1e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:21:57<?, ?it/s]              

{'loss': 0.0279, 'grad_norm': 0.48113104701042175, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:25:26<?, ?it/s]              

{'loss': 0.0232, 'grad_norm': 0.3276899456977844, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:28:55<?, ?it/s]              

{'loss': 0.0241, 'grad_norm': 0.38810330629348755, 'learning_rate': 2.4e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:32:24<?, ?it/s]              

{'loss': 0.0207, 'grad_norm': 1.1391448974609375, 'learning_rate': 2.5e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:35:53<?, ?it/s]              

{'loss': 0.0203, 'grad_norm': 0.3417981266975403, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.03}


                                           
  0%|          | 0/37600 [1:39:21<?, ?it/s]              

{'loss': 0.019, 'grad_norm': 0.31653299927711487, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.04}


                                           
  0%|          | 0/37600 [1:42:50<?, ?it/s]              

{'loss': 0.0219, 'grad_norm': 0.2992197871208191, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.04}


                                           
  0%|          | 0/37600 [1:46:19<?, ?it/s]              

{'loss': 0.0166, 'grad_norm': 0.36533743143081665, 'learning_rate': 2.9e-05, 'epoch': 0.04}


                                           
  0%|          | 0/37600 [1:49:48<?, ?it/s]              

{'loss': 0.0162, 'grad_norm': 0.27695903182029724, 'learning_rate': 3e-05, 'epoch': 0.04}


                                           
  0%|          | 0/37600 [1:53:17<?, ?it/s]              

{'loss': 0.0157, 'grad_norm': 0.2932754158973694, 'learning_rate': 3.1e-05, 'epoch': 0.04}


                                           
  0%|          | 0/37600 [1:56:45<?, ?it/s]              

{'loss': 0.0146, 'grad_norm': 0.3958445191383362, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.04}


                                           
  0%|          | 0/37600 [2:00:15<?, ?it/s]              

{'loss': 0.015, 'grad_norm': 0.27946949005126953, 'learning_rate': 3.3e-05, 'epoch': 0.04}


                                           
  0%|          | 0/37600 [2:03:45<?, ?it/s]              

{'loss': 0.0153, 'grad_norm': 0.3047943413257599, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:07:14<?, ?it/s]              

{'loss': 0.0129, 'grad_norm': 0.23802140355110168, 'learning_rate': 3.5e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:10:43<?, ?it/s]              

{'loss': 0.0125, 'grad_norm': 0.2639912962913513, 'learning_rate': 3.6e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:14:12<?, ?it/s]              

{'loss': 0.0114, 'grad_norm': 0.2409505844116211, 'learning_rate': 3.7e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:17:40<?, ?it/s]              

{'loss': 0.0124, 'grad_norm': 0.26406624913215637, 'learning_rate': 3.8e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:21:10<?, ?it/s]              

{'loss': 0.013, 'grad_norm': 0.28359928727149963, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:24:39<?, ?it/s]              

{'loss': 0.0142, 'grad_norm': 0.21680131554603577, 'learning_rate': 4e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:28:08<?, ?it/s]              

{'loss': 0.0115, 'grad_norm': 0.24325032532215118, 'learning_rate': 4.1e-05, 'epoch': 0.05}


                                           
  0%|          | 0/37600 [2:31:37<?, ?it/s]              

{'loss': 0.0124, 'grad_norm': 0.26270654797554016, 'learning_rate': 4.2e-05, 'epoch': 0.06}


                                           
  0%|          | 0/37600 [2:35:06<?, ?it/s]              

{'loss': 0.0126, 'grad_norm': 0.330718994140625, 'learning_rate': 4.3e-05, 'epoch': 0.06}


                                           
  0%|          | 0/37600 [2:38:35<?, ?it/s]              

{'loss': 0.0128, 'grad_norm': 0.25506702065467834, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.06}


                                           
  0%|          | 0/37600 [2:42:04<?, ?it/s]              

{'loss': 0.0139, 'grad_norm': 0.262535035610199, 'learning_rate': 4.5e-05, 'epoch': 0.06}


                                           
  0%|          | 0/37600 [2:45:33<?, ?it/s]              

{'loss': 0.011, 'grad_norm': 0.33342695236206055, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.06}


                                           
  0%|          | 0/37600 [2:49:01<?, ?it/s]              

{'loss': 0.0121, 'grad_norm': 0.24772599339485168, 'learning_rate': 4.7e-05, 'epoch': 0.06}


                                           
  0%|          | 0/37600 [2:52:30<?, ?it/s]              

{'loss': 0.0109, 'grad_norm': 0.24833796918392181, 'learning_rate': 4.8e-05, 'epoch': 0.06}


                                           
  0%|          | 0/37600 [2:55:59<?, ?it/s]              

{'loss': 0.0129, 'grad_norm': 0.18661269545555115, 'learning_rate': 4.9e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [2:59:29<?, ?it/s]              

{'loss': 0.012, 'grad_norm': 0.3278915584087372, 'learning_rate': 5e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [3:02:58<?, ?it/s]              

{'loss': 0.0114, 'grad_norm': 0.2717214822769165, 'learning_rate': 4.9986522911051216e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [3:06:27<?, ?it/s]              

{'loss': 0.0112, 'grad_norm': 0.26836463809013367, 'learning_rate': 4.9973045822102424e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [3:09:56<?, ?it/s]              

{'loss': 0.0118, 'grad_norm': 0.28085100650787354, 'learning_rate': 4.995956873315364e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [3:13:25<?, ?it/s]              

{'loss': 0.0111, 'grad_norm': 0.207986518740654, 'learning_rate': 4.994609164420486e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [3:16:54<?, ?it/s]              

{'loss': 0.0121, 'grad_norm': 0.30539411306381226, 'learning_rate': 4.9932614555256066e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [3:20:23<?, ?it/s]              

{'loss': 0.0119, 'grad_norm': 0.23660589754581451, 'learning_rate': 4.991913746630728e-05, 'epoch': 0.07}


                                           
  0%|          | 0/37600 [3:23:52<?, ?it/s]              

{'loss': 0.0127, 'grad_norm': 0.3283609449863434, 'learning_rate': 4.9905660377358493e-05, 'epoch': 0.08}


                                           
  0%|          | 0/37600 [3:27:20<?, ?it/s]              

{'loss': 0.0108, 'grad_norm': 0.24148905277252197, 'learning_rate': 4.989218328840971e-05, 'epoch': 0.08}


                                           
  0%|          | 0/37600 [3:30:50<?, ?it/s]              

{'loss': 0.0102, 'grad_norm': 0.21266452968120575, 'learning_rate': 4.9878706199460915e-05, 'epoch': 0.08}


                                           
  0%|          | 0/37600 [3:34:18<?, ?it/s]              

{'loss': 0.0113, 'grad_norm': 0.32447129487991333, 'learning_rate': 4.986522911051213e-05, 'epoch': 0.08}


                                           
  0%|          | 0/37600 [3:37:47<?, ?it/s]              

{'loss': 0.0112, 'grad_norm': 0.19175350666046143, 'learning_rate': 4.985175202156335e-05, 'epoch': 0.08}


                                           
  0%|          | 0/37600 [3:41:16<?, ?it/s]              

{'loss': 0.0124, 'grad_norm': 0.24357996881008148, 'learning_rate': 4.9838274932614557e-05, 'epoch': 0.08}


                                           
  0%|          | 0/37600 [3:44:45<?, ?it/s]              

{'loss': 0.01, 'grad_norm': 0.27341368794441223, 'learning_rate': 4.982479784366577e-05, 'epoch': 0.08}


                                           
  0%|          | 0/37600 [3:48:14<?, ?it/s]              

{'loss': 0.0099, 'grad_norm': 0.2637673020362854, 'learning_rate': 4.9811320754716985e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [3:51:43<?, ?it/s]              

{'loss': 0.0112, 'grad_norm': 0.24688859283924103, 'learning_rate': 4.97978436657682e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [3:55:11<?, ?it/s]              

{'loss': 0.0119, 'grad_norm': 0.24887464940547943, 'learning_rate': 4.9784366576819406e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [3:58:41<?, ?it/s]              

{'loss': 0.0114, 'grad_norm': 0.2795853316783905, 'learning_rate': 4.977088948787062e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [4:02:11<?, ?it/s]              

{'loss': 0.0103, 'grad_norm': 0.3439836800098419, 'learning_rate': 4.975741239892184e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [4:05:40<?, ?it/s]              

{'loss': 0.0105, 'grad_norm': 0.21854571998119354, 'learning_rate': 4.974393530997305e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [4:09:09<?, ?it/s]              

{'loss': 0.0111, 'grad_norm': 0.26093438267707825, 'learning_rate': 4.973045822102426e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [4:12:38<?, ?it/s]              

{'loss': 0.0108, 'grad_norm': 0.22584618628025055, 'learning_rate': 4.9716981132075476e-05, 'epoch': 0.09}


                                           
  0%|          | 0/37600 [4:16:06<?, ?it/s]              

{'loss': 0.0109, 'grad_norm': 0.20570620894432068, 'learning_rate': 4.970350404312669e-05, 'epoch': 0.1}


                                           
  0%|          | 0/37600 [4:19:35<?, ?it/s]              

{'loss': 0.0095, 'grad_norm': 0.26018649339675903, 'learning_rate': 4.96900269541779e-05, 'epoch': 0.1}


                                           
  0%|          | 0/37600 [4:23:05<?, ?it/s]              

{'loss': 0.0099, 'grad_norm': 0.23684121668338776, 'learning_rate': 4.967654986522911e-05, 'epoch': 0.1}


                                           
  0%|          | 0/37600 [4:26:33<?, ?it/s]              

{'loss': 0.0103, 'grad_norm': 0.18666291236877441, 'learning_rate': 4.966307277628033e-05, 'epoch': 0.1}


                                           
  0%|          | 0/37600 [4:30:03<?, ?it/s]              

{'loss': 0.0097, 'grad_norm': 0.29719051718711853, 'learning_rate': 4.964959568733154e-05, 'epoch': 0.1}


                                           
  0%|          | 0/37600 [4:33:32<?, ?it/s]              

{'loss': 0.0103, 'grad_norm': 0.2901934087276459, 'learning_rate': 4.963611859838275e-05, 'epoch': 0.1}


                                           
  0%|          | 0/37600 [4:37:01<?, ?it/s]              

{'loss': 0.0105, 'grad_norm': 0.3216160237789154, 'learning_rate': 4.962264150943397e-05, 'epoch': 0.1}


                                           
  0%|          | 0/37600 [4:40:29<?, ?it/s]              

{'loss': 0.0104, 'grad_norm': 0.27402523159980774, 'learning_rate': 4.9609164420485174e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [4:43:58<?, ?it/s]              

{'loss': 0.0112, 'grad_norm': 0.22960643470287323, 'learning_rate': 4.959568733153639e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [4:47:28<?, ?it/s]              

{'loss': 0.0103, 'grad_norm': 0.23601935803890228, 'learning_rate': 4.95822102425876e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [4:50:57<?, ?it/s]              

{'loss': 0.0098, 'grad_norm': 0.22034308314323425, 'learning_rate': 4.9568733153638816e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [4:54:26<?, ?it/s]              

{'loss': 0.0104, 'grad_norm': 0.25359368324279785, 'learning_rate': 4.955525606469003e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [4:57:56<?, ?it/s]              

{'loss': 0.0104, 'grad_norm': 0.2611580193042755, 'learning_rate': 4.9541778975741244e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [5:01:26<?, ?it/s]              

{'loss': 0.0099, 'grad_norm': 0.24236297607421875, 'learning_rate': 4.952830188679246e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [5:04:55<?, ?it/s]              

{'loss': 0.0091, 'grad_norm': 0.2614918053150177, 'learning_rate': 4.9514824797843665e-05, 'epoch': 0.11}


                                           
  0%|          | 0/37600 [5:08:24<?, ?it/s]              

{'loss': 0.0109, 'grad_norm': 0.2240961492061615, 'learning_rate': 4.950134770889488e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:11:53<?, ?it/s]              

{'loss': 0.0095, 'grad_norm': 0.26368582248687744, 'learning_rate': 4.948787061994609e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:15:23<?, ?it/s]              

{'loss': 0.0101, 'grad_norm': 0.20671872794628143, 'learning_rate': 4.947439353099731e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:18:52<?, ?it/s]              

{'loss': 0.0096, 'grad_norm': 0.24008356034755707, 'learning_rate': 4.946091644204852e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:22:21<?, ?it/s]              

{'loss': 0.0106, 'grad_norm': 0.20262184739112854, 'learning_rate': 4.9447439353099735e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:25:50<?, ?it/s]              

{'loss': 0.0097, 'grad_norm': 0.2255057990550995, 'learning_rate': 4.943396226415095e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:29:20<?, ?it/s]              

{'loss': 0.0095, 'grad_norm': 0.20853090286254883, 'learning_rate': 4.9420485175202156e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:32:48<?, ?it/s]              

{'loss': 0.0097, 'grad_norm': 0.21973817050457, 'learning_rate': 4.940700808625337e-05, 'epoch': 0.12}


                                           
  0%|          | 0/37600 [5:36:17<?, ?it/s]              

{'loss': 0.0084, 'grad_norm': 0.16997967660427094, 'learning_rate': 4.9393530997304584e-05, 'epoch': 0.13}


                                           
  0%|          | 0/37600 [5:39:45<?, ?it/s]              

{'loss': 0.0114, 'grad_norm': 0.24174723029136658, 'learning_rate': 4.93800539083558e-05, 'epoch': 0.13}


                                           
  0%|          | 0/37600 [5:43:14<?, ?it/s]              

{'loss': 0.0091, 'grad_norm': 0.20367421209812164, 'learning_rate': 4.936657681940701e-05, 'epoch': 0.13}


                                           
  0%|          | 0/37600 [5:46:43<?, ?it/s]              

{'loss': 0.0109, 'grad_norm': 0.22865040600299835, 'learning_rate': 4.9353099730458226e-05, 'epoch': 0.13}


                                           
  0%|          | 0/37600 [5:50:12<?, ?it/s]              

{'loss': 0.0098, 'grad_norm': 0.22236189246177673, 'learning_rate': 4.933962264150943e-05, 'epoch': 0.13}


                                           
  0%|          | 0/37600 [5:53:41<?, ?it/s]               

{'loss': 0.0103, 'grad_norm': 0.2117130607366562, 'learning_rate': 4.932614555256065e-05, 'epoch': 0.13}


                                           
  0%|          | 0/37600 [5:57:09<?, ?it/s]               

{'loss': 0.0096, 'grad_norm': 0.26330792903900146, 'learning_rate': 4.931266846361186e-05, 'epoch': 0.13}


                                           
  0%|          | 0/37600 [6:00:40<?, ?it/s]               

{'loss': 0.0094, 'grad_norm': 0.30332258343696594, 'learning_rate': 4.9299191374663075e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:04:09<?, ?it/s]               

{'loss': 0.0102, 'grad_norm': 0.257124125957489, 'learning_rate': 4.928571428571429e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:07:38<?, ?it/s]               

{'loss': 0.0098, 'grad_norm': 0.2480974942445755, 'learning_rate': 4.92722371967655e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:11:07<?, ?it/s]               

{'loss': 0.0092, 'grad_norm': 0.18540263175964355, 'learning_rate': 4.925876010781672e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:14:38<?, ?it/s]               

{'loss': 0.0087, 'grad_norm': 0.16677720844745636, 'learning_rate': 4.9245283018867924e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:18:08<?, ?it/s]               

{'loss': 0.01, 'grad_norm': 0.16426518559455872, 'learning_rate': 4.923180592991914e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:21:37<?, ?it/s]               

{'loss': 0.0104, 'grad_norm': 0.25152143836021423, 'learning_rate': 4.921832884097035e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:25:06<?, ?it/s]               

{'loss': 0.01, 'grad_norm': 0.18300050497055054, 'learning_rate': 4.9204851752021566e-05, 'epoch': 0.14}


                                           
  0%|          | 0/37600 [6:28:36<?, ?it/s]               

{'loss': 0.0091, 'grad_norm': 0.2652618885040283, 'learning_rate': 4.919137466307278e-05, 'epoch': 0.15}


                                           
  0%|          | 0/37600 [6:32:05<?, ?it/s]               

{'loss': 0.0096, 'grad_norm': 0.33998674154281616, 'learning_rate': 4.9177897574123994e-05, 'epoch': 0.15}


                                           
  0%|          | 0/37600 [6:35:34<?, ?it/s]               

{'loss': 0.0083, 'grad_norm': 0.19014014303684235, 'learning_rate': 4.916442048517521e-05, 'epoch': 0.15}


                                           
  0%|          | 0/37600 [6:39:03<?, ?it/s]               

{'loss': 0.0095, 'grad_norm': 0.20550304651260376, 'learning_rate': 4.9150943396226415e-05, 'epoch': 0.15}


                                           
  0%|          | 0/37600 [6:42:34<?, ?it/s]               

{'loss': 0.0098, 'grad_norm': 0.18388409912586212, 'learning_rate': 4.913746630727763e-05, 'epoch': 0.15}


                                           
  0%|          | 0/37600 [6:46:04<?, ?it/s]               

{'loss': 0.0093, 'grad_norm': 0.20225279033184052, 'learning_rate': 4.912398921832884e-05, 'epoch': 0.15}


                                           
  0%|          | 0/37600 [6:49:36<?, ?it/s]               

{'loss': 0.0088, 'grad_norm': 0.2754502594470978, 'learning_rate': 4.911051212938006e-05, 'epoch': 0.15}


                                           
  0%|          | 0/37600 [6:53:10<?, ?it/s]               

{'loss': 0.0092, 'grad_norm': 0.20357465744018555, 'learning_rate': 4.909703504043127e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [6:56:42<?, ?it/s]               

{'loss': 0.0093, 'grad_norm': 0.17595022916793823, 'learning_rate': 4.9083557951482485e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [7:00:15<?, ?it/s]               

{'loss': 0.0103, 'grad_norm': 0.17028377950191498, 'learning_rate': 4.90700808625337e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [7:03:45<?, ?it/s]               

{'loss': 0.0083, 'grad_norm': 0.16573300957679749, 'learning_rate': 4.9056603773584906e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [7:07:14<?, ?it/s]               

{'loss': 0.0089, 'grad_norm': 0.28486138582229614, 'learning_rate': 4.904312668463612e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [7:10:42<?, ?it/s]               

{'loss': 0.0087, 'grad_norm': 0.20575107634067535, 'learning_rate': 4.9029649595687334e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [7:14:11<?, ?it/s]               

{'loss': 0.0083, 'grad_norm': 0.24734240770339966, 'learning_rate': 4.901617250673854e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [7:17:39<?, ?it/s]               

{'loss': 0.0079, 'grad_norm': 0.3169175088405609, 'learning_rate': 4.900269541778976e-05, 'epoch': 0.16}


                                           
  0%|          | 0/37600 [7:21:09<?, ?it/s]               

{'loss': 0.0095, 'grad_norm': 0.34236887097358704, 'learning_rate': 4.8989218328840976e-05, 'epoch': 0.17}


                                           
  0%|          | 0/37600 [7:24:38<?, ?it/s]               

{'loss': 0.0092, 'grad_norm': 0.2495356947183609, 'learning_rate': 4.8975741239892184e-05, 'epoch': 0.17}


                                           
  0%|          | 0/37600 [7:28:08<?, ?it/s]               

{'loss': 0.0092, 'grad_norm': 0.23314550518989563, 'learning_rate': 4.89622641509434e-05, 'epoch': 0.17}


                                           
  0%|          | 0/37600 [7:31:37<?, ?it/s]               

{'loss': 0.0093, 'grad_norm': 0.23455558717250824, 'learning_rate': 4.894878706199461e-05, 'epoch': 0.17}


                                           
  0%|          | 0/37600 [7:35:05<?, ?it/s]               

{'loss': 0.0083, 'grad_norm': 0.20581601560115814, 'learning_rate': 4.8935309973045826e-05, 'epoch': 0.17}


                                           
  0%|          | 0/37600 [7:38:35<?, ?it/s]               

{'loss': 0.0087, 'grad_norm': 0.24643658101558685, 'learning_rate': 4.892183288409703e-05, 'epoch': 0.17}


                                           
  0%|          | 0/37600 [7:42:03<?, ?it/s]               

{'loss': 0.0077, 'grad_norm': 0.17734822630882263, 'learning_rate': 4.8908355795148253e-05, 'epoch': 0.17}


                                           
  0%|          | 0/37600 [7:45:31<?, ?it/s]               

{'loss': 0.0078, 'grad_norm': 0.2258514016866684, 'learning_rate': 4.889487870619947e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [7:48:59<?, ?it/s]               

{'loss': 0.0087, 'grad_norm': 0.6870742440223694, 'learning_rate': 4.8881401617250675e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [7:52:27<?, ?it/s]               

{'loss': 0.0088, 'grad_norm': 0.21627871692180634, 'learning_rate': 4.886792452830189e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [7:55:54<?, ?it/s]               

{'loss': 0.0092, 'grad_norm': 0.21964409947395325, 'learning_rate': 4.88544474393531e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [7:59:23<?, ?it/s]               

{'loss': 0.0092, 'grad_norm': 0.201399028301239, 'learning_rate': 4.8840970350404317e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [8:02:51<?, ?it/s]               

{'loss': 0.0085, 'grad_norm': 0.21327070891857147, 'learning_rate': 4.8827493261455524e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [8:06:19<?, ?it/s]               

{'loss': 0.0086, 'grad_norm': 0.21303942799568176, 'learning_rate': 4.8814016172506745e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [8:09:48<?, ?it/s]               

{'loss': 0.0089, 'grad_norm': 0.2583727240562439, 'learning_rate': 4.880053908355796e-05, 'epoch': 0.18}


                                           
  0%|          | 0/37600 [8:13:16<?, ?it/s]               

{'loss': 0.0086, 'grad_norm': 0.2381521612405777, 'learning_rate': 4.8787061994609166e-05, 'epoch': 0.19}


                                           
  0%|          | 0/37600 [8:16:44<?, ?it/s]               

{'loss': 0.0083, 'grad_norm': 0.21443569660186768, 'learning_rate': 4.877358490566038e-05, 'epoch': 0.19}


                                           
  0%|          | 0/37600 [8:20:12<?, ?it/s]               

{'loss': 0.0083, 'grad_norm': 0.2650510370731354, 'learning_rate': 4.8760107816711594e-05, 'epoch': 0.19}


