In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [2]:
%pwd

'a:\\git_hub_projects\\ml\\Text-Summarizer\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'a:\\git_hub_projects\\ml\\Text-Summarizer'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [8]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2025-08-01 01:48:05,054: INFO: config: PyTorch version 2.4.1 available.]


In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_bart = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_bart)
        
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
        # ) 


        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 

        trainer = Trainer(model=model_bart, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()

        ## Save model
        model_bart.save_pretrained(os.path.join(self.config.root_dir,"bart-samsum-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-08-01 01:48:05,372: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-08-01 01:48:05,387: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-01 01:48:05,388: INFO: common: created directory at: artifacts]
[2025-08-01 01:48:05,389: INFO: common: created directory at: artifacts/model_trainer]


  trainer = Trainer(model=model_bart, args=trainer_args,
  1%|          | 10/920 [02:26<3:37:30, 14.34s/it]

{'loss': 2.2268, 'grad_norm': 261.20556640625, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  2%|▏         | 20/920 [05:12<4:14:33, 16.97s/it]

{'loss': 1.9977, 'grad_norm': 173.16714477539062, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  3%|▎         | 30/920 [07:57<4:11:47, 16.97s/it]

{'loss': 1.7606, 'grad_norm': 139.28480529785156, 'learning_rate': 3e-06, 'epoch': 0.03}


  4%|▍         | 40/920 [10:53<4:10:13, 17.06s/it]

{'loss': 1.6377, 'grad_norm': 128.56076049804688, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}


  5%|▌         | 50/920 [13:37<3:42:29, 15.34s/it]

{'loss': 1.5382, 'grad_norm': 136.98675537109375, 'learning_rate': 5e-06, 'epoch': 0.05}


  7%|▋         | 60/920 [16:28<4:08:46, 17.36s/it]

{'loss': 1.5139, 'grad_norm': 119.01597595214844, 'learning_rate': 6e-06, 'epoch': 0.07}


  8%|▊         | 70/920 [19:08<3:55:40, 16.64s/it]

{'loss': 1.4886, 'grad_norm': 129.48977661132812, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}


  9%|▊         | 80/920 [21:45<3:31:56, 15.14s/it]

{'loss': 1.4454, 'grad_norm': 114.64241790771484, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.09}


 10%|▉         | 90/920 [24:21<3:35:53, 15.61s/it]

{'loss': 1.4531, 'grad_norm': 109.32832336425781, 'learning_rate': 9e-06, 'epoch': 0.1}


 11%|█         | 100/920 [26:57<3:36:42, 15.86s/it]

{'loss': 1.4218, 'grad_norm': 128.1920928955078, 'learning_rate': 1e-05, 'epoch': 0.11}


 12%|█▏        | 110/920 [29:43<3:33:30, 15.82s/it]

{'loss': 1.3683, 'grad_norm': 129.83497619628906, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.12}


 13%|█▎        | 120/920 [32:06<3:05:53, 13.94s/it]

{'loss': 1.3822, 'grad_norm': 142.91539001464844, 'learning_rate': 1.2e-05, 'epoch': 0.13}


 14%|█▍        | 130/920 [34:36<3:22:20, 15.37s/it]

{'loss': 1.3829, 'grad_norm': 112.71172332763672, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.14}


 15%|█▌        | 140/920 [37:16<3:23:57, 15.69s/it]

{'loss': 1.3857, 'grad_norm': 118.32789611816406, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.15}


 16%|█▋        | 150/920 [39:53<3:20:19, 15.61s/it]

{'loss': 1.3901, 'grad_norm': 120.29092407226562, 'learning_rate': 1.5e-05, 'epoch': 0.16}


 17%|█▋        | 160/920 [42:36<3:33:54, 16.89s/it]

{'loss': 1.4088, 'grad_norm': 122.28169250488281, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.17}


 18%|█▊        | 170/920 [45:20<3:30:45, 16.86s/it]

{'loss': 1.3872, 'grad_norm': 97.52288055419922, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.18}


 20%|█▉        | 180/920 [47:54<3:17:33, 16.02s/it]

{'loss': 1.3398, 'grad_norm': 95.29590606689453, 'learning_rate': 1.8e-05, 'epoch': 0.2}


 21%|██        | 190/920 [50:41<3:27:02, 17.02s/it]

{'loss': 1.3317, 'grad_norm': 101.01409912109375, 'learning_rate': 1.9e-05, 'epoch': 0.21}


 22%|██▏       | 200/920 [53:21<3:11:47, 15.98s/it]

{'loss': 1.3785, 'grad_norm': 111.79354858398438, 'learning_rate': 2e-05, 'epoch': 0.22}


 23%|██▎       | 210/920 [56:06<3:04:46, 15.61s/it]

{'loss': 1.3433, 'grad_norm': 109.71098327636719, 'learning_rate': 2.1e-05, 'epoch': 0.23}


 24%|██▍       | 220/920 [58:47<3:05:42, 15.92s/it]

{'loss': 1.2965, 'grad_norm': 155.9686279296875, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.24}


 25%|██▌       | 230/920 [1:01:24<2:58:23, 15.51s/it]

{'loss': 1.3967, 'grad_norm': 105.34532928466797, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.25}


 26%|██▌       | 240/920 [1:03:51<2:43:08, 14.39s/it]

{'loss': 1.3223, 'grad_norm': 97.14237213134766, 'learning_rate': 2.4e-05, 'epoch': 0.26}


 27%|██▋       | 250/920 [1:06:28<2:54:20, 15.61s/it]

{'loss': 1.349, 'grad_norm': 112.81709289550781, 'learning_rate': 2.5e-05, 'epoch': 0.27}


 28%|██▊       | 260/920 [1:09:08<3:00:51, 16.44s/it]

{'loss': 1.4043, 'grad_norm': 140.54881286621094, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.28}


 29%|██▉       | 270/920 [1:11:45<2:53:22, 16.00s/it]

{'loss': 1.3075, 'grad_norm': 97.93314361572266, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.29}


 30%|███       | 280/920 [1:14:24<2:54:02, 16.32s/it]

{'loss': 1.3629, 'grad_norm': 150.92481994628906, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.3}


 32%|███▏      | 290/920 [1:17:12<2:53:11, 16.49s/it]

{'loss': 1.4393, 'grad_norm': 101.70155334472656, 'learning_rate': 2.9e-05, 'epoch': 0.31}


 33%|███▎      | 300/920 [1:19:47<2:43:33, 15.83s/it]

{'loss': 1.304, 'grad_norm': 106.16366577148438, 'learning_rate': 3e-05, 'epoch': 0.33}


 34%|███▎      | 310/920 [1:22:31<2:48:32, 16.58s/it]

{'loss': 1.433, 'grad_norm': 112.41183471679688, 'learning_rate': 3.1e-05, 'epoch': 0.34}


 35%|███▍      | 320/920 [1:25:08<2:41:29, 16.15s/it]

{'loss': 1.4812, 'grad_norm': 120.35001373291016, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.35}


 36%|███▌      | 330/920 [1:27:51<2:34:22, 15.70s/it]

{'loss': 1.4322, 'grad_norm': 116.94062805175781, 'learning_rate': 3.3e-05, 'epoch': 0.36}


 37%|███▋      | 340/920 [1:30:33<2:35:40, 16.10s/it]

{'loss': 1.3928, 'grad_norm': 119.70249938964844, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.37}


 38%|███▊      | 350/920 [1:33:21<2:33:31, 16.16s/it]

{'loss': 1.4181, 'grad_norm': 102.81529998779297, 'learning_rate': 3.5e-05, 'epoch': 0.38}


 39%|███▉      | 360/920 [1:36:02<2:27:32, 15.81s/it]

{'loss': 1.3592, 'grad_norm': 107.06684875488281, 'learning_rate': 3.6e-05, 'epoch': 0.39}


 40%|████      | 370/920 [1:38:39<2:18:49, 15.14s/it]

{'loss': 1.3724, 'grad_norm': 120.274658203125, 'learning_rate': 3.7e-05, 'epoch': 0.4}


 41%|████▏     | 380/920 [1:41:20<2:24:20, 16.04s/it]

{'loss': 1.3657, 'grad_norm': 100.95391845703125, 'learning_rate': 3.8e-05, 'epoch': 0.41}


 42%|████▏     | 390/920 [1:44:11<2:25:10, 16.44s/it]

{'loss': 1.4025, 'grad_norm': 99.29306030273438, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.42}


 43%|████▎     | 400/920 [1:46:55<2:15:40, 15.65s/it]

{'loss': 1.3851, 'grad_norm': 131.59877014160156, 'learning_rate': 4e-05, 'epoch': 0.43}


 45%|████▍     | 410/920 [1:49:38<2:21:11, 16.61s/it]

{'loss': 1.4228, 'grad_norm': 93.05517578125, 'learning_rate': 4.1e-05, 'epoch': 0.45}


 46%|████▌     | 420/920 [1:52:23<2:16:34, 16.39s/it]

{'loss': 1.3486, 'grad_norm': 149.3837127685547, 'learning_rate': 4.2e-05, 'epoch': 0.46}


 47%|████▋     | 430/920 [1:55:03<2:10:39, 16.00s/it]

{'loss': 1.4747, 'grad_norm': 130.4295654296875, 'learning_rate': 4.3e-05, 'epoch': 0.47}


 48%|████▊     | 440/920 [1:57:49<2:08:53, 16.11s/it]

{'loss': 1.3892, 'grad_norm': 130.28440856933594, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.48}


 49%|████▉     | 450/920 [2:00:22<1:58:02, 15.07s/it]

{'loss': 1.3601, 'grad_norm': 119.72161102294922, 'learning_rate': 4.5e-05, 'epoch': 0.49}


 50%|█████     | 460/920 [2:02:58<2:02:56, 16.03s/it]

{'loss': 1.4037, 'grad_norm': 103.25820922851562, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.5}


 51%|█████     | 470/920 [2:05:36<1:58:13, 15.76s/it]

{'loss': 1.4172, 'grad_norm': 112.58702087402344, 'learning_rate': 4.7e-05, 'epoch': 0.51}


 52%|█████▏    | 480/920 [2:08:24<2:00:07, 16.38s/it]

{'loss': 1.3716, 'grad_norm': 155.71847534179688, 'learning_rate': 4.8e-05, 'epoch': 0.52}


 53%|█████▎    | 490/920 [2:11:07<2:02:28, 17.09s/it]

{'loss': 1.3623, 'grad_norm': 120.30223846435547, 'learning_rate': 4.9e-05, 'epoch': 0.53}


 54%|█████▍    | 500/920 [2:13:49<1:51:50, 15.98s/it]

{'loss': 1.3982, 'grad_norm': 85.36396026611328, 'learning_rate': 5e-05, 'epoch': 0.54}


                                                     
 54%|█████▍    | 500/920 [2:17:39<1:51:50, 15.98s/it]

{'eval_loss': 1.4576992988586426, 'eval_runtime': 230.6254, 'eval_samples_per_second': 3.547, 'eval_steps_per_second': 3.547, 'epoch': 0.54}


 55%|█████▌    | 510/920 [2:20:27<2:17:48, 20.17s/it]

{'loss': 1.4237, 'grad_norm': 129.21652221679688, 'learning_rate': 4.880952380952381e-05, 'epoch': 0.55}


 57%|█████▋    | 520/920 [2:23:05<1:50:47, 16.62s/it]

{'loss': 1.3863, 'grad_norm': 91.52373504638672, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.56}


 58%|█████▊    | 530/920 [2:25:47<1:39:53, 15.37s/it]

{'loss': 1.417, 'grad_norm': 109.88860321044922, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.58}


 59%|█████▊    | 540/920 [2:28:14<1:32:23, 14.59s/it]

{'loss': 1.3338, 'grad_norm': 107.07011413574219, 'learning_rate': 4.523809523809524e-05, 'epoch': 0.59}


 60%|█████▉    | 550/920 [2:30:41<1:32:34, 15.01s/it]

{'loss': 1.4441, 'grad_norm': 110.66477966308594, 'learning_rate': 4.404761904761905e-05, 'epoch': 0.6}


 61%|██████    | 560/920 [2:33:05<1:26:25, 14.41s/it]

{'loss': 1.4602, 'grad_norm': 110.6759262084961, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.61}


 62%|██████▏   | 570/920 [2:35:32<1:26:16, 14.79s/it]

{'loss': 1.4599, 'grad_norm': 105.74591064453125, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.62}


 63%|██████▎   | 580/920 [2:38:08<1:30:54, 16.04s/it]

{'loss': 1.4121, 'grad_norm': 89.43244171142578, 'learning_rate': 4.047619047619048e-05, 'epoch': 0.63}


 64%|██████▍   | 590/920 [2:40:47<1:24:22, 15.34s/it]

{'loss': 1.3391, 'grad_norm': 100.53861999511719, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.64}


 65%|██████▌   | 600/920 [2:43:29<1:31:23, 17.14s/it]

{'loss': 1.4404, 'grad_norm': 108.20314025878906, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.65}


 66%|██████▋   | 610/920 [2:46:08<1:21:25, 15.76s/it]

{'loss': 1.3435, 'grad_norm': 97.06017303466797, 'learning_rate': 3.690476190476191e-05, 'epoch': 0.66}


 67%|██████▋   | 620/920 [2:48:50<1:19:49, 15.97s/it]

{'loss': 1.4499, 'grad_norm': 97.39641571044922, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.67}


 68%|██████▊   | 630/920 [2:51:39<1:19:36, 16.47s/it]

{'loss': 1.4036, 'grad_norm': 93.19788360595703, 'learning_rate': 3.4523809523809526e-05, 'epoch': 0.68}


 70%|██████▉   | 640/920 [2:54:16<1:12:53, 15.62s/it]

{'loss': 1.3615, 'grad_norm': 92.857177734375, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.7}


 71%|███████   | 650/920 [2:57:03<1:14:19, 16.52s/it]

{'loss': 1.3067, 'grad_norm': 90.82041931152344, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.71}


 72%|███████▏  | 660/920 [2:59:36<1:05:25, 15.10s/it]

{'loss': 1.3103, 'grad_norm': 84.65380096435547, 'learning_rate': 3.095238095238095e-05, 'epoch': 0.72}


 73%|███████▎  | 670/920 [3:02:27<1:09:45, 16.74s/it]

{'loss': 1.3727, 'grad_norm': 103.13590240478516, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.73}


 74%|███████▍  | 680/920 [3:05:15<1:05:44, 16.43s/it]

{'loss': 1.3278, 'grad_norm': 92.85095977783203, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.74}


 75%|███████▌  | 690/920 [3:07:38<51:10, 13.35s/it]  

{'loss': 1.2721, 'grad_norm': 79.08808898925781, 'learning_rate': 2.7380952380952383e-05, 'epoch': 0.75}


 76%|███████▌  | 700/920 [3:10:03<56:09, 15.32s/it]

{'loss': 1.3473, 'grad_norm': 113.37003326416016, 'learning_rate': 2.6190476190476192e-05, 'epoch': 0.76}


 77%|███████▋  | 710/920 [3:12:44<55:51, 15.96s/it]

{'loss': 1.3308, 'grad_norm': 89.27198028564453, 'learning_rate': 2.5e-05, 'epoch': 0.77}


 78%|███████▊  | 720/920 [3:15:26<56:16, 16.88s/it]

{'loss': 1.3025, 'grad_norm': 101.93942260742188, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.78}


 79%|███████▉  | 730/920 [3:18:05<50:27, 15.94s/it]

{'loss': 1.2959, 'grad_norm': 74.15125274658203, 'learning_rate': 2.261904761904762e-05, 'epoch': 0.79}


 80%|████████  | 740/920 [3:20:46<47:22, 15.79s/it]

{'loss': 1.3789, 'grad_norm': 127.92058563232422, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.8}


 82%|████████▏ | 750/920 [3:23:22<45:21, 16.01s/it]

{'loss': 1.3042, 'grad_norm': 85.34163665771484, 'learning_rate': 2.023809523809524e-05, 'epoch': 0.81}


 83%|████████▎ | 760/920 [3:26:01<41:39, 15.62s/it]

{'loss': 1.3301, 'grad_norm': 119.08715057373047, 'learning_rate': 1.9047619047619046e-05, 'epoch': 0.83}


 84%|████████▎ | 770/920 [3:28:40<40:42, 16.28s/it]

{'loss': 1.2594, 'grad_norm': 89.159423828125, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.84}


 85%|████████▍ | 780/920 [3:31:25<37:14, 15.96s/it]

{'loss': 1.3074, 'grad_norm': 108.12455749511719, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.85}


 86%|████████▌ | 790/920 [3:34:11<35:28, 16.37s/it]

{'loss': 1.267, 'grad_norm': 86.22676849365234, 'learning_rate': 1.5476190476190476e-05, 'epoch': 0.86}


 87%|████████▋ | 800/920 [3:36:51<32:30, 16.25s/it]

{'loss': 1.3339, 'grad_norm': 95.99288940429688, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.87}


 88%|████████▊ | 810/920 [3:39:31<28:50, 15.73s/it]

{'loss': 1.2543, 'grad_norm': 76.04400634765625, 'learning_rate': 1.3095238095238096e-05, 'epoch': 0.88}


 89%|████████▉ | 820/920 [3:42:09<27:20, 16.40s/it]

{'loss': 1.2757, 'grad_norm': 89.24215698242188, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.89}


 90%|█████████ | 830/920 [3:44:53<25:20, 16.89s/it]

{'loss': 1.3276, 'grad_norm': 210.90342712402344, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.9}


 91%|█████████▏| 840/920 [3:47:37<21:12, 15.91s/it]

{'loss': 1.2402, 'grad_norm': 69.00938415527344, 'learning_rate': 9.523809523809523e-06, 'epoch': 0.91}


 92%|█████████▏| 850/920 [3:50:15<17:48, 15.26s/it]

{'loss': 1.247, 'grad_norm': 94.08534240722656, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.92}


 93%|█████████▎| 860/920 [3:52:54<15:47, 15.80s/it]

{'loss': 1.2431, 'grad_norm': 97.56417083740234, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.93}


 95%|█████████▍| 870/920 [3:55:42<13:57, 16.75s/it]

{'loss': 1.3196, 'grad_norm': 79.95535278320312, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.94}


 96%|█████████▌| 880/920 [3:58:25<11:13, 16.85s/it]

{'loss': 1.2449, 'grad_norm': 74.3889389038086, 'learning_rate': 4.7619047619047615e-06, 'epoch': 0.96}


 97%|█████████▋| 890/920 [4:01:07<08:12, 16.41s/it]

{'loss': 1.2035, 'grad_norm': 105.87803649902344, 'learning_rate': 3.5714285714285714e-06, 'epoch': 0.97}


 98%|█████████▊| 900/920 [4:03:44<05:13, 15.66s/it]

{'loss': 1.3023, 'grad_norm': 96.20258331298828, 'learning_rate': 2.3809523809523808e-06, 'epoch': 0.98}


 99%|█████████▉| 910/920 [4:06:20<02:33, 15.34s/it]

{'loss': 1.2176, 'grad_norm': 138.64964294433594, 'learning_rate': 1.1904761904761904e-06, 'epoch': 0.99}




{'loss': 1.253, 'grad_norm': 107.64303588867188, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 920/920 [4:09:08<00:00, 16.25s/it]


{'train_runtime': 14948.7719, 'train_samples_per_second': 0.985, 'train_steps_per_second': 0.062, 'train_loss': 1.3870256040407263, 'epoch': 1.0}
