In [1]:
import os

In [2]:
%pwd

'd:\\Git-Hub projects\\Text-Summarizer\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Git-Hub projects\\Text-Summarizer'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [8]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2024-05-12 17:05:19,617: INFO: config: PyTorch version 2.3.0 available.]


In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
        # ) 


        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 

        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()

        ## Save model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-05-12 17:05:20,513: INFO: common: yaml file: config\config.yaml loaded successfully]


[2024-05-12 17:05:20,530: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-12 17:05:20,533: INFO: common: created directory at: artifacts]
[2024-05-12 17:05:20,537: INFO: common: created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 10/920 [17:23<25:34:40, 101.19s/it]

{'loss': 3.1995, 'grad_norm': 20.48751449584961, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  2%|▏         | 20/920 [35:05<27:19:53, 109.33s/it]

{'loss': 3.2262, 'grad_norm': 9.711017608642578, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  3%|▎         | 30/920 [52:15<25:50:20, 104.52s/it]

{'loss': 2.9224, 'grad_norm': 11.304835319519043, 'learning_rate': 3e-06, 'epoch': 0.03}


  4%|▍         | 40/920 [1:09:23<23:51:26, 97.60s/it] 

{'loss': 2.853, 'grad_norm': 13.299054145812988, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}


  5%|▌         | 50/920 [1:26:56<24:11:25, 100.10s/it]

{'loss': 2.6911, 'grad_norm': 11.871971130371094, 'learning_rate': 5e-06, 'epoch': 0.05}


  7%|▋         | 60/920 [1:51:19<36:55:17, 154.56s/it]

{'loss': 2.729, 'grad_norm': 23.833356857299805, 'learning_rate': 6e-06, 'epoch': 0.07}


  8%|▊         | 70/920 [2:08:19<25:17:07, 107.09s/it]

{'loss': 2.6316, 'grad_norm': 10.161120414733887, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}


  9%|▊         | 80/920 [2:25:13<22:45:22, 97.53s/it] 

{'loss': 2.4347, 'grad_norm': 9.823800086975098, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.09}


 10%|▉         | 90/920 [2:41:39<22:45:14, 98.69s/it] 

{'loss': 2.5044, 'grad_norm': 7.497169494628906, 'learning_rate': 9e-06, 'epoch': 0.1}


 11%|█         | 100/920 [2:58:42<25:14:35, 110.82s/it]

{'loss': 2.467, 'grad_norm': 6.793668270111084, 'learning_rate': 1e-05, 'epoch': 0.11}


 12%|█▏        | 110/920 [3:20:36<28:37:24, 127.22s/it]

{'loss': 2.2325, 'grad_norm': 8.96264934539795, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.12}


 13%|█▎        | 120/920 [3:39:28<22:26:23, 100.98s/it]

{'loss': 2.1758, 'grad_norm': 6.3379011154174805, 'learning_rate': 1.2e-05, 'epoch': 0.13}


 14%|█▍        | 130/920 [3:57:01<23:28:31, 106.98s/it]

{'loss': 2.1534, 'grad_norm': 8.759004592895508, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.14}


 15%|█▌        | 140/920 [4:14:16<22:00:34, 101.58s/it]

{'loss': 2.1061, 'grad_norm': 7.473703384399414, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.15}


 16%|█▋        | 150/920 [4:31:07<21:30:22, 100.55s/it]

{'loss': 2.02, 'grad_norm': 8.633268356323242, 'learning_rate': 1.5e-05, 'epoch': 0.16}


 17%|█▋        | 160/920 [4:48:31<23:06:49, 109.49s/it]

{'loss': 1.9763, 'grad_norm': 32.81879806518555, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.17}


 18%|█▊        | 170/920 [5:08:28<26:22:12, 126.58s/it]

{'loss': 2.0109, 'grad_norm': 18.109634399414062, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.18}


 20%|█▉        | 180/920 [5:28:03<25:15:40, 122.89s/it]

{'loss': 2.0173, 'grad_norm': 7.700382709503174, 'learning_rate': 1.8e-05, 'epoch': 0.2}


 21%|██        | 190/920 [5:49:19<26:25:07, 130.28s/it]

{'loss': 1.8946, 'grad_norm': 4.851282119750977, 'learning_rate': 1.9e-05, 'epoch': 0.21}


 22%|██▏       | 200/920 [6:10:18<22:53:54, 114.49s/it]

{'loss': 1.9261, 'grad_norm': 10.21042251586914, 'learning_rate': 2e-05, 'epoch': 0.22}


 23%|██▎       | 210/920 [6:27:46<19:31:40, 99.01s/it] 

{'loss': 1.8865, 'grad_norm': 5.641373157501221, 'learning_rate': 2.1e-05, 'epoch': 0.23}


 24%|██▍       | 220/920 [6:44:32<19:23:39, 99.74s/it] 

{'loss': 1.79, 'grad_norm': 4.453788757324219, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.24}


 25%|██▌       | 230/920 [7:01:53<20:32:40, 107.19s/it]

{'loss': 1.8793, 'grad_norm': 5.052816390991211, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.25}


 26%|██▌       | 240/920 [7:23:51<22:15:56, 117.88s/it]

{'loss': 1.7813, 'grad_norm': 4.04083776473999, 'learning_rate': 2.4e-05, 'epoch': 0.26}


 27%|██▋       | 250/920 [7:41:24<18:38:43, 100.18s/it]

{'loss': 1.8354, 'grad_norm': 4.670814514160156, 'learning_rate': 2.5e-05, 'epoch': 0.27}


 28%|██▊       | 260/920 [7:59:14<21:03:30, 114.86s/it]

{'loss': 1.7816, 'grad_norm': 5.029804706573486, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.28}


 29%|██▉       | 270/920 [8:15:54<18:23:35, 101.87s/it]

{'loss': 1.7218, 'grad_norm': 5.134103298187256, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.29}


 30%|███       | 280/920 [8:32:43<18:30:49, 104.14s/it]

{'loss': 1.7512, 'grad_norm': 5.15428352355957, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.3}


 32%|███▏      | 290/920 [8:50:18<18:06:21, 103.46s/it]

{'loss': 1.8618, 'grad_norm': 5.409088134765625, 'learning_rate': 2.9e-05, 'epoch': 0.31}


 33%|███▎      | 300/920 [9:08:22<19:56:54, 115.83s/it]

{'loss': 1.6773, 'grad_norm': 5.940909385681152, 'learning_rate': 3e-05, 'epoch': 0.33}


 34%|███▎      | 310/920 [9:27:43<18:43:48, 110.54s/it]

{'loss': 1.8574, 'grad_norm': 6.4041619300842285, 'learning_rate': 3.1e-05, 'epoch': 0.34}


 35%|███▍      | 320/920 [9:45:33<19:28:02, 116.80s/it]

{'loss': 1.8925, 'grad_norm': 6.45241641998291, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.35}


 36%|███▌      | 330/920 [10:02:50<16:22:31, 99.92s/it] 

{'loss': 1.8061, 'grad_norm': 4.966202259063721, 'learning_rate': 3.3e-05, 'epoch': 0.36}


 37%|███▋      | 340/920 [10:20:07<16:50:59, 104.59s/it]

{'loss': 1.7383, 'grad_norm': 4.595344066619873, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.37}


 38%|███▊      | 350/920 [10:37:53<16:06:16, 101.71s/it]

{'loss': 1.7274, 'grad_norm': 8.334944725036621, 'learning_rate': 3.5e-05, 'epoch': 0.38}


 39%|███▉      | 360/920 [10:54:58<15:30:09, 99.66s/it] 

{'loss': 1.6552, 'grad_norm': 5.697684288024902, 'learning_rate': 3.6e-05, 'epoch': 0.39}


 40%|████      | 370/920 [11:11:24<14:32:57, 95.23s/it] 

{'loss': 1.6926, 'grad_norm': 5.300842761993408, 'learning_rate': 3.7e-05, 'epoch': 0.4}


 41%|████▏     | 380/920 [11:28:21<15:11:01, 101.23s/it]

{'loss': 1.6898, 'grad_norm': 4.844779968261719, 'learning_rate': 3.8e-05, 'epoch': 0.41}


 42%|████▏     | 390/920 [11:47:03<15:12:50, 103.34s/it]

{'loss': 1.7203, 'grad_norm': 3.819350481033325, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.42}


 43%|████▎     | 400/920 [12:05:53<16:36:32, 114.99s/it]

{'loss': 1.7116, 'grad_norm': 5.474185943603516, 'learning_rate': 4e-05, 'epoch': 0.43}


 45%|████▍     | 410/920 [12:25:13<15:28:09, 109.19s/it]

{'loss': 1.7211, 'grad_norm': 6.579054355621338, 'learning_rate': 4.1e-05, 'epoch': 0.45}


 46%|████▌     | 420/920 [12:42:41<14:28:26, 104.21s/it]

{'loss': 1.6462, 'grad_norm': 4.3287553787231445, 'learning_rate': 4.2e-05, 'epoch': 0.46}


 47%|████▋     | 430/920 [12:59:30<13:47:30, 101.33s/it]

{'loss': 1.7876, 'grad_norm': 5.7445969581604, 'learning_rate': 4.3e-05, 'epoch': 0.47}


 48%|████▊     | 440/920 [13:17:10<13:38:01, 102.25s/it]

{'loss': 1.7123, 'grad_norm': 5.857101917266846, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.48}


 49%|████▉     | 450/920 [13:33:19<12:28:06, 95.50s/it] 

{'loss': 1.6619, 'grad_norm': 4.295877933502197, 'learning_rate': 4.5e-05, 'epoch': 0.49}


 50%|█████     | 460/920 [13:49:41<13:01:01, 101.87s/it]

{'loss': 1.7152, 'grad_norm': 3.4269564151763916, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.5}


 51%|█████     | 470/920 [14:06:19<12:25:30, 99.40s/it] 

{'loss': 1.6674, 'grad_norm': 4.5818986892700195, 'learning_rate': 4.7e-05, 'epoch': 0.51}


 52%|█████▏    | 480/920 [14:24:09<12:40:13, 103.67s/it]

{'loss': 1.6377, 'grad_norm': 8.781657218933105, 'learning_rate': 4.8e-05, 'epoch': 0.52}


 53%|█████▎    | 490/920 [14:41:20<12:56:47, 108.39s/it]

{'loss': 1.6481, 'grad_norm': 4.209730625152588, 'learning_rate': 4.9e-05, 'epoch': 0.53}


 54%|█████▍    | 500/920 [14:58:17<11:47:22, 101.05s/it]

{'loss': 1.6579, 'grad_norm': 3.515462875366211, 'learning_rate': 5e-05, 'epoch': 0.54}


                                                        
 54%|█████▍    | 500/920 [15:23:41<11:47:22, 101.05s/it]

{'eval_loss': 1.4850271940231323, 'eval_runtime': 1524.2835, 'eval_samples_per_second': 0.537, 'eval_steps_per_second': 0.537, 'epoch': 0.54}


 55%|█████▌    | 510/920 [15:43:09<16:24:33, 144.08s/it]

{'loss': 1.7053, 'grad_norm': 4.268325328826904, 'learning_rate': 4.880952380952381e-05, 'epoch': 0.55}


 57%|█████▋    | 520/920 [16:03:48<15:16:36, 137.49s/it]

{'loss': 1.6668, 'grad_norm': 3.2583422660827637, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.56}


 58%|█████▊    | 530/920 [16:22:09<10:54:09, 100.64s/it]

{'loss': 1.6925, 'grad_norm': 3.8473622798919678, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.58}


 59%|█████▊    | 540/920 [16:38:53<10:33:45, 100.07s/it]

{'loss': 1.5671, 'grad_norm': 5.758591175079346, 'learning_rate': 4.523809523809524e-05, 'epoch': 0.59}


 60%|█████▉    | 550/920 [16:56:05<10:54:40, 106.16s/it]

{'loss': 1.6613, 'grad_norm': 3.9351396560668945, 'learning_rate': 4.404761904761905e-05, 'epoch': 0.6}


 61%|██████    | 560/920 [17:15:22<11:50:59, 118.50s/it]

{'loss': 1.6979, 'grad_norm': 5.0593581199646, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.61}


 62%|██████▏   | 570/920 [17:32:57<10:11:04, 104.75s/it]

{'loss': 1.7027, 'grad_norm': 7.10544490814209, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.62}


 63%|██████▎   | 580/920 [17:51:06<10:36:35, 112.34s/it]

{'loss': 1.6329, 'grad_norm': 3.5524168014526367, 'learning_rate': 4.047619047619048e-05, 'epoch': 0.63}


 64%|██████▍   | 590/920 [18:09:24<9:11:03, 100.19s/it] 

{'loss': 1.5351, 'grad_norm': 5.152397632598877, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.64}


 65%|██████▌   | 600/920 [18:26:38<9:43:13, 109.35s/it]

{'loss': 1.6433, 'grad_norm': 4.005782604217529, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.65}


 66%|██████▋   | 610/920 [18:43:25<8:40:03, 100.66s/it]

{'loss': 1.562, 'grad_norm': 5.927016735076904, 'learning_rate': 3.690476190476191e-05, 'epoch': 0.66}


 67%|██████▋   | 620/920 [19:00:42<8:28:14, 101.65s/it]

{'loss': 1.6833, 'grad_norm': 4.609201431274414, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.67}


 68%|██████▊   | 630/920 [19:19:37<9:25:27, 116.99s/it]

{'loss': 1.6545, 'grad_norm': 3.8150742053985596, 'learning_rate': 3.4523809523809526e-05, 'epoch': 0.68}


 70%|██████▉   | 640/920 [19:39:07<9:02:58, 116.35s/it]

{'loss': 1.6064, 'grad_norm': 4.858364582061768, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.7}


 71%|███████   | 650/920 [19:59:24<8:32:20, 113.85s/it]

{'loss': 1.5037, 'grad_norm': 6.13270902633667, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.71}


 72%|███████▏  | 660/920 [20:15:28<6:50:49, 94.80s/it] 

{'loss': 1.5693, 'grad_norm': 3.1517562866210938, 'learning_rate': 3.095238095238095e-05, 'epoch': 0.72}


 73%|███████▎  | 670/920 [20:33:40<7:21:21, 105.93s/it]

{'loss': 1.5813, 'grad_norm': 3.807197093963623, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.73}


 74%|███████▍  | 680/920 [20:51:34<7:02:54, 105.73s/it]

{'loss': 1.5956, 'grad_norm': 4.0849080085754395, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.74}


 75%|███████▌  | 690/920 [21:07:45<5:56:08, 92.90s/it] 

{'loss': 1.5508, 'grad_norm': 3.209458112716675, 'learning_rate': 2.7380952380952383e-05, 'epoch': 0.75}


 76%|███████▌  | 700/920 [21:27:52<7:45:00, 126.82s/it]

{'loss': 1.6422, 'grad_norm': 3.5065696239471436, 'learning_rate': 2.6190476190476192e-05, 'epoch': 0.76}


 77%|███████▋  | 710/920 [21:48:28<7:01:49, 120.52s/it]

{'loss': 1.5801, 'grad_norm': 3.103769302368164, 'learning_rate': 2.5e-05, 'epoch': 0.77}


 78%|███████▊  | 720/920 [22:05:50<6:14:23, 112.32s/it]

{'loss': 1.5655, 'grad_norm': 2.986144542694092, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.78}


 79%|███████▉  | 730/920 [24:00:45<32:15:49, 611.31s/it] 

{'loss': 1.5333, 'grad_norm': 2.987212657928467, 'learning_rate': 2.261904761904762e-05, 'epoch': 0.79}


 80%|████████  | 740/920 [24:17:59<5:45:50, 115.28s/it] 

{'loss': 1.6766, 'grad_norm': 3.3721554279327393, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.8}


 82%|████████▏ | 750/920 [24:34:22<4:47:23, 101.43s/it]

{'loss': 1.57, 'grad_norm': 5.035257339477539, 'learning_rate': 2.023809523809524e-05, 'epoch': 0.81}


 83%|████████▎ | 760/920 [24:51:28<4:28:20, 100.63s/it]

{'loss': 1.5955, 'grad_norm': 4.211121559143066, 'learning_rate': 1.9047619047619046e-05, 'epoch': 0.83}


 84%|████████▎ | 770/920 [25:08:28<4:20:48, 104.32s/it]

{'loss': 1.5476, 'grad_norm': 4.027068138122559, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.84}


 85%|████████▍ | 780/920 [25:28:54<4:43:27, 121.48s/it]

{'loss': 1.5764, 'grad_norm': 5.893372535705566, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.85}


 86%|████████▌ | 790/920 [25:48:50<3:58:45, 110.19s/it]

{'loss': 1.5637, 'grad_norm': 7.781467914581299, 'learning_rate': 1.5476190476190476e-05, 'epoch': 0.86}


 87%|████████▋ | 800/920 [26:05:46<3:27:11, 103.59s/it]

{'loss': 1.6632, 'grad_norm': 15.91942024230957, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.87}


 88%|████████▊ | 810/920 [26:22:58<3:04:15, 100.51s/it]

{'loss': 1.5361, 'grad_norm': 2.849755048751831, 'learning_rate': 1.3095238095238096e-05, 'epoch': 0.88}


 89%|████████▉ | 820/920 [26:39:36<2:51:11, 102.72s/it]

{'loss': 1.606, 'grad_norm': 98.86212158203125, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.89}


 90%|█████████ | 830/920 [26:59:31<3:11:57, 127.97s/it]

{'loss': 1.6891, 'grad_norm': 3.42179274559021, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.9}


 91%|█████████▏| 840/920 [27:20:24<2:38:58, 119.23s/it]

{'loss': 1.555, 'grad_norm': 3.936593532562256, 'learning_rate': 9.523809523809523e-06, 'epoch': 0.91}


 92%|█████████▏| 850/920 [27:36:56<1:53:02, 96.89s/it] 

{'loss': 1.5466, 'grad_norm': 6.291192054748535, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.92}


 93%|█████████▎| 860/920 [27:53:45<1:39:41, 99.69s/it] 

{'loss': 1.5571, 'grad_norm': 3.887887477874756, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.93}


 95%|█████████▍| 870/920 [28:11:38<1:28:33, 106.26s/it]

{'loss': 1.5839, 'grad_norm': 4.09218168258667, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.94}


 96%|█████████▌| 880/920 [28:30:52<1:23:51, 125.78s/it]

{'loss': 1.5369, 'grad_norm': 5.432347774505615, 'learning_rate': 4.7619047619047615e-06, 'epoch': 0.96}


 97%|█████████▋| 890/920 [28:51:26<1:02:33, 125.11s/it]

{'loss': 1.54, 'grad_norm': 4.5140299797058105, 'learning_rate': 3.5714285714285714e-06, 'epoch': 0.97}


 98%|█████████▊| 900/920 [29:10:48<36:37, 109.86s/it]  

{'loss': 1.597, 'grad_norm': 4.120645999908447, 'learning_rate': 2.3809523809523808e-06, 'epoch': 0.98}


 99%|█████████▉| 910/920 [29:27:19<16:18, 97.87s/it] 

{'loss': 1.5156, 'grad_norm': 4.252754211425781, 'learning_rate': 1.1904761904761904e-06, 'epoch': 0.99}


100%|██████████| 920/920 [29:44:30<00:00, 116.38s/it]


{'loss': 1.6163, 'grad_norm': 3.763716697692871, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 107070.3087, 'train_samples_per_second': 0.138, 'train_steps_per_second': 0.009, 'train_loss': 1.8227204094762388, 'epoch': 1.0}


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
