In [1]:
import os

In [2]:
%pwd

'c:\\Users\\asus\\Desktop\\Text-Summarization-Case-Study\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\asus\\Desktop\\Text-Summarization-Case-Study'

In [5]:
# Entity for model trainer
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
  root_dir: Path
  data_path: Path
  model: Path
  num_train_epochs: int
  warmup_steps: int
  per_device_train_batch_size: int
  weight_decay: float
  logging_steps: int
  evaluation_strategy: str
  eval_steps: int
  save_steps: int
  gradient_accumulation_steps: int


In [7]:
# Configuration Manager for model trainer
from textsummarizer.constants import *
from textsummarizer.utils.common import read_yaml, create_directories

class ConfigurationManager:
  def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):

    self.config = read_yaml(config_filepath)
    self.params = read_yaml(params_filepath)

    create_directories([self.config.artifacts_root])


# Model Trainer Configuration present in config.yaml
  def get_model_trainer_config(self) -> ModelTrainerConfig:
    config = self.config.model_trainer
    params = self.params.TrainingArguments

    create_directories([config.root_dir])

    model_trainer_config = ModelTrainerConfig(
      root_dir = config.root_dir,
      data_path = config.data_path,
      model = config.model,
      num_train_epochs = params.num_train_epochs,
      warmup_steps = params.warmup_steps,
      per_device_train_batch_size = params.per_device_train_batch_size,
      weight_decay = params.weight_decay,
      logging_steps = params.logging_steps,
      evaluation_strategy = params.evaluation_strategy,
      eval_steps = params.eval_steps,
      save_steps = params.save_steps,
      gradient_accumulation_steps = params.gradient_accumulation_steps
    )

    return model_trainer_config

In [8]:
# Components for model trainer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch
import requests

class ModelTrainer:
  def __init__(self, config: ModelTrainerConfig):
    self.config = config



  def train(self):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = AutoTokenizer.from_pretrained(self.config.model)
    model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model).to(device)
    seqtoseq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model_pegasus)
    

    dataset_samsum_pt = load_from_disk(self.config.data_path)


    trainer_args = TrainingArguments(output_dir=self.config.root_dir,
                                          evaluation_strategy=self.config.evaluation_strategy, per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size, gradient_accumulation_steps=self.config.gradient_accumulation_steps, weight_decay=self.config.weight_decay, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps, logging_steps=self.config.logging_steps, eval_steps=self.config.eval_steps, save_steps=self.config.save_steps)


    trainer = Trainer(model=model_pegasus, 
                      args=trainer_args, data_collator=seqtoseq_data_collator, train_dataset=dataset_samsum_pt['train'], eval_dataset=dataset_samsum_pt['test'], tokenizer=tokenizer)


    trainer.train()

    model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))

    tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-28 18:54:03,139: INFO: config: PyTorch version 2.2.1 available.]


In [9]:
# Pipeline for model trainer
try:
  config = ConfigurationManager()
  model_trainer_config = config.get_model_trainer_config()
  model_trainer = ModelTrainer(config=model_trainer_config)
  model_trainer.train()
except Exception as e:
  raise e

[2024-03-28 18:54:18,135: INFO: common: yaml_file: config\config.yaml loaded successfully]
[2024-03-28 18:54:18,135: INFO: common: yaml_file: params.yaml loaded successfully]
[2024-03-28 18:54:18,135: INFO: common: created directory at: artifacts]
[2024-03-28 18:54:18,135: INFO: common: created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 14%|█▎        | 7/51 [15:04:42<93:49:29, 7676.59s/it] 

KeyboardInterrupt: 