In [2]:
import os 


In [3]:
from pathlib import Path
from dataclasses import dataclass

@dataclass(frozen=True)
class ModelTrainerConfig:
  root_dir: Path 
  data_path: Path 
  model_ckpt: Path 
  num_train_epochs: int
  warmup_steps: int
  per_device_train_batch_size: int
  weight_decay: float
  logging_steps: int
  evaluation_strategy: str
  eval_steps: int
  save_steps: int
  gradient_accumulation_steps: int
  


In [4]:
from textSummer.constants import *
from textSummer.utils.common import read_yaml, create_directories

helooo


In [5]:
from textSummer.constants import *
from textSummer.utils.common import read_yaml, create_directories
from textSummer.entity import DataIngestionConfig, DataValidationConfig, DataTransformationConfig
class ConfigurationManager(object):
  def __init__(
      self,
      config_filepath = CONFIG_FILE_PATH,
      params_filepath = PARAMS_FILE_PATH
  ):
    self.config = read_yaml(config_filepath)
    self.params = read_yaml(params_filepath)
    create_directories([self.config.artifacts_root])
  
  def get_data_ingestion_config(self) -> DataIngestionConfig:
    
    config = self.config.data_ingestion
    
    create_directories([config.root_dir])


    return DataIngestionConfig(
      root_dir = config.root_dir,
      source_url = config.source_url,
      local_data_file = config.local_data_file,
      unzip_dir = config.unzip_dir
    )
  
  def get_data_validation_config(self) -> DataValidationConfig:
    config = self.config.data_validation

    create_directories([config.root_dir])

    return DataValidationConfig(
      root_dir = config.root_dir,
      STATUS_FILE = config.STATUS_FILE,
      ALL_REQUIRED_FILES = config.ALL_REQUIRED_FILES
    )
  
  def get_data_transformation_config(self) -> DataTransformationConfig:
    config = self.config.data_transformation
    
    create_directories([config.root_dir])

    return DataTransformationConfig(
      root_dir = config.root_dir,
      data_path = config.data_path,
      tokenizer_name = config.tokenizer_name,
    )
  
  def get_model_trainer_config(self) -> ModelTrainerConfig:
    config = self.config.model_trainer
    params = self.params.TrainingArguments

    create_directories([config.root_dir])

    return ModelTrainerConfig(
      root_dir=config.root_dir,
      data_path = config.data_path,
      model_ckpt= config.model_ckpt,
      num_train_epochs= params.num_train_epochs,
      warmup_steps= params.warmup_steps,
      per_device_train_batch_size= params.per_device_train_batch_size,
      weight_decay= params.weight_decay,
      logging_steps= params.logging_steps,
      evaluation_strategy= params.evaluation_strategy,
      eval_steps= params.eval_steps,
      save_steps= params.save_steps,
      gradient_accumulation_steps= params.gradient_accumulation_steps,
    )

In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_from_disk, load_dataset
import torch

[2024-05-24 12:19:41,174: INFO: config: PyTorch version 2.3.0 available.]


In [14]:
import threading 
import time

class ModelTrainer:
  def __init__(
      self,
      config: ModelTrainerConfig
  ):
    self.config = config

  def train(self):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(self.config.model_ckpt)
    tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
    model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
    seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model_pegasus)


    dataset_samsum_pt = load_from_disk(self.config.data_path)

    trainer_args = TrainingArguments(
      output_dir=self.config.root_dir,
      num_train_epochs=self.config.num_train_epochs,
      warmup_steps=self.config.warmup_steps,
      per_device_eval_batch_size=self.config.per_device_train_batch_size,
      per_device_train_batch_size=self.config.per_device_train_batch_size,
      weight_decay=self.config.weight_decay,
      logging_steps=self.config.logging_steps,
      evaluation_strategy=self.config.evaluation_strategy,
      eval_steps=self.config.eval_steps,
      save_steps=1e6,
      gradient_accumulation_steps= self.config.gradient_accumulation_steps
    )  


    trainer = Trainer(
      model = model_pegasus,
      args = trainer_args,
      train_dataset= dataset_samsum_pt["train"],
      tokenizer=tokenizer,
      data_collator=seq2seq_data_collator,
      eval_dataset=dataset_samsum_pt["validation"]
     )
    
    train_thread = threading.Thread(target=trainer.train)

    train_thread.start()

    time.sleep(10)

    train_thread.do_run = False

    model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))
    tokenizer.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-tokenizer"))

In [15]:
try:
  config_man = ConfigurationManager()
  model_trainer_config = config_man.get_model_trainer_config()
  model_trainer = ModelTrainer(model_trainer_config)
  model_trainer.train()
except Exception as e:
  raise e

[2024-05-24 12:41:11,275: INFO: common: yaml file: config/config.yaml loaded successfully]
hii {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_url': 'https://github.com/entbappy/Branching-Tutorial/raw/master/summarizer-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'STATUS_FILE': 'artifacts/data_validation/status.txt', 'ALL_REQUIRED_FILES': ['train', 'test', 'validation']}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/samsum_dataset', 'tokenizer_name': 'google/pegasus-cnn_dailymail'}, 'model_trainer': {'root_dir': 'artifacts/model_trainer', 'data_path': 'artifacts/data_transformation/samsum_dataset', 'model_ckpt': 'google/pegasus-cnn_dailymail'}} <class 'box.config_box.ConfigBox'>
[2024-05-24 12:41:11,298: INFO: common: yaml file: params.yaml loaded suc

[2024-05-24 12:41:11,302: INFO: common: Created directory: artifacts/model_trainer]
google/pegasus-cnn_dailymail


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


KeyboardInterrupt: 