In [1]:
import os
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Text-Summarizer\\research'

In [2]:
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\Text-Summarizer'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int


In [5]:
from src.TextSummarizer.constants import *
from src.TextSummarizer.utils.common import read_yaml,create_directories


In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config=self.config.model_trainer
        params=self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config=ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )
        return model_trainer_config



In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import torch
from src.TextSummarizer.logging import logger
from datasets import load_from_disk

In [14]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        logger.info("ModelTrainer initialized with configuration: %s", self.config)

    def train(self):
        # Setting up the device
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logger.info("Using device: %s", device)

        # Loading the tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        logger.info("Model and tokenizer loaded from checkpoint: %s", self.config.model_ckpt)

        # Setting up data collator
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        logger.info("Data collator initialized.")

        # Loading the dataset
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        logger.info("Dataset loaded from path: %s", self.config.data_path)

        # Training arguments
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 
        logger.info("Training arguments set up with: %s", trainer_args)

        # Initialize Trainer
        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["test"],
                  eval_dataset=dataset_samsum_pt["validation"])
        
        
        logger.info("Trainer initialized.")

        # Start training
        logger.info("Starting training...")
        trainer.train()
        logger.info("Training complete.")

        # Save the model and tokenizer
        model_save_path = os.path.join(self.config.root_dir, "pegasus-samsum-model")
        tokenizer_save_path = os.path.join(self.config.root_dir, "tokenizer")
        
        model_pegasus.save_pretrained(model_save_path)
        tokenizer.save_pretrained(tokenizer_save_path)
        
        logger.info("Model saved to %s", model_save_path)
        logger.info("Tokenizer saved to %s", tokenizer_save_path)

In [17]:
!pip install --upgrade accelerate
!pip install -y transformers accelerate
!pip install transformers accelerate




Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -y




In [18]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer_config()
model_trainer = ModelTrainer(model_trainer_config)
model_trainer.train()

[2024-11-07 19:50:44,288 - common.py - read_yaml - line 25 - INFO - YAML file: config\config.yaml loaded successfully]
[2024-11-07 19:50:44,290 - common.py - read_yaml - line 25 - INFO - YAML file: params.yaml loaded successfully]
[2024-11-07 19:50:44,296 - common.py - create_directories - line 46 - INFO - Directory: artifacts created successfully]
[2024-11-07 19:50:44,298 - common.py - create_directories - line 46 - INFO - Directory: artifacts/model_trainer created successfully]
[2024-11-07 19:50:44,298 - 281410720.py - __init__ - line 4 - INFO - ModelTrainer initialized with configuration: ModelTrainerConfig(root_dir='artifacts/model_trainer', data_path='artifacts/data_ingestion/samsum_dataset', model_ckpt='google/pegasus-cnn_dailymail', num_train_epochs=1, warmup_steps=500, per_device_train_batch_size=1, weight_decay=0.01, logging_steps=10, evaluation_strategy='steps', eval_steps='steps', save_steps='1e6', gradient_accumulation_steps=16)]
[2024-11-07 19:50:44,298 - 281410720.py - tr

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2024-11-07 19:51:23,895 - 281410720.py - train - line 14 - INFO - Model and tokenizer loaded from checkpoint: google/pegasus-cnn_dailymail]
[2024-11-07 19:51:23,895 - 281410720.py - train - line 18 - INFO - Data collator initialized.]
[2024-11-07 19:51:23,949 - 281410720.py - train - line 22 - INFO - Dataset loaded from path: artifacts/data_ingestion/samsum_dataset]


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`