In [1]:
pwd

'/home/dephinate/ASU/TextSummarization/research'

In [2]:
import os
os.chdir("../")

In [3]:
os.listdir()

['setup.py',
 'main.py',
 'DockerFile',
 'template.py',
 'params.yaml',
 'artifacts',
 'src',
 '.github',
 'logs',
 '.vscode',
 'README.md',
 'requirements.txt',
 'app.py',
 'config',
 'research',
 '.git',
 '.gitignore']

! pip install --upgrade accelerate
! pip uninstall -y transformers accelerate
! pip install transformers accelerate

Model Trainer
* update config.yaml
* create entity
* create method in config manager to read configurations for training and create reqiured directories
* create traininer component
* crete pipeline for training
* update main

Entity

In [4]:
import os
from dataclasses import dataclass
from pathlib import Path

In [5]:
@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: str
    num_training_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: int
    logging_steps: int
    evaluation_strategy: str
    eval_steps: float
    save_steps: float
    gradient_accumulation_steps: int

Configuration Manager

In [6]:
# Data Ingestion
from TextSummarizer.constants import *
from TextSummarizer.utils.common import read_yaml, create_directories
from TextSummarizer.entity import DataIngestionConfig, DataValidationConfig, DataTransformationConfig


class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            param_filepath=PARAMS_FILE_PATH) -> None:

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(param_filepath)

        create_directories([self.config.artifacts_root])

    # Method to read Data Ingestion Config

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config

    # Method to read Data Validation Config
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            status_file=config.status_file,
            all_required_files=config.all_required_files,
            local_data_folder=config.local_data_folder
        )
        return data_validation_config

    # Method to read Data Transformation Configuration
    def get_data_transformation_config(self) -> DataIngestionConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )
        return data_transformation_config
    
    # Method to read model trainer configuration
    def get_model_trainer_config(self) -> ModelTrainerConfig: 
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_training_epochs= params.num_training_epochs,
            warmup_steps= params.warmup_steps,
            per_device_train_batch_size= params.per_device_train_batch_size,
            weight_decay= params.weight_decay,
            logging_steps= params.logging_steps,
            evaluation_strategy= params.evaluation_strategy,
            eval_steps= params.eval_steps,
            save_steps= params.save_steps,
            gradient_accumulation_steps= params.gradient_accumulation_steps

        )
        return model_trainer_config




Trainer
* Need a Collator to to btach data and do req preprocessing
* Need a TrainingArguments class object to define all the hyperparameters
* Need a trainer object

In [7]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2024-02-02 20:04:52,457,INFO,config,PyTorch version 2.1.2 available.]


In [8]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig) -> None:
        self.config = config
    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer,model=model_pegasus)

        # load dataset
        dataset_samasum_tokenized = load_from_disk(self.config.data_path)

        # load arguments
        trainer_args = TrainingArguments(
            output_dir                  = self.config.root_dir,
            num_train_epochs            = self.config.num_training_epochs,
            warmup_steps                = self.config.warmup_steps,                             # uses a low learining rate for a set number of training examples
            per_device_train_batch_size = self.config.per_device_train_batch_size,
            per_device_eval_batch_size  = self.config.per_device_train_batch_size,
            weight_decay                = self.config.weight_decay,                             # weight decal for AdamW optimizer
            logging_steps               = self.config.logging_steps,
            evaluation_strategy         = self.config.evaluation_strategy,
            # eval_steps                  = self.config.eval_steps,
            # save_steps                  = self.config.save_steps,
            gradient_accumulation_steps = self.config.gradient_accumulation_steps,
            auto_find_batch_size        = True
        )

        trainer = Trainer(
            model=model_pegasus,
            args = trainer_args,
            tokenizer= tokenizer,
            data_collator= seq2seq_data_collator,
            train_dataset= dataset_samasum_tokenized["train"],
            eval_dataset= dataset_samasum_tokenized["validation"]

        )
        print(self.config.eval_steps)
        trainer.train()
        # Save Model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir),"pegasus-samsum-model_finetuned")
        # Save Tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir),"pegasus-samsum-tokenizer_finetuned")

Pipeline

In [9]:
torch.cuda.empty_cache()

In [10]:

class ClearCache:
    def __enter__(self):
        torch.cuda.empty_cache()

    def __exit__(self, exc_type, exc_val, exc_tb):
        torch.cuda.empty_cache()


Use the context manager

with ClearCache():

    # Define and train the PyTorch model
    ...

In [11]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    with ClearCache():
        model_trainer.train()
except Exception as e:
    raise e

[2024-02-02 20:04:52,703,INFO,common,yaml file config/config.yaml loaded successfully]
[2024-02-02 20:04:52,704,INFO,common,yaml file params.yaml loaded successfully]
[2024-02-02 20:04:52,704,INFO,common,created directory at: artifacts]
[2024-02-02 20:04:52,704,INFO,common,created directory at: artifacts/model_trainer]


  return self.fget.__get__(instance, owner)()
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.1


  0%|          | 0/920 [00:00<?, ?it/s]

RuntimeError: No executable batch size found, reached zero.