In [1]:
pwd

'/home/ayush/Documents/AI/Projects/huggingface-transformer-project/research'

cd ..

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTrainerConfig:
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: int
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int
    root_dir: Path
    model_ckpt: Path
    train_path: Path
    val_path: Path
    test_path: Path

In [7]:
from src.constants import *
from src.utils.common import read_yaml_file,createDirs

In [None]:
class ConfigurationManager:
    def __init__(self,
                 config_path=CONFIG_FILE_PATH,
                 param_path=PARAMS_FILE_PATH):
        self.config = read_yaml_file(config_path)
        self.params = read_yaml_file(param_path)
        createDirs([self.config.artifacts_root])


    def get_data_trainer_config(self)-> DataTrainerConfig:
        config=self.config.data_trainer
        createDirs([config.root_dir])

        data_trainer = DataTrainerConfig(
            root_dir=Path(config.root_dir),
            model_ckpt=Path(config.model_ckpt),
            train_path=Path(config.train_path),
            val_path=Path(config.val_path),
            test_path=Path(config.test_path),
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )
        return data_trainer 

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import torch
from datasets import load_from_disk

In [None]:
class ModelTrainer:
    def __init__(self, config: DataTrainerConfig):
        self.config = config

    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"

        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        # Load each dataset split individually
        test_path = load_from_disk(self.config.test_path)
        val_dataset = load_from_disk(self.config.val_path)

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=1,
            warmup_steps=500,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            weight_decay=0.01,
            logging_steps=10,
            evaluation_strategy='steps',
            eval_steps=500,
            save_steps=1e6,
            gradient_accumulation_steps=16
        )

        trainer = Trainer(
            model=model,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=data_collator,
            train_dataset=test_path,
            eval_dataset=val_dataset
        )

        trainer.train()

        # Save model and tokenizer
        model.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))


In [None]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
config = ConfigurationManager()
model_trainer_config = config.get_data_trainer_config()
model_trainer = ModelTrainer(config=DataTrainerConfig)
model_trainer.train()