In [1]:
import os 
os.chdir("../")

In [2]:
from logging import getLogger
from pathlib import Path
from dataclasses import dataclass
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import torch
from datasets import load_from_disk
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
logger = getLogger(__name__)

In [None]:
@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_name: str
    tokenizer_name: str
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 2   # 🟠 Optimized for 4GB
    per_device_eval_batch_size: int = 2    # 🟠 Keep small
    gradient_accumulation_steps: int = 4   # 🟠 Simulate larger batches
    fp16: bool = True                      # 🟠 Reduces VRAM usage
    gradient_checkpointing: bool = True    # 🟠 Saves memory (slower)
    

In [None]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        training_params = self.params.Seq2SeqTrainingArguments  # 🟠 Load YAML args
        return ModelTrainerConfig(
            root_dir=Path(self.config.model_trainer.root_dir),
            data_path=Path(self.config.model_trainer.data_path),
            model_name=self.config.model_trainer.model_name,
            tokenizer_name=self.config.model_trainer.tokenizer_name,
            num_train_epochs=training_params.num_train_epochs,
            per_device_train_batch_size=training_params.per_device_train_batch_size,
            per_device_eval_batch_size=training_params.per_device_eval_batch_size,
            gradient_accumulation_steps=training_params.gradient_accumulation_steps,
            fp16=training_params.fp16,                              
            gradient_checkpointing=training_params.gradient_checkpointing,
        )

In [None]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Training on: {self.device} (VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB)")

    def get_model(self):
        """Load BART with gradient checkpointing for memory efficiency"""
        model = BartForConditionalGeneration.from_pretrained(
            self.config.model_name,
            gradient_checkpointing=self.config.gradient_checkpointing
        )
        model = model.to(self.device)
        if self.config.gradient_checkpointing:
            logger.info("🔵 Gradient Checkpointing enabled (slower, but saves VRAM)")
        return model

    def get_tokenizer(self):
        """Load tokenizer"""
        return BartTokenizer.from_pretrained(self.config.tokenizer_name)

    def load_datasets(self):
        """Load tokenized datasets from disk"""
        train_path = Path(self.config.data_path) / "transformed_train_data"
        val_path = Path(self.config.data_path) / "transformed_validation_data"
        train_dataset = load_from_disk(train_path)
        val_dataset = load_from_disk(val_path)
        logger.info(f"Train: {len(train_dataset)} samples | Val: {len(val_dataset)}")
        return train_dataset, val_dataset

    def train(self):
        """Optimized training loop for 4GB GPU"""
        model = self.get_model()
        tokenizer = self.get_tokenizer()
        train_dataset, val_dataset = self.load_datasets()

        training_args = Seq2SeqTrainingArguments(
            output_dir=self.config.root_dir,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            num_train_epochs=self.config.num_train_epochs,
            fp16=self.config.fp16,     # 🟠 Mixed Precision (4GB GPU requirement)
            logging_steps=100,
            eval_strategy="steps",
            save_total_limit=2,
            report_to="none",  # Disable WandB to save memory
        )

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            processing_class=tokenizer,
            data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
        )

        logger.info("🚀 Starting training (VRAM optimized)")
        trainer.train()

        # Save the model
        output_dir = Path(self.config.root_dir) / "final_model"
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        logger.info(f"✅ Model saved to {output_dir}")


In [7]:
# Run training
config_manager = ConfigurationManager()
train_config = config_manager.get_model_trainer_config()
ModelTrainer(train_config).train()

[2025-07-04 21:43:27,680: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-04 21:43:27,683: INFO: common: yaml file: config\params.yaml loaded successfully]
[2025-07-04 21:43:27,685: INFO: common: created directory at: artifacts]
[2025-07-04 21:43:27,716: INFO: 3410207037: Training on: cuda (VRAM: 4.0GB)]
[2025-07-04 21:43:30,701: INFO: 3410207037: 🔵 Gradient Checkpointing enabled (slower, but saves VRAM)]
[2025-07-04 21:43:32,726: INFO: 3410207037: Train: 287113 samples | Val: 13368]
[2025-07-04 21:43:32,846: INFO: 3410207037: 🚀 Starting training (VRAM optimized)]


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
100,3.221,1.185387


KeyboardInterrupt: 