In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/dhyaneshanchula/Documents/Text-Summary-Generator'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_path: Path

In [5]:
from textSummarizer.constant import *
from textSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            tokenizer_path=Path(config.tokenizer_path),
        )
        return data_transformation_config    

In [7]:
import os
from textSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)

        # Match constants used in fine-tune script
        self.MAX_INPUT_LENGTH = 1024
        self.MAX_TARGET_LENGTH = 128

    def _preprocess(self, batch):
        model_inputs = self.tokenizer(
            batch["dialogue"],
            max_length=self.MAX_INPUT_LENGTH,
            truncation=True,
            padding="max_length",
        )

        labels = self.tokenizer(
            batch["summary"],
            max_length=self.MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length",
        )

        # replace pad token id's in labels by -100 so they are ignored by the loss
        labels["input_ids"] = [
            [(l if l != self.tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def convert(self):
        # Attempt to load from directory of CSVs, then from disk, then from HF dataset id
        data_path = self.config.data_path

        dataset_samsum = None
        # If data_path is a local directory containing CSV files
        try:
            if os.path.isdir(data_path):
                csv_train = os.path.join(data_path, "train.csv")
                csv_val = os.path.join(data_path, "validation.csv")
                csv_test = os.path.join(data_path, "test.csv")

                data_files = {}
                if os.path.exists(csv_train):
                    data_files["train"] = csv_train
                if os.path.exists(csv_val):
                    data_files["validation"] = csv_val
                if os.path.exists(csv_test):
                    data_files["test"] = csv_test

                if data_files:
                    dataset_samsum = load_dataset("csv", data_files=data_files)
                else:
                    # maybe it's a saved dataset directory
                    dataset_samsum = load_from_disk(data_path)
            else:
                # not a directory: try load_from_disk then load_dataset (HF id)
                try:
                    dataset_samsum = load_from_disk(data_path)
                except Exception:
                    dataset_samsum = load_dataset(data_path)
        except Exception:
            # Fallback to trying load_dataset with HF id
            dataset_samsum = load_dataset(data_path)

        tokenized = dataset_samsum.map(
            self._preprocess,
            batched=True,
            remove_columns=dataset_samsum["train"].column_names,
        )

        out_dir = os.path.join(self.config.root_dir, "samsum_dataset")
        tokenized.save_to_disk(out_dir)
        logger.info(f"Saved tokenized dataset to {out_dir}")

In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2026-01-31 13:05:43,876: INFO: common: yaml file: config/config.yaml loaded successfully]
[2026-01-31 13:05:43,877: INFO: common: yaml file: params.yaml loaded successfully]
[2026-01-31 13:05:43,877: INFO: common: Created directory at: artifacts]
[2026-01-31 13:05:43,878: INFO: common: Created directory at: artifacts/data_transformation]


Generating train split: 14731 examples [00:00, 100594.24 examples/s]
Generating validation split: 818 examples [00:00, 76851.10 examples/s]
Generating test split: 819 examples [00:00, 85444.74 examples/s]
Map: 100%|██████████| 14731/14731 [00:06<00:00, 2149.51 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 1882.46 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 2272.25 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14731/14731 [00:00<00:00, 314047.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 171418.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 189587.45 examples/s]

[2026-01-31 13:05:53,563: INFO: 1805077717: Saved tokenized dataset to artifacts/data_transformation/samsum_dataset]



