In [76]:
import os
import json
import torch
from pathlib import Path
from dataclasses import dataclass
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from Succinct.logging import logger

In [79]:
os.chdir("/11/Projects/Succinct")

In [81]:
%pwd

'c:\\11\\Projects\\Succinct'

In [82]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokeniser_name: Path


    

In [83]:
from Succinct.constants import *
from Succinct.utils.common import read_yaml, create_directories

In [84]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokeniser_name=config.tokeniser_name)
        
        return data_transformation_config


In [89]:
import os
from Succinct.logging import logger
from transformers import AutoTokenizer


##### Extra SummDataset Class for JSON conversion


In [92]:
class SummDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_len=1024, max_output_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        inputs = self.tokenizer(
            example["dialogue"],
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        outputs = self.tokenizer(
            example["summary"],
            max_length=self.max_output_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs.input_ids.squeeze(0),
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": outputs.input_ids.squeeze(0)
        }

In [93]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokeniser_name)

    def load_data(self):
        """Load JSON dataset splits from the data path"""
        data_path = self.config.data_path
        datasets = {}
        for split in ["train", "val", "test"]:
            file_path = os.path.join(data_path, f"{split}.json")
            if os.path.exists(file_path):
                with open(file_path, "r", encoding="utf-8") as f:
                    datasets[split] = json.load(f)
            else:
                logger.warning(f"{split}.json not found in {data_path}. Skipping this split.")
        return datasets

    def convert(self):
        """Tokenize and convert datasets to PyTorch format, then save as .pt"""
        datasets_dict = self.load_data()
        datasets_pt = {
            split: SummDataset(data, self.tokenizer)
            for split, data in datasets_dict.items()
        }

        save_path = os.path.join(self.config.root_dir, "samsum_dataset.pt")
        torch.save(datasets_pt, save_path)
        logger.info(f"Processed dataset saved to {save_path}")

        return datasets_pt

In [91]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
    logger.info("Data Transformation completed successfully.")
except Exception as e:
    raise e

[2025-09-26 14:33:01,886: INFO: common: yaml file config\config.yaml loaded successfully]
[2025-09-26 14:33:01,888: INFO: common: yaml file params.yaml loaded successfully]
[2025-09-26 14:33:01,889: INFO: common: Created a directory: artifacts]
[2025-09-26 14:33:01,890: INFO: common: Created a directory: artifacts/data_transformation]




[2025-09-26 14:33:05,441: INFO: 2446992344: Processed dataset saved to artifacts/data_transformation\samsum_dataset.pt]
[2025-09-26 14:33:05,442: INFO: 2410692728: Data Transformation completed successfully.]
