In [3]:
1+1

2

In [6]:
pwd


'/home/ayush/Documents/AI/Projects/huggingface-transformer-project/research'

In [7]:
cd ..

/home/ayush/Documents/AI/Projects/huggingface-transformer-project


## config setup

In [37]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str
    train_path: Path
    val_path: Path
    test_path: Path


In [38]:
from src.constants import *
from src.utils.common import read_yaml_file,createDirs

## Config Manager Update

In [39]:
class ConfigurationManager:
    def __init__(self,
                 config_path=CONFIG_FILE_PATH,
                 param_path=PARAMS_FILE_PATH):
        self.config = read_yaml_file(config_path)
        self.params = read_yaml_file(param_path)
        createDirs([self.config.artifacts_root])


    def get_data_transformation_config(self)-> DataTransformationConfig:
        config=self.config.data_transformation 
        createDirs([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            tokenizer_name=str(config.tokenizer_name),
            train_path=Path(config.train_path),
            val_path=Path(config.val_path),
            test_path=Path(config.test_path)
        )
        return data_transformation_config 

In [40]:
import os
from src.logging import logger
from transformers import AutoTokenizer
from datasets import load_from_disk,Dataset
import pandas as pd

In [41]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

    def converting_into_s2s(self, row):
        dialogue = str(row['dialogue']) if pd.notnull(row['dialogue']) else ""
        summary = str(row['summary']) if pd.notnull(row['summary']) else ""
        
        input_encoding = self.tokenizer(dialogue, max_length=1024, truncation=True)
        target_encoding = self.tokenizer(text_target=summary, max_length=1024, truncation=True)

        row['input_ids'] = input_encoding['input_ids']
        row['attention_mask'] = input_encoding['attention_mask']
        row['labels'] = target_encoding['input_ids']
        return row


    def convert(self):
        df_train = pd.read_csv(self.config.train_path)
        df_val = pd.read_csv(self.config.val_path)
        df_test = pd.read_csv(self.config.test_path)

        tokenized_df_train = df_train.apply(self.converting_into_s2s, axis=1).reset_index(drop=True)
        tokenized_df_val = df_val.apply(self.converting_into_s2s, axis=1).reset_index(drop=True)
        tokenized_df_test = df_test.apply(self.converting_into_s2s, axis=1).reset_index(drop=True)

        train_dataset = Dataset.from_pandas(tokenized_df_train)
        val_dataset = Dataset.from_pandas(tokenized_df_val)
        test_dataset = Dataset.from_pandas(tokenized_df_test)

        train_dataset.save_to_disk(os.path.join(self.config.root_dir, "samsum-dataset", "train"))
        val_dataset.save_to_disk(os.path.join(self.config.root_dir, "samsum-dataset", "val"))
        test_dataset.save_to_disk(os.path.join(self.config.root_dir, "samsum-dataset", "test"))


In [42]:
config = ConfigurationManager()
data_tranformation_config = config.get_data_transformation_config()
datatransformation=DataTransformation(config=data_tranformation_config)
datatransformation.convert()

[2025-06-05 15:19:08,360: INFO: common: Created directory at: artifacts]
[2025-06-05 15:19:08,365: INFO: common: Created directory at: artifacts/data_transformation]


Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 150288.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 53442.28 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 49773.02 examples/s]
