## Directory

In [1]:
import os
os.chdir("../")
os.getcwd()

'c:\\Users\\Marina\\Desktop\\ML Operations\\0 - KrishNaik Course\\21_end_to_end_nlp_project_with_huggingface_and_transformers\\my_project'

## 1. Config.yaml

Vamos fazer isso, criando o artifact folder e a classe data_transformation

## 2. Params.yaml

## 3. Config entity

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig():
  root_dir: Path
  data_path: Path
  tokenizer_name: str


## 4. Configuration Manager

Vamos criar umas constanstes 

In [3]:
from src.textSummarizer.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.textSummarizer.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self,
                config_path= CONFIG_FILE_PATH,
                params_path= PARAMS_FILE_PATH ):
        
        self.configurations = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.configurations.artifacts_root]) # cria o /artifacts

    # def get_somestep(self)->SomeStepConfig:
        # config = self.configurations.somestep

    def get_data_transformation_config(self)-> DataTransformationConfig:
        data_transformation_config = self.configurations.data_transformation
        
        create_directories([data_transformation_config.root_dir]) # cria o /artifacts/data_transformation

        return data_transformation_config


In [4]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()

[ 2024-11-12 14:25:48,771 ] - 28 summarizerlogger - INFO - yaml file: config\config.yaml loaded successfully
[ 2024-11-12 14:25:48,774 ] - 28 summarizerlogger - INFO - yaml file: params.yaml loaded successfully
[ 2024-11-12 14:25:48,776 ] - 46 summarizerlogger - INFO - created directory at: artifacts
[ 2024-11-12 14:25:48,780 ] - 46 summarizerlogger - INFO - created directory at: artifacts/data_transformation


## 5. Update the components- Data Ingestion,Data Transformation, Model Trainer

In [5]:
from transformers import AutoTokenizer
from datasets import load_from_disk

In [9]:
import os
from src.textSummarizer.logging import logger

class DataTransformation:
    def __init__(self, data_transformation_config: DataTransformationConfig):
        """
        Initializes the DataTransformation class with configuration details and tokenizer.

        Args:
            data_transformation_config (DataTransformationConfig): Configuration object containing
            paths and tokenizer information.
        """
        self.config = data_transformation_config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)

    def convert_examples_to_features(self, example_batch):
        """
        Converts a batch of examples into tokenized input features for a model.

        Args:
            example_batch (dict): A dictionary containing input and target texts for tokenization.

        Returns:
            dict: A dictionary with tokenized input features including 'input_ids', 'attention_mask',
                  and 'labels' (target token IDs).
        """
        # Tokenize the input dialogue with truncation to ensure length compatibility
        input_encodings = self.tokenizer(
            example_batch['dialogue'], max_length=1024, truncation=True
        )

        # Tokenize the target summary, using the tokenizer in target mode
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(
                example_batch['summary'], max_length=128, truncation=True
            )

        # Return the tokenized inputs with attention masks and target labels
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }

    def convert(self):
        """
        Loads the dataset, applies tokenization to each example, and saves the processed dataset.

        This method orchestrates the data transformation process by:
        1. Loading the dataset from disk.
        2. Applying the `convert_examples_to_features` function to each example.
        3. Saving the transformed dataset to a specified directory.
        """
        try:
            logger.info("Loading dataset from disk...")
            dataset_samsum = load_from_disk(self.config.data_path)

            logger.info("Transforming dataset with tokenization...")
            # Map the tokenization function across the dataset in a batched mode
            dataset_samsum_pt = dataset_samsum.map(
                self.convert_examples_to_features, batched=True
            )

            # Ensure the root directory exists before saving
            output_dir = os.path.join(self.config.root_dir, "samsum_dataset")
            os.makedirs(output_dir, exist_ok=True)

            logger.info(f"Saving processed dataset to {output_dir}...")
            dataset_samsum_pt.save_to_disk(output_dir)
            logger.info("Data transformation and saving complete.")

        except Exception as e:
            logger.error(f"Error in data transformation: {e}")
            raise e

    def initiate_data_transformation(self):
        """
        Orchestrates the entire data transformation process by calling the `convert` method.

        This method serves as the main entry point to perform all steps involved in data transformation,
        including loading, tokenizing, and saving the dataset.
        """
        try:
            logger.info("Initiating data transformation process...")
            self.convert()
            logger.info("Data transformation process completed successfully.")
        except Exception as e:
            logger.error(f"Data transformation process failed: {e}")
            raise e


In [10]:
configuration_manager_obj = ConfigurationManager()
data_transformation_config = configuration_manager_obj.get_data_transformation_config()

data_transformation_obj = DataTransformation(data_transformation_config)


[ 2024-11-12 15:00:56,998 ] - 28 summarizerlogger - INFO - yaml file: config\config.yaml loaded successfully
[ 2024-11-12 15:00:57,002 ] - 28 summarizerlogger - INFO - yaml file: params.yaml loaded successfully
[ 2024-11-12 15:00:57,005 ] - 46 summarizerlogger - INFO - created directory at: artifacts
[ 2024-11-12 15:00:57,009 ] - 46 summarizerlogger - INFO - created directory at: artifacts/data_transformation


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

## 6. Modularizar o Código

O `3.` vai para `src\textSummarizer\entity\__init__.py`

O `4.` vai para `src\textSummarizer\config\configuration.py`

O `5.` vai para `src\textSummarizer\components\data_transformation.py`

Modularizamos criando uma pipeline (classe) em `stage_2_data_transformation_pipeline.py`, com o que usamos para rodar o código

Jogar a Pipeline para `main.py`