In [1]:
import os

In [2]:
%pwd

'c:\\Users\\assi01\\Desktop\\projects\\AirTravel_Sentiment_Analysis\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\assi01\\Desktop\\projects\\AirTravel_Sentiment_Analysis'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=False)
class TextProcessingConfig:
    root_dir: Path
    train_tokenized_data_path: Path
    test_tokenized_data_path: Path
    val_tokenized_data_path: Path
    train_data_path: Path
    test_data_path: Path
    val_data_path: Path
    params_model_name: str
    params_text_col: str

In [6]:
from airTravelSentimentAnalysis.constants import *
from airTravelSentimentAnalysis.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_text_processing_config(self) -> TextProcessingConfig:
        config = self.config.text_processing
        config["train_data_path"] = self.config.data_preprocessing.train_data_path
        config["test_data_path"] = self.config.data_preprocessing.test_data_path
        config["val_data_path"] = self.config.data_preprocessing.val_data_path

        create_directories([config.root_dir])
        
        text_processing_config = TextProcessingConfig(
            root_dir=Path(config.root_dir),
            train_data_path=Path(config.train_data_path),
            test_data_path=Path(config.test_data_path),
            val_data_path=Path(config.val_data_path),
            train_tokenized_data_path=Path(config.train_tokenized_data_path),
            test_tokenized_data_path=Path(config.test_tokenized_data_path),
            val_tokenized_data_path=Path(config.val_tokenized_data_path),
            params_model_name=self.params.MODEL_NAME,
            params_text_col= self.params.TEXT_COL,
        )

        return text_processing_config

In [17]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

class TextProcessing:
    def __init__(self, config: TextProcessingConfig):
        self.config = config
    
    def loadData(self):
        self.train_df = pd.read_csv( self.config.train_data_path, encoding='utf-8')
        self.test_df = pd.read_csv( self.config.test_data_path, encoding='utf-8')
        self.val_df = pd.read_csv( self.config.val_data_path, encoding='utf-8')
        return self.train_df, self.test_df, self.val_df
    
    def createHuggingFaceDataset(self):
        self.train_dataset = Dataset.from_pandas(self.train_df)
        self.test_dataset = Dataset.from_pandas(self.test_df)
        self.val_dataset = Dataset.from_pandas(self.val_df)
        self.ds = DatasetDict()
        self.ds["train"] = self.train_dataset
        self.ds["test"] = self.test_dataset
        self.ds["val"] = self.val_dataset
        return self.ds
    
    def tokenize_fn(self,batch):
        tokenizer = AutoTokenizer.from_pretrained(self.config.params_model_name)
        return tokenizer(batch[self.config.params_text_col], truncation=True,padding=True)
    
    def tokenizeData(self):
        self.tokenized_datasets = self.ds.map(self.tokenize_fn, batched=True)
        self.tokenized_datasets.save_to_disk(self.config.root_dir)
        return self.tokenized_datasets

In [19]:
from airTravelSentimentAnalysis import logger

try:
    config = ConfigurationManager()
    text_processing_config = config.get_text_processing_config()
    text_processing = TextProcessing(config=text_processing_config)
    
    train_df,test_df, val_df = text_processing.loadData()
    print("Load after processed data successfully")
    print(train_df.head())
    logger.info("Load after processed data successfully \n %s",train_df.head())
    
    ds = text_processing.createHuggingFaceDataset()
    print("Hugging face dataset created successfully")
    print(ds)
    logger.info("Hugging face dataset created successfully \n %s",ds)
    
    ds = text_processing.createHuggingFaceDataset()
    print("Hugging face dataset created successfully")
    print(ds)
    logger.info("Hugging face dataset created successfully \n %s",ds)
    
    ds = text_processing.tokenizeData()
    print("Tokenized dataset created successfully")
    print(ds)
    logger.info("Tokenized dataset created successfully \n %s",ds)
except Exception as e:
    raise e

[2025-05-22 22:34:17,374: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-22 22:34:17,376: INFO: common: yaml file content: {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://drive.google.com/file/d/1taIeW6BZHqucmJbccEo92qfzZt5X_RTQ/view?usp=sharing', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'prepare_base_model': {'root_dir': 'artifacts/prepare_base_model', 'base_model_path': 'artifacts/prepare_base_model/base_model.h5', 'updated_base_model_path': 'artifacts/prepare_base_model/base_model_updated.h5', 'base_tokenizer_path': 'artifacts/prepare_base_tokenizer/base_model.h5', 'updated_base_tokenizer_path': 'artifacts/prepare_base_tokenizer/base_model_updated.h5'}, 'data_preprocessing': {'root_dir': 'artifacts/data_preprocessing', 'raw_data_file': 'artifacts/data_ingestion/bitext-travel-llm-chatbot-training-dataset.csv', 'train_data_path': 'artifa

Map: 100%|██████████| 20260/20260 [00:20<00:00, 997.53 examples/s] 
Map: 100%|██████████| 5066/5066 [00:06<00:00, 752.76 examples/s] 
Map: 100%|██████████| 6332/6332 [00:06<00:00, 958.23 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 20260/20260 [00:00<00:00, 611923.55 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5066/5066 [00:00<00:00, 209057.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6332/6332 [00:00<00:00, 248850.61 examples/s]

Tokenized dataset created successfully
DatasetDict({
    train: Dataset({
        features: ['instruction', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 20260
    })
    test: Dataset({
        features: ['instruction', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 5066
    })
    val: Dataset({
        features: ['instruction', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 6332
    })
})
[2025-05-22 22:34:51,500: INFO: 1541279884: Tokenized dataset created successfully 
 DatasetDict({
    train: Dataset({
        features: ['instruction', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 20260
    })
    test: Dataset({
        features: ['instruction', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 5066
    })
    val: Dataset({
        features: ['instruction', 'intent', 'input_ids', 'attention_mask'],
        num_rows: 6332
    })
})]



