In [1]:
import os
os.chdir('../')

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    input_data_file: Path
    tokenizer_name:Path


In [3]:
import pandas as pd
import numpy as np
import transformers
import datasets
from datasets import load_dataset,load_from_disk
from bpReviewClassifier.constants import *
from bpReviewClassifier.utils.common import read_yaml,create_directories

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class ConfigurationManager:
    def __init__(self,config_filepath=CONFIG_FILE_PATH,params_filepath=PARAMS_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation(self)->DataTransformationConfig:
        config=self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            input_data_file=config.input_data_file,
            tokenizer_name=config.tokenizer_filename
        )
        return data_transformation_config

In [5]:
import os
from bpReviewClassifier.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset,load_from_disk,Dataset,DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [6]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig):
        self.config=config
        self.tokenizer=AutoTokenizer.from_pretrained(config.tokenizer_name)
    def preparing_dataset(self):
        ohe=OneHotEncoder()
        #loading dataset and converting into pandas dataframe
        df=pd.read_json(self.config.input_data_file,lines=True)
        #dropping unnecessary columns
        df.drop(['title','images','asin','parent_asin','user_id','timestamp', 'helpful_vote', 'verified_purchase'],axis=1,inplace=True)
        #one hot encoding our output column that is rating
        rating_encoded=ohe.fit_transform(df[['rating']]).toarray()
        df['rating']=list(rating_encoded)
        #splitting dataset into train,test,validation
        train_df,temp_df=train_test_split(df,test_size=0.3,random_state=42)
        test_df,val_df=train_test_split(temp_df,test_size=0.5,random_state=42)
        train_df.reset_index(drop=True, inplace=True)
        val_df.reset_index(drop=True, inplace=True)
        test_df.reset_index(drop=True, inplace=True)

        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)
        #preparring datasetDict
        dataset_dict = DatasetDict({
            'train': train_dataset,
            'validation': val_dataset,
            'test': test_dataset
            })
        return dataset_dict
    def tokenize_fun(self,example_batch):
        encodings=self.tokenizer(example_batch['text'], truncation=True, padding=True)
        return encodings
    def convert(self):
        prepared_dataset=self.preparing_dataset()
        transformed=prepared_dataset.map(self.tokenize_fun,batched=True)
        transformed=transformed.remove_columns(['text'])
        transformed.save_to_disk(os.path.join(self.config.root_dir,"transformed_dataset"))
    
    

        

In [7]:
try:
    config=ConfigurationManager()
    data_transformation_config=config.get_data_transformation()
    data_transformation=DataTransformation(config=data_transformation_config)
    #prepared_dataset=data_transformation.preparing_dataset()
    #transformed=prepared_dataset.map(data_transformation.tokenize_fun,batched=True)
    data_transformation.convert()
    
    
except Exception as e:
    raise e


[2025-01-19 18:59:01,219 : INFO : common : yaml file config\config.yaml loaded successfully]
[2025-01-19 18:59:01,225 : INFO : common : yaml file params.yaml loaded successfully]
[2025-01-19 18:59:01,233 : INFO : common : artifacts Created successfully.]
[2025-01-19 18:59:01,236 : INFO : common : artifacts/data_transformation Created successfully.]


Map: 100%|██████████| 491069/491069 [04:31<00:00, 1809.74 examples/s]
Map: 100%|██████████| 105230/105230 [00:52<00:00, 2022.59 examples/s]
Map: 100%|██████████| 105229/105229 [00:44<00:00, 2351.94 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 491069/491069 [00:09<00:00, 54184.67 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 105230/105230 [00:05<00:00, 18063.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 105229/105229 [00:06<00:00, 16678.40 examples/s]
