In [1]:
import os

In [2]:
%pwd

'd:\\Sentence_Generation_Project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Sentence_Generation_Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path
from transformers import AutoTokenizer
import pandas as pd

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: str  # Keep as string for model ID

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Instantiate the configuration
config = DataTransformationConfig(
    root_dir=Path('artifacts/data_transformation'),
    data_path=Path('artifacts/data_ingestion/commongen_lite_train.csv'),
    tokenizer_name='meta-llama/Llama-2-7b-chat-hf'  # Correct model ID
)

In [7]:
# Load the dataset
df = pd.read_csv(config.data_path)

In [8]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)



In [9]:
from textgeneration.constants import *
from textgeneration.utils.common import read_yaml, create_directories

In [10]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name
        )

        return data_transformation_config

In [11]:
import os
from textgeneration.logging import logger
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset,load_from_disk
import torch
import pandas as pd

[2024-10-20 13:24:39,085: INFO: config: PyTorch version 2.4.1+cu118 available.]


In [12]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    # Define a function to remove tags from the concept set
    def clean_concept_set(self, df):
        # Assuming concept_set is a string representation of a list
        def clean_single_concept_set(concept_set):
            # Check if the concept_set is a string, otherwise handle it as a list
            if isinstance(concept_set, str):
                concepts = eval(concept_set)  # Convert string representation of list to actual list
            else:
                concepts = concept_set  # Use it directly if already a list
            
            # Extract the base concept (remove _N, _V, etc.)
            cleaned_concepts = [concept.split('_')[0] for concept in concepts]  # Remove tags
            return ', '.join(cleaned_concepts)  # Join cleaned concepts into a string

        # Apply the cleaning function to the 'concept_set' column
        df['cleaned_concept_set'] = df['concept_set'].apply(clean_single_concept_set)

        # Keep only the cleaned concepts for training
        formatted_data = df[['cleaned_concept_set']]

        # Save the processed data to a new CSV file
        output_path = Path('artifacts/data_transformation/cleaned_concepts.csv')
        formatted_data.to_csv(output_path, index=False)

        print("Cleaned data saved to:", output_path)

# Usage
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    
    # Load the dataset again
    df = pd.read_csv(data_transformation_config.data_path)
    
    # Apply the cleaning process
    data_transformation.clean_concept_set(df)
except Exception as e:
    raise e


[2024-10-20 13:24:39,745: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-20 13:24:39,747: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-20 13:24:39,748: INFO: common: created directory at: artifacts]
[2024-10-20 13:24:39,749: INFO: common: created directory at: artifacts/data_transformation]
Cleaned data saved to: artifacts\data_transformation\cleaned_concepts.csv
