In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
# GOING BACK TO RIGHT DIRECTORY
%pwd

'e:\\Projects\\E2E Emotion Detection from text\\Emotion-Detection-using-ML'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from src.ML_emotion_detection.constants import *
from src.ML_emotion_detection.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [None]:
from src.ML_emotion_detection import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from src.ML_emotion_detection.utils.common import save_bin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
tqdm.pandas()

In [8]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.tfidf = TfidfVectorizer(max_features=4000)

    def train_test_spliting(self, test_size=0.2):
        # Load data from parquet file
        data = pd.read_parquet('E:/Projects/E2E Emotion Detection from text/Emotion-Detection-using-ML/artifacts/data_ingestion/train-00000-of-00001.parquet')

        logger.info("Split data into training and test sets")
        X_train, X_test, y_train, y_test = train_test_split(data['text'],
                                                            data['label'],
                                                            test_size=test_size,
                                                            stratify=data['label'],
                                                            random_state=42)

        # Save y_train
        save_bin(y_train, os.path.join(self.config.root_dir, "y_train.joblib"))

        # Save y_test
        save_bin(y_test, os.path.join(self.config.root_dir, "y_test.joblib"))

        return X_train, X_test

    def preprocess(self, text, *args):
        # Preprocess the text (lowercasing, remove URLs, numbers, etc.)
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'#\w+', '', text)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text)

        # Tokenization and Lemmatization
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]

        return ' '.join(tokens)

    def pos_count_features(self, text, *args):
        # Tokenize the text
        tokens = word_tokenize(text)

        # Get POS tags for the tokens
        tagged_tokens = pos_tag(tokens)

        # Define POS counts for common POS tags
        pos_counts = {
            'ADJ': 0, 'ADP': 0, 'ADV': 0, 'AUX': 0, 'CCONJ': 0, 'DET': 0,
            'INTJ': 0, 'NOUN': 0, 'NUM': 0, 'PART': 0, 'PRON': 0, 'PROPN': 0,
            'PUNCT': 0, 'SCONJ': 0, 'SYM': 0, 'VERB': 0, 'X': 0
        }

        # Loop through tagged tokens and count POS tags
        for token, tag in tagged_tokens:
            if tag.startswith('JJ'):  # Adjective (JJ, JJR, JJS)
                pos_counts['ADJ'] += 1
            elif tag.startswith('RB'):  # Adverb (RB, RBR, RBS)
                pos_counts['ADV'] += 1
            elif tag.startswith('VB'):  # Verb (VB, VBD, VBG, VBN, VBP, VBZ)
                pos_counts['VERB'] += 1
            elif tag.startswith('NN'):  # Noun (NN, NNS, NNP, NNPS)
                pos_counts['NOUN'] += 1
            elif tag == 'IN':  # Preposition (IN)
                pos_counts['ADP'] += 1
            elif tag == 'DT':  # Determiner (DT)
                pos_counts['DET'] += 1
            elif tag == 'PRP' or tag == 'PRP$':  # Pronoun (PRP, PRP$)
                pos_counts['PRON'] += 1
            elif tag == 'TO':  # "to" (particle or infinitive marker)
                pos_counts['PART'] += 1
            elif tag == 'PDT':  # Predeterminer (PDT)
                pos_counts['DET'] += 1
            elif tag == 'CD':  # Cardinal number (CD)
                pos_counts['NUM'] += 1
            elif tag == 'CC':  # Coordinating conjunction (CC)
                pos_counts['CCONJ'] += 1
            elif tag == 'RP':  # Particle (RP)
                pos_counts['PART'] += 1
            elif tag == ',':  # Punctuation (comma, period, etc.)
                pos_counts['PUNCT'] += 1
            elif tag == 'SYM':  # Symbol (SYM)
                pos_counts['SYM'] += 1
            elif tag == 'EX':  # Existential there (EX)
                pos_counts['X'] += 1
            else:
                # For any unrecognized POS tags
                pos_counts['X'] += 1

        return pd.Series(pos_counts)

    def text_feature_extraction(self):


        text_column = 'text'
        numerical_columns = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN',
              'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
        label_columns = 'label'

        preprocessor = ColumnTransformer(
            transformers=[
                ('text', self.tfidf, text_column),  # Apply TF-IDF to the text column
                ('num', StandardScaler(), numerical_columns)  # Apply StandardScaler to numerical columns
            ],
        )

        return preprocessor

    def transform_and_save(self, preprocessor, X_train, X_test):
        # Fit and transform the training data
        X_train_processed = preprocessor.fit_transform(X_train)

        # Transform the test data using the fitted preprocessor
        X_test_processed = preprocessor.transform(X_test)

        # Save X_train
        save_bin(X_train_processed, os.path.join(self.config.root_dir, "X_train.joblib"))

        # Save X_test
        save_bin(X_test_processed, os.path.join(self.config.root_dir, "X_test.joblib"))
        
        logger.info(f"Training set shape after preprocessing: {X_train_processed.shape}")
        logger.info(f"Test set shape after preprocessing: {X_test_processed.shape}")

In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()

    # Step 1: Initialize the DataTransformation class
    data_transformation = DataTransformation(config=data_transformation_config)

    # Step 2: Split the data into train and test
    Tr_X, Te_X = data_transformation.train_test_spliting()

    # Step 3: Preprocess and clean the text using progress_apply
    Tr_X = Tr_X.progress_apply(data_transformation.preprocess)
    Te_X = Te_X.progress_apply(data_transformation.preprocess)


    # Step 4: Extract pos count features
    Tr_pos_counts = Tr_X.progress_apply(data_transformation.pos_count_features)
    Te_pos_counts = Te_X.progress_apply(data_transformation.pos_count_features)

    # Step 4.1 concat preprocessed text and count of pos features
    Tr_X_intermediate = pd.concat([Tr_X, Tr_pos_counts], axis=1)
    Te_X_intermediate = pd.concat([Te_X, Te_pos_counts], axis=1)


    # # Step 5: Create the ColumnTransformer instance
    preprocessor = data_transformation.text_feature_extraction()

    # Step 6: Transform and save the train test independent columns
    data_transformation.transform_and_save(preprocessor, Tr_X_intermediate, Te_X_intermediate)
    
except Exception as e:
    raise e

[2024-11-19 21:17:41,240: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-11-19 21:17:41,248: INFO: common: yaml file: params.yaml loaded successfully]
[2024-11-19 21:17:41,253: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-11-19 21:17:41,259: INFO: common: created directory at: artifacts]
[2024-11-19 21:17:41,264: INFO: common: created directory at: artifacts/data_transformation]
[2024-11-19 21:17:42,077: INFO: 1974438582: Split data into training and test sets]
[2024-11-19 21:17:42,506: INFO: common: binary file saved at: artifacts/data_transformation\y_train.joblib]
[2024-11-19 21:17:42,623: INFO: common: binary file saved at: artifacts/data_transformation\y_test.joblib]


100%|██████████| 333447/333447 [02:43<00:00, 2043.10it/s]
100%|██████████| 83362/83362 [00:33<00:00, 2491.82it/s]
100%|██████████| 333447/333447 [13:10<00:00, 421.76it/s]
100%|██████████| 83362/83362 [03:42<00:00, 374.67it/s]


[2024-11-19 21:38:16,206: INFO: common: binary file saved at: artifacts/data_transformation\X_train.joblib]
[2024-11-19 21:38:16,737: INFO: common: binary file saved at: artifacts/data_transformation\X_test.joblib]
[2024-11-19 21:38:16,739: INFO: 1974438582: Training set shape after preprocessing: (333447, 4017)]
[2024-11-19 21:38:16,741: INFO: 1974438582: Test set shape after preprocessing: (83362, 4017)]
