In [72]:
import os

In [73]:
%pwd

'c:\\Users\\ajay\\Desktop\\myPortfolio\\CommentAnalysis'

In [41]:
os.chdir("../")

In [42]:
%pwd

'c:\\Users\\ajay\\Desktop\\myPortfolio\\CommentAnalysis'

In [80]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    train_file_path: Path
    test_file_path: Path
    DATA_transformation_DIR: Path
    transform_train_file: Path
    transform_test_file: Path
    x_train_file_path: Path
    y_train_file_path: Path
    transformer: Path
    max_features:int
    ngram_range:tuple

In [81]:
from src.CommentAnalysis.constants import *
from src.CommentAnalysis.utils.common import read_yaml, create_directories

In [82]:
config_filepath = read_yaml(CONFIG_FILE_PATH)
filepath = Path(config_filepath.data_Transformation.x_train_file_path)
filedir, filename = os.path.split(filepath)
print(filedir,"----",filename)


[2025-07-08 00:26:52,041: INFO: common: YAML file loaded successfully: config\config.yaml]
artifacts\data_Transformation\train ---- x_train.csv


In [83]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema=read_yaml(SCHEMA_FILE_PATH)
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:

        config = self.config.data_Transformation
        config2=self.config.data_validation

        trainfilepath = Path(config.transform_train_file)
        trainfiledir,train_filename = os.path.split(trainfilepath)

        testfilepath = Path(config.transform_test_file)
        testfiledir, filename = os.path.split(testfilepath)
        
        create_directories([config.DATA_transformation_DIR,trainfiledir,testfiledir])
        
        max_features = self.params['model_building']['max_features']
        ngram_range = tuple(self.params['model_building']['ngram_range'])
        data_transformation_config = DataTransformationConfig(
            DATA_transformation_DIR=config.DATA_transformation_DIR,
            train_file_path=config2.train_file_path,
            test_file_path=config2.test_file_path,
            transform_train_file=config.transform_train_file,
            transform_test_file=config.transform_test_file,
            x_train_file_path=config.x_train_file_path,
            y_train_file_path=config.y_train_file_path,
            transformer=config.transformer,
            max_features=max_features,
            ngram_range=ngram_range)

        return data_transformation_config
      

In [None]:
import nltk
import sys
from src.CommentAnalysis import logger
from src.CommentAnalysis.utils.common import read_yaml,preprocess_comment,read_data
from src.CommentAnalysis.constants import SCHEMA_FILE_PATH
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

import numpy as np
import scipy.sparse
nltk.download('wordnet')
import pandas as pd
nltk.download('stopwords')
class DataTransformation:
    def __init__(self,data_transformation_config):
        try:
            self.data_transformation_config=data_transformation_config
            self._schema_config = read_yaml(SCHEMA_FILE_PATH)
        except Exception as e:
            raise Exception(e, sys)
    
 
    def normalize_text(self,df):
        """Apply preprocessing to the text data in the dataframe."""
        try:
            df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)
            df.dropna(inplace=True)
            logger.debug('Text normalization completed')
            return df
        except Exception as e:
            logger.error(f"Error during text normalization: {e}")
            raise
    def apply_tfidf(self,train_data: pd.DataFrame, max_features: int, ngram_range: tuple) -> tuple:
        """Apply TF-IDF with ngrams to the data."""
        try:
            vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)

            X_train = train_data['clean_comment'].values
            y_train = train_data['category'].values

            # Perform TF-IDF transformation
            X_train_tfidf = vectorizer.fit_transform(X_train)
            
            logger.debug(f"TF-IDF transformation complete. Train shape: {X_train_tfidf.shape}")

            # Save the vectorizer in the root directory
            with open(self.data_transformation_config.transformer, 'wb') as f:
                pickle.dump(vectorizer, f)

            logger.debug('TF-IDF applied with trigrams and data transformed')
            return X_train_tfidf, y_train
        except Exception as e:
            logger.error('Error during TF-IDF transformation: %s', e)
            raise


    def initiate_data_Transformation(self):
        logger.debug("Starting data preprocessing...")
        train_data = read_data(self.data_transformation_config.train_file_path)
        test_data = read_data(self.data_transformation_config.test_file_path)
        logger.debug('Data loaded successfully')

        train_processed_data = self.normalize_text(train_data)
        test_processed_data = self.normalize_text(test_data)

        transform_train_path = self.data_transformation_config.transform_train_file
        transform_test_path = self.data_transformation_config.transform_test_file

        train_processed_data.to_csv(transform_train_path, index=False)
        test_processed_data.to_csv(transform_test_path, index=False)

        X_train_tfidf, y_train = self.apply_tfidf(
            train_processed_data,
            max_features=self.data_transformation_config.max_features,
            ngram_range=self.data_transformation_config.ngram_range
        )

        # save X_train_tfidf (csr_matrix) properly
        scipy.sparse.save_npz(self.data_transformation_config.x_train_file_path, X_train_tfidf)
        logger.info(f"X_train TF-IDF matrix saved as sparse npz at: {self.data_transformation_config.x_train_file_path.replace('.csv', '.npz')}")

        # save y_train as csv
        pd.Series(y_train).to_csv(self.data_transformation_config.y_train_file_path, index=False)

        logger.info(f"Transformed Train file saved to: {transform_train_path}")
        logger.info(f"Transform Test file saved to: {transform_test_path}")
        logger.info(f"y Train file saved to: {self.data_transformation_config.y_train_file_path}")
        logger.info("****************************************************")


            




[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ajay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ajay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [108]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config=data_transformation_config)
    data_transformation.initiate_data_Transformation() 
   
except Exception as e:
    raise e

[2025-07-08 00:54:25,739: INFO: common: YAML file loaded successfully: config\config.yaml]
[2025-07-08 00:54:25,748: INFO: common: YAML file loaded successfully: params.yaml]
[2025-07-08 00:54:25,755: INFO: common: YAML file loaded successfully: config\schema.yaml]
[2025-07-08 00:54:25,759: INFO: common: Created directory: artifacts]
[2025-07-08 00:54:25,762: INFO: common: Created directory: artifacts/data_Transformation]
[2025-07-08 00:54:25,766: INFO: common: Created directory: artifacts\data_Transformation\train]
[2025-07-08 00:54:25,769: INFO: common: Created directory: artifacts\data_Transformation\test]
[2025-07-08 00:54:25,776: INFO: common: YAML file loaded successfully: config\schema.yaml]
[2025-07-08 00:54:26,962: ERROR: common: Error in preprocessing comment: 'float' object has no attribute 'lower']
[2025-07-08 00:54:26,969: ERROR: common: Error in preprocessing comment: 'float' object has no attribute 'lower']
[2025-07-08 00:54:27,100: ERROR: common: Error in preprocessing 