In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\sentiment-analysis'

In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple, Union, Optional

@dataclass(frozen=True)
class FeatureTransformConfig:
    root_dir: Path
    data_file_path: Path
    data_file: Path
    features_dir: Path
    text_column: str
    sentiment_column: str
    train_size: float
    random_state: int
    vectorizer_type: str  # 'tfidf', 'bow', or 'word2vec'
    max_features: int
    ngram_range: Tuple[int, int]
    word2vec_params: Optional[Dict] = None

In [3]:
from SentiScope.constants import (CONFIG_FILE_PATH,
                                  PARAMS_FILE_PATH)
from SentiScope.utils.file_utils import (create_directories,
                                            get_size)
from SentiScope.utils.config_utils import (read_yaml,
                                           Settings,
                                           get_settings)

In [4]:
import json
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_latest_report(self) -> Dict:
        """Locate the latest report.json file based on the timestamp folder."""
        config = self.config.data_profileing
        profiling_dir = Path(config.root_dir)

        # Get all subdirectories in data_profiling
        timestamp_dirs = [d for d in profiling_dir.iterdir() if d.is_dir()]
        
        if not timestamp_dirs:
            raise FileNotFoundError("No timestamp folders found in data_profiling.")

        # Sort directories by name (assuming timestamp format)
        latest_dir = sorted(timestamp_dirs, key=lambda x: x.name, reverse=True)[0]
        report_path = latest_dir / "report.json"

        if not report_path.exists():
            raise FileNotFoundError(f"report.json not found in {latest_dir}.")

        # Load the report.json file
        with open(report_path, "r") as f:
            report_data = json.load(f)

        return report_data

    def get_feature_transform_config(self) -> FeatureTransformConfig:
        config = self.config.feature_transformation
        report_data = self.get_latest_report()

        create_directories([config.root_dir])

        timestamp = report_data["timestamp"]
        # data_file_path = Path(str(config.data_file).format(timestamp=timestamp))
        data_file_path = Path(config.data_file_path).joinpath(f"{timestamp}", config.data_file)
 

        feature_transform_config = FeatureTransformConfig(
            root_dir=config.root_dir,
            data_file=config.data_file,
            data_file_path = data_file_path,
            features_dir=config.features_dir,
            text_column=config.text_column,
            sentiment_column=config.sentiment_column,
            train_size=config.train_size,
            random_state=config.random_state,
            vectorizer_type=config.vectorizer_type,
            max_features=config.max_features,
            ngram_range=tuple(config.ngram_range),
            word2vec_params=config.word2vec_params if hasattr(config, 'word2vec_params') else None
        )

        return feature_transform_config

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
import joblib
import json
from datetime import datetime
from SentiScope.logging import logger

In [6]:
class FeatureTransformer:
    def __init__(self, config: FeatureTransformConfig):
        """
        Initialize the FeatureTransformer with configuration settings.
        
        Parameters:
        config (FeatureTransformConfig): Configuration object containing transformation parameters
        """
        logger.info("Initializing FeatureTransformer...")
        self.config = config
        self.path = self.config.data_file_path
        self.df = pd.read_csv(self.path)
        
        # Create output directories
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = Path(self.config.root_dir) / self.timestamp
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Initialize encoders and vectorizers
        self.label_encoder = LabelEncoder()
        self.vectorizer = self.config.vectorizer_type
        self.word2vec_model = None
        
        logger.info("FeatureTransformer initialized successfully.")

    def _split_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Split the data into training and testing sets.
        """
        logger.info("Splitting data into train and test sets...")
        
        # Check class distribution if stratification is requested
        if self.config.sentiment_column:
            class_counts = self.df[self.config.sentiment_column].value_counts()
            min_samples = class_counts.min()
            
            if min_samples < 2:
                logger.warning(f"Found class(es) with less than 2 samples. Disabling stratification.")
                stratify = None
            else:
                stratify = self.df[self.config.sentiment_column]
        else:
            stratify = None
        
        train_df, test_df = train_test_split(
            self.df,
            train_size=self.config.train_size,
            random_state=self.config.random_state,
            stratify=stratify
        )
        
        logger.info(f"Train set size: {len(train_df)}, Test set size: {len(test_df)}")
        return train_df, test_df

    def _initialize_vectorizer(self):
        """
        Initialize the appropriate vectorizer based on configuration.
        """
        logger.info(f"Initializing {self.config.vectorizer_type} vectorizer...")
        if self.config.vectorizer_type == 'tfidf':
            self.vectorizer = TfidfVectorizer(
                max_features=self.config.max_features,
                ngram_range=self.config.ngram_range
            )
        elif self.config.vectorizer_type == 'bow':
            self.vectorizer = CountVectorizer(
                max_features=self.config.max_features,
                ngram_range=self.config.ngram_range
            )
        elif self.config.vectorizer_type == 'word2vec':
            # Word2Vec will be initialized during transformation
            pass
        else:
            raise ValueError(f"Unsupported vectorizer type: {self.config.vectorizer_type}")

    def _transform_text_features(self, train_df: pd.DataFrame, test_df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """
        Transform text data into numerical features using the specified method.
        """
        logger.info("Transforming text features...")
        
        if self.config.vectorizer_type in ['tfidf', 'bow']:
            # Transform using TF-IDF or Bag-of-Words
            X_train = self.vectorizer.fit_transform(train_df[self.config.text_column])
            X_test = self.vectorizer.transform(test_df[self.config.text_column])
            
            # Save vectorizer
            joblib.dump(self.vectorizer, self.output_dir / f'{self.config.vectorizer_type}_vectorizer.joblib')
            
        elif self.config.vectorizer_type == 'word2vec':
            # Initialize and train Word2Vec model
            texts = train_df[self.config.text_column].apply(str.split).values
            self.word2vec_model = Word2Vec(
                sentences=texts,
                vector_size=self.config.word2vec_params.get('vector_size', 100),
                window=self.config.word2vec_params.get('window', 5),
                min_count=self.config.word2vec_params.get('min_count', 1),
                workers=self.config.word2vec_params.get('workers', 4)
            )
            
            # Transform texts to vectors by averaging word vectors
            X_train = np.array([
                np.mean([self.word2vec_model.wv[word] 
                        for word in text.split() 
                        if word in self.word2vec_model.wv], axis=0)
                for text in train_df[self.config.text_column]
            ])
            X_test = np.array([
                np.mean([self.word2vec_model.wv[word]
                        for word in text.split()
                        if word in self.word2vec_model.wv], axis=0)
                for text in test_df[self.config.text_column]
            ])
            
            # Save Word2Vec model
            self.word2vec_model.save(str(self.output_dir / 'word2vec_model.model'))
            
        logger.info("Text feature transformation completed.")
        return X_train, X_test

    def _transform_labels(self, train_df: pd.DataFrame, test_df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """
        Transform labels using LabelEncoder.
        """
        if not self.config.sentiment_column:
            logger.info("No sentiment column specified. Skipping label transformation.")
            return None, None
            
        logger.info("Transforming labels...")
        y_train = self.label_encoder.fit_transform(train_df[self.config.sentiment_column])
        y_test = self.label_encoder.transform(test_df[self.config.sentiment_column])
        
        # Save label encoder
        joblib.dump(self.label_encoder, self.output_dir / 'label_encoder.joblib')
        
        return y_train, y_test

    def transform_and_save(self) -> str:
        """
        Execute the complete transformation pipeline and save results.
        
        Returns:
        str: Path to the output directory
        """
        try:
            logger.info("Starting feature transformation pipeline...")
            
            # Split data
            train_df, test_df = self._split_data()
            
            # Initialize vectorizer
            self._initialize_vectorizer()
            
            # Transform features
            X_train, X_test = self._transform_text_features(train_df, test_df)
            
            # Transform labels
            y_train, y_test = self._transform_labels(train_df, test_df)
            
            # Save transformed data
            np.save(self.output_dir / 'X_train.npy', X_train)
            np.save(self.output_dir / 'X_test.npy', X_test)
            if y_train is not None and y_test is not None:
                np.save(self.output_dir / 'y_train.npy', y_train)
                np.save(self.output_dir / 'y_test.npy', y_test)
            
            # Save configuration and metadata
            metadata = {
                'timestamp': self.timestamp,
                'config': {
                    'vectorizer_type': self.config.vectorizer_type,
                    'max_features': self.config.max_features,
                    'ngram_range': self.config.ngram_range,
                    'train_size': self.config.train_size,
                    'random_state': self.config.random_state
                },
                'data_shapes': {
                    'X_train': X_train.shape,
                    'X_test': X_test.shape,
                    'y_train': y_train.shape if y_train is not None else None,
                    'y_test': y_test.shape if y_test is not None else None
                }
            }
            
            with open(self.output_dir / 'metadata.json', 'w') as f:
                json.dump(metadata, f, indent=4)
            
            logger.info(f"Feature transformation completed. Results saved to: {self.output_dir}")
            return str(self.output_dir)
            
        except Exception as e:
            logger.error(f"Error during feature transformation: {str(e)}")
            raise


In [7]:
try:
    config = ConfigurationManager()
    feature_transform_config = config.get_feature_transform_config()
    transformer = FeatureTransformer(config=feature_transform_config)
    output_path = transformer.transform_and_save()
    logger.info(f"Feature transformation completed. Output saved at: {output_path}")
except Exception as e:
    raise e

[2025-01-09 18:48:23,156: INFO: config_utils: yaml file: config\config.yaml loaded successfully]
[2025-01-09 18:48:23,158: INFO: config_utils: yaml file: params.yaml loaded successfully]
[2025-01-09 18:48:23,159: INFO: file_utils: created directory at: artifacts]
[2025-01-09 18:48:23,161: INFO: file_utils: created directory at: artifacts/feature_transformation]
[2025-01-09 18:48:23,162: INFO: 3257807759: Initializing FeatureTransformer...]
[2025-01-09 18:48:24,125: INFO: 3257807759: FeatureTransformer initialized successfully.]
[2025-01-09 18:48:24,126: INFO: 3257807759: Starting feature transformation pipeline...]
[2025-01-09 18:48:24,126: INFO: 3257807759: Splitting data into train and test sets...]
[2025-01-09 18:48:24,316: INFO: 3257807759: Train set size: 175435, Test set size: 43859]
[2025-01-09 18:48:24,317: INFO: 3257807759: Initializing bow vectorizer...]
[2025-01-09 18:48:24,317: INFO: 3257807759: Transforming text features...]
[2025-01-09 18:48:35,695: INFO: 3257807759: Text