In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\sentiment-analysis'

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataProfilerConfig:
    root_dir: Path
    data_file: Path
    profile_folder: Path
    profile_file: Path
    text_column: str
    sentiment_column: str

In [3]:
from SentiScope.constants import (CONFIG_FILE_PATH,
                                  PARAMS_FILE_PATH)
from SentiScope.utils.file_utils import (create_directories,
                                            get_size)
from SentiScope.utils.config_utils import (read_yaml,
                                           Settings,
                                           get_settings)

In [19]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict
import json

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_latest_report_profiler(self) -> Dict:
        """Locate the latest report.json file based on the timestamp folder."""
        config = self.config.data_ingestion
        profiling_dir = Path(config.root_dir)

        # Get all subdirectories in data_profiling
        timestamp_dirs = [d for d in profiling_dir.iterdir() if d.is_dir()]
        
        if not timestamp_dirs:
            raise FileNotFoundError("No timestamp folders found in data_profiling.")

        # Sort directories by name (assuming timestamp format)
        latest_dir = sorted(timestamp_dirs, key=lambda x: x.name, reverse=True)[0]
        report_path = latest_dir / "ingestion_metadata.json"

        if not report_path.exists():
            raise FileNotFoundError(f"report.json not found in {latest_dir}.")

        # Load the report.json file
        with open(report_path, "r") as f:
            report_data = json.load(f)

        return report_data



    

    def get_data_profiler_config(self) -> DataProfilerConfig:
        config = self.config.data_profileing
        report_data = self.get_latest_report_profiler()
        
        timestamp = report_data["timestamp"]
        # data_file_path = Path(str(config.data_file).format(timestamp=timestamp))
        data_file_path = Path(config.data_file).joinpath(f"{timestamp}","unzipped", "file.csv")
        create_directories([config.root_dir])

        data_profileing_config = DataProfilerConfig(
            root_dir=config.root_dir,
            data_file=data_file_path,
            profile_folder= config.profile_folder,
            profile_file= config.profile_file,
            text_column = config.text_column,
            sentiment_column = config.sentiment_column
        )

        return data_profileing_config

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from wordcloud import WordCloud
from typing import Dict, List, Optional, Any
import json
import os
import re
import emoji
from datetime import datetime
from pathlib import Path
from SentiScope.logging import logger

class SentimentDataProfiler:
    def __init__(self, config: DataProfilerConfig):
        """
        Initialize the SentimentDataProfiler with a data path and column names.
        
        Parameters:
        data_path (str): Path to the CSV file containing sentiment data
        text_column (str): Name of the column containing text data
        sentiment_column (str, optional): Name of the column containing sentiment labels
        """
        # Convert string path to Path object
        logger.info("Initializing SentimentDataProfiler...")
        self.config = config
        self.path = self.config.data_file
        logger.info(f"Reading CSV file from path: {self.path}")
        self.df = self._read_csv_file(self.path)
        self.text_column = self.config.text_column
        self.sentiment_column = self.config.sentiment_column
        
        # Initialize NLTK components
        try:
            nltk.data.find('tokenizers/punkt')
            nltk.data.find('corpora/stopwords')
            logger.info("Required NLTK data found.")
        except LookupError:
            logger.info("Downloading required NLTK data...")
            nltk.download('punkt')
            nltk.download('stopwords')
            nltk.download('wordnet')
        
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        
        # Create output directory structure
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = Path(os.getcwd()) / self.config.profile_folder / self.timestamp
        self.output_dir.mkdir(parents=True, exist_ok=True)
        (self.output_dir / 'images').mkdir(exist_ok=True)
        logger.info(f"Output directory created at {self.output_dir}")
        
        # Validate and process
        self._validate_columns()
        logger.info("Columns validated successfully.")
        self.df['processed_text'] = self.df[self.text_column].apply(self._preprocess_text)
        logger.info("Text preprocessing completed.")

    def _read_csv_file(self, file_path: Path) -> pd.DataFrame:
        """
        Read and validate the CSV file.
        
        Parameters:
        file_path (Path): Path to the CSV file
        
        Returns:
        pd.DataFrame: The loaded DataFrame
        """
        logger.info(f"Reading CSV file from {file_path}")
        try:
            df = pd.read_csv(file_path)
            logger.info("Successfully read the CSV file.")
            return df
        except FileNotFoundError:
            logger.error(f"Error: File not found at {file_path}")
            raise
        except pd.errors.ParserError:
            logger.error("Error: There might be a parsing issue with the CSV file!")
            try:
                # Attempt to read with more flexible parsing
                df = pd.read_csv(file_path, dtype=str)
                return df
            except Exception as e:
                logger.error(f"Failed to fix parsing errors: {e}")
                raise

    def _validate_columns(self) -> None:
        """Validate that the specified columns exist in the DataFrame."""
        logger.info("Validating columns in the DataFrame.")
        if self.text_column not in self.df.columns:
            logger.error(f"Text column '{self.text_column}' not found in DataFrame.")
            raise ValueError(f"Text column '{self.text_column}' not found in DataFrame")
        if self.sentiment_column and self.sentiment_column not in self.df.columns:
            logger.error(f"Sentiment column '{self.sentiment_column}' not found in DataFrame.")
            raise ValueError(f"Sentiment column '{self.sentiment_column}' not found in DataFrame")

    def _preprocess_text(self, text: str) -> str:
        """Preprocess a single text string by removing links, emojis, converting to lowercase,
        tokenizing, removing stop words, stemming, and concatenating into a single string."""
        if not isinstance(text, str):
            logger.warning("Non-string text encountered during preprocessing.")
            return ""
        
        logger.debug(f"Preprocessing text: {text[:30]}...")
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove emojis using emoji library
        text = emoji.replace_emoji(text, '')
        
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [self.stemmer.stem(token) for token in tokens
                if token.isalnum() and token not in self.stop_words]
        
        # Concatenate tokens into a single string with spaces between them
        processed_text = " ".join(tokens)
        
        logger.debug(f"Processed text: {processed_text[:50]}")
        return processed_text

    def _get_initial_statistics(self) -> Dict[str, Any]:
        """Generate basic statistics about the dataset."""
        logger.info("Generating initial dataset statistics.")
        stats = {
            'total_rows': len(self.df),
            'total_columns': len(self.df.columns),
            'dtypes': {k: str(v) for k, v in self.df.dtypes.to_dict().items()},  # Convert dtypes to strings for JSON
            'missing_values': self.df.isnull().sum().to_dict()
        }

        self.df['text_length'] = self.df[self.text_column].str.len()
        stats['text_length_stats'] = {
            'mean': int(self.df['text_length'].mean()),
            'median': int(self.df['text_length'].median()),
            'min': int(self.df['text_length'].min()),
            'max': int(self.df['text_length'].max())
        }
        logger.info("Initial statistics generated successfully.")
        return stats

    def _analyze_text_features(self) -> Dict[str, Any]:
        """Analyze text features and generate visualizations."""
        # Get all words from processed texts
        logger.info("Analyzing text features.")
        all_words = [word for text in self.df['processed_text'] for word in text]
        word_freq = Counter(all_words)
        logger.info("Text feature analysis completed.")
        vocab_stats = {
            'total_words': len(all_words),
            'unique_words': len(word_freq),
            'average_words_per_text': round(len(all_words) / len(self.df), 2),
            'most_common_words': dict(word_freq.most_common(20))
        }
        
        # Generate and save word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_words))
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Text Data')
        plt.tight_layout()
        plt.savefig(self.output_dir / 'images' / 'wordcloud.png')
        plt.close()
        logger.info(f"Word Cloud saved to: {self.output_dir}/'images'/'wordcloud.png'")
        return vocab_stats

    def _analyze_sentiment_distribution(self) -> Optional[Dict[str, Any]]:
        """Analyze sentiment distribution and generate visualization."""
        
        if not self.sentiment_column:
            logger.info("No sentiment column provided; skipping sentiment analysis.")
            return None
        logger.info("Analyzing sentiment distribution.")
        sentiment_stats = {
            'value_counts': self.df[self.sentiment_column].value_counts().to_dict(),
            'distribution_percentage': {k: round(v, 2) for k, v in 
                (self.df[self.sentiment_column].value_counts(normalize=True) * 100).to_dict().items()}
        }
        
        # Generate and save sentiment distribution plot
        plt.figure(figsize=(8, 6))
        sns.countplot(data=self.df, x=self.df[self.sentiment_column])
        plt.title('Sentiment Distribution')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(self.output_dir / 'images' / 'sentiment_distribution.png')
        plt.close()
        logger.info("Sentiment distribution analysis completed.")
        logger.info(f"Sentiment distribution plot saved to: {self.output_dir} / 'images' / 'sentiment_distribution.png' ")
        return sentiment_stats

    def save_dataframe(self, filename: str = "processed_data.csv") -> str:
        """
        Save the processed DataFrame to the output directory as a CSV file.
        
        Parameters:
        -----------
        filename : str, optional
            The name of the output file. Default is "processed_data.csv".
        
        Returns:
        --------
        str
            Path to the saved CSV file.
        """
        file_path = self.output_dir / filename
        try:
            self.df.to_csv(file_path, index=False)
            logger.info(f"DataFrame successfully saved to {file_path}")
            return str(file_path)
        except Exception as e:
            logger.error(f"Failed to save DataFrame: {e}")
            raise



    def generate_report(self) -> str:
        """
        Generate and save a comprehensive profile report.
        
        Returns:
        str: Path to the generated report directory
        """
        logger.info("Generating profile report.")
        # Generate report components
        report = {
            'timestamp': self.timestamp,
            'dataset_info': {
                'text_column': self.text_column,
                'sentiment_column': self.sentiment_column
            },
            'initial_statistics': self._get_initial_statistics(),
            'text_analysis': self._analyze_text_features()
        }
        
        if self.sentiment_column:
            report['sentiment_analysis'] = self._analyze_sentiment_distribution()
            
        # Save Processed data
        self.save_dataframe()
        
        # Save report as JSON
        report_path = self.output_dir / 'report.json'
        with open(report_path, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=4, ensure_ascii=False)
        logger.info(f"Report generated successfully at {report_path}.")
        # Generate a README with file descriptions
        readme_content = f"""Data Profile Report
        Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

        Files in this directory:
        1. report.json - Complete analysis report in JSON format
        2. images/wordcloud.png - Word cloud visualization of text data"""

        if self.sentiment_column:
            readme_content += "\n3. images/sentiment_distribution.png - Distribution of sentiment labels"

        with open(self.output_dir / 'README.txt', 'w') as f:
            f.write(readme_content)
        logger.info("README file created successfully.")
        return str(self.output_dir)

In [23]:
try:
    config = ConfigurationManager()
    data_profiler_config = config.get_data_profiler_config()
    profiler = SentimentDataProfiler(config=data_profiler_config)
    report_path = profiler.generate_report()
    logger.info(f"Report generated at: {report_path}")
except Exception as e:
    raise e

[2025-01-16 04:28:04,118: INFO: config_utils: yaml file: config\config.yaml loaded successfully]
[2025-01-16 04:28:04,120: INFO: config_utils: yaml file: params.yaml loaded successfully]
[2025-01-16 04:28:04,121: INFO: file_utils: created directory at: artifacts]
[2025-01-16 04:28:04,122: INFO: file_utils: created directory at: artifacts/data_profileing]
[2025-01-16 04:28:04,122: INFO: 1607497536: Initializing SentimentDataProfiler...]
[2025-01-16 04:28:04,123: INFO: 1607497536: Reading CSV file from path: artifacts\data_ingestion\20250114_001312\unzipped\file.csv]
[2025-01-16 04:28:04,124: INFO: 1607497536: Reading CSV file from artifacts\data_ingestion\20250114_001312\unzipped\file.csv]
[2025-01-16 04:28:04,773: INFO: 1607497536: Successfully read the CSV file.]
[2025-01-16 04:28:04,774: INFO: 1607497536: Required NLTK data found.]
[2025-01-16 04:28:04,776: INFO: 1607497536: Output directory created at d:\AI\NLP\HandsOn\sentiment-analysis\artifacts\data_profileing\20250116_042804]
[2