In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\Text Summarization'

In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

In [3]:
@dataclass(frozen=True)
class DataStandardizationConfig:
    input_file_directory: Path
    output_dir: Path
    output_file: Path
    ALL_REQUIRED_FILES: list
    text_columns: list
    relevant_fields: list
    merging_key: str
    nltk_dir: Path

In [4]:
from TextSummarizer.constants import *
from TextSummarizer.utils.file_utils import *
from TextSummarizer.utils.config_utils import *
from TextSummarizer.utils.lib_utils import *

In [6]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_standardization_config(self) -> DataStandardizationConfig:
        config = self.config.data_standardization
        create_directories([config.output_dir])
        data_standardization_config = DataStandardizationConfig(
            input_file_directory=config.input_file_directory,
            output_dir = config.output_dir,
            output_file = config.output_file,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
            text_columns  = config.text_columns,
            relevant_fields = config.relevant_fields,
            merging_key = config.merging_key,
            nltk_dir = config.nltk_dir
            #output_file_path=os.path.join(config.output_dir, config.output_file)
   
        )
        return data_standardization_config
    


--------

In [9]:
import os
import pandas as pd
import re
import nltk
from typing import Set, Optional, List, Union
from  TextSummarizer.logging import logger


class DataStandardization:
    def __init__(self, 
                 config: DataStandardizationConfig,
                 remove_stops: bool = True,
                 lemmatize: bool = True,
                 custom_stopwords: Optional[Set[str]] = None):
        logger.info("Initializing DataStandardization with config")
        self.config = config
        self.remove_stops = remove_stops
        self.lemmatize = lemmatize
        self.custom_stopwords = custom_stopwords
        self.main_csv_path = os.path.join(self.config.input_file_directory,self.config.ALL_REQUIRED_FILES[0]) 
        self.transcripts_csv_path = os.path.join(self.config.input_file_directory,self.config.ALL_REQUIRED_FILES[1])
        self.text_columns = self.config.text_columns
        logger.debug(f"Set up paths - Main CSV: {self.main_csv_path}, Transcripts CSV: {self.transcripts_csv_path}")
        self.setup_nlp_utilities()
        
    def setup_nlp_utilities(self):
        logger.info("Setting up NLP utilities")
        setup_nltk_environment(self.config.nltk_dir)
        download_nltk_models(self.config.nltk_dir)
        logger.info("NLP utilities setup completed")

    def clean_text(self, text: str) -> str:
        logger.debug("Starting text cleaning process")
        if pd.isna(text):
            logger.warning("Received NA value for text cleaning")
            return ""
            
        # Convert to lowercase and string type
        text = str(text).lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        logger.debug("Text cleaning completed")
        return text

    def remove_stopwords(self, text: str, custom_stopwords: Optional[Set[str]] = None) -> str:
        logger.debug("Starting stopwords removal")
        if not text:
            logger.warning("Received empty text for stopwords removal")
            return ""
            
        try:
            from nltk.corpus import stopwords
            stop_words = set(stopwords.words('english'))
            if custom_stopwords:
                stop_words.update(custom_stopwords)
                logger.debug(f"Added {len(custom_stopwords)} custom stopwords")
        except LookupError:
            logger.info("Downloading stopwords...")
            nltk.download('stopwords')
            from nltk.corpus import stopwords
            stop_words = set(stopwords.words('english'))
        
        words = text.split()
        filtered_words = [word for word in words if word not in stop_words]
        logger.debug(f"Removed {len(words) - len(filtered_words)} stopwords")
        return ' '.join(filtered_words)

    def lemmatize_text(self, text: str) -> str:
        logger.debug("Starting text lemmatization")
        if not text:
            logger.warning("Received empty text for lemmatization")
            return ""
            
        try:
            from nltk.stem import WordNetLemmatizer
            lemmatizer = WordNetLemmatizer()
        except LookupError:
            logger.info("Downloading required models for lemmatization...")
            nltk.download('wordnet')
            from nltk.stem import WordNetLemmatizer
            lemmatizer = WordNetLemmatizer()
        
        words = text.split()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        logger.debug("Lemmatization completed")
        return ' '.join(lemmatized_words)

    def standardize_text(self, text: str) -> str:
        logger.debug("Starting text standardization")
        # Clean the text first
        text = self.clean_text(text)
        
        # Remove stopwords if requested
        if self.remove_stops:
            text = self.remove_stopwords(text, self.custom_stopwords)
        
        # Lemmatize if requested
        if self.lemmatize:
            text = self.lemmatize_text(text)
        
        logger.debug("Text standardization completed")
        return text

    def process_dataframe(self, 
                         df: pd.DataFrame, 
                         suffix: str = '_standardized') -> pd.DataFrame:
        logger.info(f"Processing DataFrame with {len(df)} rows")
        # Create a copy of the DataFrame to avoid modifying the original
        result_df = df.copy()
        
        # Process each text column
        for column in self.text_columns:
            if column not in df.columns:
                logger.error(f"Column '{column}' not found in DataFrame")
                raise ValueError(f"Column '{column}' not found in DataFrame")
                
            # Create new column name
            new_column = f"{column}{suffix}"
            logger.info(f"Processing column: {column} -> {new_column}")
            
            # Apply standardization to the column
            result_df[new_column] = df[column].apply(self.standardize_text)
            logger.debug(f"Completed processing column: {column}")
        
        logger.info("DataFrame processing completed")
        return result_df

    def load_and_prepare_data(self) -> pd.DataFrame:
        logger.info("Starting data loading and preparation")
        # Load main data
        logger.debug(f"Loading main data from {self.main_csv_path}")
        df_main = pd.read_csv(self.main_csv_path)
        logger.info(f"Loaded main data with {len(df_main)} rows")
        
        # Select relevant fields
        relevant_fields = self.config.relevant_fields
        df_main = df_main[relevant_fields]
        logger.debug(f"Selected {len(relevant_fields)} relevant fields")
        
        # Load transcripts
        logger.debug(f"Loading transcripts from {self.transcripts_csv_path}")
        df_transcripts = pd.read_csv(self.transcripts_csv_path)
        logger.info(f"Loaded transcripts with {len(df_transcripts)} rows")
        
        # Merge dataframes
        merging_key = self.config.merging_key
        logger.debug(f"Merging dataframes on key: {merging_key}")
        merged_df = df_main.merge(df_transcripts, on=merging_key).drop(merging_key, axis=1)
        logger.info(f"Merged DataFrame has {len(merged_df)} rows")
        
        return merged_df

    def save_data(self, data):
        logger.info(f"Saving standardized data to {self.config.output_file}")
        try:
            # Save the standardized data
            data.to_csv(self.config.output_file, index=False)
            logger.info("Data successfully saved")
        except Exception as e:
            logger.error(f"Error saving data: {str(e)}")
            raise

In [10]:
try: 
    standardization_config = ConfigurationManager().get_data_standardization_config()
    standardizer = DataStandardization(config=standardization_config, remove_stops=True,  lemmatize=True)
    df = standardizer.load_and_prepare_data()
    processed_df = standardizer.process_dataframe(df)
    standardizer.save_data(processed_df)
except Exception as e:
    raise e

[2024-12-08 02:28:09,447: INFO: config_utils: yaml file: config\config.yaml loaded successfully]
[2024-12-08 02:28:09,452: INFO: config_utils: yaml file: params.yaml loaded successfully]
[2024-12-08 02:28:09,453: INFO: file_utils: created directory at: artifacts]
[2024-12-08 02:28:09,454: INFO: file_utils: created directory at: artifacts/data_standardization]
[2024-12-08 02:28:09,455: INFO: 1274383872: Initializing DataStandardization with config]
[2024-12-08 02:28:09,456: INFO: 1274383872: Setting up NLP utilities]


[2024-12-08 02:28:11,746: INFO: lib_utils: Successfully downloaded 'punkt' to artifacts/models]
[2024-12-08 02:28:11,816: INFO: lib_utils: Successfully downloaded 'stopwords' to artifacts/models]
[2024-12-08 02:28:11,979: INFO: lib_utils: Successfully downloaded 'averaged_perceptron_tagger' to artifacts/models]
[2024-12-08 02:28:12,408: INFO: lib_utils: Successfully downloaded 'wordnet' to artifacts/models]
[2024-12-08 02:28:12,545: INFO: lib_utils: Successfully downloaded 'words' to artifacts/models]
[2024-12-08 02:28:12,545: INFO: 1274383872: NLP utilities setup completed]
[2024-12-08 02:28:12,546: INFO: 1274383872: Starting data loading and preparation]
[2024-12-08 02:28:12,915: INFO: 1274383872: Loaded main data with 2550 rows]
[2024-12-08 02:28:13,250: INFO: 1274383872: Loaded transcripts with 2467 rows]
[2024-12-08 02:28:13,254: INFO: 1274383872: Merged DataFrame has 2467 rows]
[2024-12-08 02:28:13,255: INFO: 1274383872: Processing DataFrame with 2467 rows]
[2024-12-08 02:28:13,2