In [73]:
import os
os.chdir('../')
%pwd

'D:\\Python\\Industry level\\NLP'

In [74]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    status_file: str
    datadata_file: Path

In [75]:
from sentiment_analysis.constants import *
from sentiment_analysis.utils.common import read_yaml,create_directories

In [76]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILEPATH
    ):
        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self)-> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir= Path(config.root_dir),
            status_file= config.status_file,
            datadata_file= config.datadata_file
        )

        return data_validation_config

In [77]:
import pandas as pd
from sentiment_analysis.logger.logger_setup import logger
from sentiment_analysis.exception.customException import CustomException
import sys

In [78]:
class DataValidation:
    def __init__(self,config: DataValidationConfig):
        self.config = config

    def validate(self):
        if not os.path.exists(self.config.datadata_file):
            raise FileNotFoundError(f"Data file not found: {self.config.datadata_file}")

        df = pd.read_csv(self.config.datadata_file)

        if df.empty:
            raise ValueError("Data file is empty")

        required_columns = ['review', 'sentiment']
        for col in required_columns:
            if col not in df.columns:
                raise ValueError(f"Missing required column: {col}")

            missing_values = df.isnull().sum().any()
            if missing_values > 0:
                logger.info("Warning: Missing values found!")

            dup_count = df.duplicated().sum()
            if dup_count > 0:
                logger.info(f"Warning: {df.duplicated().sum()} duplicate rows found!")

       
        with open(self.config.status_file, "w") as f:
            f.write("Data validation: PASSED\n")
            f.write(f"Missing values: {missing_values}\n")
            f.write(f"Duplicates removed: {dup_count}\n")
            

            logger.info("✅ Data validation passed.")
            return True
        

In [79]:
os.chdir(r'D:\Python\Industry level\NLP\NLP-IMDB-sentiment-analysis-End-to-end')
%pwd


'D:\\Python\\Industry level\\NLP\\NLP-IMDB-sentiment-analysis-End-to-end'

In [80]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate()
except CustomException as e:
    raise CustomException(e,sys)