In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/dhyaneshanchula/Documents/Text-Summary-Generator'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list
    ALL_REQUIRED_COLUMNS: list

In [5]:
from textSummarizer.constant import *
from textSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) 
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        os.makedirs(os.path.dirname(config.STATUS_FILE), exist_ok=True)

        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir),
            STATUS_FILE=config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
            ALL_REQUIRED_COLUMNS=config.ALL_REQUIRED_COLUMNS,
        )

        return data_validation_config

In [7]:
import os
import pandas as pd
from textSummarizer.logging import logger

In [8]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_files_exist(self) -> bool:
        try:
            all_files = os.listdir(self.config.root_dir)
            
            # Check if all required files exist
            status = all(file in all_files for file in self.config.ALL_REQUIRED_FILES)
            
            # Write status once
            with open(self.config.STATUS_FILE, "w") as f:
                f.write(f"File Existence Validation status: {status}\n")
            logger.info(f"Files Existence Validated with status {status}")
            
            return status
        except Exception as e:
            raise e
    
    def validate_all_columns(self) -> bool:
        try:
            all_files = os.listdir(self.config.root_dir)
            csv_files = [file for file in all_files if file.endswith('.csv')]
            
            if not csv_files:
                logger.warning("No CSV files found in the directory")
                return False
            
            all_columns_valid = True
            for csv_file in csv_files:
                file_path = os.path.join(self.config.root_dir, csv_file)
                df = pd.read_csv(file_path)
                
                missing_columns = [col for col in self.config.ALL_REQUIRED_COLUMNS if col not in df.columns]
                
                if missing_columns:
                    logger.error(f"File {csv_file} is missing columns: {missing_columns}")
                    all_columns_valid = False
                else:
                    logger.info(f"File {csv_file} contains all required columns")
            
            # Append status to file
            with open(self.config.STATUS_FILE, "a") as f:
                f.write(f"Column Validation status: {all_columns_valid}\n")
            
            return all_columns_valid
        except Exception as e:
            raise e
    
    def validate_missing_values(self) -> bool:
        try:
            all_files = os.listdir(self.config.root_dir)
            csv_files = [file for file in all_files if file.endswith('.csv')]
            
            if not csv_files:
                logger.warning("No CSV files found in the directory")
                return False
            
            no_missing_values = True
            for csv_file in csv_files:
                file_path = os.path.join(self.config.root_dir, csv_file)
                df = pd.read_csv(file_path)
                
                missing_values = df.isnull().sum()
                
                if missing_values.sum() > 0:
                    logger.warning(f"File {csv_file} contains missing values:\n{missing_values[missing_values > 0]}")
                    no_missing_values = False
                else:
                    logger.info(f"File {csv_file} contains no missing values")
            
            # Append status to file
            with open(self.config.STATUS_FILE, "a") as f:
                f.write(f"Missing Values Validation status: {no_missing_values}\n")
            
            return no_missing_values
        except Exception as e:
            raise e

In [9]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_files_exist()
    data_validation.validate_all_columns()
    data_validation.validate_missing_values()
except Exception as e:
    raise e

[2026-01-31 12:59:01,372: INFO: common: yaml file: config/config.yaml loaded successfully]
[2026-01-31 12:59:01,375: INFO: common: yaml file: params.yaml loaded successfully]
[2026-01-31 12:59:01,376: INFO: common: Created directory at: artifacts]
[2026-01-31 12:59:01,376: INFO: 3656818977: Files Existence Validated with status True]
[2026-01-31 12:59:01,434: INFO: 3656818977: File validation.csv contains all required columns]
[2026-01-31 12:59:01,478: INFO: 3656818977: File test.csv contains all required columns]
[2026-01-31 12:59:01,588: INFO: 3656818977: File train.csv contains all required columns]
[2026-01-31 12:59:01,597: INFO: 3656818977: File validation.csv contains no missing values]
[2026-01-31 12:59:01,605: INFO: 3656818977: File test.csv contains no missing values]
[2026-01-31 12:59:01,700: INFO: 3656818977: File train.csv contains no missing values]
