In [1]:
import os 
os.chdir("../")

In [2]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories
from ensure import ensure_annotations
from pathlib import Path
import os
import logging
from dataclasses import dataclass
from typing import List
from datasets import DatasetDict
import pandas as pd 
logger = logging.getLogger(__name__)

[2025-07-03 22:06:24,854: INFO: config: PyTorch version 2.7.1+cu128 available.]
[2025-07-03 22:06:24,858: INFO: config: JAX version 0.4.30 available.]


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    dataset_dir: Path
    STATUS_FILE: Path
    REQUIRED_FILES: list[str]
    DATA_FORMATS: list[str]

In [4]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath: str = "config/config.yaml",  # Update with your actual path
            params_filepath: str = "config/params.yaml"):  # Update with your actual path
        
        # Convert string paths to Path objects
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))

        # Create the root directory for artifacts
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        """Retrieve the data validation configuration."""
        config = self.config.data_validation

        # Create the directory for data validation
        create_directories([config.root_dir])

        # Create and return the DataValidationConfig object
        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir),
            dataset_dir=Path(config.dataset_dir),
            STATUS_FILE=Path(config.STATUS_FILE),
            REQUIRED_FILES=config.REQUIRED_FILES,
            DATA_FORMATS=config.DATA_FORMATS
        )

        return data_validation_config


In [5]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        create_directories([self.config.root_dir])
        self.status_file = Path(self.config.STATUS_FILE)
        logger.info("Initialized DataValidation with config: %s", self.config)

    @ensure_annotations
    def validate_files_exist(self) -> bool:
        """Validate all required files exist in the specified directory"""
        try:
            validation_status = True
            missing_files = []
            
            logger.info("Validating file existence in: %s", self.config.root_dir)
            
            for file in self.config.REQUIRED_FILES:
                file_path = Path(self.config.dataset_dir) / file
                if not file_path.exists():
                    validation_status = False
                    missing_files.append(str(file_path))
                    logger.warning("File missing: %s", file_path)

            # Write status to file
            with open(self.status_file, 'w') as f:
                f.write(f"FILE_EXISTENCE_VALIDATION: {'PASS' if validation_status else 'FAIL'}\n")
                if not validation_status:
                    f.write(f"MISSING_FILES: {','.join(missing_files)}\n")

            return validation_status
            
        except Exception as e:
            logger.error("Error during file existence validation: %s", str(e))
            raise

    @ensure_annotations
    def validate_file_formats(self) -> bool:
        """Validate all files contain required columns"""
        validation_status = True
        invalid_files = []
        
        try:
            logger.info("Validating file formats")
            
            for file in self.config.REQUIRED_FILES:
                file_path = Path(self.config.dataset_dir) / file
                
                try:
                    # Handle both CSV and JSON files
                    if file.endswith('.csv'):
                        df = pd.read_csv(file_path)
                    elif file.endswith('.json'):
                        df = pd.read_json(file_path)
                    else:
                        logger.error("Unsupported file format for file: %s", file)
                        validation_status = False
                        invalid_files.append(f"{file} (error: Unsupported file format)")
                        continue
                    
                    # Check required columns
                    missing_cols = [col for col in self.config.DATA_FORMATS if col not in df.columns]
                    
                    if missing_cols:
                        validation_status = False
                        invalid_files.append(f"{file} (missing: {','.join(missing_cols)})")
                        logger.warning("Missing columns in %s: %s", file, missing_cols)
                        
                except Exception as e:
                    validation_status = False
                    invalid_files.append(f"{file} (error: {str(e)})")
                    logger.error("Error reading %s: %s", file, str(e))

            # Update status file
            with open(self.status_file, 'a') as f:
                f.write(f"FORMAT_VALIDATION: {'PASS' if validation_status else 'FAIL'}\n")
                if not validation_status:
                    f.write(f"INVALID_FILES: {'; '.join(invalid_files)}\n")

            return validation_status
            
        except Exception as e:
            logger.error("Error during format validation: %s", str(e))
            raise

    def run_validation(self) -> bool:
        """Run complete validation pipeline"""
        logger.info("Starting comprehensive data validation")
        
        file_check = self.validate_files_exist()
        content_check = self.validate_file_formats() if file_check else False
        
        final_status = file_check and content_check
        
        with open(self.status_file, 'a') as f:
            f.write(f"OVERALL_VALIDATION: {'PASS' if final_status else 'FAIL'}\n")
        
        logger.info("Validation completed. Final status: %s", "PASS" if final_status else "FAIL")
        return final_status


In [6]:
config_manager = ConfigurationManager()

# Get config from ConfigurationManager
validation_config = config_manager.get_data_validation_config()

# Initialize validator
validator = DataValidation(validation_config)

# Run validation
validator.run_validation()

[2025-07-03 22:06:25,235: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-03 22:06:25,236: INFO: common: yaml file: config\params.yaml loaded successfully]
[2025-07-03 22:06:25,238: INFO: common: created directory at: artifacts]
[2025-07-03 22:06:25,239: INFO: common: created directory at: artifacts/data_validation]
[2025-07-03 22:06:25,240: INFO: common: created directory at: artifacts\data_validation]
[2025-07-03 22:06:25,241: INFO: 3208243731: Initialized DataValidation with config: DataValidationConfig(root_dir=WindowsPath('artifacts/data_validation'), dataset_dir=WindowsPath('artifacts/data_ingestion/cnn_dailymail_data'), STATUS_FILE=WindowsPath('artifacts/data_validation/status.txt'), REQUIRED_FILES=BoxList(['train.csv', 'validation.csv', 'test.csv']), DATA_FORMATS=BoxList(['article', 'highlights']))]
[2025-07-03 22:06:25,242: INFO: 3208243731: Starting comprehensive data validation]
[2025-07-03 22:06:25,242: INFO: 3208243731: Validating file existence i

True