In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\Text Summarization'

In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list
    required_columns: List[str]
    unzip_dir: Path

In [3]:
from TextSummarizer.constants import *
from TextSummarizer.utils.file_utils import *
from TextSummarizer.utils.config_utils import *

In [4]:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
            required_columns=config.required_columns,
            unzip_dir=config.unzip_dir
        )

        return data_validation_config

In [5]:
import os
from TextSummarizer.logging import logger
import pandas as pd
from datetime import datetime
import json

In [8]:
class DataValidation:
   def __init__(self, config: DataValidationConfig):
       self.config = config
        # Clear status file at initialization
       open(self.config.STATUS_FILE, 'w').close()

   def validate_all_files_exist(self) -> dict:
        try:
            all_files = os.listdir(os.path.join("artifacts", "data_ingestion"))
            
            found_files = {required_file: required_file in all_files 
                            for required_file in self.config.ALL_REQUIRED_FILES}
            
            status = {
                "validation_passed": all(found_files.values()),
                "required_files": list(self.config.ALL_REQUIRED_FILES),
                "files_found": found_files,
                "timestamp": datetime.now().isoformat()
            }
            logger.info(f"Validated file existence. Status: {status['validation_passed']}")
            with open(self.config.STATUS_FILE, 'a') as f:
                json.dump(status, f, indent=4)
                f.write('\n')
                
            return status
            
        except Exception as e:
            raise e

   def validate_all_required_fields_exist(self) -> dict:
       try:
           found_in_files = {col: [] for col in self.config.required_columns}
           
           for file in self.config.ALL_REQUIRED_FILES:
               file_path = os.path.join(self.config.unzip_dir, file)
               df = pd.read_csv(file_path)
               for col in self.config.required_columns:
                   if col in df.columns:
                       found_in_files[col].append(file)

           missing_columns = {col: found_in_files[col] for col in found_in_files 
                            if not found_in_files[col]}

           status = {
               "validation_passed": len(missing_columns) == 0,
               "required_columns": self.config.required_columns,
               "columns_found_in": found_in_files,
               "missing_columns": missing_columns,
               "timestamp": datetime.now().isoformat()
           }
           logger.info(f"Validated required fields existence. Status: {status['validation_passed']}")
           with open(self.config.STATUS_FILE, 'a') as f:
               json.dump(status, f, indent=4)
               f.write('\n')
           return status

       except Exception as e:
           raise e

   def validate(self) -> bool:
       files_status = self.validate_all_files_exist()
       if not files_status["validation_passed"]:
           return False
           
       fields_status = self.validate_all_required_fields_exist()
       return fields_status["validation_passed"]

In [9]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_files_exist()
    data_validation.validate_all_required_fields_exist()
except Exception as e:
    raise e

[2024-12-06 20:11:35,644: INFO: config_utils: yaml file: config\config.yaml loaded successfully]
[2024-12-06 20:11:35,646: INFO: config_utils: yaml file: params.yaml loaded successfully]
[2024-12-06 20:11:35,647: INFO: file_utils: created directory at: artifacts]
[2024-12-06 20:11:35,648: INFO: file_utils: created directory at: artifacts/data_validation]
[2024-12-06 20:11:35,649: INFO: 3474559529: Validated file existence. Status: True]
[2024-12-06 20:11:36,085: INFO: 3474559529: Validated required fields existence. Status: True]
