In [38]:
import os
path= os.getcwd()

if path.endswith("notebooks"):
    os.chdir("../")

In [39]:
# import dependencies
import pandas as pd 
from pathlib import Path
from src.Home_Premium_Prediction.utils import read_yaml, create_directories
from src.Home_Premium_Prediction.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH

In [41]:
class DataValidationConfig:
    def __init__(self, data_validation_dir: Path, train_data_path: str, status_file: Path, schema: dict):
        self.data_validation_dir = data_validation_dir
        self.train_data_path = train_data_path
        self.status_file = status_file
        self.schema = schema


class DataValidationConfigManager:
    def __init__(self, config_file=CONFIG_FILE_PATH, schema_file=SCHEMA_FILE_PATH):
        self.config_file = read_yaml(config_file)
        self.schema_file = read_yaml(schema_file)

    def get_data_validation_config(self) -> DataValidationConfig:
        create_directories([self.config_file['data_validation']['data_validation_dir']])
        schema = self.schema_file['COLUMNS']
        return DataValidationConfig(
            data_validation_dir=self.config_file['data_validation']['data_validation_dir'],
            train_data_path=self.config_file['data_validation']['train_data_path'],
            status_file=self.config_file['data_validation']['status_file'],
            schema=schema
        )


class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def get_data_validation(self) -> bool:
        try:
            data = pd.read_csv(self.config.train_data_path)
            expected_columns = self.config.schema

            # 1. Validate columns exist
            column_match = all(col in data.columns for col in expected_columns)

            # 2. Validate column types
            dtype_match = all(str(data[col].dtype) == expected_dtype for col, expected_dtype in expected_columns.items() if col in data.columns)

            validation_status = column_match and dtype_match

            # 3. Save status to file
            with open(self.config.status_file, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            raise e



if __name__ == "__main__":
    config_manager = DataValidationConfigManager()
    validation_config = config_manager.get_data_validation_config()

    validator = DataValidation(config=validation_config)
    status = validator.get_data_validation()
    print(f"Data validation passed? {status}")

created directory at: artifacts/data_validation
Data validation passed? True


In [None]:
# data validation done