In [1]:
import os

In [2]:
%pwd

'd:\\Projects\\ML Projects\\End-to-End Wine Quality\\End-to-End-ML-Project\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'd:\\Projects\\ML Projects\\End-to-End Wine Quality\\End-to-End-ML-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [6]:
from MLProject.constants import *
from MLProject.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=schema,
        )
        return data_validation_config

In [25]:
import pandas as pd

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate(self) -> bool:
        try:
            validation_status = True
            data = pd.read_csv(self.config.unzip_data_dir)

            actual_columns = list(data.columns)
            expected_columns = list(self.config.all_schema.keys())

            if actual_columns != expected_columns:
                validation_status = False
                with open(self.config.STATUS_FILE, 'w') as f:
                    f.write("Validation failed: Columns do not match expected schema.\n")
                    f.write(f"Expected columns: {expected_columns}\n")
                    f.write(f"Actual columns: {actual_columns}\n")
                return validation_status

            actual_dtypes = data.dtypes
            expected_dtypes = list(self.config.all_schema.values())

            mismatched_dtypes = []
            for col, expected_dtype in zip(actual_columns, expected_dtypes):
                if str(actual_dtypes[col]).lower() != expected_dtype.lower().replace(" ", ""):
                    mismatched_dtypes.append((col, str(actual_dtypes[col]), expected_dtype))

            if mismatched_dtypes:
                validation_status = False
                with open(self.config.STATUS_FILE, 'w') as f:
                    f.write("Validation failed: Data types do not match expected schema.\n")
                    for col, actual, expected in mismatched_dtypes:
                        f.write(f"Column: {col}, Expected: {expected}, Actual: {actual}\n")
                return validation_status

            with open(self.config.STATUS_FILE, 'w') as f:
                f.write("Validation passed: All columns and data types are valid.\n")

            return validation_status

        except Exception as e:
            raise e


In [26]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    result = data_validation.validate()
except Exception as e:
    raise e


[2025-04-25 17:37:06,310: INFO: common: yaml file: config\config.yaml loaded successfully.]
[2025-04-25 17:37:06,310: INFO: common: yaml file: params.yaml loaded successfully.]
[2025-04-25 17:37:06,315: INFO: common: yaml file: schema.yaml loaded successfully.]
[2025-04-25 17:37:06,317: INFO: common: Created directory at: artifacts]
[2025-04-25 17:37:06,317: INFO: common: Created directory at: artifacts/data_validation]
