In [1]:
import os
os.chdir("../")
!pwd

/home/aditya/MLOPS/E2E-Data-Science-Project


In [2]:
import pandas as pd

data = pd.read_csv("artifacts/data_ingestion/winequality-red.csv")

In [4]:
data.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [7]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [8]:
data.shape

(1599, 12)

In [15]:
from pathlib import Path
from pydantic import BaseModel

class DataValidationConfig(BaseModel):

    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [16]:
from src.DataScienceWorkflow.constants import *
from src.DataScienceWorkflow.utils.common import read_yaml,create_directories
from box import Box

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self)->DataValidationConfig:

        config = self.config.data_validation
        schema = self.schema.COLUMNS
        
        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema = schema
        )

        return data_validation_config

In [None]:
cm = ConfigurationManager()
data_validation_config = cm.get_data_validation_config()

[2025-03-29 16:26:30,759: INFO: common : Yaml File: config/config.yaml loaded successfully]
[2025-03-29 16:26:30,760: INFO: common : Yaml File: params.yaml loaded successfully]
[2025-03-29 16:26:30,761: INFO: common : Yaml File: schema.yaml loaded successfully]
[2025-03-29 16:26:30,761: INFO: common : created directory at: artifacts]
[2025-03-29 16:26:30,762: INFO: common : created directory at: artifacts/data_validation]


In [33]:
from src.DataScienceWorkflow import logger

class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True  # Assume valid unless proven wrong
            
            ## Read CSV
            data = pd.read_csv(self.config.unzip_data_dir)

            ## Extract all columns and schema
            all_cols = list(data.columns)
            all_schema = set(self.config.all_schema.keys())  # Convert to set for fast lookup

            ## Check for missing or extra columns
            missing_cols = all_schema - set(all_cols)
            extra_cols = set(all_cols) - all_schema

            if missing_cols or extra_cols:
                validation_status = False

            ## Write final status
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}\n")
                if missing_cols:
                    f.write(f"Missing Columns: {missing_cols}\n")
                if extra_cols:
                    f.write(f"Extra Columns: {extra_cols}\n")

            return validation_status

        except Exception as e:
            raise e


In [35]:
dv = DataValiadtion(data_validation_config)
dv.validate_all_columns()

True