In [1]:
import os

In [2]:
%pwd

'f:\\OneDrive - MSFT\\wine_quality_ML\\Machine-Learning-End-to-End-Project\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'f:\\OneDrive - MSFT\\wine_quality_ML\\Machine-Learning-End-to-End-Project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict


In [6]:
# configuration Manager
from src.MLProject_WineQT.constants.const import *
from src.MLProject_WineQT.utils.common import read_yaml, create_directories
import pandas as pd

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        
        # Load config, params, and schema YAML files
        self.config = read_yaml(config_filepath)  # Ensure this loads the YAML correctly
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        # Debugging: Print loaded config, params, and schema
        print(f"Loaded config: {self.config}")
        print(f"Loaded params: {self.params}")
        print(f"Loaded schema: {self.schema}")

        # Create directories specified in the config
        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation  # Access 'data_validation' config block
        
        # Fetch the schema (columns) from the schema file
        schema = self.schema.COLUMNS
        create_directories([config.root_dir]) # Debug print to ensure schema is loaded
        
        data_validation_config=DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=schema
        )
        return data_validation_config
     

In [8]:
import os
import pandas as pd

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validation_all_columns(self) -> bool:
        try:
            validation_status = None
            data=pd.read_csv(self.config.unzip_data_dir)
            all_cols= list(data.columns)
            all_schema = self.config.all_schema.keys()
           

           

            for col in all_cols:
                if col not in all_schema:
                    validation_status=False
                    with open(self.config.STATUS_FILE,'w') as file:
                    
                        file.write(f'data validation status {validation_status}')
                else:
                    validation_status=True
                    with open(self.config.STATUS_FILE,'w') as file:
                    
                        file.write(f'data validation status {validation_status}')
            return validation_status
        except Exception as e:
            raise e
                    
           

In [9]:
try:
    # Initialize the configuration manager
    config = ConfigurationManager()  
    
    # Fetch data validation configuration
    data_validation_config = config.get_data_validation_config()
    
    # Create a DataValidation instance
    data_validation = DataValidation(config=data_validation_config)
    
    # Perform the column validation
    data_validation.validation_all_columns()
except Exception as e:
    raise e


[2024-10-05 03:59:37,097: INFO: common: yaml file:config\config.yaml load successfuly]
[2024-10-05 03:59:37,101: INFO: common: yaml file:params.yaml load successfuly]
[2024-10-05 03:59:37,112: INFO: common: yaml file:schema.yaml load successfuly]
Loaded config: {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/AliTheAnalyst01/Machine-Learning-End-to-End-Project/raw/refs/heads/main/WineQT.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion/unzipped_data'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'STATUS_FILE': 'artifacts/data_validation/status.txt', 'unzip_data_dir': 'artifacts/data_ingestion/unzipped_data/WineQT.csv'}}
Loaded params: {'key': 'val'}
Loaded schema: {'COLUMNS': {'fixed acidity': 'float64', 'volatile acidity': 'float64', 'citric acid': 'float64', 'residual sugar': 'float64', 'chlorides': 'float64', 'free sulfur dioxide': 'float64'

In [12]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import os
import yaml  # Assuming you're using PyYAML for reading YAML files

# Define constants for file paths
#CONFIG_FILE_PATH = 'path/to/config.yaml'  # Adjust the path as necessary
#PARAMS_FILE_PATH = 'path/to/params.yaml'  # Adjust the path as necessary
#SCHEMA_FILE_PATH = 'path/to/schema.yaml'  # Adjust the path as necessary

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    all_schema: dict
    unzip_data_dir: Path

def read_yaml(file_path: str):
    """Read a YAML file and return its content."""
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

def create_directories(dirs: list):
    """Create directories if they do not exist."""
    for dir in dirs:
        Path(dir).mkdir(parents=True, exist_ok=True)

class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        
        # Load config, params, and schema YAML files
        self.config = read_yaml(config_filepath)  # Load the config file
        self.params = read_yaml(params_filepath)  # Load the parameters file
        self.schema = read_yaml(schema_filepath)  # Load the schema file
        
        # Debugging: Print loaded config, params, and schema
        print(f"Loaded config: {self.config}")
        print(f"Loaded params: {self.params}")
        print(f"Loaded schema: {self.schema}")

        # Create directories specified in the config
        create_directories([self.config['artifacts_root']])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config['data_validation']  # Access 'data_validation' config block
        
        # Fetch the schema (columns) from the schema file
        schema = self.schema.get('COLUMNS', None)
        print(f"Loaded schema for validation: {schema}")  # Debug print to ensure schema is loaded
        
        # Raise error if schema is not defined
        if schema is None:
            raise ValueError("Schema (COLUMNS) is not defined in the schema file.")
        
        # Create directories for data validation
        create_directories([config['root_dir']])
        
        # Create and return the DataValidationConfig instance
        data_validation_config = DataValidationConfig(
            root_dir=Path(config['root_dir']),
            STATUS_FILE=config['STATUS_FILE'],
            unzip_data_dir=Path(config['unzip_data_dir']),
            all_schema=schema  # Use the loaded schema
        )
        
        return data_validation_config

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validation_all_columns(self) -> bool:
        try:
            validation_status = True  # Default to True
            
            # Check if the file exists
            if not os.path.exists(self.config.unzip_data_dir):
                raise FileNotFoundError(f"File {self.config.unzip_data_dir} does not exist.")
            
            # Read the CSV file from the specified directory
            data = pd.read_csv(self.config.unzip_data_dir)  # Ensure this is the full file path
            all_cols = list(data.columns)
            
            # Schema keys
            all_schema = self.config.all_schema.keys()
            
            # Validate each column
            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation failed for column: {col}\n")
                else:
                    with open(self.config.STATUS_FILE, 'a') as f:  # Append mode
                        f.write(f"Validation passed for column: {col}\n")
            
            return validation_status
        except Exception as e:
            print(f"An error occurred during validation: {e}")
            raise e

# Example usage
try:
    config = ConfigurationManager()  # Initialize configuration manager
    data_validation_config = config.get_data_validation_config()  # Fetch data validation config
    data_validation = DataValidation(config=data_validation_config)  # Create DataValidation instance
    data_validation.validation_all_columns()  # Perform validation
except Exception as e:
    print(f"An error occurred: {e}")


An error occurred: [Errno 2] No such file or directory: 'path/to/config.yaml'
