In [1]:
import os
os.chdir("../")
!pwd

/home/aditya/network_security


In [4]:
import pandas as pd

data = pd.read_csv("artifacts/data_ingestion/datafromDB.csv")
print(data.dtypes)

NumDots                               float64
SubdomainLevel                        float64
PathLevel                             float64
UrlLength                             float64
NumDash                               float64
NumDashInHostname                     float64
AtSymbol                              float64
TildeSymbol                           float64
NumUnderscore                         float64
NumPercent                            float64
NumQueryComponents                    float64
NumAmpersand                          float64
NumHash                               float64
NumNumericChars                       float64
NoHttps                               float64
RandomString                          float64
IpAddress                             float64
DomainInSubdomains                    float64
DomainInPaths                         float64
HttpsInHostname                       float64
HostnameLength                        float64
PathLength                        

In [6]:
from pathlib import Path
from pydantic import BaseModel

class DataValidationConfig(BaseModel):

    root_dir: Path
    ingestion_file: str
    STATUS_FILE: Path
    all_schema: dict

In [13]:
from src.NetworkSecurity.constants import *
from src.NetworkSecurity.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self)->DataValidationConfig:

        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])
        """
        root_dir: Path
        ingestion_file: str
        STATUS_FILE: Path
        all_schema: dict"
        """

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            ingestion_file = config.ingestion_file,
            STATUS_FILE = config.STATUS_FILE,
            all_schema = schema
        )

        return data_validation_config

In [14]:
cm = ConfigurationManager()
data_validation_config = cm.get_data_validation_config()

[2025-03-30 21:41:53,510: INFO: common : Yaml File: config/config.yaml loaded successfully]
[2025-03-30 21:41:53,511: INFO: common : Yaml File: params.yaml loaded successfully]
[2025-03-30 21:41:53,515: INFO: common : Yaml File: schema.yaml loaded successfully]
[2025-03-30 21:41:53,516: INFO: common : created directory at: artifacts]
[2025-03-30 21:41:53,516: INFO: common : created directory at: artifacts/data_validation]


In [17]:
from src.NetworkSecurity.logging.logger import logger

class DataValiadtion:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        """
        root_dir: Path => config
		    STATUS_FILE: str => config
		    unzip_data_dir: Path => config
		    all_schema: dict => schema 
        """

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True  # Assume valid unless proven wrong
            
            ## Read CSV
            data = pd.read_csv(self.config.ingestion_file)

            ## Extract all columns and schema
            all_cols = list(data.columns)
            all_schema = set(self.config.all_schema.keys())  # Convert to set for fast lookup

            ## Check for missing or extra columns
            missing_cols = all_schema - set(all_cols)
            extra_cols = set(all_cols) - all_schema

            if missing_cols or extra_cols:
                validation_status = False

            ## Write final status
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}\n")
                if missing_cols:
                    f.write(f"Missing Columns: {missing_cols}\n")
                if extra_cols:
                    f.write(f"Extra Columns: {extra_cols}\n")

            return validation_status

        except Exception as e:
            raise e


In [18]:
dv = DataValiadtion(data_validation_config)
dv.validate_all_columns()

True