In [2]:
import os
os.chdir('../')

In [13]:
%pwd

'e:\\Github repositories\\end-to-end-fake-news-detection'

In [18]:
from pathlib import Path
from dataclasses import dataclass

@dataclass
class DataValidationConfig:
    root_dir: Path
    unzip_data_path: Path
    status_file_path: Path
    all_schema: dict
    

In [29]:
from FakeNewsDetection.utils.common import read_yaml, create_directories
from FakeNewsDetection.constants import *

class ConfigurationManager:
    def __init__(self, 
                 config_path= CONFIG_FILE,
                 parama_path= PARAMS_FILE,
                 schema_path= SCHEMA_FILE):
        
        self.config = read_yaml(config_path)
        self.params = read_yaml(parama_path)
        self.schema = read_yaml(schema_path)

        create_directories([self.config.artifact_root])

    def get_datavalidation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        create_directories([config.root_dir])

        return DataValidationConfig(
            root_dir=config.root_dir,
            unzip_data_path=config.unzip_data_path,
            status_file_path=config.status_file_path,
            all_schema=self.schema.COLUMNS
        )

In [34]:
import os
from FakeNewsDetection import logger
import pandas as pd


class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        
    def validate_data_columns(self) ->bool:
        try:
            val_status = None
            data = pd.read_csv(self.config.unzip_data_path, nrows=5)
            all_cols = list(data.columns)

            all_schema_cols = list(self.config.all_schema.keys())

            for col in all_cols:
                if col not in all_schema_cols:
                    logger.error(f"Column {col} is not in schema")
                    val_status=False
                    break
            else:
                val_status = True
            

            with open(self.config.status_file_path, "w") as f:
                f.write(f"Validation Status: {val_status}")
                if val_status:
                    logger.info("Data Validation Passed")

            return val_status
        except Exception as e:
            logger.error(f"Error in data validation: {str(e)}")
            raise Exception()




In [36]:
# starts data validation 

if __name__ == '__main__':
    try:
        config_mgr = ConfigurationManager()
        data_validation_config = config_mgr.get_datavalidation_config()
        data_validation = DataValidation(config= data_validation_config)
        data_validation.validate_data_columns()
    except Exception as e:
        logger.error(e)
        

[2024-11-21 11:50:28,682] [INFO] [common.py:26] [Loaded yaml file from config\config.yaml]
[2024-11-21 11:50:28,685] [INFO] [common.py:26] [Loaded yaml file from params.yaml]
[2024-11-21 11:50:28,690] [INFO] [common.py:26] [Loaded yaml file from schema.yaml]
[2024-11-21 11:50:28,693] [INFO] [common.py:48] [created directory at: artifacts]
[2024-11-21 11:50:28,698] [INFO] [common.py:48] [created directory at: artifacts/data_validation]
[2024-11-21 11:50:28,708] [INFO] [1404676303.py:30] [Data Validation Passed]
