In [1]:
import os
os.chdir('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system')

from dataclasses import dataclass
from pathlib import Path
from src.utils.commons import read_yaml,create_directories
from src.cloud_storage.azure_blob_storage import AzureDatastore
import os
import sys
from datetime import datetime
from src.logger import logging
import pandas  as pd
from src.constants import *
import time
from datetime import datetime
from src.utils.commons import unzip_files
from glob import glob
from pyspark.sql.functions import col
from pyspark.sql.session import SparkSession
from glob import glob
import json

[2024-12-07 15:19:00,165 ] 161 numexpr.utils - INFO - NumExpr defaulting to 4 threads.


In [2]:
@dataclass
class DataValidationConfig:
    root_dir : Path
    data_source :Path
    status_file: Path
    columns: list
    all_schema : dict

In [3]:
# configuration manager

class ConfigurationManager:

    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        # initialiazing filepath

        self.config = read_yaml(str(config_filepath))
        self.schema = read_yaml(str(schema_filepath))
        self.params = read_yaml(str(params_filepath))

    def get_data_validation_config(self):

        config = self.config.DATA_VALIDATION
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            data_source= config.data_source,
            status_file= config.status_file,
            columns = config.critical_columns,
            all_schema= schema
        )

        return data_validation_config


In [34]:
class DataValidation:

    def __init__(self,config = ConfigurationManager):

        self.config = config

    def validate_all_columns(self,data):

        try:
            validation_statue = True

            all_columns = data.columns
            all_schemas = list(self.config.all_schema.keys())

            missing_columns  =  [col for col in all_schemas if col not in all_columns]
            extra_columns = [col for col in all_columns if col not in all_schemas]

            if missing_columns or extra_columns :
                logging.info(f"Missing columns: {missing_columns}, Extra columns: {extra_columns}")
                validation_statue = False

            return validation_statue

        except Exception as e:
            logging.info(f'data validation failed : {str(e)}')
            

    def validate_datatypes(self,data):

        try : 

            validation_status = True
            all_schema = self.config.all_schema

            type_mismatches = {}
            for col , excepted_types in all_schema.items():
                if col in data.columns:
                    actual_type = dict(data.dtypes)[col]
                    if actual_type != excepted_types :
                        type_mismatches[col] = (excepted_types, actual_type)
                        logging.info(f'type validation : {type_mismatches}')
                        validation_status = False

            return validation_status
                    
        except Exception as e:
            raise e

    
    def validate_missing_values(self,data):
        try:
            validation_status = True
            missing_values = {}

            for column_name in self.config.columns:
                    if data.filter(col(column_name).isNull()).count() > 0 :
                        validation_status = False

            if missing_values:
                logging.info(f"Missing values found in columns: {missing_values}")

            return validation_status
        except Exception as e:
            raise e
                


        




In [35]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)

    spark = SparkSession.builder.appName('recommendation_system').getOrCreate()

    data_path = glob(data_validation_config.data_source + '/*.csv')

    for path in data_path :
        data = spark.read.csv(path,header=True,inferSchema=True)

        column_validation_status = data_validation.validate_all_columns(data)
        type_validation_status = data_validation.validate_datatypes(data)
        missing_value_status = data_validation.validate_missing_values(data)

        validation_status = {
            'filename' : path,
            'validate columns' : column_validation_status,
            'type validation' : type_validation_status,
            'missing values' : missing_value_status
        }

        with open(data_validation_config.status_file,'a') as f:
            json.dump(validation_status, f,indent=4)
            f.write('\n\n')

        overall_validation_status = (column_validation_status and type_validation_status and missing_value_status)

        if overall_validation_status : 
            logging.info('data validation completed susscessfully')
        else : 
            logging.info('data validation failed , check validation status')
except Exception as e:
    logging.error(f' data validation failed {e}')
    raise e

    




[2024-12-07 16:30:38,768 ] 38 root - INFO - Yaml file:  config/config.yaml loaded suscessfully
[2024-12-07 16:30:38,799 ] 38 root - INFO - Yaml file:  schema.yaml loaded suscessfully
[2024-12-07 16:30:38,824 ] 38 root - INFO - Yaml file:  params.yaml loaded suscessfully
[2024-12-07 16:30:38,825 ] 61 root - INFO - File directory create at : Artifacts/data_validation


[Stage 1325:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               (0 + 4) / 4]



[2024-12-07 16:31:11,265 ] 33 root - INFO - data validation failed , check validation status


[Stage 1378:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               (0 + 4) / 5]

[2024-12-07 16:31:52,259 ] 33 root - INFO - data validation failed , check validation status


[Stage 1407:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               (0 + 4) / 4]

[2024-12-07 16:32:28,640 ] 33 root - INFO - data validation failed , check validation status


[Stage 1436:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               (0 + 4) / 4]

[2024-12-07 16:33:05,416 ] 33 root - INFO - data validation failed , check validation status


[Stage 1465:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               (0 + 4) / 4]

[2024-12-07 16:33:43,072 ] 33 root - INFO - data validation failed , check validation status


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [13]:
 data_path = glob(data_validation_config.data_source + '/*.csv')

In [14]:
data_path

['Artifacts/ingested_data/2019-Dec.csv',
 'Artifacts/ingested_data/2019-Nov.csv',
 'Artifacts/ingested_data/2019-Oct.csv',
 'Artifacts/ingested_data/2020-Feb.csv',
 'Artifacts/ingested_data/2020-Jan.csv']