In [6]:
import os

os.chdir('C:/Users/giddy/Documents/RECOMMENDATION_SYSTEM')

from dataclasses import dataclass
from pathlib import Path
from src.utils.commons import read_yaml,create_directories
import os
import sys
from datetime import datetime
from src.logger import logging
import pandas  as pd
from src.constants import *
import time
from datetime import datetime
from src.utils.commons import unzip_files
from glob import glob
import great_expectations as gx
import great_expectations.expectations as gxe
from great_expectations.exceptions import GreatExpectationsError
from src.utils.commons import save_json

In [7]:
# enttity

@dataclass
class DatavalidationConfig:
    status_file : Path
    source_folder : str
    data_directory : str
    data_source_name : str
    asset_name : str
    batch_definition_name : str
    expectation_suite_name : str
    validation_definition_name : str



In [8]:
# configuration manager

class ConfigurationManager:

    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        # initialiazing filepath

        self.config = read_yaml(str(config_filepath))
        self.schema = read_yaml(str(schema_filepath))
        self.params = read_yaml(str(params_filepath))

    def get_data_validation_config(self):

        config = self.config.DATA_VALIDATION

        create_directories([config.root_dir])

        data_validation_config = DatavalidationConfig(
                        status_file = config.status_file,

                        source_folder = config.source_folder,
                        data_directory = config.data_directory,
                        data_source_name = config.data_source_name,
                        asset_name = config.asset_name,
                        batch_definition_name = config.batch_definition_name,
                        expectation_suite_name = config.expectation_suite_name,
                        validation_definition_name = config.validation_definition_name
        )

        return data_validation_config



In [13]:
class DataValidation:

    def __init__(self, config=DatavalidationConfig):

        self.config = config
        self.context = gx.get_context(mode='file')
        

    def validation_setup(self):
        try:
            logging.info('setting up validation')

            # Check if the data source already exists
            try:
                data_source = self.context.data_sources.get(self.config.data_source_name)
                logging.info(f'Data source "{self.config.data_source_name}" already exists.')
            except KeyError:
                # If the data source doesn't exist, create it
                data_source = self.context.data_sources.add_spark_filesystem(
                    name=self.config.data_source_name, base_directory=self.config.source_folder
                )
                logging.info(f'Data source "{self.config.data_source_name}" added.')

            # Check if the asset exists or create it
            try:
                data_asset = data_source.get_asset(name=self.config.asset_name)
                logging.info(f'Asset "{self.config.asset_name}" already exists.')
            except LookupError:
                # If asset doesn't exist, create it
                data_asset = data_source.add_directory_csv_asset(
                    name=self.config.asset_name, data_directory=self.config.data_directory, header=True
                )
                logging.info(f'Asset "{self.config.asset_name}" added.')

            # Add or get the batch definition
            try:
                batch_definition = data_asset.get_batch_definition(self.config.batch_definition_name)
                logging.info(f'Data source "{self.config.batch_definition_name}" already exists.')
            except KeyError:
                    batch_definition = data_asset.add_batch_definition_whole_directory(
                        self.config.batch_definition_name
                    )
                    logging.info(f'Batch definition "{self.config.batch_definition_name}" added.')

        except (Exception, GreatExpectationsError) as e:
            logging.error(f'Error occurred: {e}')
            # Fetch existing batch definition if there's an error
            data_source = self.context.data_sources.get(self.config.data_source_name)
            data_asset = data_source.get_asset(name=self.config.asset_name)
            batch_definition = data_asset.get_batch_definition(self.config.batch_definition_name)

        # Return the batch definition
        return batch_definition

    def expectation_suite(self):
        try:
            # Check if the expectation suite already exists
            try:
                expectation_suite = self.context.suites.get(self.config.expectation_suite_name)
                logging.info(f'Expectation suite "{self.config.expectation_suite_name}" already exists.')
            except Exception:
                # If the expectation suite doesn't exist, create it
                expectation_suite = self.context.suites.add(
                    gx.ExpectationSuite(name=self.config.expectation_suite_name)
                )
                logging.info(f'Expectation suite "{self.config.expectation_suite_name}" created.')
        except:
            # If an error occurs (e.g., the suite exists but could not be fetched), delete and recreate
            self.context.suites.delete(name=self.config.expectation_suite_name)
            expectation_suite = self.context.suites.add(
                gx.ExpectationSuite(name=self.config.expectation_suite_name)
            )
            logging.info(f'Expectation suite "{self.config.expectation_suite_name}" recreated.')

        # Define the expectations
        expectations = [
            *[gxe.ExpectColumnToExist(column=x, column_index=0) for x in ['price','product_id','user_id','category_id', 'event_time',
                                                                          'event_type','category_code', 'brand','user_session']],

            gxe.ExpectColumnDistinctValuesToBeInSet(column='event_type', value_set=['cart','remove_from_cart','purchase','view']),

            gxe.ExpectCompoundColumnsToBeUnique(column_list=['price','product_id','user_id','category_id', 'event_time','event_type',
                                                             'category_code', 'brand','user_session']),

            *[gxe.ExpectColumnValuesToNotBeNull(column=x) for x in ['user_id','event_time','product_id','category_id','event_type','user_session']],

            *[gxe.ExpectColumnValuesToBeOfType(column=x, type_='StringType') for x in ['user_id','product_id','category_id','user_session','event_type']],

            gxe.ExpectColumnValuesToBeInTypeList(column='price', type_list=['DoubleType','FloatType','IntegerType']),
            gxe.ExpectColumnValuesToBeOfType(column='event_time', type_='TimestampType')
        ]

        # Add expectations to the suite
        for expectation in expectations:
            expectation_suite.add_expectation(expectation)

        return expectation_suite

    def validate(self):
        batch_definition = self.validation_setup()
        # Validate Batch using Expectation Suite.
        expectation_suite = self.expectation_suite()

        try:
            # Check if validation definition already exists
            validation_definition = self.context.validation_definitions.get(self.config.validation_definition_name)
            logging.info(f'Validation definition "{self.config.validation_definition_name}" already exists.')
        except Exception:
            # If the validation definition doesn't exist, create it
            validation_definition = gx.ValidationDefinition(
                data=batch_definition, suite=expectation_suite, name=self.config.validation_definition_name
            )
            logging.info(f'Validation definition "{self.config.validation_definition_name}" created.')

        validation_results = validation_definition.run().to_json_dict()

        save_json(filepath=Path(self.config.status_file),data=validation_results)
       

In [14]:
man = ConfigurationManager()
config = man.get_data_validation_config()
val = DataValidation(config=config)

vali = val.validate()

[2025-01-07 10:50:36,645 ] 39 root - INFO - Yaml file:  config\config.yaml loaded suscessfully
[2025-01-07 10:50:36,661 ] 39 root - INFO - Yaml file:  schema.yaml loaded suscessfully
[2025-01-07 10:50:36,663 ] 39 root - INFO - Yaml file:  params.yaml loaded suscessfully
[2025-01-07 10:50:36,663 ] 62 root - INFO - File directory create at : Artifacts/data_validation
[2025-01-07 10:50:36,833 ] 209 great_expectations.data_context.data_context.file_data_context - INFO - FileDataContext loading fluent config
[2025-01-07 10:50:36,846 ] 191 great_expectations.datasource.fluent.config - INFO - Loading 'datasources' ->
[{'assets': [...],
  'base_directory': 'Artifacts',
  'id': 'a253eceb-4c83-44a3-87b2-abbe8991ee57',
  'name': 'ingested data local file_system',
  'type': 'spark_filesystem'}]
[2025-01-07 10:50:36,923 ] 2437 great_expectations.data_context.data_context.abstract_data_context - INFO - Loaded 'ingested data local file_system' from fluent config
[2025-01-07 10:50:36,923 ] 17 root - I

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
Calculating Metrics: 100%|██████████| 48/48 [01:00<00:00,  1.27s/it]

[2025-01-07 10:51:40,217 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,223 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,226 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,233 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,243 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,249 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field





[2025-01-07 10:51:40,259 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,267 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,268 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,278 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,286 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,286 ] 1213 great_expectations.expectations.expectation - INFO - _get_default_value called with key "table", but it is not a known field
[2025-01-07 10:51:40,303 ] 1213 great_expectations.expecta

In [None]:
print(vali)

None
