In [None]:
import great_expectations as gx
from great_expectations.exceptions import DataContextError
import pandas as pd
import dotenv
import os

# Environment
dotenv.load_dotenv()
POSTGRES_USER = os.environ.get('POSTGRES_USER')
POSTGRES_PASSWORD = os.environ.get('POSTGRES_PASSWORD')
POSTGRES_PORT = os.environ.get('POSTGRES_PORT_CONTAINER')
POSTGRES_HOST = os.environ.get('POSTGRES_HOST')

POSTGRES_URI = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/postgres"


# Data Context
data_context_path = '.'
context = gx.data_context.FileDataContext.create(project_root_dir=data_context_path)

# Datasource - engine
datasource_name = "postgres_src"
try:
    datasource = context.sources.add_postgres(name=datasource_name, connection_string=POSTGRES_URI, )
except DataContextError:
    pass

# Data Asset - connection
asset_name = 'listings_asset_11'
asset_table_name = "g1_listings"  # SQL table
table_asset = datasource.add_table_asset(name=asset_name, table_name=asset_table_name, schema_name='raw')
# add_query_asset
# (name=asset_name, table_name=asset_table_name)
data_asset = context.get_datasource( datasource_name ).get_asset( asset_name )
batch_request = table_asset.build_batch_request()

# Suite
context.add_or_update_expectation_suite("my_expectation_suite")

In [4]:
import great_expectations as gx
import pandas as pd
import dotenv
import os


def load_postgres_instance_datasource_asset(context):
    # Environment
    dotenv.load_dotenv()
    POSTGRES_USER = os.environ.get('POSTGRES_USER')
    POSTGRES_PASSWORD = os.environ.get('POSTGRES_PASSWORD')
    POSTGRES_PORT = os.environ.get('POSTGRES_PORT_CONTAINER')
    POSTGRES_HOST = os.environ.get('POSTGRES_HOST')

    POSTGRES_URI = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/postgres"


    # Datasource - engine
    datasource_name = "postgres_src"
    datasource = context.datasources.get('airbnb', None)
    if datasource == None:
        datasource = context.sources.add_postgres(name=datasource_name, connection_string=POSTGRES_URI, )


    # Data Asset - connection
    asset_name = 'listings_asset'
    asset_table_name = "g1_listings"  # SQL table
    list_asset_names = [asset_obj.name for asset_obj in datasource.assets]
    if asset_name in list_asset_names:
        table_asset = datasource.get_asset('reviews', None)
    else:
        table_asset = datasource.add_table_asset(name=asset_name, table_name=asset_table_name, schema_name='raw')
        # add_query_asset
    data_asset = context.get_datasource( datasource_name ).get_asset( asset_name )

    batch_request = table_asset.build_batch_request()
    return batch_request



def load_csv_datasource_asset_raw(context, datasource_name, asset_name, layer_name='raw'):
    file_data_regex = asset_name + '\.csv\.gz'
    expectation_suite_name_str = f'{layer_name}_{asset_name}'
    run_name = f'{layer_name}.{asset_name}'
    
    # Datasource - engine
    datasource = context.datasources.get(datasource_name, None)
    if datasource == None:
        datasource = context.sources.add_pandas_filesystem(datasource_name, base_directory='./data')


    # Data Asset - connection
    list_asset_names = [asset_obj.name for asset_obj in datasource.assets]
    if asset_name in list_asset_names:
        table_asset = datasource.get_asset('reviews')
    else:
        table_asset = datasource.add_csv_asset(asset_name, batching_regex=file_data_regex)

    batch_request = table_asset.build_batch_request()
    return batch_request

In [9]:
# Monitoramento RAW - Reviews
def suite_monitoring_execution(asset_name, datasource_name='airbnb', data_context_path = '.'):
    data_context_path = '.'
    datasource_name='airbnb'
    suite_name = f'raw_{asset_name}'

    # Data Context
    context = gx.data_context.FileDataContext.create(project_root_dir=data_context_path)

    # Batch request
    batch_request = load_csv_datasource_asset_raw(context=context, datasource_name=datasource_name, asset_name=asset_name, layer_name='raw')

    # Suite
    context.add_or_update_expectation_suite(suite_name)

    # Checkpoint Validation
    checkpoint = context.add_or_update_checkpoint(
        name=f"{suite_name}",
        validations=[{
            "batch_request": batch_request,
            "expectation_suite_name": suite_name,
            }])

    checkpoint_result = checkpoint.run(run_name=suite_name)
    return checkpoint_result

In [12]:
# checkpoint_result_raw_reviews = suite_monitoring_execution(asset_name='reviews')
checkpoint_result_raw_listings = suite_monitoring_execution(asset_name='listings')

Calculating Metrics: 0it [00:00, ?it/s]

In [11]:
checkpoint_result_raw_reviews

{
  "run_id": {
    "run_name": "raw_reviews",
    "run_time": "2023-10-26T13:11:02.835010-03:00"
  },
  "run_results": {
    "ValidationResultIdentifier::raw_reviews/raw_reviews/20231026T161102.835010Z/airbnb-reviews": {
      "validation_result": {
        "success": true,
        "results": [],
        "evaluation_parameters": {},
        "statistics": {
          "evaluated_expectations": 0,
          "successful_expectations": 0,
          "unsuccessful_expectations": 0,
          "success_percent": null
        },
        "meta": {
          "great_expectations_version": "0.17.23",
          "expectation_suite_name": "raw_reviews",
          "run_id": {
            "run_name": "raw_reviews",
            "run_time": "2023-10-26T13:11:02.835010-03:00"
          },
          "batch_spec": {
            "path": "data\\reviews.csv.gz",
            "reader_method": "read_csv",
            "reader_options": {}
          },
          "batch_markers": {
            "ge_load_time": "202310

In [None]:
# Data Context
data_context_path = '.'
context = gx.data_context.FileDataContext.create(project_root_dir=data_context_path)

# Batch request
# batch_request = load_postgres_instance_datasource_asset(context=context)
batch_request_raw_reviews = load_csv_datasource_asset_raw(context=context, datasource_name='airbnb', asset_name='reviews', layer_name='raw')
# batch_request_raw_listings = load_csv_datasource_asset_raw(context=context, datasource_name='airbnb', asset_name='listings', layer_name='raw')
# batch_request_raw_calendar = load_csv_datasource_asset_raw(context=context, datasource_name='airbnb', asset_name='calendar', layer_name='raw')

# Suite
context.add_or_update_expectation_suite("my_expectation_suite")


# Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="my_expectation_suite",
)
validator.head()

# validator.expect_column_values_to_not_be_null(column="vendor_id")
# validator.save_expectation_suite(discard_failed_expectations=False)