In [None]:
# PROFILING
# 
# Profiling de arquivo tipo CSV com Engine de Pandas.
# Cria um Suite com os dados do CSV, o que permite melhor análise da qualidade esperada.
# 
# Pode ser usado um conjunto de Batches ou amostras do dado geral

In [1]:
import pandas as pd
import great_expectations as gx

In [32]:
# ARGUMENTS
# file_data_regex = 'listings\.csv\.gz'
# asset_name = 'listings'
# expectation_suite_name_str = 'raw_listings'
# run_name = 'raw.listings'

file_data_regex = 'reviews\.csv\.gz'
asset_name = 'reviews'
expectation_suite_name_str = 'raw_reviews'
run_name = 'raw.reviews'

# file_data_regex = 'calendar\.csv\.gz'
# asset_name = 'calendar'
# expectation_suite_name_str = 'raw_calendar'
# run_name = 'raw.calendar'

In [None]:
# ARGUMENTS
# ... [upstream]

# Context
context = gx.data_context.FileDataContext.create('.')

# Data Source
datasource = context.datasources.get('airbnb', None)
if datasource == None:
    datasource = context.sources.add_pandas_filesystem('airbnb', base_directory='./data')

# Data Asset
table_asset = datasource.add_csv_asset(asset_name, batching_regex=file_data_regex)

# Batch Request
batch_request = table_asset.build_batch_request()

expectation_suite_name = expectation_suite_name_str
expectations_suite = context.add_expectation_suite(expectation_suite_name)


# ===================================================================================
# Run Onboarding Data Assist
exclude_column_names = []

data_assistant_result = context.assistants.onboarding.run(
    batch_request=batch_request,
    exclude_column_names=exclude_column_names
)

# Saving Suite
expectation_suite = data_assistant_result.get_expectation_suite(
    expectation_suite_name=expectation_suite_name)

# New Suite (updating)
context.add_or_update_expectation_suite(expectation_suite=expectation_suite)



# ====================================================================================
# Testing Suite [Optional]
checkpoint = context.add_or_update_checkpoint(
    name=f"{expectation_suite_name}",
    validations=[{
        "batch_request": batch_request,
        "expectation_suite_name": expectation_suite_name,
        }])

checkpoint_result = checkpoint.run(run_name=run_name)

assert checkpoint_result["success"] is True

In [None]:
# data_assistant_result @ gx\expectations\raw_review.json

In [47]:
context.datasources.get('airbnb', None).assets[0].name
[asset_obj.name for asset_obj in context.datasources.get('airbnb', None).assets]

['reviews', 'listings', 'calendar']