In [122]:
"""Example: How to create an Expectation Suite with the Missingness Data Assistant

--documentation--
    https://docs.greatexpectations.io/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_missingness_data_assistant
"""
import great_expectations as gx

context = gx.get_context()

In [123]:
# Read data in to get validator 
 
datasource = context.sources.add_pandas_filesystem(
    name="taxi_multi_batch_datasource",  # custom name to assign to new datasource, can be used to retrieve datasource later
    base_directory="../../tests/test_sets/taxi_yellow_tripdata_samples/",  # replace with your data directory
)
    
validator = datasource.read_csv(
    asset_name="all_years",   # custom name to assign to the asset, can be used to retrieve asset later
    batching_regex=r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv",
)


In [124]:

# Run the Missingness Assistant

exclude_column_names = [
    "VendorID",
    "pickup_datetime",
    "dropoff_datetime",
    "RatecodeID",
    "PULocationID",
    "DOLocationID",
    "payment_type",
    "fare_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "improvement_surcharge",
    "congestion_surcharge",
]

data_assistant_result = context.assistants.missingness.run(
    validator=validator,
    exclude_column_names=exclude_column_names,
    
)





Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/180 [00:00<?, ?it/s]

In [125]:
# View generated expectations

data_assistant_result.show_expectations_by_expectation_type()


[ { 'expect_column_values_to_not_be_null': { 'column': 'trip_distance',
                                             'domain': 'column',
                                             'mostly': 1.0}},
  { 'expect_column_values_to_not_be_null': { 'column': 'pickup_location_id',
                                             'domain': 'column',
                                             'mostly': 1.0}},
  { 'expect_column_values_to_not_be_null': { 'column': 'dropoff_location_id',
                                             'domain': 'column',
                                             'mostly': 1.0}},
  { 'expect_column_values_to_not_be_null': { 'column': 'total_amount',
                                             'domain': 'column',
                                             'mostly': 1.0}}]


In [126]:

# Save your Expectation Suite

# Get the expectation suite from the Data Assistant Result
# This creates a new expectation suite containing the expectations from the Data Assistant Result
expectation_suite = data_assistant_result.get_expectation_suite(
    expectation_suite_name="my_custom_expectation_suite_name"  # Your custom name here
)

# Add the expectation suite to your Data Context
context.add_or_update_expectation_suite(expectation_suite=expectation_suite)


{
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "meta": {
        "profiler_details": {
          "metric_configuration": {
            "metric_name": "column_values.nonnull.unexpected_count",
            "domain_kwargs": {
              "column": "trip_distance"
            },
            "metric_value_kwargs": null
          },
          "num_batches": 36,
          "mode": "multi_batch"
        }
      },
      "kwargs": {
        "column": "trip_distance",
        "mostly": 1.0
      }
    },
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "meta": {
        "profiler_details": {
          "metric_configuration": {
            "metric_name": "column_values.nonnull.unexpected_count",
            "domain_kwargs": {
              "column": "pickup_location_id"
            },
            "metric_value_kwargs": null
          },
          "num_batches": 36,
          "mode": "multi_batch"
        }
      },
 

In [127]:

# # Save the expectation suite from Data Context to validator to be able to use it in a Checkpoint
# validator.expectation_suite_name ="my_custom_expectation_suite_name"
validator.expectation_suite = expectation_suite
# Use a Checkpoint to verify that your new Expectation Suite works.
checkpoint = context.add_or_update_checkpoint(
    name="yellow_tripdata_sample_all_years_checkpoint",
    validator=validator,
)
checkpoint_result = checkpoint.run(run_name="my_run_name")

assert checkpoint_result["success"] is True

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

In [128]:
# To view the metrics that were computed, you can use the `plot_metrics` method:

data_assistant_result.plot_metrics()

64 Metrics calculated, 16 Metric plots implemented
Use DataAssistantResult.metrics_by_domain to show all calculated Metrics


interactive(children=(Dropdown(description='Select Plot Type: ', layout=Layout(margin='0px', width='max-conten…



In [129]:
context.build_data_docs()


{'local_site': 'file:///var/folders/9w/qsyh30616nq8frx877b3pyzm0000gn/T/tmpcga59abh/index.html'}

In [None]:

context.open_data_docs()
