Module imports

In [1]:
import great_expectations as gx
import pandas as pd

Disable GX analytics (uses internet)

In [2]:
from great_expectations.analytics.config import ENV_CONFIG
ENV_CONFIG.gx_analytics_enabled = False

Create Pandas DataFrame using CSV file from GitHub

In [3]:
raw_df = pd.read_csv(
    "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
)

raw_df.head(5)

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-15 03:36:12,2019-01-15 03:42:19,1,1.0,1,N,230,48,1,6.5,0.5,0.5,1.95,0.0,0.3,9.75,
1,1,2019-01-25 18:20:32,2019-01-25 18:26:55,1,0.8,1,N,112,112,1,6.0,1.0,0.5,1.55,0.0,0.3,9.35,0.0
2,1,2019-01-05 06:47:31,2019-01-05 06:52:19,1,1.1,1,N,107,4,2,6.0,0.0,0.5,0.0,0.0,0.3,6.8,
3,1,2019-01-09 15:08:02,2019-01-09 15:20:17,1,2.5,1,N,143,158,1,11.0,0.0,0.5,3.0,0.0,0.3,14.8,
4,1,2019-01-25 18:49:51,2019-01-25 18:56:44,1,0.8,1,N,246,90,1,6.5,1.0,0.5,1.65,0.0,0.3,9.95,0.0


Create a GX context

In [4]:
context = gx.get_context()

Connect to data, create a data asset and create a Batch (a description of how data should be retrieved - in our case by reading a DataFrame object)

In [5]:
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_parameters = {"dataframe": raw_df}
batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

Create an Expectation Suite (an object that contains one or more Expectations about the data)

In [6]:
suite = context.suites.add(
    gx.core.expectation_suite.ExpectationSuite(name="expectations")
)

suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="passenger_count", min_value=1, max_value=6
    )
)

suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(column="fare_amount", min_value=0)
)

ExpectColumnValuesToBeBetween(id='019aa911-70b3-463e-b1c4-26a76e7c29a8', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='fare_amount', mostly=1, row_condition=None, condition_parser=None, min_value=0.0, max_value=None, strict_min=False, strict_max=False)

Create a Validation Definition that connects the Batch of data with the Expectation Suite

In [7]:
validation_definition = context.validation_definitions.add(
    gx.core.validation_definition.ValidationDefinition(
        name="validation definition",
        data=batch_definition,
        suite=suite,
    )
)

Run the Validation Definition and display the results

In [8]:
validation_results = validation_definition.run(batch_parameters=batch_parameters)

print(validation_results)

Calculating Metrics: 100%|██████████| 17/17 [00:00<00:00, 1067.35it/s]

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "pandas-pd dataframe asset",
          "column": "passenger_count",
          "min_value": 1.0,
          "max_value": 6.0
        },
        "meta": {},
        "id": "28e9ab2f-cf56-4184-a8c5-4c9efd6fc145"
      },
      "result": {
        "element_count": 10000,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": false,
 


