In [0]:
%pip install great_expectations

Collecting great_expectations
  Obtaining dependency information for great_expectations from https://files.pythonhosted.org/packages/0a/09/0d14dfdada5d9710b26cc1220b5b85b0eaa2ad24938ec4cec6d36412e52c/great_expectations-1.4.6-py3-none-any.whl.metadata
  Downloading great_expectations-1.4.6-py3-none-any.whl.metadata (8.8 kB)
Collecting altair<5.0.0,>=4.2.1 (from great_expectations)
  Obtaining dependency information for altair<5.0.0,>=4.2.1 from https://files.pythonhosted.org/packages/18/62/47452306e84d4d2e67f9c559380aeb230f5e6ca84fafb428dd36b96a99ba/altair-4.2.2-py3-none-any.whl.metadata
  Downloading altair-4.2.2-py3-none-any.whl.metadata (13 kB)
Collecting jinja2>=3 (from great_expectations)
  Obtaining dependency information for jinja2>=3 from https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl.metadata
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting jsonschema>=2.5.1 (from g

In [0]:
%sh
ls /dbfs/tmp

pytest_results.xml


In [0]:
%sh
mkdir /dbfs/tmp/great_expectations

In [0]:
%sh
cp transactions.csv /dbfs/tmp/great_expectations

In [0]:
%sh
ls /dbfs/tmp/great_expectations

orders.csv
transactions.csv


In [0]:
import great_expectations as ge

context = ge.get_context()
print(type(context).__name__)

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpht17wm4i' for ephemeral docs site


EphemeralDataContext


In [0]:
orders_df = spark.read.csv("/tmp/great_expectations/orders.csv", header=True, inferSchema=True)
orders_df.show(n=5)

+--------------+-----------+-----------+---------+------------+-------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|      order_id|customer_id|  ship_mode|vendor_id|order_status|order_purchase_date|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------+-----------+-----------+---------+------------+-------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|CA-2014-101147|       NULL|First Class|    VEN02|   delivered|   16-12-2017 22:28|       2017-12-16|            12-19-17 8:32 PM|             12-27-17 6:03 PM|                   2018-01-18|
|CA-2014-101476|   SD-20485|First Class|    VEN04|   delivered|   23-01-2017 13:40|       2017-01-25|            1-26-17 11:26 AM|              1-30-17 8:42 AM|                   2017-02-28|
|CA-2014-101602|       NULL|First Class|    V

In [0]:
data_source_name = "sales_department"
data_source = context.data_sources.add_spark(name=data_source_name)

data_asset_name = "raw_orders"
data_asset = data_source.add_dataframe_asset(
    name=data_asset_name
)

batch_definition_name = "my_batch_definition"
batch_definition = data_asset.add_batch_definition_whole_dataframe(
    batch_definition_name
)

batch_parameters = {"dataframe": orders_df}

In [0]:
no_duplicate_orders = ge.expectations.ExpectColumnValuesToBeUnique(
    column="order_id"
)

no_null_order_id = ge.expectations.ExpectColumnValuesToNotBeNull(
    column="order_id"
)

no_null_customer_id = ge.expectations.ExpectColumnValuesToNotBeNull(
    column="customer_id"
)

order_id_format = ge.expectations.ExpectColumnValuesToMatchRegex(
    column="order_id",
    regex="^[A-Z]{2}-\d{4}-\d{6}$"  # Format: CA-2014-108189
)

date_logic_validation = ge.expectations.ExpectColumnPairValuesAToBeGreaterThanB(
    column_A="order_delivered_customer_date",
    column_B="order_purchase_date",
    or_equal=True,
    ignore_row_if="both_values_are_missing"
)

valid_status = ge.expectations.ExpectColumnValuesToBeInSet(
    column="order_status",
    value_set=["pending", "confirmed", "shipped", "delivered", "cancelled"]
)

  regex="^[A-Z]{2}-\d{4}-\d{6}$"  # Format: CA-2014-108189


In [0]:
# Validating the no_duplicate order_id expectations
batch = batch_definition.get_batch(batch_parameters=batch_parameters)
validation_results = batch.validate(no_duplicate_orders)
print(validation_results)

  self.comm = Comm(**args)


Calculating Metrics:   0%|          | 0/12 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "batch_id": "sales_department-raw_orders",
      "column": "order_id"
    },
    "meta": {}
  },
  "result": {
    "element_count": 5015,
    "unexpected_count": 9,
    "unexpected_percent": 0.1794616151545364,
    "partial_unexpected_list": [
      "CA-2014-102652",
      "CA-2014-102652",
      "CA-2014-102652",
      "CA-2014-103219",
      "CA-2014-103219",
      "CA-2014-103219",
      "CA-2014-103527",
      "CA-2014-103527",
      "CA-2014-103527"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.1794616151545364,
    "unexpected_percent_nonmissing": 0.1794616151545364,
    "partial_unexpected_counts": [
      {
        "value": "CA-2014-102652",
        "count": 3
      },
      {
        "value": "CA-2014-103219",
        "count": 3
      },
      {
        "value": "CA-2014-103527",
        "count": 3
      }
    ]
 

In [0]:
report = []
expectations = [
    no_duplicate_orders,
    no_null_order_id,
    no_null_customer_id,
    order_id_format,
    date_logic_validation,
    valid_status]

for expectation in expectations:
    batch = batch_definition.get_batch(batch_parameters=batch_parameters)
    validation_results = batch.validate(expectation)
    report.append(validation_results)

  self.comm = Comm(**args)


Calculating Metrics:   0%|          | 0/12 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

In [0]:
print(report)

[{
  "success": false,
  "expectation_config": {
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "batch_id": "sales_department-raw_orders",
      "column": "order_id"
    },
    "meta": {}
  },
  "result": {
    "element_count": 5015,
    "unexpected_count": 9,
    "unexpected_percent": 0.1794616151545364,
    "partial_unexpected_list": [
      "CA-2014-102652",
      "CA-2014-102652",
      "CA-2014-102652",
      "CA-2014-103219",
      "CA-2014-103219",
      "CA-2014-103219",
      "CA-2014-103527",
      "CA-2014-103527",
      "CA-2014-103527"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.1794616151545364,
    "unexpected_percent_nonmissing": 0.1794616151545364,
    "partial_unexpected_counts": [
      {
        "value": "CA-2014-102652",
        "count": 3
      },
      {
        "value": "CA-2014-103219",
        "count": 3
      },
      {
        "value": "CA-2014-103527",
        "count": 3
      }
    ]
