# Great Expectations Configuration for MY_TABLE_NAME
Define the expectations to test against your table

### Imports and Installs

In [0]:
%pip install pyyaml -i https://artifactory.healthpartners.com/artifactory/api/pypi/python-hosted-remote/simple
%pip install great-expectations -i https://artifactory.healthpartners.com/artifactory/api/pypi/python-hosted-remote/simple

In [0]:
from Databricks.SharedModules.gx_tool import SimpleGXTool
from Databricks.SharedModules.general import get_catalog_suffix

### Target Environment

In [0]:
# target_environment = dbutils.widgets.get('target_environment')
target_environment = "dev"

### Create Expectation Suites
Create a new expectation suite for your table. You can have as many expectation suites per table as you want. However, generally, you want one expectation suite per alert level that you plan to employ. So if you want your job to fail for all expectation violations, you'd just need one expectation suite at alert_level 'error'. If you want a warning expectation suite as well, you'd have two suites, one at 'error' and one at 'warning'. Your new expectation suite(s) will be named: {catalog}\_{schema}\_{table}\_expectation\_suite_{alert_level}.

You can find your new expectation suite(s) in the Great Expectations Cloud UI: https://app.greatexpectations.io/organizations/healthpartners/expectation-suites.

In [0]:
# Install the PyYAML module
!pip install pyyaml
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
import yaml
import json

with open('index.yaml', 'r') as f:
  index_yaml = yaml.safe_load(f)

In [0]:
zone = 'standardized' # don't include the _dev or _npd suffix;
schema = index_yaml['schema']
table = index_yaml['table']['name']

In [0]:
gx_tool = SimpleGXTool(catalog='{zone}{suffix}'.format(
                            zone=zone,
                            suffix=get_catalog_suffix(target_environment)),
                        schema=schema,
                        table=table)

In [0]:
expectation_suite = gx_tool.add_expectation_suite(alert_level='error')
expectation_suite = gx_tool.add_expectation_suite(alert_level='warning')

In [0]:
# The names are a bit cumbersome, so store them in variables for easy access when creating expectations
error_suite_name = '{zone}{suffix}_{schema}_{table}_expectation_suite_error'.format(
                        zone=zone,
                        suffix=get_catalog_suffix(target_environment),
                        schema=schema,
                        table=table
                        )
print(error_suite_name)
warning_suite_name = '{zone}{suffix}_{schema}_{table}_expectation_suite_warning'.format(
                        zone=zone,
                        suffix=get_catalog_suffix(target_environment),
                        schema=schema,
                        table=table
                        )

### Add Expectations
You can find expectations in the GX Expectation Gallery (https://greatexpectations.io/expectations/). Take a look at the expectation_column_distinct_values_to_be_in_set expectation here: https://greatexpectations.io/expectations/expect_column_distinct_values_to_be_in_set. Note how the args are added to the expectation_config_kwargs in the example below. Also note that the keyword args could also be added in the same way, though they are optional and were left out in the example.

In [0]:
# Expectations that will cause errors (fail the job)
# gx_tool.add_expectation(
#                 expectation_suite_name=error_suite_name,
#                 expectation_type='expect_column_distinct_values_to_be_in_set',
#                 expectation_config_kwargs={'column': 'race_length',
#                                            'value_set': ['100m', '5000m', '10000m', 'marathon']}
#                 )

gx_tool.add_expectation(
                expectation_suite_name=error_suite_name,
                expectation_type='expect_column_values_to_not_be_null',
                expectation_config_kwargs={'column': 'PAID_PERIOD_ID'}
                )

# # Expectations that will cause warnings (notify the team)
# gx_tool.add_expectation(
#                 expectation_suite_name=warning_suite_name,
#                 expectation_type='expect_table_columns_to_match_set',
#                 expectation_config_kwargs={'column_set': ['race_length', 'record_holder'],
#                                            'exact_match': True}
#                 )