# Great Expectations Configuration 

You should first have a directory with similar structure:

<pre>
├── conf
├── data
└── gx
    ├── checkpoints
    ├── expectations
    ├── great_expectations.yml
</pre>

In [1]:
from pathlib import Path
import os
import pandas as pd

# Move path to root folder
os.chdir(str(Path("../")))

%load_ext kedro.extras.extensions.ipython
%reload_kedro

In [2]:
import great_expectations as ge

name_suite = "olist_order_items_dataset.raw.suite" 

dataset = "olist_order_items_dataset"

metadata = {"source": dataset, "layer": "raw"}

In [8]:
data = catalog.load(dataset)

In [10]:
context = ge.get_context()
data = catalog.load(dataset)
df = ge.from_pandas(data)

In [11]:
dir(df)


[1m[[0m
    [32m'T'[0m,
    [32m'_AXIS_LEN'[0m,
    [32m'_AXIS_ORDERS'[0m,
    [32m'_AXIS_TO_AXIS_NUMBER'[0m,
    [32m'_HANDLED_TYPES'[0m,
    [32m'__abs__'[0m,
    [32m'__add__'[0m,
    [32m'__and__'[0m,
    [32m'__annotations__'[0m,
    [32m'__array__'[0m,
    [32m'__array_priority__'[0m,
    [32m'__array_ufunc__'[0m,
    [32m'__array_wrap__'[0m,
    [32m'__bool__'[0m,
    [32m'__class__'[0m,
    [32m'__contains__'[0m,
    [32m'__copy__'[0m,
    [32m'__dataframe__'[0m,
    [32m'__deepcopy__'[0m,
    [32m'__delattr__'[0m,
    [32m'__delitem__'[0m,
    [32m'__dict__'[0m,
    [32m'__dir__'[0m,
    [32m'__divmod__'[0m,
    [32m'__doc__'[0m,
    [32m'__eq__'[0m,
    [32m'__finalize__'[0m,
    [32m'__floordiv__'[0m,
    [32m'__format__'[0m,
    [32m'__ge__'[0m,
    [32m'__getattr__'[0m,
    [32m'__getattribute__'[0m,
    [32m'__getitem__'[0m,
    [32m'__getstate__'[0m,
    [32m'__gt__'[0m,
    [32m'__hash__'[0m,
   

In [15]:
data

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.90,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.90,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.00,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.90,18.14
...,...,...,...,...,...,...,...
112645,fffc94f6ce00a00581880bf54a75a037,1,4aa6014eceb682077f9dc4bffebc05b0,b8bc237ba3788b23da09c0f1f3a3288c,2018-05-02 04:11:01,299.99,43.41
112646,fffcd46ef2263f404302a634eb57f7eb,1,32e07fd915822b0765e448c4dd74c828,f3c38ab652836d21de61fb8314b69182,2018-07-20 04:31:48,350.00,36.53
112647,fffce4705a9662cd70adb13d4a31832d,1,72a30483855e2eafc67aee5dc2560482,c3cfdc648177fdbbbb35635a37472c53,2017-10-30 17:14:25,99.90,16.95
112648,fffe18544ffabc95dfada21779c9644f,1,9c422a519119dcad7575db5af1ba540e,2b3e4a2a3ea8e01938cabda2a3e5cc79,2017-08-21 00:04:32,55.99,8.72


In [14]:
data['price'].mean()

[1;36m120.65373901464716[0m

In [16]:
numerical_cols = ['price', 'freight_value']

## Layers
- raw
- intermediate
- final

In [17]:
context = ge.get_context()
data = catalog.load(dataset)
df = ge.from_pandas(data)

# add some sample validations
for col in df:
    df.expect_column_to_exist(col, meta=metadata)
    df.expect_column_values_to_not_be_null(col, meta=metadata)

    if col in numerical_cols:
        df.expect_column_mean_to_be_between(col, min_value = data[col].mean() * 0.9, max_value = data[col].mean() * 1.1)

# get suite definition
suite = df.get_expectation_suite(
        discard_failed_expectations=False
    )

# add the name
suite.expectation_suite_name = name_suite

# save expectation
context.add_or_update_expectation_suite(
    expectation_suite=suite
)


[1m{[0m
  [32m"expectation_suite_name"[0m: [32m"olist_order_items_dataset.raw.suite"[0m,
  [32m"ge_cloud_id"[0m: null,
  [32m"expectations"[0m: [1m[[0m
    [1m{[0m
      [32m"expectation_type"[0m: [32m"expect_column_to_exist"[0m,
      [32m"kwargs"[0m: [1m{[0m
        [32m"column"[0m: [32m"order_id"[0m
      [1m}[0m,
      [32m"meta"[0m: [1m{[0m
        [32m"source"[0m: [32m"olist_order_items_dataset"[0m,
        [32m"layer"[0m: [32m"raw"[0m
      [1m}[0m
    [1m}[0m,
    [1m{[0m
      [32m"expectation_type"[0m: [32m"expect_column_values_to_not_be_null"[0m,
      [32m"kwargs"[0m: [1m{[0m
        [32m"column"[0m: [32m"order_id"[0m
      [1m}[0m,
      [32m"meta"[0m: [1m{[0m
        [32m"source"[0m: [32m"olist_order_items_dataset"[0m,
        [32m"layer"[0m: [32m"raw"[0m
      [1m}[0m
    [1m}[0m,
    [1m{[0m
      [32m"expectation_type"[0m: [32m"expect_column_to_exist"[0m,
      [32m"kwargs"[0m: [1m{[

## Create Checkpoint

- https://docs.greatexpectations.io/docs/reference/checkpoints_and_actions
- https://docs.greatexpectations.io/docs/guides/validation/checkpoints/how_to_create_a_new_checkpoint
- https://docs.greatexpectations.io/docs/guides/validation/checkpoints/how_to_configure_a_new_checkpoint_using_test_yaml_config

In [8]:
context.get_available_data_asset_names()

[1m{[0m[32m'pandas_dataframe'[0m: [1m{[0m[32m'default_runtime_data_connector_name'[0m: [1m[[0m[1m][0m[1m}[0m[1m}[0m

In [9]:
context.list_expectation_suite_names()

[1m[[0m[32m'olist_order_items_dataset.intermediate.suite'[0m[1m][0m