In [49]:
import great_expectations as gx
import pandas as pd
from great_expectations.data_context import FileDataContext
from great_expectations.dataset.pandas_dataset import PandasDataset

## Create data context

In [50]:
context = FileDataContext(project_root_dir="../services")

## Create data source

In [51]:
ds1 = context.sources.add_or_update_pandas(name="my_pandas_ds")
ds1

PandasDatasource(type='pandas', name='my_pandas_ds', id=None, assets=[])

## Create data assets

In [52]:
da1 = ds1.add_csv_asset(
    name = "asset01",
    filepath_or_buffer="../data/raw/Equity_Apartments_Data.csv"
)

In [53]:
batch_request = da1.build_batch_request()
batches = da1.get_batch_list_from_batch_request(batch_request)
for batch in batches:
    print(batch.batch_spec)

{'reader_method': 'read_csv', 'reader_options': {'filepath_or_buffer': PosixPath('../data/raw/Equity_Apartments_Data.csv')}}


## Create expectations

In [54]:
context.add_or_update_expectation_suite("my_expectation_suite")

{
  "expectation_suite_name": "my_expectation_suite",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.18.15"
  }
}

In [55]:
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="my_expectation_suite"
)

In [56]:
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,Price,Beds,Baths,sq.ft,Floor,Move_in_date,building_id,unit_id,URL,...,Fireplace,City_Skyline,Kitchen_Island,Stainless_Appliances,Renovated,Office_Space,Days_Till_Available,Day_of_the_week_recorded,Unique_ID,Estiamted_Vacancy
0,1,2377,0,1.0,523,5,2021-09-02,1,0507,https://www.equityapartments.com/washington-dc...,...,0.0,0.0,0.0,1.0,1.0,0.0,47.0,Wednesday,0105071210MassApartments,0.020833
1,2,2816,1,1.0,713,2,2021-09-13,1,0204,https://www.equityapartments.com/washington-dc...,...,0.0,0.0,0.0,1.0,1.0,0.0,58.0,Wednesday,0102041210MassApartments,0.020833
2,3,3811,2,2.0,1252,6,2021-10-08,1,0608,https://www.equityapartments.com/washington-dc...,...,1.0,0.0,0.0,0.0,0.0,0.0,83.0,Wednesday,0106081210MassApartments,0.020833
3,4,1549,0,1.0,456,2,2021-09-17,1,256 \r\n,https://www.equityapartments.com/washington-dc...,...,0.0,0.0,0.0,0.0,0.0,0.0,62.0,Wednesday,1256\r\n1500MassApartments,0.003597
4,5,1753,1,1.0,580,3,2021-10-05,1,337 \r\n,https://www.equityapartments.com/washington-dc...,...,0.0,0.0,0.0,0.0,0.0,0.0,80.0,Wednesday,1337\r\n1500MassApartments,0.003597


#### Expectation for column Price


In [57]:
validator.expect_column_values_to_be_between(column="Price", min_value=0)
validator.expect_column_values_to_not_be_null(column="Price")
validator.expect_column_values_to_be_of_type(column="Price", type_="int")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Expectation for column Beds

In [58]:
validator.expect_column_values_to_be_between(column="Beds", min_value=0, max_value=10)
validator.expect_column_values_to_not_be_null(column="Beds")
validator.expect_column_values_to_match_regex(column="Beds", regex="^\d+$")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 62810,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Expectation for column Baths

In [59]:
validator.expect_column_values_to_be_between(column="Baths", min_value=0, max_value=10)
validator.expect_column_values_to_not_be_null(column="Baths")
validator.expect_column_values_to_match_regex(column="Baths", regex="^\d+$")


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 62810,
    "unexpected_count": 62810,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      1.0,
      1.0,
      2.0,
      1.0,
      1.0,
      1.0,
      1.0,
      1.0,
      1.0,
      1.0,
      1.0,
      1.0,
      1.0,
      2.0,
      1.0,
      1.0,
      2.0,
      1.0,
      1.0,
      2.0
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Expectation for column sq.ft

In [60]:
validator.expect_column_values_to_be_between(column="sq.ft", min_value=0)
validator.expect_column_values_to_not_be_null(column="sq.ft")
validator.expect_column_values_to_match_regex(column="sq.ft", regex="^\d+(\.\d{1})?$")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 62810,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Expectation for column Floor

In [61]:
validator.expect_column_values_to_be_between(column="Floor", min_value=0, max_value=100)
validator.expect_column_values_to_not_be_null(column="Floor")
validator.expect_column_values_to_match_regex(column="Floor", regex="^\d+$")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 62810,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Expectation for column Move_in_date

In [62]:
validator.expect_column_values_to_match_strftime_format(column="Move_in_date", strftime_format="%Y-%m-%d")
validator.expect_column_values_to_not_be_null(column="Move_in_date")
validator.expect_column_values_to_be_between(
    column="Move_in_date",
    min_value="2021-06-25",
    max_value="2021-07-17"
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 62810,
    "unexpected_count": 39033,
    "unexpected_percent": 62.9341201509142,
    "partial_unexpected_list": [
      "2021-09-02",
      "2021-09-13",
      "2021-10-08",
      "2021-09-17",
      "2021-10-05",
      "2021-08-05",
      "2021-09-14",
      "2021-08-19",
      "2021-09-29",
      "2021-08-31",
      "2021-08-04",
      "2021-09-15",
      "2021-07-21",
      "2021-08-05",
      "2021-07-30",
      "2021-09-16",
      "2021-08-05",
      "2021-08-10",
      "2021-08-17",
      "2021-08-28"
    ],
    "missing_count": 788,
    "missing_percent": 1.2545772966088202,
    "unexpected_percent_total": 62.14456296768031,
    "unexpected_percent_nonmissing": 62.9341201509142
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Expectation for column unit_id

In [63]:
validator.expect_column_values_to_match_regex(column="unit_id", regex="^[a-zA-Z0-9]+$")
validator.expect_column_values_to_not_be_null(column="unit_id")
validator.expect_column_values_to_be_unique(column="unit_id")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 62810,
    "unexpected_count": 61921,
    "unexpected_percent": 99.94028212660189,
    "partial_unexpected_list": [
      "0507",
      "0204",
      "0608",
      "256 \r\n",
      "337 \r\n",
      "423 \r\n",
      "512 \r\n",
      "812 \r\n",
      "802 \r\n",
      "0101",
      "0306",
      "0426",
      "0831",
      "0702",
      "126 \r\n",
      "131 \r\n",
      "508 \r\n",
      "1109",
      "801 \r\n",
      "817 \r\n"
    ],
    "missing_count": 852,
    "missing_percent": 1.3564718993790799,
    "unexpected_percent_total": 98.58462028339436,
    "unexpected_percent_nonmissing": 99.94028212660189
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

#### Save expectation suit

In [64]:
validator.save_expectation_suite(discard_failed_expectations=False)
context.build_data_docs()
context.open_data_docs()