=================================================

Milestone 3

Name  : Destriana Ramadani
Batch : FTDS-028-RMT

This program was created to check validation data and data quality using Great Expectation.

=================================================

In [1]:
# Create a data context
from great_expectations.data_context import FileDataContext
context = FileDataContext.create(project_root_dir='./')

In [2]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'datasource'
datasources = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'house-price'
path_to_data = 'dags\P2M3_destriana_ramadani_data_clean.csv'
asset = datasources.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [3]:
# Creat an expectation suite
expectation_suite_name = 'expectation-price-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,house_id,suburb,address,rooms,type,price,method,sellerg,date,distance,...,bathroom,car,landsize,buildingarea,yearbuilt,councilarea,lattitude,longtitude,regionname,propertycount
0,1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,2016-02-04,2.5,...,1,0,156.0,79.0,1900,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
1,2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,2017-03-04,2.5,...,2,0,134.0,150.0,1900,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
2,4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2016-06-04,2.5,...,1,2,120.0,142.0,2014,Yarra,-37.8072,144.9941,Northern Metropolitan,4019
3,6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,2016-05-07,2.5,...,2,0,245.0,210.0,1910,Yarra,-37.8024,144.9993,Northern Metropolitan,4019
4,7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,2016-10-08,2.5,...,1,2,256.0,107.0,1890,Yarra,-37.806,144.9954,Northern Metropolitan,4019


In [4]:
# Expectation 1 : Column `house_id` has to be unique

validator.expect_column_values_to_be_unique('house_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 6196,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# Expectation 2 : Column `rooms` must be in range 1-10
validator.expect_column_values_to_be_between(
    column='rooms', min_value=1, max_value=10
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 6196,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# Expectation 3 : Column `bathroom` must contain one of the following number of list :
validator.expect_column_values_to_be_in_set('bathroom', [1, 2, 3, 4, 5, 6, 7, 8])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 6196,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 4 : Column 'price' must be integer or float
validator.expect_column_values_to_be_in_type_list('price', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 5: Column 'date' must be strings representing a date or time with '%Y-%m-%d' format
validator.expect_column_values_to_match_strftime_format('date', '%Y-%m-%d')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 6196,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 6: Expect the set of distinct Column 'type' values to equal a given set. 
validator.expect_column_distinct_values_to_equal_set('type', ['h', 'u', 't'])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "result": {
    "observed_value": [
      "h",
      "t",
      "u"
    ],
    "details": {
      "value_counts": [
        {
          "value": "h",
          "count": 4088
        },
        {
          "value": "t",
          "count": 602
        },
        {
          "value": "u",
          "count": 1506
        }
      ]
    }
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 7: Expect the Column 'type' entries to be strings with length equal to 1
validator.expect_column_value_lengths_to_equal('type', 1)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 6196,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}