# Great Expectation

---

Notebook ini digunakan untuk proses evaluasi great expectations terhadap clean data

---

In [1]:
# Import libraries

import pandas as pd
import great_expectations as ge
from great_expectations.checkpoint import SimpleCheckpoint

In [2]:
# Load GX context
context = ge.get_context()

# Add datasource
datasource = context.sources.add_or_update_pandas(name="shoes_datasource")

# Load CSV and make asset
csv_file = "./dataset/data_clean_raw_reviews.csv"  
asset = datasource.add_csv_asset(name="shoes_asset", filepath_or_buffer=csv_file)

# Batch Request
batch_request = asset.build_batch_request()

# Expectation Suite
suite_name = "shoes_suite"
suite = context.add_or_update_expectation_suite(expectation_suite_name=suite_name)

# Create Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name)

# checking expectations

In [3]:
# No missing value

validator.expect_column_values_to_not_be_null('product')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 609,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [4]:
# in between min and max value

validator.expect_column_values_to_be_between('rating', min_value=1, max_value=5)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 609,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# rating must exist

validator.expect_column_to_exist(column='rating')

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# brand must be in the set

valid_brand = ['Adidas', 'Nike', 'Reebok', 'New Balance', 'PUMA']
validator.expect_column_values_to_be_in_set('brand', valid_brand)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 609,
    "unexpected_count": 128,
    "unexpected_percent": 21.01806239737274,
    "partial_unexpected_list": [
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma",
      "Puma"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 21.01806239737274,
    "unexpected_percent_nonmissing": 21.01806239737274
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# price must be numerical

validator.expect_column_values_to_be_in_type_list('price', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# regex must be matched

validator.expect_column_values_to_match_regex(
    'product',
    r'(?i)^(Adidas|New Balance|Nike|PUMA|Reebok)$'
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 609,
    "unexpected_count": 608,
    "unexpected_percent": 99.83579638752053,
    "partial_unexpected_list": [
      "Adidas Fortarun 2.0 Cloudfoam Lace",
      "Adidas Men's Lightblaze",
      "Adidas Men's Lightshift",
      "Adidas Men's Run Falcon 5",
      "Adidas Men's X_PLR Path",
      "Adidas Unisex-Adult Ultraboost 1.0",
      "Adidas Women's Ultrarun 5",
      "Adidas Womens Swift Run 22",
      "NIKE Men's Air Flight Lite Mid",
      "NIKE Men's Backboard II Mid Basketball",
      "NIKE Men's Low",
      "NIKE Men's RunAllDay",
      "NIKE Men's Sport",
      "NIKE Women's Tanjun",
      "NIKE Women's W Zoom Bella 6 Trainers",
      "NIKE Women's WMNS Zoomx Ultrafly",
      "New Balance Baby-Boy's 530 Bungee",
      "New Balance Boy's 574 V1 Lace-up",
      "New Balance Girl's",
      "New Balance Logic Composite Toe for Men - Non-Slip"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percen

In [9]:
# maximum value of rating

validator.expect_column_max_to_be_between('rating', 5)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 5.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# saving report

In [10]:
# Save into Expectation Suite
validator.save_expectation_suite(discard_failed_expectations=False)

In [11]:
# Create a Checkpoint
checkpoint = SimpleCheckpoint(
    name="shoes_checkpoint",
    data_context=context,
    validator=validator
)

result = checkpoint.run()
print(result)

Calculating Metrics:   0%|          | 0/28 [00:00<?, ?it/s]

{
  "run_id": {
    "run_name": null,
    "run_time": "2025-10-09T20:54:51.890690+07:00"
  },
  "run_results": {
    "ValidationResultIdentifier::shoes_suite/__none__/20251009T135451.890690Z/shoes_datasource-shoes_asset": {
      "validation_result": {
        "success": false,
        "results": [
          {
            "success": true,
            "expectation_config": {
              "expectation_type": "expect_column_values_to_not_be_null",
              "kwargs": {
                "column": "product",
                "batch_id": "shoes_datasource-shoes_asset"
              },
              "meta": {}
            },
            "result": {
              "element_count": 609,
              "unexpected_count": 0,
              "unexpected_percent": 0.0,
              "partial_unexpected_list": [],
              "partial_unexpected_counts": [],
              "partial_unexpected_index_list": []
            },
            "meta": {},
            "exception_info": {
              "raise