# 1. Instantiate Data Context

In [1]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# 2. Connect to a datasource

In [None]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-projekfinal'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'komoditas-pangan'
path_to_data = '.\dags\data_clean_interpolasi.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# 3. create an expectation suite

In [3]:
# Creat an expectation suite
expectation_suite_name = 'expectation-projekfinal-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,date,komoditas,provinsi,harga
0,2021-02-18,Beras Medium,Jawa Timur,10600.0
1,2021-02-19,Beras Medium,Jawa Timur,10594.736842
2,2021-02-20,Beras Medium,Jawa Timur,10589.473684
3,2021-02-21,Beras Medium,Jawa Timur,10584.210526
4,2021-02-22,Beras Medium,Jawa Timur,10578.947368


# Expectations
### 1. To be Unique

Dalam dataset ini, kolom tanggal dan komoditas tidak bersifat unik karena masing-masing mewakili satu dimensi informasi. Kolom tanggal mencerminkan waktu pencatatan harga, sehingga wajar jika banyak entri memiliki tanggal yang sama, terutama ketika data dikumpulkan dari berbagai provinsi dan komoditas. Demikian pula, komoditas hanya menunjukkan jenis barang, bukan identitas unik untuk tiap baris data. Oleh karena itu, keunikan baris tidak bisa dilihat dari satu kolom saja, melainkan dari kombinasi beberapa kolom seperti komoditas, provinsi, dan tanggal.

Sehingga, perlu dibuat kolom baru yang berisi kolom-kolom yang sudah ada, untuk mengecek apakah data ini benar-benar tidak memiliki baris yang duplikat

In [4]:
df = validator.active_batch.data.dataframe

# Tambah kolom unique_key berbasis kombinasi kolom
df['unique_key'] = (
    df['komoditas'].astype(str) + "_" +
    df['provinsi'] + "_" +
    df['date'].astype(str)
)

# Buat validator baru dari DataFrame yang telah dimodifikasi
temp_validator = context.sources.pandas_default.read_dataframe(df)

# Jalankan expectation untuk pastikan baris benar-benar unik
temp_validator.expect_column_values_to_be_unique('unique_key')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

### 2. To be not null

In [5]:
# Expectation 1 : Column `harga` can not contain missing values

validator.expect_column_values_to_not_be_null('harga')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

In [6]:
# Expectation 1 : Column `komoditas` can not contain missing values

validator.expect_column_values_to_not_be_null('komoditas')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

In [7]:
# Expectation 1 : Column `date` can not contain missing values

validator.expect_column_values_to_not_be_null('date')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

In [8]:
# Expectation 1 : Column `provinsi` can not contain missing values

validator.expect_column_values_to_not_be_null('provinsi')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

### 3. To be in set

In [9]:
validator.expect_column_values_to_be_in_set(
    'provinsi',
    ['Banten', 'DKI Jakarta', 'Jawa Barat', 'Jawa Tengah', 'Jawa Timur', 'D.I Yogyakarta']
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

### 4. To match strftime format

In [10]:
validator.expect_column_values_to_match_strftime_format('date', "%Y-%m-%d")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

### 5. To be between

In [None]:
validator.expect_column_values_to_be_between(
    column='harga',
    min_value=7000,
    max_value=52000
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 37440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

Berdasarkan distribusi statistik, rentang harga komoditas berkisar antara 7.480 hingga 50.000. Oleh karena itu, batas validasi ditetapkan dari 7.000 hingga 52.000 untuk mencakup nilai sah dan mendeteksi anomali ekstrem tanpa memicu false positive.

### 6. To equal set

In [13]:
validator.expect_column_distinct_values_to_equal_set(
    column='komoditas',
    value_set=[
        'Beras Medium',
        'Daging Ayam Ras',
        'Telur Ayam Ras',
        'Minyak Goreng Kemasan'
    ]
)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      "Beras Medium",
      "Daging Ayam Ras",
      "Minyak Goreng Kemasan",
      "Telur Ayam Ras"
    ],
    "details": {
      "value_counts": [
        {
          "value": "Beras Medium",
          "count": 9360
        },
        {
          "value": "Daging Ayam Ras",
          "count": 9360
        },
        {
          "value": "Minyak Goreng Kemasan",
          "count": 9360
        },
        {
          "value": "Telur Ayam Ras",
          "count": 9360
        }
      ]
    }
  },
  "success": true
}