In [1]:
import great_expectations as gx
from great_expectations.data_context import FileDataContext
import pandas as pd
import os

# Initialize the DataContext
if not os.path.exists('../services/gx'):
    FileDataContext(project_root_dir = "../services")
context = gx.get_context(project_root_dir = "../services")

# Add or update the pandas datasource
ds = context.sources.add_or_update_pandas(name = "pandas_datasource1")

da1 = ds.add_csv_asset(
    name = "csv_file1",
    filepath_or_buffer="../data/dvc/sample.csv"
)

# Build batch request
batch_request = da1.build_batch_request()

# Create a new expectation suite
suite_name = "initial_feature_validation"
context.add_or_update_expectation_suite(suite_name)

# Get validator for the batch and expectation suite
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name
)

# Step 3: Create expectations to validate all features
# Load data
# data = pd.read_csv('../data/dvc/sample.csv')

In [2]:
# Define expectations
validator.expect_column_to_exist("text")
validator.expect_column_to_exist("label")

validator.expect_column_values_to_not_be_null("text")
validator.expect_column_values_to_not_be_null("label")

validator.expect_column_values_to_be_of_type("text", "str")
validator.expect_column_values_to_be_of_type("label", "int")

validator.expect_column_value_lengths_to_be_between("text", min_value=1)  # No empty strings allowed

validator.expect_column_values_to_be_in_set("label", [0, 1, 2])

validator.expect_table_row_count_to_be_between(min_value=1)  # Ensure the table has rows

validator.expect_table_columns_to_match_ordered_list(["text", "label"])

validator.expect_column_values_to_be_unique("text")  # Optional: if each text is expected to be unique

# Check for duplicates
# validator.expect_table_row_count_to_equal(validator.drop_duplicates().shape[0])

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 5027,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [3]:
# Save expectations to suite
validator.save_expectation_suite(discard_failed_expectations=False)

In [4]:
# validate expectations
checkpoint = context.add_or_update_checkpoint(
    name = "initial_data_validation_checkpoint",
    validations=[
        {
            "batch_request":batch_request,
            "expectation_suite_name" : suite_name
        }
    ]
)

In [5]:
results = context.run_checkpoint(checkpoint_name="initial_data_validation_checkpoint")

# Print detailed validation results
print("Validation success:", results.success)
for result in results["run_results"].values():
    validation_result = result["validation_result"]
    for res in validation_result["results"]:
        expectation = res["expectation_config"]["expectation_type"]
        success = res["success"]
        print(f"Expectation {expectation}: {'SUCCESS' if success else 'FAILURE'}")
        if not success:
            print(f"Details: {res['result']}")

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Validation success: True
Expectation expect_column_to_exist: SUCCESS
Expectation expect_column_values_to_not_be_null: SUCCESS
Expectation expect_column_values_to_be_of_type: SUCCESS
Expectation expect_column_value_lengths_to_be_between: SUCCESS
Expectation expect_column_values_to_be_unique: SUCCESS
Expectation expect_column_to_exist: SUCCESS
Expectation expect_column_values_to_not_be_null: SUCCESS
Expectation expect_column_values_to_be_of_type: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_table_row_count_to_be_between: SUCCESS
Expectation expect_table_columns_to_match_ordered_list: SUCCESS


In [3]:
import great_expectations as gx
from great_expectations.data_context import FileDataContext
import pandas as pd
import os

context = gx.get_context(project_root_dir = "../services")
results = context.run_checkpoint(checkpoint_name="initial_data_validation_checkpoint")

# Print detailed validation results
print("Validation success:", results.success)
for result in results["run_results"].values():
    validation_result = result["validation_result"]
    for res in validation_result["results"]:
        expectation = res["expectation_config"]["expectation_type"]
        success = res["success"]
        print(f"Expectation {expectation}: {'SUCCESS' if success else 'FAILURE'}")
        if not success:
            print(f"Details: {res['result']}")

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Validation success: True
Expectation expect_column_to_exist: SUCCESS
Expectation expect_column_values_to_not_be_null: SUCCESS
Expectation expect_column_values_to_be_of_type: SUCCESS
Expectation expect_column_value_lengths_to_be_between: SUCCESS
Expectation expect_column_values_to_be_unique: SUCCESS
Expectation expect_column_to_exist: SUCCESS
Expectation expect_column_values_to_not_be_null: SUCCESS
Expectation expect_column_values_to_be_of_type: SUCCESS
Expectation expect_column_values_to_be_in_set: SUCCESS
Expectation expect_table_row_count_to_be_between: SUCCESS
Expectation expect_table_columns_to_match_ordered_list: SUCCESS
