In [1]:
import great_expectations as ge
import pandas as pd

# Load datasets
tweets = pd.read_csv('../test_text.txt', sep='\t', header=None, names=['tweet'])
labels = pd.read_csv('../test_labels.txt', sep='\t', header=None, names=['label'])

# Combine into a single DataFrame
data = pd.concat([tweets, labels], axis=1)

In [2]:
print(data.describe())

              label
count  12284.000000
mean       0.869993
std        0.706985
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        2.000000


In [3]:
# Display information about data types and missing values
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12284 entries, 0 to 12283
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   12032 non-null  object
 1   label   12284 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 192.1+ KB
None


In [21]:
import great_expectations as ge

# Convert to a Great Expectations dataset
data_ge = ge.dataset.PandasDataset(data)

# Setting up expectations
data_ge.expect_column_values_to_be_in_set("label", [0, 1, 2], mostly=0.7) 
data_ge.expect_column_values_to_not_be_null("tweet")
data_ge.expect_column_values_to_be_of_type("tweet", "object")  
data_ge.expect_column_values_to_match_regex("tweet", r".+",mostly=0.9)  
data_ge.expect_column_values_to_not_be_null("label")
data_ge.expect_column_values_to_be_of_type("label", "int64") 

# Saving the expectation suite
data_ge.save_expectation_suite("data_expectations.json")

# Validate the dataset
results = data_ge.validate()
print(results)

# Handling validation results
if not results["success"]:
    print("Data validation failed.")
else:
    print("Data validation passed.")



{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "column": "label",
          "value_set": [
            0,
            1,
            2
          ],
          "mostly": 0.7,
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 12284,
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "