In [7]:
import great_expectations as ge
import pandas as pd

tweets = pd.read_csv('../test_text.txt', sep='\t', header=None, names=['tweet'])
labels = pd.read_csv('../test_labels.txt', sep='\t', header=None, names=['label'])

data = pd.concat([tweets, labels], axis=1)

In [8]:
print(data.describe())

              label
count  12284.000000
mean       0.869993
std        0.706985
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        2.000000


In [9]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12284 entries, 0 to 12283
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   12032 non-null  object
 1   label   12284 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 192.1+ KB
None


In [10]:
import great_expectations as ge

data_ge = ge.dataset.PandasDataset(data)

data_ge.expect_column_values_to_be_in_set("label", [0, 1, 2])
data_ge.expect_column_values_to_not_be_null("tweet")
data_ge.expect_column_values_to_be_of_type("tweet", "object")
data_ge.expect_column_values_to_match_regex("tweet", r".+")
data_ge.expect_column_values_to_not_be_null("label")
data_ge.expect_column_values_to_be_of_type("label", "int64")

data_ge.save_expectation_suite("data_expectations.json")

results = data_ge.validate()
print(results)

if not results["success"]:
    print("Data validation failed.")
else:
    print("Data validation passed.")



{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "column": "label",
          "value_set": [
            0,
            1,
            2
          ],
          "mostly": 0.7,
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 12284,
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        

In [11]:
for result in results['results']:
        if not result['success']:
            expectation_type = result['expectation_config']['expectation_type']
           
            if expectation_type == 'expect_column_values_to_not_be_null':
                column = result['expectation_config']['kwargs']['column']
                if data[column].dtype == 'float64' or data[column].dtype == 'int64':
                    data[column].fillna(data[column].mean(), inplace=True)
                else:
                    data[column].fillna('UNKNOWN', inplace=True)
            
            elif expectation_type == 'expect_column_values_to_be_of_type':
                column = result['expectation_config']['kwargs']['column']
                desired_type = result['expectation_config']['kwargs']['type_']
                try:
                    if desired_type == 'int64':
                        data[column] = data[column].astype('int64')
                    elif desired_type == 'float':
                        data[column] = data[column].astype('float')
                except Exception as e:
                    print(f"Error converting {column} to {desired_type}: {e}")
            




In [13]:
revalidation_results = data_ge.validate()
print(revalidation_results)
if not revalidation_results["success"]:
    print("Data validation failed.")
else:
    print("Data validation passed.")

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "column": "label",
          "value_set": [
            0,
            1,
            2
          ],
          "mostly": 0.7,
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 12284,
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "