In [37]:
import great_expectations as ge
import pandas as pd


def clean_data(data, results):
    for result in results['results']:
        if not result['success']:
            expectation_type = result['expectation_config']['expectation_type']
            column = result['expectation_config']['kwargs']['column']

            if expectation_type == 'expect_column_values_to_not_be_null':
                if data[column].dtype == 'float64' or data[column].dtype == 'int64':
                    data[column].fillna(data[column].mean(), inplace=True)
                else:
                    data[column].fillna('UNKNOWN', inplace=True)
            
            elif expectation_type == 'expect_column_values_to_be_of_type':
                desired_type = result['expectation_config']['kwargs']['type_']
                try:
                    if desired_type == 'int64':
                        data[column] = data[column].astype('int64')
                    elif desired_type == 'float':
                        data[column] = data[column].astype('float')
                except Exception as e:
                    print(f"Error converting {column} to {desired_type}: {e}")

In [38]:
import great_expectations as ge
    
def validate_and_clean_tweets(tweets):
    tweets_ge = ge.dataset.PandasDataset(tweets)
    
    # Define expectations for tweets
    tweets_ge.expect_column_values_to_not_be_null("tweet")
    tweets_ge.expect_column_values_to_be_of_type("tweet", "object")
    tweets_ge.expect_column_values_to_match_regex("tweet", r".+")
    
    # Initial validation
    results = tweets_ge.validate()

    # Always define revalidation_results before using it
    revalidation_results = results  # Assuming initial results as default

    if not results['success']:
        clean_data(tweets, results)  # Assuming clean_data function modifies 'tweets' in-place
        tweets_ge = ge.dataset.PandasDataset(tweets)  # Reload dataset for revalidation
        revalidation_results = tweets_ge.validate()  # Revalidate after cleaning

    print(revalidation_results)
    return tweets




In [39]:
def validate_and_clean_labels(labels):
    labels_ge = ge.dataset.PandasDataset(labels)
    
    # Define expectations for labels
    labels_ge.expect_column_values_to_be_in_set("label", [0, 1, 2])
    labels_ge.expect_column_values_to_not_be_null("label")
    labels_ge.expect_column_values_to_be_of_type("label", "int64")
    
    # Initial validation
    results = labels_ge.validate()
    if not results['success']:
        clean_data(labels, results)
        labels_ge = ge.dataset.PandasDataset(labels)  
        revalidation_results = labels_ge.validate()
        
    print(revalidation_results)
    return labels


In [41]:
tweets = pd.read_csv('../test_text.txt', sep='\t', header=None, names=['tweet'])
labels = pd.read_csv('../test_labels.txt', sep='\t', header=None, names=['label'])

validated_tweets = validate_and_clean_tweets(tweets)
validated_labels = validate_and_clean_labels(labels)

validated_tweets.to_csv('validated_tweets.txt', sep='\t', header=True, index=False)
validated_labels.to_csv('validated_labels.txt', sep='\t', header=True, index=False)

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "tweet",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 12032,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "column": "tweet",
          "type_": "object",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "observed_value": "object_"
      },
      

UnboundLocalError: local variable 'revalidation_results' referenced before assignment