# Greate Expectation Data Validation

In [1]:
import pandas as pd
import great_expectations as ge

## 1. Importing data

In [8]:
data = pd.read_json('https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json').head(100)
print(data)

                              image  \
0   flickr30k-images/1007129816.jpg   
1   flickr30k-images/1009434119.jpg   
2    flickr30k-images/101362133.jpg   
3    flickr30k-images/102617084.jpg   
4     flickr30k-images/10287332.jpg   
..                              ...   
95    flickr30k-images/18638572.jpg   
96  flickr30k-images/1874530310.jpg   
97  flickr30k-images/1881494074.jpg   
98  flickr30k-images/1916798494.jpg   
99  flickr30k-images/1921102799.jpg   

                                              caption  
0   [The man with pierced ears is wearing glasses ...  
1   [A black and white dog is running in a grassy ...  
2   [A young female student performing a downward ...  
3   [Five snowmobile riders all wearing helmets an...  
4   [Two men sitting on the roof of a house while ...  
..                                                ...  
95  [A young boy wearing a white tunic, white pant...  
96  [Three construction workers are digging a whol...  
97  [A woman takes a pictur

## 2. Convert it to a Pandas DataFrame

In [16]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,image,caption
0,flickr30k-images/1007129816.jpg,[The man with pierced ears is wearing glasses ...
1,flickr30k-images/1009434119.jpg,[A black and white dog is running in a grassy ...
2,flickr30k-images/101362133.jpg,[A young female student performing a downward ...
3,flickr30k-images/102617084.jpg,[Five snowmobile riders all wearing helmets an...
4,flickr30k-images/10287332.jpg,[Two men sitting on the roof of a house while ...


## 3. Convert the DataFrame to a Great Expectations dataset

In [19]:
df.info()
dataset = ge.from_pandas(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   image    100 non-null    object
 1   caption  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [20]:
type(dataset)

great_expectations.dataset.pandas_dataset.PandasDataset

## 4. Validate the Structure of Each Record
### Ensure that each record in your dataset contains an "image" field and a "caption" field, and that the "caption" field is a list.

In [25]:
dataset.expect_column_to_exist("image")
dataset.expect_column_to_exist("caption")
dataset.expect_column_values_to_be_of_type("caption", "list")

{
  "success": true,
  "result": {
    "element_count": 100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 5. Check Image Filenames
### Verify that the image filenames follow the expected pattern, e.g., located in the "flickr30k-images" folder and have a ".jpg" extension.

In [27]:
dataset.expect_column_values_to_match_regex('image', r'^flickr30k-images\/.+\.jpg$')

{
  "success": true,
  "result": {
    "element_count": 100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 6. Check for unique images

In [38]:
dataset.expect_column_values_to_be_unique("image")
dataset.expect_column_values_to_be_unique("caption")

{
  "success": true,
  "result": {
    "element_count": 100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## 7. Validate Captions for Non-Emptiness
### Check that each caption list has at least one caption and doesn't exceed a maximum of 5 captions

In [40]:
def expect_list_of_strings_to_not_be_empty(column_values):
    all_non_empty = all(all(caption.strip() for caption in captions) for captions in column_values)
    return {
        "success": all_non_empty
    }

# Run validation for the defined expectations
results = dataset.validate()
print(results)

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_to_exist",
        "kwargs": {
          "column": "image",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_match_regex",
        "kwargs": {
          "column": "image",
          "regex": "^flickr30k-images\\/.+\\.jpg$",
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "element_count": 100,
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent