In [8]:
pip install great_expectations

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import great_expectations as gx
import pandas as pd

In [6]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
)
df

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-15 03:36:12,2019-01-15 03:42:19,1,1.00,1,N,230,48,1,6.5,0.5,0.5,1.95,0.0,0.3,9.75,
1,1,2019-01-25 18:20:32,2019-01-25 18:26:55,1,0.80,1,N,112,112,1,6.0,1.0,0.5,1.55,0.0,0.3,9.35,0.0
2,1,2019-01-05 06:47:31,2019-01-05 06:52:19,1,1.10,1,N,107,4,2,6.0,0.0,0.5,0.00,0.0,0.3,6.80,
3,1,2019-01-09 15:08:02,2019-01-09 15:20:17,1,2.50,1,N,143,158,1,11.0,0.0,0.5,3.00,0.0,0.3,14.80,
4,1,2019-01-25 18:49:51,2019-01-25 18:56:44,1,0.80,1,N,246,90,1,6.5,1.0,0.5,1.65,0.0,0.3,9.95,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,2019-01-02 07:48:44,2019-01-02 08:00:13,6,1.07,1,N,50,161,2,8.5,0.0,0.5,0.00,0.0,0.3,9.30,
9996,2,2019-01-16 19:06:45,2019-01-16 19:10:05,6,0.35,1,N,234,234,1,4.0,1.0,0.5,1.16,0.0,0.3,6.96,
9997,2,2019-01-02 09:10:44,2019-01-02 09:36:46,6,4.12,1,N,50,236,1,20.0,0.0,0.5,6.24,0.0,0.3,27.04,
9998,2,2019-01-03 13:28:36,2019-01-03 13:36:42,6,1.17,1,N,137,234,1,7.0,0.0,0.5,0.90,0.0,0.3,8.70,


# Using Great Expectations

In [5]:
## Example Setup

context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="passenger_count", min_value=1, max_value=6
)

validation_result = batch.validate(expectation)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

#**To-do:**

1.   Fetch any dataset from online source. I'll recommend using reddit API used in the assignment #1.
2.   Run 5 expectations of your choice to validate the dataset. They should cover row, column, multi-column, table, distribution operation.
3.   Submit the notebook file on LMS before Monday 11:55 PM.
4.   Mention your group number in the name of the file.



In [None]:
import great_expectations as gx
import pandas as pd

# Fetch dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
)

# Setup Great Expectations
context = gx.get_context()

# Create or load an expectation suite
suite_name = "my_expectation_suite"
suite = context.add_expectation_suite(suite_name)

# Create a Validator using PandasDatasource
validator = context.sources.pandas_default.read_dataframe(df)

# Define Expectations

#Row Expectation
validator.expect_table_row_count_to_be_greater_than(value=10000)  # At least 10,000 rows 

#Column Level Expectation
validator.expect_column_values_to_be_between(column="passenger_count", min_value=1, max_value=6)  #Passenger count range

# Multi-column expectation
validator.expect_multicolumn_values_to_be_ordered(column_list=["pickup_datetime", "dropoff_datetime"])  # Pickup before dropoff

# expected columns expectation
validator.expect_table_columns_to_match_set(column_set=list(df.columns))  # Ensuring expected columns exist

# distributional expectation
validator.expect_column_values_to_be_between(column="trip_distance", min_value=0.1, max_value=50)  # Trip distance range

# Save the expectation suite
context.save_expectation_suite(suite, suite_name)

# Validate the data
validation_result = validator.validate()

# Print validation results
print(validation_result)
