### Task 1: Validate Data with a Custom Expectation in Great Expectations
**Description**: Create a custom expectation and validate data with Great Expectations.

**Load a sample DataFrame**

data = {
'age': [25, 30, 35, 40, 45],
'income': [50000, 60000, 75000, None, 100000]
}

In [1]:
import pandas as pd
import great_expectations as ge

# Step 1: Sample data
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}
df = pd.DataFrame(data)

# Step 2: Create an in-memory context (EphemeralContext)
context = ge.data_context.DataContext.create_ephemeral_context()

# Step 3: Create a new expectation suite
suite_name = "custom_suite"
suite = context.create_expectation_suite(suite_name=suite_name, overwrite_existing=True)

# Step 4: Create a Validator for the DataFrame
validator = context.create_validator(
    batch_kwargs={"batch_data": df},
    expectation_suite=suite
)

# Step 5: Add Expectations
# Expect the 'age' column to not have any null values
validator.expect_column_values_to_not_be_null("age")

# Expect the 'income' column to not have any null values
validator.expect_column_values_to_not_be_null("income")

# Step 6: Validate and get the results
results = validator.validate()

# Step 7: Print the validation results
if results["success"]:
    print("✅ Validation Passed")
else:
    print("❌ Validation Failed")

# Optionally, print details of the validation results
for result in results["results"]:
    print(f" - Expectation: {result['expectation_config']['expectation_type']}")
    print(f"   Column: {result['expectation_config']['kwargs']['column']}")
    print(f"   Success: {result['success']}")


AttributeError: module 'great_expectations.data_context' has no attribute 'DataContext'

### Task 2: Implement a Basic Alert System for Data Quality Drops
**Description**: Set up a basic alert system that triggers when data quality drops.

In [None]:
# Write your code from here
import pandas as pd

# Sample data
data = {
    'name': ['Alice', 'Bob', None, 'David', 'Eva'],
    'age': [25, None, 35, 40, None],
    'income': [50000, 60000, 75000, None, 100000]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Function to calculate DQI (valid entries / total entries) for each column
def calculate_dqi(df):
    dqi_scores = {}
    for col in df.columns:
        total = len(df[col])
        valid = df[col].notnull().sum()
        dqi = valid / total if total > 0 else 0
        dqi_scores[col] = round(dqi, 2)
    return dqi_scores

# Set threshold for alert (e.g., 0.8 means 80% data completeness required)
DQI_THRESHOLD = 0.8

# Calculate DQI for each column
dqi_scores = calculate_dqi(df)

# Check for alerts
alerts = []
for col, dqi in dqi_scores.items():
    if dqi < DQI_THRESHOLD:
        alerts.append(f"⚠️ Alert: Data Quality for '{col}' dropped to {dqi*100:.0f}%")

# Print results
print("📊 Data Quality Index (DQI) Scores:")
for col, score in dqi_scores.items():
    print(f"  - {col}: {score*100:.0f}%")

# Print alerts if any
if alerts:
    print("\n🚨 Alerts:")
    for alert in alerts:
        print(alert)
else:
    print("\n✅ All columns meet the data quality threshold.")


### Task 3: Real-time Data Quality Monitoring with Python and Great Expectations
**Description**: Implement a system that monitors data quality in real-time.

In [2]:
# Write your code from here
import pandas as pd
import time
import great_expectations as gx
from great_expectations.validator.validator import Validator
from great_expectations.expectations.core import ExpectColumnValuesToNotBeNull

# Simulated data batches (as if coming from a stream)
data_batches = [
    pd.DataFrame({'id': [1, 2, 3], 'age': [25, 30, 35]}),
    pd.DataFrame({'id': [4, 5, 6], 'age': [None, 28, None]}),
    pd.DataFrame({'id': [7, 8, 9], 'age': [40, 45, 50]}),
]

# Initialize Great Expectations Data Context in-memory
context = gx.get_context(mode="ephemeral")

# Create an expectation suite
suite_name = "real_time_data_quality_suite"
context.add_expectation_suite(expectation_suite_name=suite_name)

# Add expectation (e.g., age column should not have null values)
context.save_expectation_suite(
    expectation_suite=context.get_expectation_suite(suite_name)
)

# Real-time validation loop
for i, batch in enumerate(data_batches):
    print(f"\n📥 Processing batch {i + 1}")
    
    validator: Validator = Validator(df=batch, expectation_suite=context.get_expectation_suite(suite_name))

    # Apply expectation
    validator.expect_column_values_to_not_be_null("age")
    
    # Run validation
    results = validator.validate()
    
    # Display summary
    success = results["success"]
    if success:
        print("✅ Data Quality Check Passed")
    else:
        print("❌ Data Quality Issue Detected")
        failed_expectations = results["results"]
        for exp in failed_expectations:
            if not exp["success"]:
                print(f"  - Failed: {exp['expectation_config']['expectation_type']}")


AttributeError: 'EphemeralDataContext' object has no attribute 'add_expectation_suite'