### Task 1: Understanding and Defining Data Quality Metrics
**Description**: Learn how to define basic data quality metrics such as completeness, validity, and uniqueness for a simple dataset.

**Steps**:
1. Dataset: Use a CSV with columns like Name , Email , Age .
2. Metric Definitions:
    - Completeness: Percentage of non-null values.
    - Validity: % of email fields containing @ .
    - Uniqueness: Count distinct entries in the Email column.

In [1]:
# Write your code from here
import pandas as pd

def calculate_completeness(series):
    return series.count() / len(series) if len(series) > 0 else 0.0

def calculate_validity_email(series):
    valid_count = series.astype(str).str.contains('@').sum()
    return valid_count / len(series) if len(series) > 0 else 0.0

def calculate_uniqueness(series):
    return series.nunique()

data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', None],
        'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david.example', 'eve@example.com', ''],
        'Age': [25, 30, 22, None, 28, 31]}
df = pd.DataFrame(data)

completeness_name = calculate_completeness(df['Name'])
completeness_email = calculate_completeness(df['Email'])
completeness_age = calculate_completeness(df['Age'])

validity_email = calculate_validity_email(df['Email'])

uniqueness_email = calculate_uniqueness(df['Email'])

print(f"Completeness - Name: {completeness_name:.2f}")
print(f"Completeness - Email: {completeness_email:.2f}")
print(f"Completeness - Age: {completeness_age:.2f}")
print(f"Validity - Email: {validity_email:.2f}")
print(f"Uniqueness - Email: {uniqueness_email}")

Completeness - Name: 0.83
Completeness - Email: 1.00
Completeness - Age: 0.83
Validity - Email: 0.67
Uniqueness - Email: 6


### Task 2: Calculating Data Quality Score
**Description**: Aggregate multiple metrics to calculate an overall data quality score.

**Steps**:
1. Formula: Simple average of all metrics defined in Task 1.

In [2]:
# Write your code from here
import pandas as pd

def calculate_completeness(series):
    return series.count() / len(series) if len(series) > 0 else 0.0

def calculate_validity_email(series):
    valid_count = series.astype(str).str.contains('@').sum()
    return valid_count / len(series) if len(series) > 0 else 0.0

def calculate_uniqueness(series):
    return series.nunique() / len(series) if len(series) > 0 else 0.0

data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', None],
        'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david.example', 'eve@example.com', ''],
        'Age': [25, 30, 22, None, 28, 31]}
df = pd.DataFrame(data)

completeness_name = calculate_completeness(df['Name'])
completeness_email = calculate_completeness(df['Email'])
completeness_age = calculate_completeness(df['Age'])

validity_email = calculate_validity_email(df['Email'])

uniqueness_email = calculate_uniqueness(df['Email'])

data_quality_score = (completeness_name + completeness_email + completeness_age + validity_email + uniqueness_email) / 5

print(f"Completeness - Name: {completeness_name:.2f}")
print(f"Completeness - Email: {completeness_email:.2f}")
print(f"Completeness - Age: {completeness_age:.2f}")
print(f"Validity - Email: {validity_email:.2f}")
print(f"Uniqueness - Email: {uniqueness_email:.2f}")
print(f"Overall Data Quality Score: {data_quality_score:.2f}")

Completeness - Name: 0.83
Completeness - Email: 1.00
Completeness - Age: 0.83
Validity - Email: 0.67
Uniqueness - Email: 1.00
Overall Data Quality Score: 0.87


### Task 3: Creating Expectations for a CSV
**Description**: Develop basic data quality expectations using Great Expectations.

**Steps**:
1. Expectation Suite
2. Define Expectations for Completeness

In [3]:
import great_expectations as ge
from great_expectations.expectations.core import ExpectationConfiguration
from great_expectations.expectations.dataset import Dataset

# Step 1: Initialize the context
context = ge.data_context.DataContext("/path/to/your/great_expectations/directory")  # Replace with the actual path

# Step 2: Create an expectation suite (if it doesn't exist)
suite_name = "data_quality_suite"
suite = context.create_expectation_suite(suite_name, overwrite_existing=True)

# Step 3: Define the expectations
expectation_1 = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null", 
    kwargs={"column": "customer_id"}
)

expectation_2 = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set", 
    kwargs={"column": "status", "value_set": ["active", "inactive"]}
)

# Add expectations to the suite
suite.add_expectation(expectation_1)
suite.add_expectation(expectation_2)

# Step 4: Save the expectation suite
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=suite_name)

print(f"Expectation Suite '{suite_name}' has been created and saved.")


ImportError: cannot import name 'ExpectationConfiguration' from 'great_expectations.expectations.core' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/expectations/core/__init__.py)

### Task 4: Running and Validating Expectations
**Description**: Run the created expectations and generate an output report.

**Steps**:
1. Validate
2. Generate HTML Report

In [None]:
# Write your code from here
import great_expectations as ge
from great_expectations.core import ExpectationConfiguration
import os

# Initialize the context
context = ge.get_context()

# Step 1: Define the batch kwargs to load data from a CSV file
batch_kwargs = {
    "datasource": "my_csv_datasource",  # This should be configured in your Great Expectations project
    "data_connector": "default_inferred_data_connector_name",  # Default connector for CSV
    "data_asset_name": "your_file.csv",  # Replace with your actual CSV file name
}

# Step 2: Get a batch of data using the batch kwargs
batch = context.get_batch_list(batch_request=batch_kwargs)[0]

# Step 3: Load the Expectation Suite that was previously created
expectation_suite_name = "my_csv_completeness_suite"
suite = context.get_expectation_suite(expectation_suite_name)

# Step 4: Create a Validator
validator = context.create_validator(
    batch_kwargs=batch_kwargs,
    expectation_suite=suite
)

# Step 5: Run the validation and store results
validation_result = validator.validate()

# Step 6: Generate the HTML Report for validation results
validation_result_html = validation_result.to_json_dict()

# Save the validation results as an HTML file
output_dir = "ge_output_reports"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "validation_report.html")

with open(output_file, "w") as f:
    f.write(validation_result_html['expectations_result']['expectations'][0]['result']['details']['formatted'])

print(f"HTML report saved at: {output_file}")



### Task 5: Automating Data Quality Score Calculation
**Description**: Automate the data quality score via a script that integrates with Great
Expectations.

In [None]:
# Write your code from here
import great_expectations as ge
import pandas as pd

# Step 1: Initialize Great Expectations Context
context = ge.get_context()

# Step 2: Define the Data Source and Batch Kwargs (CSV)
batch_kwargs = {
    "datasource": "my_csv_datasource",  # Replace with the name of your data source
    "data_connector": "default_inferred_data_connector_name",
    "data_asset_name": "your_file.csv",  # Replace with your actual CSV file name
}

# Step 3: Load the Data
df = pd.read_csv("your_file.csv")  # Read your CSV file into a pandas DataFrame
batch = context.get_batch_list(batch_request=batch_kwargs)[0]  # Get the batch of data

# Step 4: Create Expectation Suite (or load existing one)
suite_name = "data_quality_suite"
suite = context.create_expectation_suite(suite_name, overwrite_existing=True)

# Add expectations for data quality:
suite.add_expectation(ge.core.ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={"column": "customer_id"}  # Example: Ensure 'customer_id' is not null
))

suite.add_expectation(ge.core.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set",
    kwargs={"column": "status", "value_set": ["active", "inactive"]}  # Example: 'status' should be either 'active' or 'inactive'
))

# Add more expectations as necessary
# e.g., Expecting values to be greater than a threshold for the 'age' column
suite.add_expectation(ge.core.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_greater_than",
    kwargs={"column": "age", "value": 18}  # Expecting 'age' to be greater than 18
))

# Step 5: Save the Expectation Suite
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=suite_name)

# Step 6: Validate the Data Against the Expectations
validator = context.create_validator(
    batch_kwargs=batch_kwargs,
    expectation_suite=suite
)

validation_result = validator.validate()

# Step 7: Calculate the Data Quality Score
def calculate_dqi(validation_result):
    total_expectations = len(validation_result['expectations_result']['expectations'])
    passed_expectations = sum(1 for exp in validation_result['expectations_result']['expectations'] if exp['success'])
    
    if total_expectations > 0:
        dqi = (passed_expectations / total_expectations) * 100
    else:
        dqi = 0  # No expectations means a score of 0
    return dqi

# Calculate the Data Quality Index (DQI)
dqi = calculate_dqi(validation_result)

# Step 8: Output the Data Quality Score
print(f"Data Quality Score (DQI): {dqi:.2f}%")

# Optional: Save the DQI in a log or dashboard
# Log the score or send it to a monitoring dashboard for real-time updates.



### Task 6: Leveraging Data Quality Metrics for Automated Data Cleaning
**Description**: Implement a system where if data quality metrics fall below a threshold,
automated data cleaning scripts are triggered.

**Steps**:
1. Define Cleaning Logic
2. Integrate with Great Expectations:
    - Use an action within the Great Expectations action list that only triggers if quality score is below a threshold, automating the cleaning.

In [None]:
# Write your code from here
import pandas as pd
import great_expectations as ge

# Step 1: Initialize Great Expectations Context
context = ge.get_context()

# Step 2: Load Data from CSV (or any source)
df = pd.read_csv("your_file.csv")  # Replace with your actual CSV file

# Step 3: Define Data Quality Threshold
DQI_THRESHOLD = 85  # The threshold below which cleaning is triggered

# Step 4: Create or Load Expectation Suite
suite_name = "data_quality_suite"
suite = context.create_expectation_suite(suite_name, overwrite_existing=True)

# Add expectations to check for missing values in key columns
suite.add_expectation(ge.core.ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={"column": "customer_id"}
))

suite.add_expectation(ge.core.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set",
    kwargs={"column": "status", "value_set": ["active", "inactive"]}
))

# Add more expectations (for example: age > 18)
suite.add_expectation(ge.core.ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_greater_than",
    kwargs={"column": "age", "value": 18}
))

# Save Expectation Suite
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=suite_name)

# Step 5: Run Validation on Data
validator = context.create_validator(
    batch_kwargs={"datasource": "my_csv_datasource", "data_connector": "default_inferred_data_connector_name", "data_asset_name": "your_file.csv"},
    expectation_suite=suite
)

validation_result = validator.validate()

# Step 6: Calculate Data Quality Score (DQI)
def calculate_dqi(validation_result):
    total_expectations = len(validation_result['expectations_result']['expectations'])
    passed_expectations = sum(1 for exp in validation_result['expectations_result']['expectations'] if exp['success'])
    
    if total_expectations > 0:
        dqi = (passed_expectations / total_expectations) * 100
    else:
        dqi = 0  # If no expectations, score is 0
    
    return dqi

# Calculate DQI
dqi = calculate_dqi(validation_result)

# Step 7: Trigger Automated Cleaning if DQI is Below Threshold
if dqi < DQI_THRESHOLD:
    print(f"Data Quality Score (DQI): {dqi:.2f}% is below the threshold. Triggering cleaning scripts...")
    # Call the data cleaning function if DQI is below threshold
    df_cleaned = clean_data(df)  # Function to clean the data
    df_cleaned.to_csv("your_file_cleaned.csv", index=False)  # Save the cleaned data
else:
    print(f"Data Quality Score (DQI): {dqi:.2f}% is above the threshold. No cleaning needed.")

# Step 8: Define Cleaning Logic
def clean_data(df):
    """Define your data cleaning steps here."""
    
    # Example: Handle missing values
    df['customer_id'].fillna('Unknown', inplace=True)  # Fill missing customer IDs
    df['status'].fillna('inactive', inplace=True)  # Fill missing status
    
    # Example: Remove duplicates
    df.drop_duplicates(inplace=True)
    
    # Example: Handle outliers in age (e.g., cap ages above 100)
    df['age'] = df['age'].apply(lambda x: 100 if x > 100 else x)
    
    # Add any additional cleaning logic as needed
    return df

