In [4]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.

import great_expectations as ge
from great_expectations.core.batch import Batch

# Initialize a new Great Expectations context
context = ge.data_context.DataContext()

# Create a basic DataFrame using pandas (assuming you already have pandas installed)
import pandas as pd

# Example dataset
data = {
    'age': [25, 30, 35, 40, None, 50],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
    'salary': [50000, 55000, 60000, 65000, 70000, None]
}

df = pd.DataFrame(data)

# Convert pandas DataFrame into a Great Expectations dataset
df_ge = ge.from_pandas(df)

# Create an expectation suite for this dataset
suite = context.create_expectation_suite(
    "my_suite", overwrite_existing=True
)

# Add expectations to the suite
df_ge.expect_column_values_to_be_in_set('name', ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'])
df_ge.expect_column_values_to_be_in_range('age', 18, 100)
df_ge.expect_column_values_to_not_be_null('salary')

# Validate the data
validation_results = df_ge.validate()

# Print out validation results
print(validation_results)





# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”
# Check that the 'age' column values are between 18 and 100
df_ge.expect_column_values_to_be_in_range('age', 18, 100)

# Check that 'salary' is a positive value
df_ge.expect_column_values_to_be_in_range('salary', 0, None)

# Validate the data
validation_results = df_ge.validate()

# Output results
print(validation_results)






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.
context.build_data_docs()

# Serve the docs using the built-in server
context.open_data_docs()








AttributeError: module 'great_expectations.data_context' has no attribute 'DataContext'

In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.

import pandas as pd

# Sample data
data = {
    'id': [1, 2, 3, 4, 5],
    'email': ['test@example.com', 'invalid_email', 'user@domain.org', 'hello@world.net', None],
    'price': [10.5, None, 15.5, 12.0, 20.0],
}

df = pd.DataFrame(data)

# Profiling: Check for missing values, data types, and unique values
profiling_report = {
    'missing_values': df.isnull().sum(),
    'data_types': df.dtypes,
    'unique_values': df.nunique()
}

print(profiling_report)







# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.

# Task 1: Check for missing values
missing_values = df.isnull().sum()

# Task 2: Check for duplicates
duplicates = df[df.duplicated()]

# Task 3: Identify outliers in the 'price' column (simple method using IQR)
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))]

print(f"Missing values:\n{missing_values}")
print(f"Duplicate records:\n{duplicates}")
print(f"Outliers in 'price' column:\n{outliers}")






# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.
import re

# Rule 1: Check for valid email format using regex
def is_valid_email(email):
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w{2,}$'
    return bool(re.match(pattern, email))

# Apply rule to the 'email' column
df['email_valid'] = df['email'].apply(is_valid_email)

# Rule 2: Check for duplicates in the 'id' column
duplicates_by_id = df[df.duplicated(subset='id')]

# Display the results
print(f"Valid emails:\n{df[['email', 'email_valid']]}")
print(f"Duplicate rows based on 'id':\n{duplicates_by_id}")








