## Using AI for Anomalies Detection in Data Quality
**Description**: Implement an AI-based approach to detect anomalies in data quality.

**Steps**:
1. Use an Anomaly Detection Algorithm:
    - Use sklearn's Isolation Forest for anomaly detection.

**Example data:**

data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

2. Integrate with Great Expectations:
    - Generate alerts if anomalies are detected:

In [1]:
# Write your code from here
import numpy as np
from sklearn.ensemble import IsolationForest
import great_expectations as gx
from great_expectations.core.expectation_configuration import ExpectationConfiguration

# Step 1: Use an Anomaly Detection Algorithm (Isolation Forest)

def detect_data_quality_anomalies(data):
    """
    Detects anomalies in the given data using Isolation Forest.

    Args:
        data (np.array): A 2D numpy array where each row represents a data point
                         and columns represent features. Missing values (None)
                         should be handled appropriately before passing.

    Returns:
        np.array: An array of predictions where 1 indicates inlier and -1 indicates outlier.
                  Returns None if the input data is empty or has insufficient samples.
    """
    if data is None or len(data) < 2:  # Isolation Forest needs at least 2 samples
        print("Warning: Insufficient data to perform anomaly detection.")
        return None

    # Handle None values by replacing them with NaN, Isolation Forest handles NaN
    processed_data = np.array([[val if val is not None else np.nan for val in row] for row in data], dtype=float)

    # Remove rows with NaN in all features (otherwise Isolation Forest might fail)
    valid_rows_mask = ~np.all(np.isnan(processed_data), axis=1)
    valid_data = processed_data[valid_rows_mask]
    original_indices = np.where(valid_rows_mask)[0]

    if len(valid_data) < 2:
        print("Warning: Insufficient valid data points for anomaly detection after handling missing values.")
        return None

    model = IsolationForest(random_state=42)
    model.fit(valid_data)
    predictions = model.predict(valid_data)

    # Reconstruct predictions array to match the original data size, marking rows with all NaNs as inliers (no anomaly)
    full_predictions = np.ones(len(data), dtype=int)
    full_predictions[original_indices] = predictions

    return full_predictions

# Example data
data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

# Detect anomalies
anomaly_predictions = detect_data_quality_anomalies(data)

if anomaly_predictions is not None:
    print("Anomaly Predictions (1: inlier, -1: outlier):")
    for i, prediction in enumerate(anomaly_predictions):
        print(f"Data Point {i+1}: {data[i]}, Prediction: {prediction}")

    # Step 2: Integrate with Great Expectations

    # Create a Great Expectations Data Context (replace with your actual context setup)
    context = gx.DataContext()

    # Assume your data is in a Pandas DataFrame for easier integration with GE
    import pandas as pd
    df = pd.DataFrame(data, columns=['age', 'salary'])

    # Create a Great Expectations Datasource and DataConnector (if you haven't already)
    # This example assumes an in-memory Pandas DataFrame
    datasource_name = "my_pandas_datasource"
    if datasource_name not in context.list_datasources()["name"]:
        context.add_pandas(name=datasource_name, batch_kwargs_list=[{"df": df}])

    batch_kwargs = {"datasource": datasource_name, "pandas_kwargs": {}}
    batch = context.get_batch(batch_kwargs=batch_kwargs)

    expectation_suite_name = "anomaly_detection_suite"
    suite = context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name, overwrite_existing=True
    )

    if anomaly_predictions is not None:
        anomalous_indices = np.where(anomaly_predictions == -1)[0]

        if len(anomalous_indices) > 0:
            anomalous_data = df.iloc[anomalous_indices].to_dict(orient='records')
            print("\nAnomalies Detected by Isolation Forest:")
            for anomaly in anomalous_data:
                print(anomaly)

            # Generate a Great Expectations Expectation to check for anomalies
            suite.add_expectation(
                ExpectationConfiguration(
                    expectation_type="expect_column_values_to_not_be_in_set",
                    kwargs={
                        "column": "index",  # We'll use the index to flag anomalous rows
                        "value_set": list(anomalous_indices),
                        "mostly": 1.0,  # All these indices should be anomalous
                    },
                    meta={
                        "notes": {
                            "format": "markdown",
                            "content": "## Anomaly Detection Alert\n\n"
                            "The following data points were identified as anomalies by the Isolation Forest algorithm.",
                        }
                    }
                )
            )

            # Save the Expectation Suite
            context.save_expectation_suite(expectation_suite=suite)

            # Create a Checkpoint to run the Expectation Suite
            checkpoint_name = "anomaly_detection_checkpoint"
            checkpoint_config = {
                "name": checkpoint_name,
                "config_version": 1.0,
                "class_name": "SimpleCheckpoint",
                "run_name_template": "%Y%m%d-%H%M%S-anomaly-detection",
                "expectation_suite_name": expectation_suite_name,
                "batch_request": {
                    "datasource_name": datasource_name,
                    "batch_kwargs": batch_kwargs,
                },
                "action_list": [
                    {
                        "name": "store_validation_result",
                        "action": {"class_name": "StoreValidationResultAction"},
                    },
                    {
                        "name": "store_evaluation_params",
                        "action": {"class_name": "StoreEvaluationParametersAction"},
                    },
                    {
                        "name": "send_slack_notification_on_validation_result",
                        "action": {
                            "class_name": "SlackNotificationAction",
                            "slack_webhook": "YOUR_SLACK_WEBHOOK_URL",  # Replace with your Slack webhook URL
                            "only_on": "failure",  # Only send alert if anomalies are found (validation fails)
                            "message": "Data Quality Anomaly Detection Alert!",
                            "notify_on": "failure",
                            "renderer": {
                                "class_name": "ValidationResultsTableRenderer",
                                "styling": {
                                    "default": {"styles": {"font-size": "0.8em"}},
                                    "header": {"styles": {"font-weight": "bold"}},
                                    "cell": {"styles": {"padding": "5px"}},
                                },
                            },
                        },
                    },
                    # Add other actions like storing to database, etc.
                ],
            }
            context.add_checkpoint(**checkpoint_config)

            # Run the Checkpoint
            results = context.run_checkpoint(checkpoint_name=checkpoint_name)

            if not results["success"]:
                print("\nData Quality Anomaly Alert triggered!")
            else:
                print("\nNo data quality anomalies detected by Isolation Forest according to the Great Expectations Checkpoint.")

        else:
            print("\nNo anomalies detected by Isolation Forest.")

else:
    print("Anomaly detection could not be performed due to insufficient data.")

ModuleNotFoundError: No module named 'great_expectations.core.expectation_configuration'