```
=================================================

This program is designed to automate the transformation and loading of data from a CSV file into PostgreSQL, and subsequently into Elasticsearch.  
The dataset used contains information related to a marketing campaign.

The primary goal of this program is to establish a clean and structured data foundation, enabling further analysis using tools such as Kibana for data visualization.  
Ultimately, this supports better decision-making within the context of marketing strategies.
=================================================
```

In [None]:
import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext
from uuid import uuid4

# Initialize FileDataContext for structured project management
context = FileDataContext.create(project_root_dir='./')

# Create a Pandas datasource for the customer dataset
datasource = context.sources.add_pandas(name="customer_data")

# Add the cleaned dataset as a Data Asset
asset_name = "data_clean"
data_path = "data_clean.csv"
asset = datasource.add_csv_asset(name=asset_name, filepath_or_buffer=data_path)

# Build a batch request to validate the dataset
batch_request = asset.build_batch_request()

# Create or update an Expectation Suite
expectation_suite_name = "customer_segmentation_suite"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)

# Initialize a validator for the dataset
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)



In [2]:
# Expectation 1: id must be unique
# Ensures no duplicate customer records for accurate segmentation
validator.expect_column_values_to_be_unique(column="id")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 2240,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [3]:
# Expectation 2: recency must be between 0 and 100
# Validates recency of customer activity is within expected range
validator.expect_column_values_to_be_between(column="recency", min_value=0, max_value=100)



Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 2240,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [4]:
# Expectation 3: education must be one of specified categories
# Ensures education levels are consistent for demographic analysis
validator.expect_column_values_to_be_in_set(
    column="education",
    value_set=["Basic", "Graduation", "2n Cycle", "Master", "PhD"]
)



Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 2240,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# Expectation 4: income must be of type float
# Validates income is stored as a numeric type for financial analysis
validator.expect_column_values_to_be_in_type_list(column="income", type_list=["float64"])



Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# Expectation 5: year_birth must not be null
# Ensures birth year is present for age-based segmentation
validator.expect_column_values_to_not_be_null(column="year_birth")



Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 2240,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 6: mean of mnt_wines must be between 100 and 1000
# Validates average wine spending is within expected range
validator.expect_column_mean_to_be_between(column="mnt_wines", min_value=100, max_value=1000)



Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 303.9357142857143
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 7: median of income must be between 20000 and 100000
# Ensures income distribution is reasonable for customer profiling
validator.expect_column_median_to_be_between(column="income", min_value=20000, max_value=100000)



Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 51741.5
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Save the Expectation Suite for future use
validator.save_expectation_suite(discard_failed_expectations=False)

# Create a Checkpoint for production validation
checkpoint_name = "customer_data_checkpoint"
checkpoint = context.add_or_update_checkpoint(
    name=checkpoint_name,
    validator=validator
)

# Run the Checkpoint to validate the dataset
checkpoint_result = checkpoint.run()

# Build and open Data Docs for visualization of validation results
context.build_data_docs()
context.open_data_docs()

Calculating Metrics:   0%|          | 0/31 [00:00<?, ?it/s]