# **I. INTRODUCTION**

- Batch : RMT-050

- Objective : This notebook provides a data validation process using `GreatExpectation` after data cleaning

# **II. IMPORT LIBRARIES**

In [1]:
from great_expectations.data_context import FileDataContext

    You are using a Python version 3.9 past its end of life. Google will update
    google-auth with critical bug fixes on a best-effort basis, but not
    with any other fixes or features. Please upgrade your Python version,
    and then update google-auth.
    
    You are using a Python version 3.9 past its end of life. Google will update
    google-auth with critical bug fixes on a best-effort basis, but not
    with any other fixes or features. Please upgrade your Python version,
    and then update google-auth.
    


# **III. CONNECT TO DATASOURCE**

In [2]:
# Create a data context
context = FileDataContext.create(project_root_dir='./')

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'clean-data'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = "samsung-data"
path_to_data = "/Users/Shared/Workspaces/FTDS_2025/Fase_2/Final_Project/p2-ftds-final-project-ftds-050-rmt-group-001/Data_Scraped/clean.csv"
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# **IV. EXPECTATION SUITE**

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-clean-samsung'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,title,price,battery,description,reviews,ratings,url,img_url,brand,...,cpu_speed,storage,screen_size,resolution,refresh_rate,model_name,weight_gr,rating_distribution,overall_rating,language
0,0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",I’ve used Samsung’s Ultra and Note series for ...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
1,0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",This phone is worth every penny! I am upgradin...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
2,0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",I've been a lifelong Samsung user. I've had th...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
3,0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...","So far, the item is great and very practical. ...",5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en
4,0,"Samsung Galaxy Z Fold7 Cell Phone, 512GB AI Sm...",2119.99,4400,"Expand what’s possible with Galaxy Z Fold7, th...",Switched from iOS to android with this purchas...,5.0,https://www.amazon.com/Samsung-Smartphone-Unlo...,https://m.media-amazon.com/images/I/41yEZrL-vj...,Samsung,...,4.7,512,8.0,1968 x 2184,120,Galaxy Z Fold7,215.46,80%,4.5,en


## Expectations

In [5]:
# Expectation 1 : Column `Unnamed: 0` must contain 20 unique values --> acting as a "product id"

validator.expect_column_distinct_values_to_be_in_set("Unnamed: 0", value_set=[i for i in range(20)])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      0,
      1,
      2,
      3,
      4,
      5,
      6,
      7,
      8,
      9,
      10,
      11,
      12,
      13,
      14,
      15,
      16,
      17,
      18,
      19
    ],
    "details": {
      "value_counts": [
        {
          "value": 0,
          "count": 49
        },
        {
          "value": 1,
          "count": 26
        },
        {
          "value": 2,
          "count": 48
        },
        {
          "value": 3,
          "count": 28
        },
        {
          "value": 4,
          "count": 13
        },
        {
          "value": 5,
          "count": 33
        },
        {
          "value": 6,
          "count": 39
        },
        {
          "value": 7,
          "count": 38
        },
        {
          "value": 8,
          "count": 49
        },
        {
          "value": 9,
          "count": 45
        },
        {
          "value": 10,
          "count": 32

In [6]:
# Expectation 2 : Column `reviews` doesn't have null values

validator.expect_column_values_to_not_be_null("reviews")

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 817,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 3 : Column `ratings` doesn't have null values

validator.expect_column_values_to_not_be_null("ratings")

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 817,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 4 : Column `battery` must be less than 10,000 mAh

validator.expect_column_values_to_be_between(column='battery', min_value=0, max_value=10000)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 817,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 5 : Column `ratings` must be in numerical values 

validator.expect_column_values_to_be_in_type_list('ratings', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 6 : Column `ram` must not exceed 20 GB
validator.expect_column_max_to_be_between(column='ram', min_value=0, max_value=20)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 12
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Expectation 7 : Column `language` must be = English --> for clean input before data modeling

validator.expect_column_distinct_values_to_be_in_set("language", value_set=["en"])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "en"
    ],
    "details": {
      "value_counts": [
        {
          "value": "en",
          "count": 817
        }
      ]
    }
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

## Checkpoint

In [13]:
# Define a checkpoint

checkpoint = context.add_or_update_checkpoint(
    name = 'checkpoint',
    validator = validator,
)

In [14]:
# Run a checkpoint

checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/23 [00:00<?, ?it/s]

## Build a Data Docs

In [15]:
# Build data docs

context.build_data_docs()

{'local_site': 'file:///Users/Shared/Workspaces/FTDS_2025/Fase_2/Final_Project/p2-ftds-final-project-ftds-050-rmt-group-001/GX/gx/uncommitted/data_docs/local_site/index.html'}