In [1]:
# Cell 1 — environment setup
import sys, os
project_root = os.path.abspath("..")   # one level up from notebooks/
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

import pandas as pd
print("Project root:", project_root)
print("SRC path appended:", src_path)


Project root: /home/cmogbo/Documents/jupyter_test/RESEARCH_DATA_ANALYST_PORTOFOLIO/cloud-data-ai-portfolio/project-01-serverless-data-qc
SRC path appended: /home/cmogbo/Documents/jupyter_test/RESEARCH_DATA_ANALYST_PORTOFOLIO/cloud-data-ai-portfolio/project-01-serverless-data-qc/src


In [2]:
# Cell 2 — sample dataframe for prototyping
df = pd.DataFrame([
    {"id": 1, "name": "Alice", "age": 30, "email": "a@x.com", "signup_date": "2020-01-01"},
    {"id": 2, "name": "Bob", "age": "twenty", "email": None, "signup_date": "2021-02-02"},
])
df


Unnamed: 0,id,name,age,email,signup_date
0,1,Alice,30,a@x.com,2020-01-01
1,2,Bob,twenty,,2021-02-02


In [3]:
# Cell 3 — prototype validator (quick, iterative)

# Import the pandas library, which is necessary for handling DataFrame structures
# and using its specific utility functions like pd.isna().
import pandas as pd

# --- Mock DataFrame Setup (Added for runnable code) ---
# NOTE: The original script referenced a global 'df', so we create a mock
# DataFrame here to make the example fully runnable and testable.
data = {
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", None],
    "age": [30, "twenty", 150]
}
df = pd.DataFrame(data)
# df.iloc[0] is {'id': 1, 'name': 'Alice', 'age': 30} (Expected: No issues)
# df.iloc[1] is {'id': 2, 'name': 'Bob', 'age': 'twenty'} (Expected: 'age' not_integer)
# df.iloc[2] is {'id': 3, 'name': None, 'age': 150} (Expected: 'name' missing_required, 'age' out_of_range)
# ----------------------------------------------------


def validate_row(row):
    """
    Validates a single row (Pandas Series or dictionary-like object) for
    missing required fields and age data type/range constraints.

    Args:
        row (pd.Series or dict): A single record to validate.

    Returns:
        list: A list of dictionaries, where each dictionary represents a
              data quality issue found in the row.
    """
    # Initialize an empty list to store all validation issues found in the row.
    issues = []

    # Define the list of columns that must be present and non-null in every row.
    required = ["id", "name", "age"]

    # 1. Missing Required Field Check
    for col in required:
        # Check if the column key is missing from the row OR if the value
        # associated with the column is considered NaN (Not a Number) by pandas,
        # which covers None, np.nan, and pd.NA.
        # .get(col, None) safely retrieves the value, defaulting to None if the key doesn't exist.
        if col not in row or pd.isna(row.get(col, None)):
            # If an issue is found, append a detailed dictionary describing it.
            issues.append({
                "column": col,
                "issue": "missing_required",
                "value": row.get(col, None) # The value that was found (often None or NaN)
            })

    # 2. Age-Specific Checks (Type and Range)
    try:
        # Retrieve the age value.
        age_val = row.get("age", None)

        # Attempt to convert the age value to an integer. This will fail
        # if the value is non-numeric (e.g., "twenty").
        age_int = int(age_val)

        # Check if the converted integer age is outside a reasonable range (0 to 120).
        if age_int < 0 or age_int > 120:
            # If out of range, record the specific issue.
            issues.append({
                "column": "age",
                "issue": "age_out_of_range",
                "value": age_int
            })

    # This catches exceptions like ValueError (if int(age_val) fails because it's non-numeric)
    # or TypeError (if age_val is None and fails conversion, though pd.isna should catch missing values).
    except Exception:
        # If the conversion to integer fails, record the "not_integer" issue.
        # This includes cases where age is non-numeric or missing (though missing is often caught above).
        issues.append({
            "column": "age",
            "issue": "not_integer",
            "value": row.get("age", None) # Shows the original value that caused the error
        })
# in notebook cell
from importlib import reload
import src.validator as validator
reload(validator)
df = pd.read_csv("../samples/good.csv")
report = validator.validate_dataframe(df)
report

    # Return the complete list of issues. If the list is empty, the row is valid.
    return issues

# quick tests
# Use the iloc accessor to pass a single row (as a Series) to the validator.
print("Row 0 issues (Expected: []):", validate_row(df.iloc[0]))
print("Row 1 issues (Expected: 'age' not_integer):", validate_row(df.iloc[1]))
print("Row 2 issues (Expected: 'name' missing, 'age' out_of_range):", validate_row(df.iloc[2]))

Row 0 issues (Expected: []): []
Row 1 issues (Expected: 'age' not_integer): [{'column': 'age', 'issue': 'not_integer', 'value': 'twenty'}]
Row 2 issues (Expected: 'name' missing, 'age' out_of_range): [{'column': 'name', 'issue': 'missing_required', 'value': None}, {'column': 'age', 'issue': 'age_out_of_range', 'value': 150}]


In [8]:
import os
import sys

# 1. Get the path of the current notebook (the directory it's in).
notebook_dir = os.getcwd()

# 2. Go up one level to find the project root (where src/ is located).
project_root = os.path.join(notebook_dir, '..')

# 3. Add the project root to the Python search path.
sys.path.insert(0, project_root)

# Now, imports should work!

import src.handler as handler_mod
import src.utils as utils_mod

# monkeypatch read_s3_object to read file bytes
orig_read = utils_mod.read_s3_object
orig_write = utils_mod.write_s3_json

utils_mod.read_s3_object = lambda b,k: open("samples/good.csv","rb").read()
written = {}
def fake_write(b,k,p): written[k]=p
utils_mod.write_s3_json = fake_write

event = {"Records":[{"s3":{"bucket":{"name":"local-bucket"},"object":{"key":"good.csv"}}}]}
res = handler_mod.handler(event, None)
print(res)
print("written keys:", written.keys())

# restore
utils_mod.read_s3_object = orig_read
utils_mod.write_s3_json = orig_write


Error reading S3 object good.csv from local-bucket: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied


ClientError: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied

In [9]:
import os
import sys
import json
import io
import pandas as pd
from botocore.exceptions import ClientError

# --- Setup to fix module imports (remains the same) ---
# 1. Get the path of the current notebook (the directory it's in).
notebook_dir = os.getcwd()

# 2. Go up one level to find the project root (where src/ is located).
project_root = os.path.join(notebook_dir, '..')

# 3. Add the project root to the Python search path.
sys.path.insert(0, project_root)

# Now, imports should work!
import src.handler as handler_mod
import src.utils as utils_mod

# ----------------------------------------------------------------------
# --- Monkeypatching for Local Testing ---
# Goal: Replace the AWS-dependent S3 functions with local file access/mocking
# to test the handler logic without needing live AWS credentials.
# ----------------------------------------------------------------------

# 1. Backup original functions
orig_read = utils_mod.read_s3_object
orig_write = utils_mod.write_s3_json

# 2. Mock storage for written reports
written_reports = {}

# 3. Replace read_s3_object to read a local sample file (e.g., samples/good.csv)
# NOTE: The lambda function doesn't need to check the bucket/key; it just reads the mock file.
# We are assuming 'samples/good.csv' exists in the project root structure.
def fake_read(bucket, key):
    # This reads the CSV file from your local disk and returns its bytes, 
    # perfectly simulating what the real S3 call would return.
    try:
        # Assuming the samples folder is in the project root directory
        file_path = os.path.join(project_root, "samples", "good.csv")
        with open(file_path, "rb") as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: Sample file not found at {file_path}")
        # To mimic the real function, we raise an error if the mock file is missing.
        raise

utils_mod.read_s3_object = fake_read

# 4. Replace write_s3_json to save the report to the local 'written_reports' dictionary
def fake_write(bucket, key, payload, acl=None):
    # Saves the report payload (Python dict) to our local mock storage.
    written_reports[key] = payload

utils_mod.write_s3_json = fake_write


# ----------------------------------------------------------------------
# --- Test Execution ---
# ----------------------------------------------------------------------

# Mock the S3 event payload.
event = {"Records":[
    {"s3":{
        "bucket":{"name":"local-bucket"},
        "object":{"key":"good.csv"} # This key will be used to generate the report path
    }}
]}

# Run the handler (Lambda execution simulation)
# It will use fake_read (Step 3) and fake_write (Step 4).
try:
    print("--- Starting handler execution with mock S3 I/O ---")
    res = handler_mod.handler(event, None)
    
    # Check results
    print(f"\nHandler Response: {res}")
    print(f"Report Key Written: {list(written_reports.keys())[0]}")
    
    # You can inspect the generated report here
    # report_content = written_reports[list(written_reports.keys())[0]]
    # print(json.dumps(report_content, indent=2))

except Exception as e:
    print(f"\n--- Handler execution failed ---")
    # If the handler fails, we print the error here instead of relying on the traceback.
    print(f"Error: {e}")

# ----------------------------------------------------------------------
# --- Cleanup ---
# ----------------------------------------------------------------------

# Restore the original AWS-dependent functions so other tests don't break.
utils_mod.read_s3_object = orig_read
utils_mod.write_s3_json = orig_write

print("\nCleanup complete: Original S3 functions restored.")

--- Starting handler execution with mock S3 I/O ---
Error reading S3 object good.csv from local-bucket: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied

--- Handler execution failed ---
Error: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied

Cleanup complete: Original S3 functions restored.


In [11]:
import os
import sys
import json
import io
import pandas as pd
from botocore.exceptions import ClientError

# --- Setup to fix module imports (remains the same) ---
# 1. Get the path of the current notebook (the directory it's in).
notebook_dir = os.getcwd()

# 2. Go up one level to find the project root (where src/ is located).
project_root = os.path.join(notebook_dir, '..')

# 3. Add the project root to the Python search path.
sys.path.insert(0, project_root)

# Now, imports should work!
import src.handler as handler_mod
import src.utils as utils_mod

# ----------------------------------------------------------------------
# --- Monkeypatching for Local Testing ---
# ----------------------------------------------------------------------

# 1. Backup original functions (from the utils module)
orig_read = utils_mod.read_s3_object
orig_write = utils_mod.write_s3_json

# 2. Mock storage for written reports
written_reports = {}

# 3. Define the fake read function
def fake_read(bucket, key):
    # This reads the CSV file from your local disk and returns its bytes.
    try:
        # Assuming the samples folder is in the project root directory
        file_path = os.path.join(project_root, "samples", "good.csv")
        with open(file_path, "rb") as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: Sample file not found at {file_path}")
        raise

# 4. Define the fake write function
def fake_write(bucket, key, payload, acl=None):
    # Saves the report payload (Python dict) to our local mock storage.
    written_reports[key] = payload

# 5. !!! CRITICAL FIX: PATCH BOTH MODULES !!!
# We patch both handler_mod and utils_mod to ensure the functions are replaced 
# where they are defined AND where they are called.
utils_mod.read_s3_object = fake_read
utils_mod.write_s3_json = fake_write

# Since handler.py imported these functions by name, we MUST patch them there too.
handler_mod.read_s3_object = fake_read
handler_mod.write_s3_json = fake_write


# ----------------------------------------------------------------------
# --- Test Execution ---
# ----------------------------------------------------------------------

# Mock the S3 event payload.
event = {"Records":[
    {"s3":{
        "bucket":{"name":"local-bucket"},
        "object":{"key":"good.csv"} # This key will be used to generate the report path
    }}
]}

# Run the handler (Lambda execution simulation)
try:
    print("--- Starting handler execution with mock S3 I/O ---")
    res = handler_mod.handler(event, None)
    
    # Check results
    print(f"\nHandler Response: {res}")
    print(f"Report Key Written: {list(written_reports.keys())[0]}")
    
    # Print the report content to verify the validation worked
    report_key = list(written_reports.keys())[0]
    report_content = written_reports[report_key]
    print("\n--- Validation Report Content (First 1000 chars) ---")
    print(json.dumps(report_content, indent=2)[:2000] + "...")

except Exception as e:
    print(f"\n--- Handler execution failed ---\nError: {e}")

# ----------------------------------------------------------------------
# --- Cleanup ---\n# ----------------------------------------------------------------------

# Restore the original AWS-dependent functions so other tests don't break.
utils_mod.read_s3_object = orig_read
utils_mod.write_s3_json = orig_write

# Also restore the handler module's original references
handler_mod.read_s3_object = orig_read
handler_mod.write_s3_json = orig_write

print("\nCleanup complete: Original S3 functions restored.")

--- Starting handler execution with mock S3 I/O ---

Handler Response: {'statusCode': 200, 'body': '{"report_key": "reports/good.csv.qc.json"}'}
Report Key Written: reports/good.csv.qc.json

--- Validation Report Content (First 1000 chars) ---
{
  "num_rows": 2,
  "num_columns": 5,
  "columns_present": [
    "id",
    "name",
    "age",
    "email",
    "signup_date"
  ],
  "row_issues": {},
  "summary": {
    "columns": {
      "id": {
        "non_null_count": 2,
        "unique": 2,
        "dtype": "int64"
      },
      "name": {
        "non_null_count": 2,
        "unique": 2,
        "dtype": "object"
      },
      "age": {
        "non_null_count": 2,
        "unique": 2,
        "dtype": "int64"
      },
      "email": {
        "non_null_count": 2,
        "unique": 2,
        "dtype": "object"
      },
      "signup_date": {
        "non_null_count": 2,
        "unique": 2,
        "dtype": "object"
      }
    }
  },
  "missing_required_columns": [],
  "total_issues": 0
}