In [1]:
# Cell 1 — environment setup
import sys, os
project_root = os.path.abspath("..")   # one level up from notebooks/
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

import pandas as pd
print("Project root:", project_root)
print("SRC path appended:", src_path)


Project root: /home/cmogbo/Documents/jupyter_test/RESEARCH_DATA_ANALYST_PORTOFOLIO/cloud-data-ai-portfolio/project-01-serverless-data-qc
SRC path appended: /home/cmogbo/Documents/jupyter_test/RESEARCH_DATA_ANALYST_PORTOFOLIO/cloud-data-ai-portfolio/project-01-serverless-data-qc/src


In [2]:
# Cell 2 — sample dataframe for prototyping
df = pd.DataFrame([
    {"id": 1, "name": "Alice", "age": 30, "email": "a@x.com", "signup_date": "2020-01-01"},
    {"id": 2, "name": "Bob", "age": "twenty", "email": None, "signup_date": "2021-02-02"},
])
df


Unnamed: 0,id,name,age,email,signup_date
0,1,Alice,30,a@x.com,2020-01-01
1,2,Bob,twenty,,2021-02-02


In [3]:
# Cell 3 — prototype validator (quick, iterative)

# Import the pandas library, which is necessary for handling DataFrame structures
# and using its specific utility functions like pd.isna().
import pandas as pd

# --- Mock DataFrame Setup (Added for runnable code) ---
# NOTE: The original script referenced a global 'df', so we create a mock
# DataFrame here to make the example fully runnable and testable.
data = {
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", None],
    "age": [30, "twenty", 150]
}
df = pd.DataFrame(data)
# df.iloc[0] is {'id': 1, 'name': 'Alice', 'age': 30} (Expected: No issues)
# df.iloc[1] is {'id': 2, 'name': 'Bob', 'age': 'twenty'} (Expected: 'age' not_integer)
# df.iloc[2] is {'id': 3, 'name': None, 'age': 150} (Expected: 'name' missing_required, 'age' out_of_range)
# ----------------------------------------------------


def validate_row(row):
    """
    Validates a single row (Pandas Series or dictionary-like object) for
    missing required fields and age data type/range constraints.

    Args:
        row (pd.Series or dict): A single record to validate.

    Returns:
        list: A list of dictionaries, where each dictionary represents a
              data quality issue found in the row.
    """
    # Initialize an empty list to store all validation issues found in the row.
    issues = []

    # Define the list of columns that must be present and non-null in every row.
    required = ["id", "name", "age"]

    # 1. Missing Required Field Check
    for col in required:
        # Check if the column key is missing from the row OR if the value
        # associated with the column is considered NaN (Not a Number) by pandas,
        # which covers None, np.nan, and pd.NA.
        # .get(col, None) safely retrieves the value, defaulting to None if the key doesn't exist.
        if col not in row or pd.isna(row.get(col, None)):
            # If an issue is found, append a detailed dictionary describing it.
            issues.append({
                "column": col,
                "issue": "missing_required",
                "value": row.get(col, None) # The value that was found (often None or NaN)
            })

    # 2. Age-Specific Checks (Type and Range)
    try:
        # Retrieve the age value.
        age_val = row.get("age", None)

        # Attempt to convert the age value to an integer. This will fail
        # if the value is non-numeric (e.g., "twenty").
        age_int = int(age_val)

        # Check if the converted integer age is outside a reasonable range (0 to 120).
        if age_int < 0 or age_int > 120:
            # If out of range, record the specific issue.
            issues.append({
                "column": "age",
                "issue": "age_out_of_range",
                "value": age_int
            })

    # This catches exceptions like ValueError (if int(age_val) fails because it's non-numeric)
    # or TypeError (if age_val is None and fails conversion, though pd.isna should catch missing values).
    except Exception:
        # If the conversion to integer fails, record the "not_integer" issue.
        # This includes cases where age is non-numeric or missing (though missing is often caught above).
        issues.append({
            "column": "age",
            "issue": "not_integer",
            "value": row.get("age", None) # Shows the original value that caused the error
        })

    # Return the complete list of issues. If the list is empty, the row is valid.
    return issues

# quick tests
# Use the iloc accessor to pass a single row (as a Series) to the validator.
print("Row 0 issues (Expected: []):", validate_row(df.iloc[0]))
print("Row 1 issues (Expected: 'age' not_integer):", validate_row(df.iloc[1]))
print("Row 2 issues (Expected: 'name' missing, 'age' out_of_range):", validate_row(df.iloc[2]))

Row 0 issues (Expected: []): []
Row 1 issues (Expected: 'age' not_integer): [{'column': 'age', 'issue': 'not_integer', 'value': 'twenty'}]
Row 2 issues (Expected: 'name' missing, 'age' out_of_range): [{'column': 'name', 'issue': 'missing_required', 'value': None}, {'column': 'age', 'issue': 'age_out_of_range', 'value': 150}]
