# Numeric Generalization Playground

This notebook exercises `pamola_core.anonymization.generalization.numeric_op.NumericGeneralizationOperation` with a small synthetic dataset, making it easy to validate binning, rounding, and custom range strategies.

## How to use

1. Run the environment/setup cell below to import project modules and create the demo dataset.
2. Execute the helper setup cell to register reusable utilities.
3. Run any of the strategy sections (binning, rounding, range) to generate outputs inside `docs/common/examples/notebook_artifacts/numeric_generalization`.

Feel free to tweak the parameters and re-run cells to explore different behaviors.

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 10)
np.random.seed(7)

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for candidate in [start, *start.parents]:
        if (candidate / "pamola_core").exists():
            return candidate
    raise RuntimeError("Run this notebook from inside the PAMOLA repository.")

PROJECT_ROOT = find_project_root(Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

demo_df = pd.DataFrame({
    "employee_id": np.arange(1, 51),
    "annual_income": np.random.normal(95000, 15000, 50).round(2),
    "credit_score": np.random.normal(690, 55, 50).round(0),
    "tenure_years": np.random.gamma(shape=2.5, scale=2.0, size=50).round(1),
})
demo_df["annual_income"] = demo_df["annual_income"].clip(lower=35000, upper=180000)
demo_df["credit_score"] = demo_df["credit_score"].clip(lower=520, upper=850)
demo_df["tenure_years"] = demo_df["tenure_years"].clip(lower=0, upper=20)

demo_df.head()

In [None]:
from datetime import datetime
from typing import Any, Dict, Optional

from pamola_core.anonymization.generalization.numeric_op import NumericGeneralizationOperation
from pamola_core.utils.ops.op_data_source import DataSource
from pamola_core.utils.ops.op_result import OperationStatus

ARTIFACT_ROOT = PROJECT_ROOT / "docs" / "common" / "examples" / "notebook_artifacts" / "numeric_generalization"
ARTIFACT_ROOT.mkdir(parents=True, exist_ok=True)

class NotebookReporter:
    """Lightweight reporter compatible with NumericGeneralizationOperation."""

    def __init__(self):
        self.operations: list[dict[str, Any]] = []
        self.artifacts: list[dict[str, Any]] = []

    def add_operation(self, description: str, details: Optional[Dict[str, Any]] = None) -> None:
        self.operations.append({"description": description, "details": details or {}})

    def register_artifact(self, name: str, path: str, artifact_type: str = "file") -> None:
        self.artifacts.append({"name": name, "path": path, "type": artifact_type})

def run_numeric_generalization(
    strategy: str,
    *,
    mode: str = "ENRICH",
    source_df: Optional[pd.DataFrame] = None,
    task_suffix: Optional[str] = None,
    **operation_kwargs,
) -> Dict[str, Any]:
    """Execute NumericGeneralizationOperation and load the saved output for inspection."""
    task_suffix = task_suffix or datetime.now().strftime("%Y%m%d_%H%M%S")
    task_dir = ARTIFACT_ROOT / f"{strategy}_{task_suffix}"
    task_dir.mkdir(parents=True, exist_ok=True)

    reporter = NotebookReporter()
    working_df = (source_df or demo_df).copy(deep=True)
    data_source = DataSource(dataframes={"main": working_df})

    operation = NumericGeneralizationOperation(
        field_name="annual_income",
        strategy=strategy,
        mode=mode,
        generate_visualization=False,
        use_cache=False,
        save_output=True,
        **operation_kwargs,
    )

    result = operation.execute(
        data_source=data_source,
        task_dir=task_dir,
        reporter=reporter,
    )

    if result.status != OperationStatus.SUCCESS:
        raise RuntimeError(f"Operation failed: {result.error_message}")

    csv_artifacts = result.get_artifacts_by_type(operation.output_format)
    artifact_path = Path(csv_artifacts[0].path) if csv_artifacts else None
    output_df = pd.read_csv(artifact_path) if artifact_path and artifact_path.exists() else None

    return {
        "result": result,
        "output": output_df,
        "artifact_path": artifact_path,
        "task_dir": task_dir,
        "reporter": reporter,
        "operation": operation,
    }


## Binning strategy

Equal-width binning condenses numeric values into interpretable ranges.

In [14]:
binning_run = run_numeric_generalization(
    strategy="binning",
    bin_count=5,
    binning_method="equal_width",
)
binning_run["output"][["annual_income", "_annual_income"]].head(10)

jsonschema package not installed. Schema validation skipped.


Unnamed: 0,annual_income,_annual_income
0,120357.89,115106.20-128713.93
1,88010.94,87890.73-101498.47
2,95492.3,87890.73-101498.47
3,101112.74,87890.73-101498.47
4,83166.15,74283.00-87890.73
5,95030.98,87890.73-101498.47
6,94986.64,87890.73-101498.47
7,68679.14,60675.27-74283.00
8,110264.87,101498.47-115106.20
9,104007.48,101498.47-115106.20


## Rounding strategy

Rounding to the nearest thousand reduces precision without removing trend information.

In [None]:
rounding_run = run_numeric_generalization(
    strategy="rounding",
    precision=-3,
)
print(f"Saved output: {rounding_run['artifact_path']}")
rounding_run["output"][["annual_income", "_annual_income"]].head(10)

## Range strategy

Custom ranges provide explicit labels for policy-specific intervals (e.g., income brackets).

In [None]:
range_run = run_numeric_generalization(
    strategy="range",
    range_limits=[
        (40000, 60000),
        (60000, 90000),
        (90000, 120000),
        (120000, 200000),
    ],
)
print(f"Saved output: {range_run['artifact_path']}")
range_run["output"][["annual_income", "_annual_income"]].head(10)