# Numeric Generalization Playground

This notebook exercises `pamola_core.anonymization.generalization.numeric_op.NumericGeneralizationOperation` with a small synthetic dataset, making it easy to validate binning, rounding, and custom range strategies.

## How to use

1. Run the environment/setup cell below to import project modules and create the demo dataset.
2. Execute the helper setup cell to register reusable utilities.
3. Run any of the strategy sections (binning, rounding, range) to generate outputs inside `docs/common/examples/notebook_artifacts/numeric_generalization`.

Feel free to tweak the parameters and re-run cells to explore different behaviors.

In [1]:
python --version

NameError: name 'python' is not defined

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 10)
np.random.seed(7)

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for candidate in [start, *start.parents]:
        if (candidate / "pamola_core").exists():
            return candidate
    raise RuntimeError("Run this notebook from inside the PAMOLA repository.")

PROJECT_ROOT = find_project_root(Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

demo_df = pd.DataFrame({
    "employee_id": np.arange(1, 51),
    "annual_income": np.random.normal(95000, 15000, 50).round(2),
    "credit_score": np.random.normal(690, 55, 50).round(0),
    "tenure_years": np.random.gamma(shape=2.5, scale=2.0, size=50).round(1),
})
demo_df["annual_income"] = demo_df["annual_income"].clip(lower=35000, upper=180000)
demo_df["credit_score"] = demo_df["credit_score"].clip(lower=520, upper=850)
demo_df["tenure_years"] = demo_df["tenure_years"].clip(lower=0, upper=20)

demo_df.head()

In [None]:

from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, Optional
import tempfile

from pamola_core.anonymization.generalization.numeric_op import NumericGeneralizationOperation
from pamola_core.utils.ops.op_data_source import DataSource
from pamola_core.utils.ops.op_result import OperationStatus


@dataclass
class ReporterSession:
    """Encapsulates debug information for a single execution session."""

    name: str
    created_at: datetime
    operations: list[dict[str, Any]] = field(default_factory=list)
    artifacts: list[dict[str, Any]] = field(default_factory=list)
    debug_notes: list[str] = field(default_factory=list)

    def log_operation(self, description: str, details: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        entry = {
            "description": description,
            "details": details or {},
            "timestamp": datetime.now().isoformat(),
        }
        self.operations.append(entry)
        return entry

    def log_artifact(self, name: str, path: str, artifact_type: str = "file") -> Dict[str, Any]:
        entry = {
            "name": name,
            "path": path,
            "type": artifact_type,
            "timestamp": datetime.now().isoformat(),
        }
        self.artifacts.append(entry)
        return entry

    def add_note(self, message: str) -> str:
        note = f"{datetime.now().isoformat()} - {message}"
        self.debug_notes.append(note)
        return note


class NotebookReporter:
    """Debug-friendly reporter that groups events into named sessions."""

    def __init__(self):
        self.sessions: Dict[str, ReporterSession] = {}
        self._session_counter = 0
        self.active_session: ReporterSession = self.start_session()

    def start_session(self, name: Optional[str] = None) -> ReporterSession:
        if name and name in self.sessions:
            raise ValueError(f"Session '{name}' already exists.")
        if not name:
            self._session_counter += 1
            name = f"session_{self._session_counter}"
        session = ReporterSession(name=name, created_at=datetime.now())
        self.sessions[name] = session
        self.active_session = session
        return session

    def use_session(self, name: str) -> ReporterSession:
        """Switch to an existing session or create it if missing."""
        if name in self.sessions:
            self.active_session = self.sessions[name]
        else:
            self.active_session = self.start_session(name)
        return self.active_session

    @property
    def operations(self):
        return self.active_session.operations

    @property
    def artifacts(self):
        return self.active_session.artifacts

    def add_operation(self, description: str, details: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        return self.active_session.log_operation(description, details)

    def register_artifact(self, name: str, path: str, artifact_type: str = "file") -> Dict[str, Any]:
        return self.active_session.log_artifact(name, path, artifact_type)

    def add_debug_note(self, message: str) -> str:
        return self.active_session.add_note(message)

    def summary(self) -> Dict[str, Dict[str, Any]]:
        return {
            name: {
                "operations": len(session.operations),
                "artifacts": len(session.artifacts),
                "notes": session.debug_notes,
            }
            for name, session in self.sessions.items()
        }


def _build_preview_df(operation: NumericGeneralizationOperation, df: pd.DataFrame, mode: str, strategy: str, params: Dict[str, Any]) -> pd.DataFrame:
    preview_kwargs = {
        "field_name": operation.field_name,
        "output_field_name": operation.output_field_name if mode == "ENRICH" else operation.field_name,
        "mode": mode,
        "strategy": strategy,
    }
    preview_kwargs.update(params)
    return NumericGeneralizationOperation.process_batch(df.copy(deep=True), **preview_kwargs)


def run_numeric_generalization(
    strategy: str,
    *,
    mode: str = "ENRICH",
    source_df: Optional[pd.DataFrame] = None,
    task_suffix: Optional[str] = None,
    **operation_kwargs,
) -> Dict[str, Any]:
    """Execute NumericGeneralizationOperation and return in-memory results."""
    reporter = NotebookReporter()
    session_name = f"{strategy}_{task_suffix or datetime.now().strftime('%Y%m%d_%H%M%S')}"
    reporter.use_session(session_name)
    reporter.add_debug_note(f"Running {strategy} strategy with params: {operation_kwargs}")

    working_df = (source_df or demo_df).copy(deep=True)
    data_source = DataSource(dataframes={"main": working_df})

    operation = NumericGeneralizationOperation(
        field_name="annual_income",
        strategy=strategy,
        mode=mode,
        generate_visualization=False,
        use_cache=False,
        save_output=False,
        **operation_kwargs,
    )

    with tempfile.TemporaryDirectory(prefix=f"numeric_generalization_{strategy}_") as tmp_dir:
        task_dir = Path(tmp_dir)
        result = operation.execute(
            data_source=data_source,
            task_dir=task_dir,
            reporter=reporter,
        )

        if result.status != OperationStatus.SUCCESS:
            raise RuntimeError(f"Operation failed: {result.error_message}")

        preview_df = _build_preview_df(operation, working_df, mode, strategy, operation_kwargs)

    print(f"[{session_name}] status={result.status.value}, operations_logged={len(reporter.operations)}")

    return {
        "result": result,
        "preview_df": preview_df,
        "reporter": reporter,
        "reporter_session": reporter.active_session,
        "operation": operation,
    }



## Binning strategy

Equal-width binning condenses numeric values into interpretable ranges.

In [None]:

binning_run = run_numeric_generalization(
    strategy="binning",
    bin_count=5,
    binning_method="equal_width",
)
binning_cols = ["annual_income", binning_run["operation"].output_field_name]
binning_run["preview_df"][binning_cols].head(10)



## Rounding strategy

Rounding to the nearest thousand reduces precision without removing trend information.

In [None]:

rounding_run = run_numeric_generalization(
    strategy="rounding",
    precision=-3,
)
rounding_cols = ["annual_income", rounding_run["operation"].output_field_name]
rounding_run["preview_df"][rounding_cols].head(10)



## Range strategy

Custom ranges provide explicit labels for policy-specific intervals (e.g., income brackets).

In [None]:

range_run = run_numeric_generalization(
    strategy="range",
    range_limits=[
        (40000, 60000),
        (60000, 90000),
        (90000, 120000),
        (120000, 200000),
    ],
)
range_cols = ["annual_income", range_run["operation"].output_field_name]
range_run["preview_df"][range_cols].head(10)

