# Datetime Generalization Playground

Interactively exercise `DateTimeGeneralizationOperation` with synthetic event logs to verify rounding, binning, and component strategies.


## How to use

1. Run the setup cell to configure imports and build the demo dataset.
2. Execute the helper cell to register reporter utilities and the wrapper function.
3. Experiment with the strategy cells or tweak parameters to inspect other generalization behaviors without writing artifacts to disk.


In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 12)
pd.set_option("display.max_columns", None)
np.random.seed(21)


def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for candidate in [start, *start.parents]:
        if (candidate / "pamola_core").exists():
            return candidate
    raise RuntimeError("Run this notebook inside the PAMOLA repository.")

PROJECT_ROOT = find_project_root(Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")

base_timestamps = pd.date_range("2024-01-01", periods=120, freq="12H")
random_offsets = np.random.randint(0, 6 * 60, size=len(base_timestamps))

demo_df = pd.DataFrame({
    "event_id": np.arange(1, len(base_timestamps) + 1),
    "event_timestamp": base_timestamps + pd.to_timedelta(random_offsets, unit="m"),
    "region": np.random.choice(["NA", "EU", "APAC", "LATAM"], size=len(base_timestamps)),
    "channel": np.random.choice(["web", "mobile", "email"], size=len(base_timestamps)),
})

demo_df.head()


Project root: /root/PAMOLA


  base_timestamps = pd.date_range("2024-01-01", periods=120, freq="12H")


Unnamed: 0,event_id,event_timestamp,region,channel
0,1,2024-01-01 03:27:00,EU,email
1,2,2024-01-01 17:12:00,EU,email
2,3,2024-01-02 04:20:00,LATAM,web
3,4,2024-01-02 12:48:00,EU,mobile
4,5,2024-01-03 02:00:00,,email


In [28]:
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
import tempfile

from pamola_core.anonymization.generalization.datetime_op import DateTimeGeneralizationOperation
from pamola_core.utils.ops.op_data_source import DataSource
from pamola_core.utils.ops.op_result import OperationStatus


@dataclass
class ReporterSession:
    'Lightweight session log for notebook runs.'

    name: str
    created_at: datetime
    operations: list[dict[str, Any]] = field(default_factory=list)
    artifacts: list[dict[str, Any]] = field(default_factory=list)
    debug_notes: list[str] = field(default_factory=list)

    def log_operation(self, description: str, details: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        entry = {
            'description': description,
            'details': details or {},
            'timestamp': datetime.now().isoformat(),
        }
        self.operations.append(entry)
        return entry

    def add_note(self, message: str) -> str:
        note = f"{datetime.now().isoformat()} - {message}"
        self.debug_notes.append(note)
        return note


class NotebookReporter:
    'Debug-friendly reporter grouping events by session.'

    def __init__(self):
        self.sessions: Dict[str, ReporterSession] = {}
        self._session_counter = 0
        self.active_session: ReporterSession = self.start_session()

    def start_session(self, name: Optional[str] = None) -> ReporterSession:
        if name and name in self.sessions:
            raise ValueError(f"Session '{name}' already exists")
        if not name:
            self._session_counter += 1
            name = f"session_{self._session_counter}"
        session = ReporterSession(name=name, created_at=datetime.now())
        self.sessions[name] = session
        self.active_session = session
        return session

    def use_session(self, name: str) -> ReporterSession:
        if name in self.sessions:
            self.active_session = self.sessions[name]
        else:
            self.active_session = self.start_session(name)
        return self.active_session

    def add_operation(self, description: str, details: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        return self.active_session.log_operation(description, details)

    def add_debug_note(self, message: str) -> str:
        return self.active_session.add_note(message)

    def summary(self) -> Dict[str, Dict[str, Any]]:
        return {
            name: {
                'operations': len(session.operations),
                'notes': session.debug_notes,
            }
            for name, session in self.sessions.items()
        }


def _build_preview_df(
    operation: DateTimeGeneralizationOperation,
    df: pd.DataFrame,
    mode: str,
    strategy: str,
    params: Dict[str, Any],
) -> pd.DataFrame:
    preview_kwargs = {
        'field_name': operation.field_name,
        'output_field_name': operation.output_field_name if mode == 'ENRICH' else operation.field_name,
        'mode': mode,
        'strategy': strategy,
    }
    preview_kwargs.update(params)
    return DateTimeGeneralizationOperation.process_batch(df.copy(deep=True), **preview_kwargs)


def run_datetime_generalization(
    strategy: str,
    *,
    mode: str = 'ENRICH',
    source_df: Optional[pd.DataFrame] = None,
    session_label: Optional[str] = None,
    **operation_kwargs,
) -> Dict[str, Any]:
    reporter = NotebookReporter()
    session_name = session_label or f"{strategy}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    reporter.use_session(session_name)
    reporter.add_debug_note(f"Running {strategy} with params: {operation_kwargs}")

    working_df = (source_df or demo_df).copy(deep=True)
    data_source = DataSource(dataframes={'main': working_df})

    operation = DateTimeGeneralizationOperation(
        field_name='event_timestamp',
        strategy=strategy,
        mode=mode,
        generate_visualization=False,
        use_cache=False,
        save_output=False,
        **operation_kwargs,
    )

    with tempfile.TemporaryDirectory(prefix=f"datetime_generalization_{strategy}_") as tmp_dir:
        result = operation.execute(
            data_source=data_source,
            task_dir=Path(tmp_dir),
            reporter=reporter,
        )

        if result.status != OperationStatus.SUCCESS:
            raise RuntimeError(f"Operation failed: {result.error_message}")
        

        preview_df = _build_preview_df(operation, working_df, mode, strategy, operation_kwargs)

    print(f"[{session_name}] status={result.status.value}; operations_logged={len(reporter.active_session.operations)}")

    return {
        'result': result,
        'preview_df': preview_df,
        'reporter': reporter,
        'reporter_session': reporter.active_session,
        'operation': operation,
    }

## Rounding strategy

Round timestamps down to the nearest month to remove intra-month precision while preserving seasonal trends.


In [29]:
rounding_run = run_datetime_generalization(
    strategy="rounding",
    rounding_unit="month",
)
rounding_cols = ["event_timestamp", rounding_run["operation"].output_field_name]
rounding_run["preview_df"][rounding_cols].head(12)


[rounding_20251108_182905] status=success; operations_logged=3


Unnamed: 0,event_timestamp,_event_timestamp
0,2024-01-01 03:27:00,2024-01-01
1,2024-01-01 17:12:00,2024-01-01
2,2024-01-02 04:20:00,2024-01-01
3,2024-01-02 12:48:00,2024-01-01
4,2024-01-03 02:00:00,2024-01-01
5,2024-01-03 15:08:00,2024-01-01
6,2024-01-04 01:38:00,2024-01-01
7,2024-01-04 16:01:00,2024-01-01
8,2024-01-05 03:22:00,2024-01-01
9,2024-01-05 17:28:00,2024-01-01


## Binning strategy

Group timestamps into rolling 7-day windows to align with weekly reporting intervals.


In [30]:
binning_run = run_datetime_generalization(
    strategy="binning",
    bin_type="day_range",
    interval_size=7,
    interval_unit="days",
)
binning_cols = ["event_timestamp", binning_run["operation"].output_field_name]
binning_run["preview_df"][binning_cols].head(12)


[binning_20251108_182913] status=success; operations_logged=3


Unnamed: 0,event_timestamp,_event_timestamp
0,2024-01-01 03:27:00,Day 0-6
1,2024-01-01 17:12:00,Day 0-6
2,2024-01-02 04:20:00,Day 0-6
3,2024-01-02 12:48:00,Day 0-6
4,2024-01-03 02:00:00,Day 0-6
5,2024-01-03 15:08:00,Day 0-6
6,2024-01-04 01:38:00,Day 0-6
7,2024-01-04 16:01:00,Day 0-6
8,2024-01-05 03:22:00,Day 0-6
9,2024-01-05 17:28:00,Day 0-6


## Component strategy

Retain only selected components (year, month, weekday) to encode coarse trends while dropping exact timestamps.


In [31]:
component_run = run_datetime_generalization(
    strategy="component",
    keep_components=["year", "month", "weekday"],
    strftime_output_format="{year}-M{month}-W{weekday}",
)
component_cols = ["event_timestamp", component_run["operation"].output_field_name]
component_run["preview_df"][component_cols].head(12)


[component_20251108_182918] status=success; operations_logged=3


Unnamed: 0,event_timestamp,_event_timestamp
0,2024-01-01 03:27:00,2024-01-Monday
1,2024-01-01 17:12:00,2024-01-Monday
2,2024-01-02 04:20:00,2024-01-Tuesday
3,2024-01-02 12:48:00,2024-01-Tuesday
4,2024-01-03 02:00:00,2024-01-Wednesday
5,2024-01-03 15:08:00,2024-01-Wednesday
6,2024-01-04 01:38:00,2024-01-Thursday
7,2024-01-04 16:01:00,2024-01-Thursday
8,2024-01-05 03:22:00,2024-01-Friday
9,2024-01-05 17:28:00,2024-01-Friday
