# 02 · Test Cleaning Strategies

Use this notebook to prototype different `DataCleanerConfig` combinations before wiring them into automated jobs. Each run reuses the production cleaner so results mirror what would happen in `scripts/data-cleaner.py`.


## Notebook goals

- Quickly compare multiple cleaning strategies without editing production code
- Capture row-retention and missing-value metrics per strategy
- Surface trade-offs (strict vs. relaxed) for stakeholders
- Provide reusable helper functions for future experiments


In [None]:
from __future__ import annotations

from pathlib import Path
import importlib.util
import sys
from dataclasses import dataclass

import dask.dataframe as dd
import matplotlib.pyplot as plt
from IPython.display import display
import pandas as pd
import seaborn as sns

plt.style.use("seaborn-v0_8")
sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", 50)

PROJECT_ROOT = Path.cwd().resolve().parents[1]
DATA_PATH = PROJECT_ROOT / "data" / "national_water_plan.csv"
SCRIPTS_DIR = PROJECT_ROOT / "scripts"


def load_module(module_name: str, file_path: Path):
    if module_name in sys.modules:
        return sys.modules[module_name]
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    return module


data_loader = load_module("project_data_loader", SCRIPTS_DIR / "data-loader.py")
DataLoader = data_loader.DataLoader
DataConfig = data_loader.DataConfig

data_cleaner = load_module("project_data_cleaner", SCRIPTS_DIR / "data-cleaner.py")
DataCleanerConfig = data_cleaner.DataCleanerConfig
WaterDataCleaner = data_cleaner.WaterDataCleaner


## 1. Load data & build a working sample

The sample keeps experiments fast while still flowing through the exact same cleaning methods.


In [None]:
data_config = DataConfig(filepath=str(DATA_PATH))
loader = DataLoader(data_config)
raw_ddf, exploration_report = loader.load_and_explore_data()

SAMPLE_ROWS = 20_000
sample_pdf = raw_ddf.head(SAMPLE_ROWS, compute=True)
print(f"Exploration rows: {exploration_report.metadata.rows:,}")
print(f"Sample size: {len(sample_pdf):,}")


In [None]:
CleaningReport = data_cleaner.CleaningReport

@dataclass
class StrategyResult:
    name: str
    description: str
    config: DataCleanerConfig
    report: CleaningReport
    cleaned_ddf: "dd.DataFrame"


def run_cleaning_strategy(name: str, description: str, overrides: dict) -> StrategyResult:
    base_kwargs = dict(
        strict_mode=True,
        remove_duplicates=True,
        remove_outliers=True,
        remove_invalid_spill_years=True,
        remove_invalid_text_values=True,
        create_backup=False,
        save_cleaning_report=False,
        output_directory="/tmp",
    )
    base_kwargs.update(overrides)
    config = DataCleanerConfig(**base_kwargs)
    cleaner = WaterDataCleaner(config)
    cleaned_ddf, report = cleaner.clean_data(sample_pdf.copy(), output_dir=None)
    return StrategyResult(name, description, config, report, cleaned_ddf)


## 2. Define strategy grid

Tweak just a handful of parameters per strategy so we can reason about the outcome of each change.


In [None]:
strategies = [
    {
        "name": "strict_default",
        "description": "Baseline configuration with strict column requirements and duplicate removal.",
        "overrides": {},
    },
    {
        "name": "relaxed_text",
        "description": "Allow rows that miss optional text fields while keeping numeric validation strict.",
        "overrides": {
            "remove_invalid_text_values": False,
            "missing_value_threshold": 0.4,
        },
    },
    {
        "name": "spill_imputation",
        "description": "Fill missing spill values with zeroes and keep rows even if only two years are available.",
        "overrides": {
            "fill_missing_values": True,
            "fill_value": 0,
            "remove_invalid_spill_years": False,
            "min_valid_spill_years": 2,
        },
    },
    {
        "name": "aggressive_outliers",
        "description": "Tighten outlier filtering to 2σ and demand four valid spill years.",
        "overrides": {
            "outlier_std_threshold": 2.0,
            "min_valid_spill_years": 4,
        },
    },
]

strategies


In [None]:
results: list[StrategyResult] = []
for strat in strategies:
    result = run_cleaning_strategy(strat["name"], strat["description"], strat["overrides"])
    results.append(result)
    print(f"✓ {strat['name']} completed — rows retained: {result.report.cleaned_shape[0]:,}")


## 3. Compare outcomes

Aggregate the metrics that matter most (row retention, missing %, duplicates removed) and visualize trade-offs.


In [None]:
summary_records = []
for result in results:
    metrics = result.report.quality_metrics
    summary_records.append(
        {
            "strategy": result.name,
            "description": result.description,
            "rows_in": result.report.original_shape[0],
            "rows_out": result.report.cleaned_shape[0],
            "rows_retained_pct": metrics.get("rows_retained_percent"),
            "initial_missing_pct": metrics.get("initial_missing_percent"),
            "final_missing_pct": metrics.get("final_missing_percent"),
            "duplicates_removed": metrics.get("duplicate_reduction"),
        }
    )

summary_df = pd.DataFrame(summary_records)
summary_df


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.barplot(data=summary_df, x="strategy", y="rows_retained_pct", ax=axes[0], color="#3182bd")
axes[0].set_title("Row retention %")
axes[0].set_ylim(0, 105)

sns.barplot(data=summary_df, x="strategy", y="final_missing_pct", ax=axes[1], color="#e6550d")
axes[1].set_title("Final missing %")
axes[1].set_ylim(0, summary_df["final_missing_pct"].max() * 1.2)

for ax in axes:
    ax.set_xlabel("Strategy")
    ax.tick_params(axis="x", rotation=30)

plt.tight_layout()
plt.show()


### Removal breakdown per strategy

Inspect which rules are driving the drop in rows to tune thresholds precisely.


In [None]:
removal_records = []
for result in results:
    breakdown = result.report.removal_breakdown or {}
    for reason, count in breakdown.items():
        removal_records.append(
            {
                "strategy": result.name,
                "reason": reason,
                "rows_removed": count,
            }
        )

if removal_records:
    breakdown_df = (
        pd.DataFrame(removal_records)
        .pivot_table(index="reason", columns="strategy", values="rows_removed", fill_value=0)
        .reindex(columns=summary_df["strategy"], fill_value=0)
        .sort_index(ascending=False)
    )
else:
    breakdown_df = pd.DataFrame()

breakdown_df


### Peek at cleaned sample for a chosen strategy

Use this helper to inspect the head/tail of the cleaned Dask DataFrame (converted to pandas for convenience).


In [None]:
def preview_strategy(strategy_name: str, n: int = 5):
    match = next((r for r in results if r.name == strategy_name), None)
    if match is None:
        raise ValueError(f"Strategy '{strategy_name}' not found. Choose from {[r.name for r in results]}")
    preview_df = match.cleaned_ddf.head(n, compute=True)
    display(preview_df)


preview_strategy("strict_default")


## Takeaways

- Use the summary table to document how each config impacts retention vs. quality.
- Feed promising overrides back into `DataCleanerConfig` defaults or environment-specific settings.
- Commit strategy descriptions so future analysts understand _why_ thresholds changed.
