In [None]:
from typing import Any

from phaseedge.storage import store

def delete_ckpts_with_cation_counts(
    *,
    wl_key: str | None = None,
    dry_run: bool = True,
    limit: int | None = None,
) -> dict[str, Any]:
    """
    Delete documents from `wang_landau_ckpt` that have a non-empty `cation_counts` array.

    Args:
        wl_key: if provided, restrict deletion to this WL chain key.
        dry_run: if True, only list what would be deleted (no deletion).
        limit: if provided, cap the number of documents considered (sorted by wl_key, step_end).

    Returns:
        A summary dict with 'matched', 'deleted', and 'sample_ids'.
    """
    coll = store.db_rw()["wang_landau_ckpt"]

    criteria: dict[str, Any] = {"cation_counts": {"$elemMatch": {}}}
    if wl_key:
        criteria["wl_key"] = wl_key

    projection: dict[str, int] = {"_id": 1, "wl_key": 1, "step_end": 1}
    cursor = coll.find(criteria, projection).sort([("wl_key", 1), ("step_end", 1)])
    if limit is not None:
        cursor = cursor.limit(int(limit))

    docs = list(cursor)
    sample_ids = [str(d["_id"]) for d in docs[:10]]
    matched = len(docs)

    print(f"Matched {matched} document(s) with non-empty cation_counts"
          f"{' for wl_key='+wl_key if wl_key else ''}.")
    if matched:
        print("First few _id values:")
        for i, oid in enumerate(sample_ids, 1):
            print(f"{i:5d}  {oid}")

    if dry_run or matched == 0:
        print("Dry run: no deletions performed.")
        return {"matched": matched, "deleted": 0, "sample_ids": sample_ids}

    # Perform deletion
    result = coll.delete_many(criteria)
    deleted = int(result.deleted_count)
    print(f"Deleted {deleted} document(s).")
    return {"matched": matched, "deleted": deleted, "sample_ids": sample_ids}

# EXAMPLES:
# 1) Preview everything that would be deleted (no deletion):
delete_ckpts_with_cation_counts(dry_run=True)

# 2) Actually delete across ALL wl_key chains:
# delete_ckpts_with_cation_counts(dry_run=False)

# 3) Only delete within a specific chain:
# delete_ckpts_with_cation_counts(wl_key="my_wl_chain_key", dry_run=False)


Matched 0 document(s) with non-empty cation_counts.
Dry run: no deletions performed.
Matched 0 document(s) with non-empty cation_counts.
Dry run: no deletions performed.


{'matched': 0, 'deleted': 0, 'sample_ids': []}