In [None]:
from phaseedge.storage.store import get_jobstore
from phaseedge.jobs.store_ce_model import lookup_ce_by_key
from phaseedge.storage.cetrainref_dataset import Dataset

def delete_phaseedge_entries(ce_key: str, dry_run: bool = True) -> None:
    """
    Delete all MongoDB entries associated with a ce_key/dataset_key and their
    Atomate2 relaxation tasks.  When dry_run=True, the function reports how many
    documents would be removed without deleting them.

    Parameters
    ----------
    ce_key : str
        Cluster-expansion key whose documents should be removed.
    dataset_key : Optional[str]
    dry_run : bool
        If True (default), only report counts.  If False, perform deletions.

    """
    js = get_jobstore()

    # Lookup the CEModelDoc to infer dataset_key if needed
    ce_doc = lookup_ce_by_key(ce_key)
    if ce_doc is None:
        raise ValueError(f"No CEModelDoc found for ce_key={ce_key!r}.")

    dataset_key = ce_doc["dataset_key"]
    dataset = Dataset.from_key(dataset_key)

    # Build deletion criteria: CEModelDoc and dataset document
    criteria_info: list[tuple[str, dict]] = [
        ("CEModelDoc", {"output.kind": "CEModelDoc", "output.ce_key": ce_key}),
        ("CETrainRef_dataset", {"output.kind": "CETrainRef_dataset", "output.dataset_key": dataset_key}),
    ]

    # For each training reference, build criteria for its relaxation tasks
    seen = set()
    for ref in dataset.train_refs:
        key = (
            ref.occ_key,
            ref.calc_spec.calculator,
            ref.calc_spec.relax_type.value,
            ref.calc_spec.spin_type.value,
            ref.calc_spec.frozen_sublattices,
        )
        if key in seen:
            continue
        seen.add(key)
        criteria_info.append((
            "relax_task",
            {
                "metadata.occ_key": ref.occ_key,
                "metadata.calculator": ref.calc_spec.calculator,
                "metadata.relax_type": ref.calc_spec.relax_type.value,
                "metadata.spin_type": ref.calc_spec.spin_type.value,
                "metadata.frozen_sublattices": ref.calc_spec.frozen_sublattices,
            }
        ))

    # Preview counts for all criteria
    for label, criteria in criteria_info:
        count = js.count(criteria)
        print(f"{label} -> {count} document(s) match criteria {criteria}")

    if dry_run:
        print("Dry run only; no deletions performed.")
        return

    # Perform deletions
    for label, criteria in criteria_info:
        pre_count = js.count(criteria)
        if pre_count:
            js.docs_store.remove_docs(criteria)
            post_count = js.count(criteria)
            print(f"{label}: deleted {pre_count - post_count}, remaining {post_count}")
        else:
            print(f"{label}: nothing to delete.")

    print("Deletion complete.")

delete_phaseedge_entries(ce_key="cc4deb3c2ff88cfef40a9a8bb7954c1239dfb5310d4a00c0cdc9b5c556c8527d", dry_run=True)

ValueError: No CEModelDoc found for ce_key='cc4deb3c2ff88cfef40a9a8bb7954c1239dfb5310d4a00c0cdc9b5c556c8527d'.

In [None]:
from typing import Any
from phaseedge.storage.store import get_jobstore
from phaseedge.schemas.calc_spec import CalcSpec


def debug_energy_lookup(
    *,
    occ_key: str,
    calc_spec: CalcSpec,
    limit: int = 5,
) -> None:
    """
    Inspect why lookup_total_energy_eV is (not) finding an existing relaxation.

    Prints counts for progressively stricter criteria and shows the metadata
    for any matching docs.
    """
    js = get_jobstore()

    def _spin_value(spin: Any) -> Any:
        # Handle Enum[str] vs plain str
        return getattr(spin, "value", spin)

    # Full criteria (what lookup_total_energy_eV is using now)
    full_criteria: dict[str, Any] = {
        "metadata.occ_key": occ_key,
        "metadata.calculator": calc_spec.calculator,
        "metadata.relax_type": calc_spec.relax_type.value,
        "metadata.spin_type": _spin_value(calc_spec.spin_type),
        "metadata.max_force_eV_per_A": {"$exists": True},
        "metadata.frozen_sublattices": calc_spec.frozen_sublattices,
        "output.output.energy": {"$exists": True},
    }


    print("Counts:")
    print(f"  full (with spin + max_force): {js.count(full_criteria)}")

    print("\nDocs matching core criteria (up to {limit}):")
    for i, doc in enumerate(
        js.docs_store.query(
            criteria=full_criteria,
            properties={"_id": 1, "metadata": 1, "output": 1},
        )
    ):
        if i >= limit:
            break
        md = doc.get("metadata", {})
        print(
            f"  _id={doc.get('_id')} | "
            f"occ_key={str(md.get('occ_key'))[:12]}... | "
            f"calculator={md.get('calculator')} | "
            f"relax_type={md.get('relax_type')} | "
            f"spin_type={md.get('spin_type', '<missing>')} | "
            f"maxF={md.get('max_force_eV_per_A', '<missing>')} | "
            f"frozen_sublattices={md.get('frozen_sublattices')!r} | "
        )


from phaseedge.schemas.calc_spec import RelaxType, SpinType

calc_spec = CalcSpec(
    calculator="MACE-MPA-0",
    relax_type=RelaxType.FULL,
    frozen_sublattices="",
    max_force_eV_per_A=0.1,
    spin_type=SpinType.NONMAGNETIC,
)

debug_energy_lookup(
    occ_key="28cd7f89f6fdfbcf382a6d01c6b11582c1f22f334c194049f3c8d02830f3df31",
    calc_spec=calc_spec,
)

Counts:
  full (with spin + max_force): 1

Docs matching core criteria (up to {limit}):
  _id=690c2f5c53e6a4f48f43530a | occ_key=28cd7f89f6fd... | calculator=vasp-mp-24 | relax_type=full | spin_type=nonmagnetic | maxF=0.02 | frozen_sublattices='' | energy={'@module': 'pymatgen.core.structure', '@class': 'Structure', 'charge': 0.0, 'lattice': {'matrix': [[0.0, 8.076881686140267, 8.076881686140267], [8.076881686140267, -0.0, 8.076881686140267], [8.076881686140267, 8.076881686140267, -0.0]], 'pbc': [True, True, True], 'a': 11.422435622222437, 'b': 11.422435622222437, 'c': 11.422435622222437, 'alpha': 59.99999999999999, 'beta': 59.99999999999999, 'gamma': 59.99999999999999, 'volume': 1053.80719443729}, 'properties': {}, 'sites': [{'species': [{'element': 'Al', 'occu': 1}], 'abc': [0.0625, 0.3125, 0.3125], 'properties': {}, 'label': 'Al', 'xyz': [5.048051053837667, 3.0288306323025997, 3.0288306323025997]}, {'species': [{'element': 'Al', 'occu': 1}], 'abc': [0.0625, 0.3125, 0.8125], 'propert

In [8]:
from typing import Any

from phaseedge.storage.store import get_jobstore
from phaseedge.schemas.calc_spec import CalcSpec, CalcType, RelaxType, SpinType


def delete_duplicate_relax_docs_for_calc_spec(
    *,
    calc_spec: CalcSpec,
    dry_run: bool = True,
) -> int:
    """
    Find all relaxation documents matching a given CalcSpec and delete
    duplicates that share the same occ_key, keeping one arbitrary document
    per occ_key.

    When dry_run=True (default), nothing is deleted; the function only reports
    how many documents *would* be removed and returns that count.

    Parameters
    ----------
    calc_spec : CalcSpec
        Calculation specification whose matching relaxation docs should be
        de-duplicated.
    dry_run : bool
        If True, only report how many docs would be deleted. If False, perform
        the deletions.

    Returns
    -------
    int
        Number of documents that would be / were deleted (i.e., the number of
        duplicates beyond one per occ_key).
    """
    js = get_jobstore()

    def _spin_value(spin: Any) -> Any:
        # Handle Enum[str] vs plain str
        return getattr(spin, "value", spin)

    # Match the same "core" criteria you use for energy lookups, but without
    # constraining occ_key so that we can group by it.
    criteria: dict[str, Any] = {
        "metadata.calculator": calc_spec.calculator,
        "metadata.relax_type": calc_spec.relax_type.value,
        "metadata.spin_type": _spin_value(calc_spec.spin_type),
        "metadata.frozen_sublattices": calc_spec.frozen_sublattices,
        "metadata.max_force_eV_per_A": calc_spec.max_force_eV_per_A,
        "output.output.energy": {"$exists": True},
    }
    if calc_spec.calculator_info["calc_type"] == CalcType.MACE_MPA_0:
        criteria["output.is_force_converged"] = True

    # Collect all matching docs and group their _ids by occ_key
    occ_to_ids: dict[str, list[Any]] = {}

    for doc in js.docs_store.query(
        criteria=criteria,
        properties={"_id": 1, "metadata": 1},
    ):
        md = doc.get("metadata", {})
        occ_key = md.get("occ_key")
        if occ_key is None:
            # If there is no occ_key, we can't define "duplicate by occ_key"
            continue

        occ_key_str = str(occ_key)
        occ_to_ids.setdefault(occ_key_str, []).append(doc["_id"])

    total_docs = sum(len(ids) for ids in occ_to_ids.values())
    num_with_dupes = sum(1 for ids in occ_to_ids.values() if len(ids) > 1)

    # Figure out which specific docs to delete: for each occ_key, keep one
    # arbitrary _id and mark the rest for deletion.
    ids_to_delete: list[Any] = []
    for occ_key_str, ids in occ_to_ids.items():
        if len(ids) > 1:
            # Keep the first; delete all the rest
            ids_to_delete.extend(ids[1:])

    num_to_delete = len(ids_to_delete)

    print(
        f"[duplicate cleanup] calc_spec={calc_spec} -> "
        f"{total_docs} matching docs, "
        f"{num_with_dupes} occ_key(s) with duplicates, "
        f"{num_to_delete} doc(s) marked for deletion."
    )

    if dry_run:
        print("[duplicate cleanup] Dry run only; no deletions performed.")
        return num_to_delete

    if not ids_to_delete:
        print("[duplicate cleanup] Nothing to delete.")
        return 0

    # Delete all duplicates in one shot
    js.docs_store.remove_docs({"_id": {"$in": ids_to_delete}})
    print(f"[duplicate cleanup] Deleted {num_to_delete} duplicate document(s).")
    return num_to_delete


calc_spec = CalcSpec(
    calculator=CalcType.MACE_MPA_0,
    relax_type=RelaxType.FULL,
    frozen_sublattices="",
    max_force_eV_per_A=0.02,
    spin_type=SpinType.NONMAGNETIC,
)

# Dry run: just see how many duplicates exist for this CalcSpec
n_dupes = delete_duplicate_relax_docs_for_calc_spec(calc_spec=calc_spec, dry_run=True)

# Actual deletion:
# n_deleted = delete_duplicate_relax_docs_for_calc_spec(calc_spec=calc_spec, dry_run=False)


[duplicate cleanup] calc_spec=CalcSpec(calculator=<CalcType.MACE_MPA_0: 'MACE-MPA-0'>, relax_type=<RelaxType.FULL: 'full'>, spin_type=<SpinType.NONMAGNETIC: 'nonmagnetic'>, max_force_eV_per_A=0.02, frozen_sublattices='') -> 11830 matching docs, 0 occ_key(s) with duplicates, 0 doc(s) marked for deletion.
[duplicate cleanup] Dry run only; no deletions performed.
