## Deleak a given set of systems
> In this example we will be deleaking posebusters

In [16]:
from pathlib import Path
import pandas as pd
from dataclasses import dataclass, field
from plinder.data.splits import deleak_specific_test_systems, get_default_config
from plinder.data.common.constants import BASE_DIR
from cloudpathlib import GSPath

EXAMPLE_FOLDER = BASE_DIR.parent.parent.parent / "examples"
# data_dir = Path("/Users/yusuf/plinder_local_data/2024-06/v2")
DATA_DIR = GSPath("gs://plinder/2024-06/v2")

@dataclass
class TestCriteria:
    max_entry_resolution: float = 3.5
    max_entry_r: float = 0.4
    max_entry_rfree: float = 0.45
    max_entry_r_minus_rfree: float = 0.05
    ligand_max_num_unresolved_heavy_atoms: int = 0
    ligand_max_alt_count: int = 1
    ligand_min_average_occupancy: float = 0.8
    ligand_min_average_rscc: float = 0.8
    ligand_max_average_rsr: float = 0.3
    ligand_max_percent_outliers_clashes: float = 0
    pocket_max_num_unresolved_heavy_atoms: int = 0
    pocket_max_alt_count: int = 1
    pocket_min_average_occupancy: float = 0.8
    pocket_min_average_rscc: float = 0.8
    pocket_max_average_rsr: float = 0.3
    pocket_max_percent_outliers_clashes: int = 100


@dataclass
class GraphConfig:
    metric: str
    threshold: int  # edges above this threshold are kept
    depth: int  # neighbors at this depth are counted as leakage

@dataclass
class SplitConfig:
    proto_test_criteria: TestCriteria = field(default_factory=lambda: TestCriteria())
    graph_configs: list[GraphConfig] = field(
        default_factory=lambda: [
            GraphConfig("pli_qcov", 30, 1),
            GraphConfig("pocket_qcov", 50, 1),
        ]
    )
    test_cluster_cluster: str = "communities"
    # metric to use for sampling representatives from each component
    test_cluster_metric: str = "pli_qcov"
    # threshold to use for sampling representatives from each component
    test_cluster_threshold: int = 50
    # directed to use for sampling representatives from each component
    test_cluster_directed: bool = False
    cluster_column: str = "cluster"
    # max number of representatives from each community
    num_test_representatives: int = 3
    # test should not be singletons
    min_test_cluster_size: int = 5
    # test/val should not be in too big communities or cause too many train cases to be removed
    max_test_leakage_count: int = 300
    mms_unique_quality_count: int = 3
    test_fraction: float = 0.01

    val_cluster_cluster: str = "components"
    val_cluster_metric: str = "pocket_qcov"  # metric to use for splitting train and val
    val_cluster_threshold: int = 50  # threshold to use for splitting train and val
    val_cluster_directed: bool = False  # directed to use for splitting train and val
    val_fraction: float = 0.01
    min_val_cluster_size: int = 5  # val should not be singletons
    num_val_representatives: (
        int
    ) = 3  # max number of val representatives from each community

cfg =  get_default_config()

posebusters_pdbid_ccd_code = pd.read_csv(EXAMPLE_FOLDER / "data/posebusters_benchmark_set_ids.txt", header=None)[0].to_list()

deleaked_posebusters = deleak_specific_test_systems(
        DATA_DIR,
        cfg,
        posebusters_pdbid_ccd_code,
        "posebusters_test",
        id_is_pdb_ccd_codes=True)


In [19]:
deleaked_posebusters

Unnamed: 0,system_id,uniqueness,split,cluster,cluster_for_val_split
0,3grt__1__1.A__1.B,3grt__A__B_c239137,train,c45,c0
388467,3t60__1__1.C__1.H,3t60__C__H_c13371,train,c154,c82
388466,3t60__1__1.B_1.C__1.F,3t60__B_C__F_c365465,train,c154,c82
388465,3t60__1__1.A__1.D,3t60__A__D_c13371,train,c154,c82
388464,1t6z__1__1.B__1.D,1t6z__B__D_c118538,train,c1412,c1100
...,...,...,...,...,...
490041,5nff__9__1.I__1.KA,5nff__I__KA_c82803,train,c447,c0
590871,8gk3__9__1.I__1.OA_1.PA_1.QA,8gk3__I__OA_PA_QA_c109707,removed,c9,c0
314020,4kxf__9__3.H__3.R,4kxf__H__R_c113350,train,c663,c488
314018,4kxf__9__2.A__2.I,4kxf__A__I_c113351,train,c663,c488
