From 0a00d11c5a70d5c46b2c29cc93b511490b2f8611 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Thu, 31 Jul 2025 18:20:17 +0200 Subject: [PATCH 1/4] use ratio parameter to add external data --- chebai/preprocessing/datasets/chebi.py | 60 ++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index 1df144d9..ed658180 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -702,18 +702,24 @@ class ChEBIOverXPartial(ChEBIOverX): top_class_id (int): The ID of the top class from which to extract subclasses. """ - def __init__(self, top_class_id: int, **kwargs): + def __init__(self, top_class_id: int, external_data_ratio: float, **kwargs): """ Initializes the ChEBIOverXPartial dataset. Args: top_class_id (int): The ID of the top class from which to extract subclasses. **kwargs: Additional keyword arguments passed to the superclass initializer. + external_data_ratio (float): How much external data (i.e., samples where top_class_id + is no positive label) to include in the dataset. 0 means no external data, 1 means + the maximum amount (i.e., the complete ChEBI dataset). """ if "top_class_id" not in kwargs: kwargs["top_class_id"] = top_class_id + if "external_data_ratio" not in kwargs: + kwargs["external_data_ratio"] = external_data_ratio self.top_class_id: int = top_class_id + self.external_data_ratio: float = external_data_ratio super().__init__(**kwargs) @property @@ -727,7 +733,7 @@ def processed_dir_main(self) -> str: return os.path.join( self.base_dir, self._name, - f"partial_{self.top_class_id}", + f"partial_{self.top_class_id}_ext_ratio_{self.external_data_ratio:.2f}", "processed", ) @@ -746,9 +752,53 @@ def _extract_class_hierarchy(self, chebi_path: str) -> nx.DiGraph: descendants of the top class ID. """ g = super()._extract_class_hierarchy(chebi_path) - g = g.subgraph(list(g.successors(self.top_class_id)) + [self.top_class_id]) + top_class_successors = list(g.successors(self.top_class_id)) + [ + self.top_class_id + ] + external_nodes = list(set(n for n in g.nodes if n not in top_class_successors)) + if 0 < self.external_data_ratio < 1: + n_external_nodes = int( + len(top_class_successors) + * self.external_data_ratio + / (1 - self.external_data_ratio) + ) + print( + f"Extracting {n_external_nodes} external nodes from the ChEBI dataset (ratio: {self.external_data_ratio:.2f})" + ) + external_nodes = external_nodes[: int(n_external_nodes)] + elif self.external_data_ratio == 0: + external_nodes = [] + + g = g.subgraph(top_class_successors + external_nodes) + print( + f"Subgraph contains {len(g.nodes)} nodes, of which {len(top_class_successors)} are subclasses of the top class ID {self.top_class_id}." + ) return g + def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List: + """Only selects classes that meet the threshold AND are subclasses of the top class ID (including itself).""" + smiles = nx.get_node_attributes(g, "smiles") + nodes = list( + sorted( + { + node + for node in g.nodes + if sum( + 1 if smiles[s] is not None else 0 for s in g.successors(node) + ) + >= self.THRESHOLD + and ( + self.top_class_id in g.predecessors(node) + or node == self.top_class_id + ) + } + ) + ) + filename = "classes.txt" + with open(os.path.join(self.processed_dir_main, filename), "wt") as fout: + fout.writelines(str(node) + "\n" for node in nodes) + return nodes + class ChEBIOver50Partial(ChEBIOverXPartial, ChEBIOver50): """ @@ -1473,3 +1523,7 @@ def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]: ] JCI_500_COLUMNS_INT = [int(n.split(":")[-1]) for n in JCI_500_COLUMNS] + +if __name__ == "__main__": + # get arguments from command line + pass From f7c4eb719db63a4dfa54c99186f6c25619f328fe Mon Sep 17 00:00:00 2001 From: sfluegel Date: Fri, 1 Aug 2025 10:20:51 +0200 Subject: [PATCH 2/4] remove non-positive filter (allow samples with no positive labels) --- chebai/preprocessing/datasets/chebi.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index ed658180..9fa1c1c7 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -301,9 +301,7 @@ def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame: data = pd.DataFrame(data) data = data[~data["SMILES"].isnull()] data = data[[name not in CHEBI_BLACKLIST for name, _ in data.iterrows()]] - # This filters the DataFrame to include only the rows where at least one value in the row from 4th column - # onwards is True/non-zero. - data = data[data.iloc[:, self._LABELS_START_IDX :].any(axis=1)] + return data # ------------------------------ Phase: Setup data ----------------------------------- @@ -892,7 +890,7 @@ def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]: atom_index = ( - "\*", + r"\*", "H", "He", "Li", @@ -1525,5 +1523,13 @@ def term_callback(doc: fastobo.term.TermFrame) -> Union[Dict, bool]: JCI_500_COLUMNS_INT = [int(n.split(":")[-1]) for n in JCI_500_COLUMNS] if __name__ == "__main__": - # get arguments from command line - pass + data_module_05 = ChEBIOver50Partial( + chebi_version=241, + splits_file_path=os.path.join( + "data", "chebi_v241", "ChEBI50", "splits_80_10_10.csv" + ), + top_class_id=22712, + external_data_ratio=0.5, + ) + data_module_05.prepare_data() + data_module_05.setup() From 3702b5a824178a95df20b8bfda1a817a75792017 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Fri, 1 Aug 2025 14:24:46 +0200 Subject: [PATCH 3/4] fix unit tests --- tests/unit/dataset_classes/testChebiOverXPartial.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py index 76584ebf..232d97da 100644 --- a/tests/unit/dataset_classes/testChebiOverXPartial.py +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -16,7 +16,9 @@ def setUpClass(cls, mock_makedirs) -> None: """ Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph. """ - cls.chebi_extractor = ChEBIOverXPartial(top_class_id=11111, chebi_version=231) + cls.chebi_extractor = ChEBIOverXPartial( + top_class_id=11111, external_data_ratio=0.5, chebi_version=231 + ) cls.test_graph = ChebiMockOntology.get_transitively_closed_graph() @patch( From a47a6752645c6bc80634e1201109b4b444e4892c Mon Sep 17 00:00:00 2001 From: sfluegel Date: Fri, 1 Aug 2025 14:29:06 +0200 Subject: [PATCH 4/4] set external data ratio to 0 for test --- tests/unit/dataset_classes/testChebiOverXPartial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py index 232d97da..0e263335 100644 --- a/tests/unit/dataset_classes/testChebiOverXPartial.py +++ b/tests/unit/dataset_classes/testChebiOverXPartial.py @@ -17,7 +17,7 @@ def setUpClass(cls, mock_makedirs) -> None: Set up the ChEBIOverXPartial instance with a mock processed directory path and a test graph. """ cls.chebi_extractor = ChEBIOverXPartial( - top_class_id=11111, external_data_ratio=0.5, chebi_version=231 + top_class_id=11111, external_data_ratio=0, chebi_version=231 ) cls.test_graph = ChebiMockOntology.get_transitively_closed_graph()