From d0416928fed4622640a0c8437ff9cd227bb6f4c6 Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 7 Feb 2024 09:30:31 +0000 Subject: [PATCH 01/30] Removed production name attribute --- .../production/metadata/api/exceptions.py | 6 +++- .../metadata/api/hive/dataset_factory.py | 36 +++++++++++++++++++ .../production/metadata/api/models/dataset.py | 5 ++- 3 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 src/ensembl/production/metadata/api/hive/dataset_factory.py diff --git a/src/ensembl/production/metadata/api/exceptions.py b/src/ensembl/production/metadata/api/exceptions.py index 4cff5b80..d4770c0a 100644 --- a/src/ensembl/production/metadata/api/exceptions.py +++ b/src/ensembl/production/metadata/api/exceptions.py @@ -47,4 +47,8 @@ class UpdateBackCoreException(UpdaterException, RuntimeError): class TypeNotFoundException(UpdaterException, RuntimeError): """Dataset Type not found""" - pass \ No newline at end of file + pass + +class DatasetFactoryException(Exception, RuntimeError): + """An error occured while using dataset factory""" + pass diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py new file mode 100644 index 00000000..2c007c08 --- /dev/null +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -0,0 +1,36 @@ +from ensembl.production.metadata.api.exceptions import * + +class DatasetFactory(): + """ + A class used to interact with the dataset REST endpoint and to add data. + + ... + + Attributes + ---------- + genome_uuid = uuid + + Methods + ------- + get_child_datasets() + """ + def __init__(self, genome_uuid=None,dataset_type=None,dataset_uuid=None): + if genome_uuid == None and dataset_uuid == None: + raise DatasetFactoryException("genome_uuid + datset.type or dataset_uuid are required") + + def get_child_datasets(self): + #Function to get all of the possible children datasets that are not constrained + + def create_child_datasets(self): + #Recursive function to create all the child datasets that it can. Breaks when no more datasets are created + print "Not Implemented" + + def create_dataset(self): + print "Not Implemented" + + def update_dataset_status(self): + print "Not Implemented" + + def update_dataset_attributes(self): + print "Not Implemented" + diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 17c91a29..17ceafc4 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -9,7 +9,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index +from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index, JSON from sqlalchemy.dialects.mysql import DATETIME from sqlalchemy.orm import relationship from sqlalchemy.sql import func @@ -99,6 +99,9 @@ class DatasetType(LoadAble, Base): topic = Column(String(32), nullable=False) description = Column(String(255)) details_uri = Column(String(255)) + parent = Column(String(128), default=None) + depends_on = Column(String(128), default=None) + filter_on = Column(JSON, default=None) # One to many relationships # dataset_type_id to dataset datasets = relationship('Dataset', back_populates='dataset_type') From 2ec24caedee871880811b424f207a54d5420c82f Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 8 Feb 2024 09:37:56 +0000 Subject: [PATCH 02/30] Initial function definitions --- .../metadata/api/hive/dataset_factory.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 2c007c08..cc4b91b4 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -2,7 +2,7 @@ class DatasetFactory(): """ - A class used to interact with the dataset REST endpoint and to add data. + A class used to interact with the ensembl_genome_metadata to modify dataset and dataset attribute table. ... @@ -14,23 +14,26 @@ class DatasetFactory(): ------- get_child_datasets() """ - def __init__(self, genome_uuid=None,dataset_type=None,dataset_uuid=None): - if genome_uuid == None and dataset_uuid == None: - raise DatasetFactoryException("genome_uuid + datset.type or dataset_uuid are required") - - def get_child_datasets(self): + # def __init__(self): + # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome + def get_child_datasets(self, dataset_uuid=None): #Function to get all of the possible children datasets that are not constrained - - def create_child_datasets(self): + #Only returns children of dataset_uuid if specified + child_datasets = [] + return child_datasets + def create_child_datasets(self, dataset_uuid=None, dataset_type=None): #Recursive function to create all the child datasets that it can. Breaks when no more datasets are created - print "Not Implemented" - - def create_dataset(self): - print "Not Implemented" - - def update_dataset_status(self): - print "Not Implemented" - - def update_dataset_attributes(self): - print "Not Implemented" - + #Only returns children of dataset_uuid if specified + #Should be limited to a single type if dataset_uuid is not specified + child_datasets = self.get_child_datasets() + return child_datasets + + def create_dataset(self,genome_uuid, datasource, dataset_type, dataset_attributes): + dataset_uuid = '' + return dataset_uuid + def update_dataset_status(self,dataset_uuid,status): + return dataset_uuid,status + + def update_dataset_attributes(self,dataset_uuid, dataset_attributes): + datset_attribute_indicies = [] + return dataset_uuid,datset_attribute_indicies From b5143fb8a52a148a0bb032bc786519f4e58ca3fe Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 8 Feb 2024 09:55:17 +0000 Subject: [PATCH 03/30] Added session --- .../production/metadata/api/hive/dataset_factory.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index cc4b91b4..741db35e 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -1,3 +1,5 @@ +from ensembl.database import DBConnection + from ensembl.production.metadata.api.exceptions import * class DatasetFactory(): @@ -14,7 +16,15 @@ class DatasetFactory(): ------- get_child_datasets() """ - # def __init__(self): + def __init__(self,session=None,metadata_uri=None): + if session is None: + if metadata_uri is None: + raise DatasetFactoryException("session or metadata_uri are required") + self.session = DBConnection(metadata_uri).session_scope() + self.session_source = "new" + else: + self.session=session + self.session_source = "import" # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome def get_child_datasets(self, dataset_uuid=None): #Function to get all of the possible children datasets that are not constrained From 713e0d211408d65a4593de4c3dd724989edbf9f7 Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 8 Feb 2024 10:20:52 +0000 Subject: [PATCH 04/30] update_dataset_status finished and dataset.status definitions improved --- .../metadata/api/hive/dataset_factory.py | 23 +++++++++++++++++-- .../production/metadata/api/models/dataset.py | 2 +- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 741db35e..39859209 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -1,6 +1,8 @@ from ensembl.database import DBConnection from ensembl.production.metadata.api.exceptions import * +from ensembl.production.metadata.api.models import Dataset + class DatasetFactory(): """ @@ -23,7 +25,7 @@ def __init__(self,session=None,metadata_uri=None): self.session = DBConnection(metadata_uri).session_scope() self.session_source = "new" else: - self.session=session + self.session = session self.session_source = "import" # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome def get_child_datasets(self, dataset_uuid=None): @@ -41,9 +43,26 @@ def create_child_datasets(self, dataset_uuid=None, dataset_type=None): def create_dataset(self,genome_uuid, datasource, dataset_type, dataset_attributes): dataset_uuid = '' return dataset_uuid - def update_dataset_status(self,dataset_uuid,status): + def update_dataset_status(self,dataset_uuid,status=None): + dataset=self.get_dataset(dataset_uuid) + if status is None: + old_status = dataset.status + if old_status == 'Released': + raise DatasetFactoryException("Unable to change status of Released dataset") + elif old_status == 'Submitted': + status = 'Processing' + elif old_status == 'Processing': + status = 'Processed' + elif old_status == 'Processed': + status = 'Released' + dataset.status = status + #TODO: Check if I have to close the session here. return dataset_uuid,status def update_dataset_attributes(self,dataset_uuid, dataset_attributes): datset_attribute_indicies = [] return dataset_uuid,datset_attribute_indicies + + def get_dataset(self, dataset_uuid): + dataset = self.session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + return dataset \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 17ceafc4..463b6f29 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -44,7 +44,7 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column(Enum('Submitted', 'Progressing', 'Processed'), server_default=text("'Submitted'")) + status = Column(Enum('Submitted', 'Processing', 'Processed', 'Released'), server_default=text("'Submitted'")) # One to many relationships # dataset_id to dataset attribute and genome dataset From 16e7f6015e236f1063d9a37c0f17f55a3e14f6cd Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 8 Feb 2024 15:14:17 +0000 Subject: [PATCH 05/30] added tests for update status and improved session logic --- .../production/metadata/api/exceptions.py | 2 +- .../metadata/api/hive/dataset_factory.py | 44 ++++++++++++----- .../api/sample/ensembl_metadata/table.sql | 2 +- src/tests/test_dataset_factory.py | 48 +++++++++++++++++++ 4 files changed, 81 insertions(+), 15 deletions(-) create mode 100644 src/tests/test_dataset_factory.py diff --git a/src/ensembl/production/metadata/api/exceptions.py b/src/ensembl/production/metadata/api/exceptions.py index d4770c0a..715a564c 100644 --- a/src/ensembl/production/metadata/api/exceptions.py +++ b/src/ensembl/production/metadata/api/exceptions.py @@ -49,6 +49,6 @@ class TypeNotFoundException(UpdaterException, RuntimeError): """Dataset Type not found""" pass -class DatasetFactoryException(Exception, RuntimeError): +class DatasetFactoryException(Exception): """An error occured while using dataset factory""" pass diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 39859209..a3c578dc 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -19,14 +19,15 @@ class DatasetFactory(): get_child_datasets() """ def __init__(self,session=None,metadata_uri=None): - if session is None: + if session: + self.session = session + self.owns_session = False + else: if metadata_uri is None: raise DatasetFactoryException("session or metadata_uri are required") - self.session = DBConnection(metadata_uri).session_scope() - self.session_source = "new" - else: - self.session = session - self.session_source = "import" + self.owns_session = True + self.metadata_db = DBConnection(metadata_uri) + # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome def get_child_datasets(self, dataset_uuid=None): #Function to get all of the possible children datasets that are not constrained @@ -43,8 +44,9 @@ def create_child_datasets(self, dataset_uuid=None, dataset_type=None): def create_dataset(self,genome_uuid, datasource, dataset_type, dataset_attributes): dataset_uuid = '' return dataset_uuid - def update_dataset_status(self,dataset_uuid,status=None): - dataset=self.get_dataset(dataset_uuid) + + def _update_status(self, dataset, status=None): + valid_statuses = ['Submitted', 'Processing', 'Processed', 'Released'] if status is None: old_status = dataset.status if old_status == 'Released': @@ -55,14 +57,30 @@ def update_dataset_status(self,dataset_uuid,status=None): status = 'Processed' elif old_status == 'Processed': status = 'Released' + if status not in valid_statuses: + raise DatasetFactoryException(f"Unable to change status to {status} as this is not valid. Please use " + f"one of :{valid_statuses}") dataset.status = status - #TODO: Check if I have to close the session here. - return dataset_uuid,status + return status + + def update_dataset_status(self, dataset_uuid, status=None): + if self.owns_session: + with self.metadata_db.session_scope() as session: + dataset = self.get_dataset(session, dataset_uuid) + updated_status = self._update_status(dataset, status) + else: + dataset = self.get_dataset(self.session, dataset_uuid) + updated_status = self._update_status(dataset, status) + + return dataset_uuid, updated_status def update_dataset_attributes(self,dataset_uuid, dataset_attributes): datset_attribute_indicies = [] return dataset_uuid,datset_attribute_indicies - def get_dataset(self, dataset_uuid): - dataset = self.session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - return dataset \ No newline at end of file + def get_dataset(self, session, dataset_uuid): + return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + + def close_session(self): + if self.owns_session and self.session: + self.session.close() \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql index 859e0f14..eccde2bb 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql @@ -98,7 +98,7 @@ CREATE TABLE dataset created datetime(6) not null, dataset_source_id int not null, label varchar(128) not null, - status enum ('Submitted', 'Progressing', 'Processed') default 'Submitted' null, + status enum ('Submitted', 'Processing', 'Processed', 'Released') default 'Submitted' null, constraint dataset_uuid unique (dataset_uuid), constraint dataset_dataset_source_id_fd96f115_fk_dataset_s diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py new file mode 100644 index 00000000..77c10eed --- /dev/null +++ b/src/tests/test_dataset_factory.py @@ -0,0 +1,48 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from unittest import mock +from unittest.mock import Mock, patch + +import pytest +import re + +import sqlalchemy +from ensembl.database import UnitTestDB, DBConnection + +from ensembl.production.metadata.api.hive.dataset_factory import DatasetFactory +from ensembl.production.metadata.api.models import Dataset + +db_directory = Path(__file__).parent / 'databases' +db_directory = db_directory.resolve() + + +@pytest.mark.parametrize("multi_dbs", [[{'src': 'ensembl_metadata'}, {'src': 'ncbi_taxonomy'}]],indirect=True) +class TestDatasetFactory: + dbc = None # type: UnitTestDB + + def test_update_dataset_status(self, multi_dbs): + dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + test_uuid = '385f1ec2-bd06-40ce-873a-98e199f10534' + dataset_factory.update_dataset_status(test_uuid) + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() + assert dataset.status == 'Processing' + dataset_factory = DatasetFactory(session=session) + dataset_factory.update_dataset_status(test_uuid) + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() + assert dataset.status == 'Processed' + + + + From a408ff60df781d7072af8a960d389cc3d505416d Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 8 Feb 2024 23:23:43 +0000 Subject: [PATCH 06/30] update_dataset_attributes implemented with tests --- .../metadata/api/hive/dataset_factory.py | 41 ++++++++++++++----- .../sample/ensembl_metadata/dataset_type.txt | 21 ++++++---- .../api/sample/ensembl_metadata/table.sql | 5 ++- .../production/metadata/updater/base.py | 17 +++++++- src/tests/test_dataset_factory.py | 30 +++++++++++++- 5 files changed, 94 insertions(+), 20 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index a3c578dc..3e9d988e 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -1,7 +1,7 @@ from ensembl.database import DBConnection from ensembl.production.metadata.api.exceptions import * -from ensembl.production.metadata.api.models import Dataset +from ensembl.production.metadata.api.models import Dataset, Attribute, DatasetAttribute class DatasetFactory(): @@ -41,9 +41,10 @@ def create_child_datasets(self, dataset_uuid=None, dataset_type=None): child_datasets = self.get_child_datasets() return child_datasets - def create_dataset(self,genome_uuid, datasource, dataset_type, dataset_attributes): - dataset_uuid = '' - return dataset_uuid + #TODO: + # def create_dataset(self,genome_uuid, datasource, dataset_type, dataset_attributes): + # dataset_uuid = '' + # return dataset_uuid def _update_status(self, dataset, status=None): valid_statuses = ['Submitted', 'Processing', 'Processed', 'Released'] @@ -74,13 +75,33 @@ def update_dataset_status(self, dataset_uuid, status=None): return dataset_uuid, updated_status - def update_dataset_attributes(self,dataset_uuid, dataset_attributes): - datset_attribute_indicies = [] - return dataset_uuid,datset_attribute_indicies + def update_dataset_attributes(self, dataset_uuid, attribut_dict): + if not isinstance(attribut_dict, dict): + raise TypeError("attribut_dict must be a dictionary") + if self.owns_session: + with self.metadata_db.session_scope() as session: + dataset = self.get_dataset(session, dataset_uuid) + dataset_attributes = update_attributes(dataset, attribut_dict, session) + return dataset_attributes + else: + dataset = self.get_dataset(self.session, dataset_uuid) + dataset_attributes = update_attributes(dataset, attribut_dict, self.session) + return dataset_attributes def get_dataset(self, session, dataset_uuid): return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - def close_session(self): - if self.owns_session and self.session: - self.session.close() \ No newline at end of file + +#This is a direct copy of Marc's code in the core updater in an unmerged branch. I am not sure where we should keep it. +def update_attributes(dataset, attributes, session): + dataset_attributes = [] + for attribute, value in attributes.items(): + meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise UpdaterException(f"{attribute} does not exist. Add it to the database and reload.") + dataset_attributes.append(DatasetAttribute( + value=value, + dataset=dataset, + attribute=meta_attribute, + )) + return dataset_attributes \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt index de9b64ef..6874578f 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt @@ -1,7 +1,14 @@ -1 assembly Genomic assembly Core Annotation Compilation of sequences for a genome \N \N -2 genebuild Genomic Build Core Annotation Genomic annotations for an assembly \N \N -3 variation Variations Variation Annotation \N \N -4 evidence Variation Evidence Variation Annotation \N \N -5 regulation_build Regulations Regulatory Annotation \N \N -6 homologies Comparative homologies Comparative Annotation \N \N -7 regulatory_features Regulations Regulatory Annotation \N \N +1 assembly Genomic assembly Core Annotation Compilation of sequences for a genome \N \N \N \N +2 genebuild Genomic Build Core Annotation Genomic annotations for an assembly \N \N \N \N +3 variation Variation Data Variation Annotation Short variant data for rattus_norvegicus \N \N \N \N +4 evidence Variation Evidence Variation Annotation Variation Annotation \N \N \N \N +5 regulation_build Regulations Regulatory Annotation Regulatory Annotation \N \N \N \N +6 homologies Comparative homologies Comparative Annotation Comparative Annotation \N genebuild \N \N +7 regulatory_features Regulatory Annotation Core Annotation Regulatory annotation for an assembly \N \N \N \N +8 xref External References Core Annotation Data obtained from external sources and their links \N genebuild \N \N +9 protein_features Protein Features Core Annotation Annotation for proteins from external sources \N xref \N \N +10 appris APPRIS Core Annotation Data obtained from APPRIS \N xref protein_features \N +11 goa Gene Ontology Annotation Dumps Comparative Annotation Ontology Dumps \N homologies \N \N +12 gpad Gene Ontology Annotation Loading Comparative Annotation Ontology Loading \N goa \N \N +13 gene_name Gene Name Projection Comparative Annotation Gene Name Projection and Update \N gpad;homologies \N \N +14 ftp_dumps FTP Dumps Core Annotation FTP Dumps \N protein_features;appris;gene_name \N \N \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql index eccde2bb..7ecbfaa2 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql @@ -84,7 +84,10 @@ CREATE TABLE dataset_type label varchar(128) not null, topic varchar(32) not null, description varchar(255) null, - details_uri varchar(255) null + details_uri varchar(255) null, + parent varchar(255) null, + depends_on varchar(255) null, + filter_on JSON null ); CREATE TABLE dataset diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index d68245b9..7697e157 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -14,7 +14,9 @@ from sqlalchemy.engine import make_url from ensembl.core.models import Meta -from ensembl.production.metadata.api.models import DatasetSource + +from ensembl.production.metadata.api.exceptions import UpdaterException +from ensembl.production.metadata.api.models import DatasetSource, Attribute from ensembl.database import DBConnection from ensembl.production.metadata.api.models import EnsemblRelease @@ -54,3 +56,16 @@ def get_or_new_source(self, meta_session, db_type): ) meta_session.add(dataset_source) # Only add a new DatasetSource to the session if it doesn't exist return dataset_source + + def update_attributes(self, dataset, attributes, session): + genebuild_dataset_attributes = [] + for attribute, value in attributes.items(): + meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise UpdaterException(f"{attribute} does not exist. Add it to the database and reload.") + genebuild_dataset_attributes.append(DatasetAttribute( + value=value, + dataset=dataset, + attribute=meta_attribute, + )) + return genebuild_dataset_attributes \ No newline at end of file diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index 77c10eed..e5b3471d 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -20,7 +20,7 @@ from ensembl.database import UnitTestDB, DBConnection from ensembl.production.metadata.api.hive.dataset_factory import DatasetFactory -from ensembl.production.metadata.api.models import Dataset +from ensembl.production.metadata.api.models import Dataset, DatasetAttribute, Attribute db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() @@ -43,6 +43,34 @@ def test_update_dataset_status(self, multi_dbs): dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() assert dataset.status == 'Processed' + def test_update_dataset_attributes(self, multi_dbs): + dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + test_uuid = '385f1ec2-bd06-40ce-873a-98e199f10534' + test_attributes = {"contig_n50" : "test1", "total_genome_length": "test2"} + # def update_dataset_attributes(self,dataset_uuid, attribut_dict): + dataset_factory.update_dataset_attributes(test_uuid, test_attributes) + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() + dataset_attribute = session.query(DatasetAttribute) \ + .join(Attribute, DatasetAttribute.attribute_id == Attribute.attribute_id) \ + .filter(DatasetAttribute.dataset_id == dataset.dataset_id, + Attribute.name == 'contig_n50', + DatasetAttribute.value == 'test1') \ + .one_or_none() + assert dataset_attribute is not None + dataset_factory = DatasetFactory(session=session) + test_attributes = {"gc_percentage": "test3", "longest_gene_length": "test4"} + dataset_factory.update_dataset_attributes(test_uuid, test_attributes) + session.commit() + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() + test_attribute = session.query(DatasetAttribute) \ + .join(Attribute, DatasetAttribute.attribute_id == Attribute.attribute_id) \ + .filter(DatasetAttribute.dataset_id == dataset.dataset_id, + Attribute.name == 'longest_gene_length', + DatasetAttribute.value == 'test4') \ + .all() + assert test_attribute is not None From 2bc60586ac376c7ec51886874e88d29f6f73d9cd Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 14 Feb 2024 10:14:42 +0000 Subject: [PATCH 07/30] added tests for update status and improved session logic --- .../metadata/api/hive/dataset_factory.py | 71 +++++++++++++++---- .../api/sample/ensembl_metadata/table.sql | 3 +- src/tests/test_dataset_factory.py | 54 +++++++++++--- 3 files changed, 103 insertions(+), 25 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 3e9d988e..5ac9b395 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -1,7 +1,11 @@ from ensembl.database import DBConnection from ensembl.production.metadata.api.exceptions import * -from ensembl.production.metadata.api.models import Dataset, Attribute, DatasetAttribute +from ensembl.production.metadata.api.models import Dataset, Attribute, DatasetAttribute, Genome, GenomeDataset, \ + DatasetType +from sqlalchemy.sql import func +import datetime +import uuid class DatasetFactory(): @@ -18,7 +22,8 @@ class DatasetFactory(): ------- get_child_datasets() """ - def __init__(self,session=None,metadata_uri=None): + + def __init__(self, session=None, metadata_uri=None): if session: self.session = session self.owns_session = False @@ -30,21 +35,60 @@ def __init__(self,session=None,metadata_uri=None): # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome def get_child_datasets(self, dataset_uuid=None): - #Function to get all of the possible children datasets that are not constrained - #Only returns children of dataset_uuid if specified + # Function to get all of the possible children datasets that are not constrained + # Only returns children of dataset_uuid if specified child_datasets = [] return child_datasets + def create_child_datasets(self, dataset_uuid=None, dataset_type=None): - #Recursive function to create all the child datasets that it can. Breaks when no more datasets are created - #Only returns children of dataset_uuid if specified - #Should be limited to a single type if dataset_uuid is not specified + # Recursive function to create all the child datasets that it can. Breaks when no more datasets are created + # Only returns children of dataset_uuid if specified + # Should be limited to a single type if dataset_uuid is not specified child_datasets = self.get_child_datasets() return child_datasets - #TODO: - # def create_dataset(self,genome_uuid, datasource, dataset_type, dataset_attributes): - # dataset_uuid = '' - # return dataset_uuid + def get_parent_datasets(self, dataset_uuid): + # Function to return all of the parent datasets. Usually only one will be returned. + #Unlike previous functions a dataset_uuid is required. + #If there is no parent it will return top_level and itself. + parent_uuid = [] + parent_type = [] + if self.owns_session: + with self.metadata_db.session_scope() as session: + dataset = self.get_dataset(session, dataset_uuid) + dataset_type = session.query(DatasetType).filter( + DatasetType.dataset_type_id == dataset.dataset_type_id).one() + if dataset_type.parent is None: + return ['dateset_uuid'], ['top_level'] + parent_dataset_types = dataset_type.parent.split(';') + # loop over datesets that have the same genome and contain one of the parent types. + else: + dataset = self.get_dataset(self.session, dataset_uuid) + return parent_uuid, parent_type + + + def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, + version): + new_dataset = Dataset( + dataset_uuid=str(uuid.uuid4()), + dataset_type=dataset_type, # Must be an object returned from the current session + name=name, + version=version, + label=label, + created=func.now(), + dataset_source=dataset_source, # Must + status='Submitted', + ) + genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).one() + new_genome_dataset = GenomeDataset( + genome=genome, + dataset=new_dataset, + is_current=False, + ) + new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session) + session.add(new_genome_dataset) + dataset_uuid = new_dataset.dataset_uuid + return dataset_uuid, new_dataset_attributes, new_genome_dataset def _update_status(self, dataset, status=None): valid_statuses = ['Submitted', 'Processing', 'Processed', 'Released'] @@ -72,7 +116,6 @@ def update_dataset_status(self, dataset_uuid, status=None): else: dataset = self.get_dataset(self.session, dataset_uuid) updated_status = self._update_status(dataset, status) - return dataset_uuid, updated_status def update_dataset_attributes(self, dataset_uuid, attribut_dict): @@ -92,7 +135,7 @@ def get_dataset(self, session, dataset_uuid): return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() -#This is a direct copy of Marc's code in the core updater in an unmerged branch. I am not sure where we should keep it. +# This is a direct copy of Marc's code in the core updater in an unmerged branch. I am not sure where we should keep it. def update_attributes(dataset, attributes, session): dataset_attributes = [] for attribute, value in attributes.items(): @@ -104,4 +147,4 @@ def update_attributes(dataset, attributes, session): dataset=dataset, attribute=meta_attribute, )) - return dataset_attributes \ No newline at end of file + return dataset_attributes diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql index 21bd3bb2..8d86fcb8 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql @@ -94,8 +94,9 @@ CREATE TABLE dataset name varchar(128) not null, version varchar(128) null, created datetime(6) not null, - dataset_source_id int not null, label varchar(128) not null, + dataset_source_id int not null, + dataset_type_id int not null, status enum ('Submitted', 'Processing', 'Processed', 'Released') default 'Submitted' null, constraint dataset_uuid unique (dataset_uuid), diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index e5b3471d..e6ad405c 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -20,19 +20,19 @@ from ensembl.database import UnitTestDB, DBConnection from ensembl.production.metadata.api.hive.dataset_factory import DatasetFactory -from ensembl.production.metadata.api.models import Dataset, DatasetAttribute, Attribute +from ensembl.production.metadata.api.models import Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() -@pytest.mark.parametrize("multi_dbs", [[{'src': 'ensembl_metadata'}, {'src': 'ncbi_taxonomy'}]],indirect=True) +@pytest.mark.parametrize("multi_dbs", [[{'src': 'ensembl_metadata'}, {'src': 'ncbi_taxonomy'}]], indirect=True) class TestDatasetFactory: dbc = None # type: UnitTestDB def test_update_dataset_status(self, multi_dbs): dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - test_uuid = '385f1ec2-bd06-40ce-873a-98e199f10534' + test_uuid = 'fc5d3e13-340c-4e2a-9f49-256fc319331e' dataset_factory.update_dataset_status(test_uuid) metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) with metadata_db.session_scope() as session: @@ -45,8 +45,8 @@ def test_update_dataset_status(self, multi_dbs): def test_update_dataset_attributes(self, multi_dbs): dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - test_uuid = '385f1ec2-bd06-40ce-873a-98e199f10534' - test_attributes = {"contig_n50" : "test1", "total_genome_length": "test2"} + test_uuid = 'fc5d3e13-340c-4e2a-9f49-256fc319331e' + test_attributes = {"assembly.contig_n50": "test1", "assembly.total_genome_length": "test2"} # def update_dataset_attributes(self,dataset_uuid, attribut_dict): dataset_factory.update_dataset_attributes(test_uuid, test_attributes) metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) @@ -55,22 +55,56 @@ def test_update_dataset_attributes(self, multi_dbs): dataset_attribute = session.query(DatasetAttribute) \ .join(Attribute, DatasetAttribute.attribute_id == Attribute.attribute_id) \ .filter(DatasetAttribute.dataset_id == dataset.dataset_id, - Attribute.name == 'contig_n50', + Attribute.name == 'assembly.contig_n50', DatasetAttribute.value == 'test1') \ .one_or_none() assert dataset_attribute is not None dataset_factory = DatasetFactory(session=session) - test_attributes = {"gc_percentage": "test3", "longest_gene_length": "test4"} + test_attributes = {"assembly.gc_percentage": "test3", "genebuild.nc_longest_gene_length": "test4"} dataset_factory.update_dataset_attributes(test_uuid, test_attributes) session.commit() dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() test_attribute = session.query(DatasetAttribute) \ .join(Attribute, DatasetAttribute.attribute_id == Attribute.attribute_id) \ .filter(DatasetAttribute.dataset_id == dataset.dataset_id, - Attribute.name == 'longest_gene_length', + Attribute.name == 'genebuild.nc_longest_gene_length', DatasetAttribute.value == 'test4') \ .all() assert test_attribute is not None - - + def test_create_dataset(self, multi_dbs): + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with (metadata_db.session_scope() as session): + test_attributes = {"assembly.contig_n50": "test1", "assembly.total_genome_length": "test2"} + test_genome_uuid = '48b1b849-3b73-4242-ae83-af2290aeb071' + test_dataset_source = session.query(DatasetSource).filter( + DatasetSource.name == 'mus_musculus_nodshiltj_core_110_1').one() + test_dataset_type = session.query(DatasetType).filter(DatasetType.name == 'regulatory_features').one() + test_name = 'test_name' + test_label = 'test_label' + test_version = 'test_version' + dataset_factory = DatasetFactory(session=session) + dataset_uuid, new_dataset_attributes, new_genome_dataset = dataset_factory.create_dataset(session, + test_genome_uuid, + test_dataset_source, + test_dataset_type, + test_attributes, + test_name, + test_label, + test_version) + session.commit() + created_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + assert created_dataset.name == test_name + assert created_dataset.label == test_label + assert created_dataset.version == test_version + assert test_dataset_source == session.query(DatasetSource).filter( + DatasetSource.dataset_source_id == created_dataset.dataset_source_id).one() + assert test_dataset_type == session.query(DatasetType).filter( + DatasetType.dataset_type_id == created_dataset.dataset_type_id).one() + test_attribute = session.query(DatasetAttribute) \ + .join(Attribute, DatasetAttribute.attribute_id == Attribute.attribute_id) \ + .filter(DatasetAttribute.dataset_id == created_dataset.dataset_id, + Attribute.name == 'genebuild.nc_longest_gene_length', + DatasetAttribute.value == 'test4') \ + .all() + assert test_attribute is not None From 110e5d77979d2a353c2778fb0225ed3473c7910f Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 14 Feb 2024 14:36:40 +0000 Subject: [PATCH 08/30] added get_parent_dataset to dataset_factory.py --- .../metadata/api/hive/dataset_factory.py | 77 +++++++++++++++---- 1 file changed, 60 insertions(+), 17 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 5ac9b395..ad62868d 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -34,36 +34,79 @@ def __init__(self, session=None, metadata_uri=None): self.metadata_db = DBConnection(metadata_uri) # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome - def get_child_datasets(self, dataset_uuid=None): + def get_child_datasets(self, dataset_uuid=None,constrained=True): # Function to get all of the possible children datasets that are not constrained # Only returns children of dataset_uuid if specified child_datasets = [] return child_datasets - def create_child_datasets(self, dataset_uuid=None, dataset_type=None): - # Recursive function to create all the child datasets that it can. Breaks when no more datasets are created - # Only returns children of dataset_uuid if specified - # Should be limited to a single type if dataset_uuid is not specified - child_datasets = self.get_child_datasets() + def create_child_datasets(self, dataset_uuid=None, dataset_type=None, constrained=True): + child_datasets = [] + + with self.session as session: + # Identify parent dataset types + parent_dataset_types = set() + if dataset_uuid: + parent_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).first() + if parent_dataset: + parent_dataset_types.add(parent_dataset.dataset_type.name) + elif dataset_type: + parent_dataset_types.add(dataset_type) + else: + raise ValueError("Either dataset_uuid or dataset_type must be provided") + + potential_child_types = session.query(DatasetType).filter( + DatasetType.parent.in_(parent_dataset_types)).all() + + for child_type in potential_child_types: + # Check if dependencies are 'Processed' + dependencies = child_type.depends_on.split(';') if child_type.depends_on else [] + all_dependencies_processed = all( + session.query(Dataset).filter(Dataset.dataset_type.has(name=dep), + Dataset.status == 'Processed').count() > 0 + for dep in dependencies + ) + if all_dependencies_processed: + # Create child dataset + new_dataset = self.create_dataset(session,genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, + version) + session.add(new_dataset) + child_datasets.append(new_dataset) + + return child_datasets + return child_datasets def get_parent_datasets(self, dataset_uuid): - # Function to return all of the parent datasets. Usually only one will be returned. - #Unlike previous functions a dataset_uuid is required. - #If there is no parent it will return top_level and itself. parent_uuid = [] parent_type = [] + + def query_parent_datasets(session): + dataset = self.get_dataset(session, dataset_uuid) + dataset_type = session.query(DatasetType).filter( + DatasetType.dataset_type_id == dataset.dataset_type_id).one() + if dataset_type.parent is None: + return ['dataset_uuid'], ['top_level'] + parent_dataset_types = dataset_type.parent.split(';') + genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) + if not genome_id: + raise ValueError("No associated Genome found for the given dataset UUID") + + related_genome_datasets = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( + GenomeDataset.genome_id == genome_id, + DatasetType.name.in_(parent_dataset_types) + ).all() + + for gd in related_genome_datasets: + parent_uuid.append(gd.dataset.dataset_uuid) + parent_type.append(gd.dataset.dataset_type.name) + if self.owns_session: with self.metadata_db.session_scope() as session: - dataset = self.get_dataset(session, dataset_uuid) - dataset_type = session.query(DatasetType).filter( - DatasetType.dataset_type_id == dataset.dataset_type_id).one() - if dataset_type.parent is None: - return ['dateset_uuid'], ['top_level'] - parent_dataset_types = dataset_type.parent.split(';') - # loop over datesets that have the same genome and contain one of the parent types. + query_parent_datasets(session) else: - dataset = self.get_dataset(self.session, dataset_uuid) + query_parent_datasets(self.session) + return parent_uuid, parent_type From 4ce9dac429510de52f391851994bbef45563413f Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 15 Feb 2024 10:24:31 +0000 Subject: [PATCH 09/30] Added child update and tests to dataset_factory.py Added updater_utils.py and moved standalone function --- .../metadata/api/hive/dataset_factory.py | 252 ++++++++++++++---- .../api/sample/ensembl_metadata/dataset.txt | 2 +- .../production/metadata/updater/base.py | 13 - .../production/metadata/updater/core.py | 3 +- .../metadata/updater/updater_utils.py | 27 ++ src/tests/test_dataset_factory.py | 38 ++- 6 files changed, 258 insertions(+), 77 deletions(-) create mode 100644 src/ensembl/production/metadata/updater/updater_utils.py diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index ad62868d..18c33823 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -1,29 +1,64 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from ensembl.database import DBConnection from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.models import Dataset, Attribute, DatasetAttribute, Genome, GenomeDataset, \ - DatasetType + DatasetType, DatasetSource from sqlalchemy.sql import func -import datetime import uuid - +from ensembl.production.metadata.updater.updater_utils import update_attributes class DatasetFactory(): """ - A class used to interact with the ensembl_genome_metadata to modify dataset and dataset attribute table. - - ... + A class for interacting with the ensembl_genome_metadata database, specifically for modifying the dataset and + dataset attribute tables. Attributes ---------- - genome_uuid = uuid + session : SQLAlchemy session + An active SQLAlchemy session for database operations. If not provided, a new session is created using metadata_uri. + metadata_db : DBConnection + A database connection object used when a new session is created. + owns_session : bool + Flag to indicate whether this class instance owns the session (True) or if it was provided by the user (False). Methods ------- - get_child_datasets() + create_child_datasets(dataset_uuid, parent_type, child_type, dataset_attributes, genome_uuid): + Creates child datasets based on various parameters like dataset_uuid, parent_type, child_type, etc. + get_parent_datasets(dataset_uuid): + Retrieves parent datasets for a given dataset UUID. + create_dataset(session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, version): + Creates a new dataset and associates it with a genome. + update_dataset_status(dataset_uuid, status): + Updates the status of a dataset identified by its UUID. + update_dataset_attributes(dataset_uuid, attribut_dict): + Updates the attributes of a dataset identified by its UUID. + get_dataset(session, dataset_uuid): + Retrieves a dataset by its UUID. """ - def __init__(self, session=None, metadata_uri=None): + """ + Initializes the DatasetFactory instance. + + Parameters: + session (SQLAlchemy session, optional): An active database session. + metadata_uri (str, optional): URI for the metadata database. + + Raises: + DatasetFactoryException: If neither session nor metadata_uri is provided. + """ if session: self.session = session self.owns_session = False @@ -34,50 +69,120 @@ def __init__(self, session=None, metadata_uri=None): self.metadata_db = DBConnection(metadata_uri) # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome - def get_child_datasets(self, dataset_uuid=None,constrained=True): - # Function to get all of the possible children datasets that are not constrained - # Only returns children of dataset_uuid if specified - child_datasets = [] - return child_datasets + def create_child_datasets(self, dataset_uuid=None, parent_type=None, child_type=None, dataset_attributes={}, + genome_uuid=None): + """ + Creates child datasets based on the provided parameters. Child datasets are created based on the type of parent + dataset, child dataset type, or associated genome UUID. The method enforces rules to prevent conflict in parameters. - def create_child_datasets(self, dataset_uuid=None, dataset_type=None, constrained=True): - child_datasets = [] + Parameters: + dataset_uuid (str, optional): UUID of the parent dataset. + parent_type (str, optional): Type of the parent dataset. + child_type (str, optional): Type of the child dataset to be created. + dataset_attributes (dict, optional): Attributes to be assigned to the child dataset. + genome_uuid (str, optional): UUID of the genome associated with the datasets. - with self.session as session: - # Identify parent dataset types + Returns: + list: UUIDs of the created child datasets. + """ + if dataset_uuid and genome_uuid: + raise ValueError("Please only provide genome_uuid or dataset_uuid") + if parent_type and child_type: + raise ValueError("Please only provide child_type or parent_type") + def fetch_parent_datasets(session): parent_dataset_types = set() + potential_parent_datasets = [] if dataset_uuid: parent_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).first() if parent_dataset: parent_dataset_types.add(parent_dataset.dataset_type.name) - elif dataset_type: - parent_dataset_types.add(dataset_type) + potential_parent_datasets.append(parent_dataset) + elif parent_type: + parent_dataset_types.add(parent_type) + potential_parent_datasets = session.query(Dataset).filter( + Dataset.dataset_type.has(name=parent_type), + Dataset.status != 'Released' + ).all() + elif genome_uuid: + genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).first() + if not genome: + raise ValueError("No genome found with the provided genome_uuid") + if not parent_type and not child_type: + raise ValueError("Genome_uuid requires either child type or parent type.") + if child_type: + #Alwalys go for the first one as dependencies will check the rest later. + new_type = session.query(DatasetType).filter(DatasetType.name == child_type).one() + parent_dataset_types.add(new_type.parent.split(';')[0]) + for genome_dataset in genome.genome_datasets: + if genome_dataset.dataset.status != 'Released' and genome_dataset.dataset.dataset_type.name in parent_dataset_types: + potential_parent_datasets.append(genome_dataset.dataset) else: - raise ValueError("Either dataset_uuid or dataset_type must be provided") - - potential_child_types = session.query(DatasetType).filter( - DatasetType.parent.in_(parent_dataset_types)).all() - - for child_type in potential_child_types: - # Check if dependencies are 'Processed' - dependencies = child_type.depends_on.split(';') if child_type.depends_on else [] - all_dependencies_processed = all( - session.query(Dataset).filter(Dataset.dataset_type.has(name=dep), - Dataset.status == 'Processed').count() > 0 - for dep in dependencies - ) - if all_dependencies_processed: - # Create child dataset - new_dataset = self.create_dataset(session,genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, - version) - session.add(new_dataset) - child_datasets.append(new_dataset) - - return child_datasets - - return child_datasets + raise ValueError("Either dataset_uuid, parent_type, or genome_uuid must be provided") + return parent_dataset_types, potential_parent_datasets + + def process_datasets(session, parent_dataset_types, potential_parent_datasets): + + child_datasets = [] + if child_type: + potential_child_types = [session.query(DatasetType).filter(DatasetType.name == child_type).first()] + else: + potential_child_types = session.query(DatasetType).filter( + DatasetType.parent.in_(parent_dataset_types)).all() + + for parent_dataset in potential_parent_datasets: + # I thought this was a good idea, but we would need different logic + # if parent_dataset.status == 'Processed': + for child_dataset_type in potential_child_types: + if check_existing_and_dependencies(session, parent_dataset, child_dataset_type): + parent_genome_uuid = parent_dataset.genome_datasets[0].genome.genome_uuid + parent_dataset_source = parent_dataset.dataset_source + new_dataset_uuid, new_dataset_attributes, new_genome_dataset = self.create_dataset( + session, parent_genome_uuid, parent_dataset_source, child_dataset_type, + dataset_attributes, child_dataset_type.name, child_dataset_type.name, None + ) + child_datasets.append(new_dataset_uuid) + return child_datasets + + def check_existing_and_dependencies(session, parent_dataset, child_dataset_type): + existing_datasets = session.query(Dataset).filter( + Dataset.dataset_type == child_dataset_type, + Dataset.genome_datasets.any(genome_id=parent_dataset.genome_datasets[0].genome_id), + Dataset.status.in_(['Submitted', 'Processing', 'Processed']) + ).count() + + if existing_datasets > 0: + return False # Skip if a similar dataset already exists + + dependencies = child_dataset_type.depends_on.split(';') if child_dataset_type.depends_on else [] + return all( + session.query(Dataset).filter( + Dataset.dataset_type.has(name=dep), + Dataset.status == 'Processed', + Dataset.genome_datasets.any(genome_id=parent_dataset.genome_datasets[0].genome_id) + ).count() > 0 for dep in dependencies + ) + + if self.owns_session: + with self.metadata_db.session_scope() as session: + parent_dataset_types, potential_parent_datasets = fetch_parent_datasets(session) + return process_datasets(session, parent_dataset_types, potential_parent_datasets) + else: + session = self.session + parent_dataset_types, potential_parent_datasets = fetch_parent_datasets(session) + return process_datasets(session, parent_dataset_types, potential_parent_datasets) + def get_parent_datasets(self, dataset_uuid): + """ + Retrieves the parent datasets of a specified dataset. If the dataset does not have a parent, + it returns the dataset itself and marks it as 'top_level'. + + Parameters: + dataset_uuid (str): UUID of the dataset for which the parent datasets are to be found. + + Returns: + tuple: Two lists containing UUIDs and types of the parent datasets. + """ parent_uuid = [] parent_type = [] @@ -112,6 +217,22 @@ def query_parent_datasets(session): def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, version): + """ + Creates a new dataset record and associates it with a specific genome. The new dataset is added to the database session. + + Parameters: + session (SQLAlchemy session): An active database session. + genome_uuid (str): UUID of the genome to associate the dataset with. + dataset_source (DatasetSource): The source of the dataset. + dataset_type (DatasetType): The type of the dataset. + dataset_attributes (dict): Attributes to assign to the dataset. + name (str): Name of the dataset. + label (str): Label for the dataset. + version (str): Version of the dataset. + + Returns: + tuple: Dataset UUID, dataset attributes, and the new genome-dataset association. + """ new_dataset = Dataset( dataset_uuid=str(uuid.uuid4()), dataset_type=dataset_type, # Must be an object returned from the current session @@ -134,6 +255,7 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat return dataset_uuid, new_dataset_attributes, new_genome_dataset def _update_status(self, dataset, status=None): + valid_statuses = ['Submitted', 'Processing', 'Processed', 'Released'] if status is None: old_status = dataset.status @@ -152,6 +274,17 @@ def _update_status(self, dataset, status=None): return status def update_dataset_status(self, dataset_uuid, status=None): + """ + Updates the status of a dataset identified by its UUID. The status is updated to the next logical state unless + a specific state is provided. + + Parameters: + dataset_uuid (str): UUID of the dataset to update. + status (str, optional): The new status to set for the dataset. If not provided, status is advanced to the next logical state. + + Returns: + tuple: Dataset UUID and the updated status. + """ if self.owns_session: with self.metadata_db.session_scope() as session: dataset = self.get_dataset(session, dataset_uuid) @@ -162,6 +295,16 @@ def update_dataset_status(self, dataset_uuid, status=None): return dataset_uuid, updated_status def update_dataset_attributes(self, dataset_uuid, attribut_dict): + """ + Updates the attributes of a dataset identified by its UUID. The attributes to be updated are provided as a dictionary. + + Parameters: + dataset_uuid (str): UUID of the dataset to update. + attribute_dict (dict): A dictionary containing attribute names and their new values. + + Returns: + list: Updated dataset attributes. + """ if not isinstance(attribut_dict, dict): raise TypeError("attribut_dict must be a dictionary") if self.owns_session: @@ -175,19 +318,18 @@ def update_dataset_attributes(self, dataset_uuid, attribut_dict): return dataset_attributes def get_dataset(self, session, dataset_uuid): + """ + Retrieves a dataset by its UUID using an active database session. + + Parameters: + session (SQLAlchemy session): An active database session. + dataset_uuid (str): UUID of the dataset to retrieve. + + Returns: + Dataset: The dataset object corresponding to the provided UUID. + """ return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() # This is a direct copy of Marc's code in the core updater in an unmerged branch. I am not sure where we should keep it. -def update_attributes(dataset, attributes, session): - dataset_attributes = [] - for attribute, value in attributes.items(): - meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() - if meta_attribute is None: - raise UpdaterException(f"{attribute} does not exist. Add it to the database and reload.") - dataset_attributes.append(DatasetAttribute( - value=value, - dataset=dataset, - attribute=meta_attribute, - )) - return dataset_attributes + diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt index f299eba7..aa18a33b 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt @@ -310,7 +310,7 @@ 356 90ba6c03-5161-4f9a-911c-1961b9c0470d genebuild ENS01 2023-09-22 15:06:46.000000 GCA_018472825.1_ENS01 188 2 Submitted 357 4519fdf3-8b4e-463b-9822-69c45ee408da assembly \N 2023-09-22 15:06:46.000000 GCA_000001735.1 190 1 Submitted 359 de92123a-22ca-407f-9954-d8c0f8b17f64 assembly \N 2023-09-22 15:06:48.000000 GCA_018503575.1 192 1 Submitted -361 e95e194c-52ad-4b1e-94d4-1d5c0a03e9e3 assembly \N 2023-09-22 15:06:48.000000 GCA_018466835.1 193 1 Submitted +361 e95e194c-52adc-4b1e-94d4-1d5c0a03e9e3 assembly \N 2023-09-22 15:06:48.000000 GCA_018466835.1 193 1 Submitted 363 2b5664b7-6b42-4a18-9128-3019d631b836 assembly \N 2023-09-22 15:06:48.000000 GCA_905237065.2 191 1 Submitted 364 2ffbdc3f-1c68-42b1-b99f-4449c5914ec5 genebuild ENS01 2023-09-22 15:06:48.000000 GCA_905237065.2_ENS01 191 2 Submitted 365 b8fa1a4e-6d40-4540-a022-8846abee284c assembly \N 2023-09-22 15:06:49.000000 GCA_018472765.1 194 1 Submitted diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 39f6cef0..400cdc15 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -55,16 +55,3 @@ def get_or_new_source(self, meta_session, db_type): ) meta_session.add(dataset_source) # Only add a new DatasetSource to the session if it doesn't exist return dataset_source - - def update_attributes(self, dataset, attributes, session): - genebuild_dataset_attributes = [] - for attribute, value in attributes.items(): - meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() - if meta_attribute is None: - raise UpdaterException(f"{attribute} does not exist. Add it to the database and reload.") - genebuild_dataset_attributes.append(DatasetAttribute( - value=value, - dataset=dataset, - attribute=meta_attribute, - )) - return genebuild_dataset_attributes diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index d3f485dc..f6726abf 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -28,6 +28,7 @@ from ensembl.ncbi_taxonomy.models import NCBITaxaName import logging from ensembl.production.metadata.api.exceptions import * +from ensembl.production.metadata.updater.updater_utils import update_attributes class CoreMetaUpdater(BaseMetaUpdater): @@ -591,5 +592,5 @@ def get_or_new_genebuild(self, species_id, meta_session, source=None, existing=F genebuild_dataset.version = genebuild_version attributes = self.get_meta_list_from_prefix_meta_key(species_id, "genebuild.") - genebuild_dataset_attributes = self.update_attributes(genebuild_dataset, attributes, meta_session) + genebuild_dataset_attributes = update_attributes(genebuild_dataset, attributes, meta_session) return genebuild_dataset, genebuild_dataset_attributes diff --git a/src/ensembl/production/metadata/updater/updater_utils.py b/src/ensembl/production/metadata/updater/updater_utils.py new file mode 100644 index 00000000..230605c7 --- /dev/null +++ b/src/ensembl/production/metadata/updater/updater_utils.py @@ -0,0 +1,27 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ensembl.production.metadata.api.exceptions import UpdaterException +from ensembl.production.metadata.api.models import Attribute, DatasetAttribute + + +def update_attributes(dataset, attributes, session): + dataset_attributes = [] + for attribute, value in attributes.items(): + meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise UpdaterException(f"{attribute} does not exist. Add it to the database and reload.") + dataset_attributes.append(DatasetAttribute( + value=value, + dataset=dataset, + attribute=meta_attribute, + )) + return dataset_attributes \ No newline at end of file diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index e6ad405c..ec6e02eb 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -10,15 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. from pathlib import Path -from unittest import mock -from unittest.mock import Mock, patch - import pytest -import re - -import sqlalchemy from ensembl.database import UnitTestDB, DBConnection - from ensembl.production.metadata.api.hive.dataset_factory import DatasetFactory from ensembl.production.metadata.api.models import Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType @@ -108,3 +101,34 @@ def test_create_dataset(self, multi_dbs): DatasetAttribute.value == 'test4') \ .all() assert test_attribute is not None + + def test_create_child_datasets_get_parent(self, multi_dbs): + # Tests for individual calling via dataset_uuid or genome_uuid + dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + test_uuid = '90ba6c03-5161-4f9a-911c-1961b9c0470d' + data = dataset_factory.create_child_datasets(dataset_uuid=test_uuid) + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').one() + assert dataset.status == 'Submitted' + dataset_factory = DatasetFactory(session=session) + dataset_factory.update_dataset_status(dataset.dataset_uuid, 'Processed') + session.commit() + parent, parent_type = dataset_factory.get_parent_datasets(dataset.dataset_uuid) + assert parent[0] == test_uuid + assert parent_type[0] == 'genebuild' + dataset_factory.create_child_datasets(genome_uuid='9cc516a8-529e-4919-a429-0d7032e295c9', + child_type='protein_features') + # dataset_factory.create_child_datasets(dataset_uuid=data[0], + # child_type='protein_features') + session.commit() + new_dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'protein_features').one() + assert new_dataset.status == 'Submitted' + + # Tests for bulk calling. + dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + dataset_factory.create_child_datasets(parent_type='genebuild') + metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').all() + assert len(dataset) == 240 From 264ec1b8525c5d9993720cbef5ded9dc54885543 Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 15 Feb 2024 10:29:09 +0000 Subject: [PATCH 10/30] Minor code cleanup --- src/ensembl/production/metadata/api/hive/dataset_factory.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 18c33823..94010f7e 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -69,6 +69,7 @@ def __init__(self, session=None, metadata_uri=None): self.metadata_db = DBConnection(metadata_uri) # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome + # Actually it is probably acceptable now, as we only consider unreleased datasets. def create_child_datasets(self, dataset_uuid=None, parent_type=None, child_type=None, dataset_attributes={}, genome_uuid=None): """ @@ -331,5 +332,3 @@ def get_dataset(self, session, dataset_uuid): return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() -# This is a direct copy of Marc's code in the core updater in an unmerged branch. I am not sure where we should keep it. - From 42f2f13b6d30033391d00e54e03ecddf30b0eabf Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 20 Feb 2024 15:31:55 +0000 Subject: [PATCH 11/30] Dataset Status enum added dataset_factory.py WIP --- .../metadata/api/hive/dataset_factory.py | 103 +++++++++++------- .../production/metadata/api/models/dataset.py | 21 +++- src/tests/test_dataset_factory.py | 2 +- 3 files changed, 81 insertions(+), 45 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 94010f7e..fe074bb5 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -14,12 +14,13 @@ from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.models import Dataset, Attribute, DatasetAttribute, Genome, GenomeDataset, \ - DatasetType, DatasetSource + DatasetType, DatasetSource, DatasetStatus from sqlalchemy.sql import func import uuid from ensembl.production.metadata.updater.updater_utils import update_attributes -class DatasetFactory(): + +class DatasetFactory: """ A class for interacting with the ensembl_genome_metadata database, specifically for modifying the dataset and dataset attribute tables. @@ -48,6 +49,7 @@ class DatasetFactory(): get_dataset(session, dataset_uuid): Retrieves a dataset by its UUID. """ + def __init__(self, session=None, metadata_uri=None): """ Initializes the DatasetFactory instance. @@ -90,6 +92,7 @@ def create_child_datasets(self, dataset_uuid=None, parent_type=None, child_type= raise ValueError("Please only provide genome_uuid or dataset_uuid") if parent_type and child_type: raise ValueError("Please only provide child_type or parent_type") + def fetch_parent_datasets(session): parent_dataset_types = set() potential_parent_datasets = [] @@ -111,7 +114,7 @@ def fetch_parent_datasets(session): if not parent_type and not child_type: raise ValueError("Genome_uuid requires either child type or parent type.") if child_type: - #Alwalys go for the first one as dependencies will check the rest later. + # Alwalys go for the first one as dependencies will check the rest later. new_type = session.query(DatasetType).filter(DatasetType.name == child_type).one() parent_dataset_types.add(new_type.parent.split(';')[0]) for genome_dataset in genome.genome_datasets: @@ -148,7 +151,7 @@ def check_existing_and_dependencies(session, parent_dataset, child_dataset_type) existing_datasets = session.query(Dataset).filter( Dataset.dataset_type == child_dataset_type, Dataset.genome_datasets.any(genome_id=parent_dataset.genome_datasets[0].genome_id), - Dataset.status.in_(['Submitted', 'Processing', 'Processed']) + Dataset.status.in_([DatasetStatus.SUBMITTED, DatasetStatus.PROCESSING, DatasetStatus.PROCESSED]) ).count() if existing_datasets > 0: @@ -172,7 +175,6 @@ def check_existing_and_dependencies(session, parent_dataset, child_dataset_type) parent_dataset_types, potential_parent_datasets = fetch_parent_datasets(session) return process_datasets(session, parent_dataset_types, potential_parent_datasets) - def get_parent_datasets(self, dataset_uuid): """ Retrieves the parent datasets of a specified dataset. If the dataset does not have a parent, @@ -184,15 +186,14 @@ def get_parent_datasets(self, dataset_uuid): Returns: tuple: Two lists containing UUIDs and types of the parent datasets. """ - parent_uuid = [] - parent_type = [] + parent_uuid = '' - def query_parent_datasets(session): + def query_parent_datasets(session, dataset_uuid): dataset = self.get_dataset(session, dataset_uuid) dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == dataset.dataset_type_id).one() if dataset_type.parent is None: - return ['dataset_uuid'], ['top_level'] + return None parent_dataset_types = dataset_type.parent.split(';') genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) if not genome_id: @@ -205,17 +206,26 @@ def query_parent_datasets(session): for gd in related_genome_datasets: parent_uuid.append(gd.dataset.dataset_uuid) - parent_type.append(gd.dataset.dataset_type.name) - if self.owns_session: with self.metadata_db.session_scope() as session: query_parent_datasets(session) else: query_parent_datasets(self.session) - return parent_uuid, parent_type + return parent_uuid + def fetch_top_level_parent(self, session, dataset_uuid): + def get_top_level_parent(self, dataset_uuid, session=None, metadata_uri=None): + if session: + top_uuid = self._update_status(dataset_uuid) + elif metadata_uri: + metadata_db = DBConnection(metadata_uri) + with metadata_db.session_scope() as session: + top_uuid = self._update_status(dataset_uuid) + else: + raise DatasetFactoryException("session or metadata_uri are required") + return top_uuid def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, version): """ @@ -255,26 +265,40 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat dataset_uuid = new_dataset.dataset_uuid return dataset_uuid, new_dataset_attributes, new_genome_dataset - def _update_status(self, dataset, status=None): - - valid_statuses = ['Submitted', 'Processing', 'Processed', 'Released'] - if status is None: - old_status = dataset.status - if old_status == 'Released': - raise DatasetFactoryException("Unable to change status of Released dataset") - elif old_status == 'Submitted': - status = 'Processing' - elif old_status == 'Processing': - status = 'Processed' - elif old_status == 'Processed': - status = 'Released' - if status not in valid_statuses: - raise DatasetFactoryException(f"Unable to change status to {status} as this is not valid. Please use " - f"one of :{valid_statuses}") + + def _check_childrens_status(self, dataset): + #returns the lowest possible status (ex submitted) for all datasets that are children. + + def _update_status(self, session, dataset_uuid, status): + + #sbumitted to processing update all parents, all the way to top level. + #processing to processed. Check children. Don't update if they are still processing. Update. Recursively call this on parent + #Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. + # Then convert all to released. #Add a blocker and warning in here. + + #First Check if children have + if status == DatasetStatus.SUBMITTED: + parent_uuid = self.get_parent_dataset(session,dataset_uuid) + if parent_uuid != None + self._update_status(session, parent_uuid, Dataset.SUBMITTED) + #update all the way back. + elif status == DatasetStatus.PROCESSING: + #Update all the way back + elif status == DatasetStatus.PROCESSED: + #attempt update. See above + elif status == DatasetStatus.RELEASED: + #Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. + + parent_datasets = dataset.status = status - return status - def update_dataset_status(self, dataset_uuid, status=None): + + + + return status, parent_status + + def update_dataset_status(self, dataset_uuid, status=None, session=None, metadata_uri=None): + # TODO: Check parent for progress and update parent if child """ Updates the status of a dataset identified by its UUID. The status is updated to the next logical state unless a specific state is provided. @@ -286,14 +310,17 @@ def update_dataset_status(self, dataset_uuid, status=None): Returns: tuple: Dataset UUID and the updated status. """ - if self.owns_session: - with self.metadata_db.session_scope() as session: - dataset = self.get_dataset(session, dataset_uuid) - updated_status = self._update_status(dataset, status) + if session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + updated_status, parent_status = self._update_status(dataset, status) + elif metadata_uri: + metadata_db = DBConnection(metadata_uri) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + updated_status, parent_status = self._update_status(dataset, status) else: - dataset = self.get_dataset(self.session, dataset_uuid) - updated_status = self._update_status(dataset, status) - return dataset_uuid, updated_status + raise DatasetFactoryException("session or metadata_uri are required") + return dataset_uuid, updated_status, parent_status def update_dataset_attributes(self, dataset_uuid, attribut_dict): """ @@ -330,5 +357,3 @@ def get_dataset(self, session, dataset_uuid): Dataset: The dataset object corresponding to the provided UUID. """ return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - - diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index c87981ac..b2dc79ff 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -9,6 +9,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import enum + from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index, JSON from sqlalchemy.dialects.mysql import DATETIME from sqlalchemy.orm import relationship @@ -21,6 +23,7 @@ + class Attribute(LoadAble, Base): __tablename__ = 'attribute' @@ -35,6 +38,12 @@ class Attribute(LoadAble, Base): # many to one relationships # none +class DatasetStatus(enum.Enum): + SUBMITTED = 'Submitted' + PROCESSING = 'Processing' + PROCESSED = 'Processed' + RELEASED = 'Released' + class Dataset(LoadAble, Base): __tablename__ = 'dataset' @@ -46,11 +55,12 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column(Enum('Submitted', 'Processing', 'Processed', 'Released'), server_default=text("'Submitted'")) + status = Column(Enum(DatasetStatus), server_default=DatasetStatus.SUBMITTED) # One to many relationships # dataset_id to dataset attribute and genome dataset - dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', cascade="all, delete, delete-orphan") + dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', + cascade="all, delete, delete-orphan") genome_datasets = relationship("GenomeDataset", back_populates='dataset', cascade="all, delete, delete-orphan") # many to one relationships # dataset_type_id to dataset_type @@ -66,12 +76,14 @@ def genebuild_version(self): return next( (att.value for att in self.dataset_attributes if att.attribute.name == 'genebuild.last_geneset_update'), - next((att.value for att in self.dataset_attributes if att.attribute.name == 'genebuild.start_date'), None)) + next((att.value for att in self.dataset_attributes if att.attribute.name == 'genebuild.start_date'), + None)) else: # return Related genebuild version logger.debug(F"Related datasets! : {self.genome_datasets.datasets}") genebuild_ds = next( - (dataset for dataset in self.genome_datasets.datasets if dataset.dataset_type.name == 'genebuild'), None) + (dataset for dataset in self.genome_datasets.datasets if dataset.dataset_type.name == 'genebuild'), + None) if genebuild_ds: return genebuild_ds.genebuild_version else: @@ -128,4 +140,3 @@ class DatasetType(LoadAble, Base): datasets = relationship('Dataset', back_populates='dataset_type') # many to one relationships # none - diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index ec6e02eb..63c712b0 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -67,7 +67,7 @@ def test_update_dataset_attributes(self, multi_dbs): def test_create_dataset(self, multi_dbs): metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) - with (metadata_db.session_scope() as session): + with metadata_db.session_scope() as session: test_attributes = {"assembly.contig_n50": "test1", "assembly.total_genome_length": "test2"} test_genome_uuid = '48b1b849-3b73-4242-ae83-af2290aeb071' test_dataset_source = session.query(DatasetSource).filter( From c07296097e68da08971ec79597a9922482789af2 Mon Sep 17 00:00:00 2001 From: Marc Chakiachvili Date: Tue, 20 Feb 2024 15:46:09 +0000 Subject: [PATCH 12/30] Updated Dataset.py --- src/ensembl/production/metadata/api/models/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index b2dc79ff..14aec3b0 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -22,8 +22,6 @@ from ensembl.production.metadata.api.models.base import Base, LoadAble - - class Attribute(LoadAble, Base): __tablename__ = 'attribute' @@ -38,12 +36,14 @@ class Attribute(LoadAble, Base): # many to one relationships # none + class DatasetStatus(enum.Enum): SUBMITTED = 'Submitted' PROCESSING = 'Processing' PROCESSED = 'Processed' RELEASED = 'Released' + class Dataset(LoadAble, Base): __tablename__ = 'dataset' @@ -55,7 +55,7 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column(Enum(DatasetStatus), server_default=DatasetStatus.SUBMITTED) + status = Column('status', Enum(DatasetStatus), default=DatasetStatus.SUBMITTED) # One to many relationships # dataset_id to dataset attribute and genome dataset From 92eae493d244fa35527ba4981593544debc81f0d Mon Sep 17 00:00:00 2001 From: danielp Date: Fri, 23 Feb 2024 02:05:11 +0000 Subject: [PATCH 13/30] Refactored dataset factory to handle new logic --- .../metadata/api/hive/dataset_factory.py | 359 +++++++++--------- 1 file changed, 174 insertions(+), 185 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index fe074bb5..bc8480cc 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -50,182 +50,41 @@ class DatasetFactory: Retrieves a dataset by its UUID. """ - def __init__(self, session=None, metadata_uri=None): - """ - Initializes the DatasetFactory instance. - Parameters: - session (SQLAlchemy session, optional): An active database session. - metadata_uri (str, optional): URI for the metadata database. - Raises: - DatasetFactoryException: If neither session nor metadata_uri is provided. - """ - if session: - self.session = session - self.owns_session = False - else: - if metadata_uri is None: - raise DatasetFactoryException("session or metadata_uri are required") - self.owns_session = True - self.metadata_db = DBConnection(metadata_uri) - - # #TODO: Determine how to implement genome_uuid when we can have multiples of each dataset type per genome - # Actually it is probably acceptable now, as we only consider unreleased datasets. - def create_child_datasets(self, dataset_uuid=None, parent_type=None, child_type=None, dataset_attributes={}, - genome_uuid=None): - """ - Creates child datasets based on the provided parameters. Child datasets are created based on the type of parent - dataset, child dataset type, or associated genome UUID. The method enforces rules to prevent conflict in parameters. + def create_all_child_datasets(self, session, dataset_uuid): + # Retrieve the top-level dataset + top_level_dataset = self.get_dataset(session, dataset_uuid) + self._create_child_datasets_recursive(session, top_level_dataset) - Parameters: - dataset_uuid (str, optional): UUID of the parent dataset. - parent_type (str, optional): Type of the parent dataset. - child_type (str, optional): Type of the child dataset to be created. - dataset_attributes (dict, optional): Attributes to be assigned to the child dataset. - genome_uuid (str, optional): UUID of the genome associated with the datasets. + def _create_child_datasets_recursive(self, session, parent_dataset): + parent_dataset_type = session.query(DatasetType).filter( + DatasetType.dataset_type_id == parent_dataset.dataset_type_id).one() - Returns: - list: UUIDs of the created child datasets. - """ - if dataset_uuid and genome_uuid: - raise ValueError("Please only provide genome_uuid or dataset_uuid") - if parent_type and child_type: - raise ValueError("Please only provide child_type or parent_type") - - def fetch_parent_datasets(session): - parent_dataset_types = set() - potential_parent_datasets = [] - if dataset_uuid: - parent_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).first() - if parent_dataset: - parent_dataset_types.add(parent_dataset.dataset_type.name) - potential_parent_datasets.append(parent_dataset) - elif parent_type: - parent_dataset_types.add(parent_type) - potential_parent_datasets = session.query(Dataset).filter( - Dataset.dataset_type.has(name=parent_type), - Dataset.status != 'Released' - ).all() - elif genome_uuid: - genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).first() - if not genome: - raise ValueError("No genome found with the provided genome_uuid") - if not parent_type and not child_type: - raise ValueError("Genome_uuid requires either child type or parent type.") - if child_type: - # Alwalys go for the first one as dependencies will check the rest later. - new_type = session.query(DatasetType).filter(DatasetType.name == child_type).one() - parent_dataset_types.add(new_type.parent.split(';')[0]) - for genome_dataset in genome.genome_datasets: - if genome_dataset.dataset.status != 'Released' and genome_dataset.dataset.dataset_type.name in parent_dataset_types: - potential_parent_datasets.append(genome_dataset.dataset) - else: - raise ValueError("Either dataset_uuid, parent_type, or genome_uuid must be provided") - return parent_dataset_types, potential_parent_datasets - - def process_datasets(session, parent_dataset_types, potential_parent_datasets): - - child_datasets = [] - if child_type: - potential_child_types = [session.query(DatasetType).filter(DatasetType.name == child_type).first()] - else: - potential_child_types = session.query(DatasetType).filter( - DatasetType.parent.in_(parent_dataset_types)).all() - - for parent_dataset in potential_parent_datasets: - # I thought this was a good idea, but we would need different logic - # if parent_dataset.status == 'Processed': - for child_dataset_type in potential_child_types: - if check_existing_and_dependencies(session, parent_dataset, child_dataset_type): - parent_genome_uuid = parent_dataset.genome_datasets[0].genome.genome_uuid - parent_dataset_source = parent_dataset.dataset_source - new_dataset_uuid, new_dataset_attributes, new_genome_dataset = self.create_dataset( - session, parent_genome_uuid, parent_dataset_source, child_dataset_type, - dataset_attributes, child_dataset_type.name, child_dataset_type.name, None - ) - child_datasets.append(new_dataset_uuid) - return child_datasets - - def check_existing_and_dependencies(session, parent_dataset, child_dataset_type): - existing_datasets = session.query(Dataset).filter( - Dataset.dataset_type == child_dataset_type, - Dataset.genome_datasets.any(genome_id=parent_dataset.genome_datasets[0].genome_id), - Dataset.status.in_([DatasetStatus.SUBMITTED, DatasetStatus.PROCESSING, DatasetStatus.PROCESSED]) - ).count() - - if existing_datasets > 0: - return False # Skip if a similar dataset already exists - - dependencies = child_dataset_type.depends_on.split(';') if child_dataset_type.depends_on else [] - return all( - session.query(Dataset).filter( - Dataset.dataset_type.has(name=dep), - Dataset.status == 'Processed', - Dataset.genome_datasets.any(genome_id=parent_dataset.genome_datasets[0].genome_id) - ).count() > 0 for dep in dependencies - ) + # Find child dataset types for the parent dataset type + child_dataset_types = session.query(DatasetType).filter( + DatasetType.parent == parent_dataset_type.name).all() - if self.owns_session: - with self.metadata_db.session_scope() as session: - parent_dataset_types, potential_parent_datasets = fetch_parent_datasets(session) - return process_datasets(session, parent_dataset_types, potential_parent_datasets) - else: - session = self.session - parent_dataset_types, potential_parent_datasets = fetch_parent_datasets(session) - return process_datasets(session, parent_dataset_types, potential_parent_datasets) + for child_type in child_dataset_types: + # Example placeholders for dataset properties + genome_uuid = parent_dataset.genome_datasets.genome_id + dataset_source = parent_dataset.source + dataset_type = child_type + dataset_attributes = {} # Populate with appropriate attributes + name = dataset_type.name + label = f"Child of {parent_dataset.name}" + version = None - def get_parent_datasets(self, dataset_uuid): - """ - Retrieves the parent datasets of a specified dataset. If the dataset does not have a parent, - it returns the dataset itself and marks it as 'top_level'. + # Create the child dataset + child_dataset_uuid = self.create_dataset(session, genome_uuid, dataset_source, dataset_type, + dataset_attributes, name, label, version) - Parameters: - dataset_uuid (str): UUID of the dataset for which the parent datasets are to be found. + # Recursively create children of this new child dataset + child_dataset = self.get_dataset(session, child_dataset_uuid) + self._create_child_datasets_recursive(session, child_dataset) - Returns: - tuple: Two lists containing UUIDs and types of the parent datasets. - """ - parent_uuid = '' - - def query_parent_datasets(session, dataset_uuid): - dataset = self.get_dataset(session, dataset_uuid) - dataset_type = session.query(DatasetType).filter( - DatasetType.dataset_type_id == dataset.dataset_type_id).one() - if dataset_type.parent is None: - return None - parent_dataset_types = dataset_type.parent.split(';') - genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) - if not genome_id: - raise ValueError("No associated Genome found for the given dataset UUID") - - related_genome_datasets = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( - GenomeDataset.genome_id == genome_id, - DatasetType.name.in_(parent_dataset_types) - ).all() - - for gd in related_genome_datasets: - parent_uuid.append(gd.dataset.dataset_uuid) - if self.owns_session: - with self.metadata_db.session_scope() as session: - query_parent_datasets(session) - else: - query_parent_datasets(self.session) - return parent_uuid - def fetch_top_level_parent(self, session, dataset_uuid): - - def get_top_level_parent(self, dataset_uuid, session=None, metadata_uri=None): - if session: - top_uuid = self._update_status(dataset_uuid) - elif metadata_uri: - metadata_db = DBConnection(metadata_uri) - with metadata_db.session_scope() as session: - top_uuid = self._update_status(dataset_uuid) - else: - raise DatasetFactoryException("session or metadata_uri are required") - return top_uuid def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, version): """ @@ -265,37 +124,167 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat dataset_uuid = new_dataset.dataset_uuid return dataset_uuid, new_dataset_attributes, new_genome_dataset + def _query_parent_datasets(self, session, dataset_uuid): + dataset = self.get_dataset(session, dataset_uuid) + dataset_type = session.query(DatasetType).filter( + DatasetType.dataset_type_id == dataset.dataset_type_id).one() + if dataset_type.parent is None: + return None + parent_dataset_types = dataset_type.parent.split(';') + genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) + if not genome_id: + raise ValueError("No associated Genome found for the given dataset UUID") + + parent_genome_dataset = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( + GenomeDataset.genome_id == genome_id, + DatasetType.name.in_(parent_dataset_types) + ).one() + parent_uuid = parent_genome_dataset.dataset.dataset_uuid + parent_status = parent_genome_dataset.dataset.status + return parent_uuid, parent_status - def _check_childrens_status(self, dataset): - #returns the lowest possible status (ex submitted) for all datasets that are children. + + + def get_parent_datasets(self, dataset_uuid, session=None, metadata_uri=None): + """ + Retrieves the parent datasets of a specified dataset. If the dataset does not have a parent, + it returns the dataset itself and marks it as 'top_level'. + + Parameters: + dataset_uuid (str): UUID of the dataset for which the parent datasets are to be found. + + Returns: + tuple: Two lists containing UUIDs and types of the parent datasets. + """ + parent_uuid = '' + if session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + return self.query_parent_datasets(self.session, dataset_uuid) + elif metadata_uri: + metadata_db = DBConnection(metadata_uri) + with metadata_db.session_scope() as session: + return self.query_parent_datasets(self.session, dataset_uuid) + else: + raise DatasetFactoryException("session or metadata_uri are required") + + def _query_top_level_parent(self, session, dataset_uuid): + current_uuid = dataset_uuid + while True: + parent_data = self._query_parent_datasets(session, current_uuid) + if parent_data is None: + return current_uuid + current_uuid = parent_data[0] + + + def _query_related_genome_by_type(self, session, dataset_uuid, dataset_type): + dataset = self.get_dataset(session, dataset_uuid) + genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) + if not genome_id: + raise ValueError("No associated Genome found for the given dataset UUID") + related_genome_dataset = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( + GenomeDataset.genome_id == genome_id, + DatasetType.name == dataset_type + ).one() + related_uuid = related_genome_dataset.dataset.dataset_uuid + related_status = related_genome_dataset.dataset.status + return related_uuid, related_status + + def _query_child_datasets(self, session, dataset_uuid): + parent_dataset = self.get_dataset(session, dataset_uuid) + parent_dataset_type = session.query(DatasetType).filter( + DatasetType.dataset_type_id == parent_dataset.dataset_type_id).one() + child_dataset_types = session.query(DatasetType).filter( + DatasetType.parent == parent_dataset_type.name).all() + if not child_dataset_types: + return [] # Return an empty list if no child types are found + #This will break if we have multiple genome datasets for a single dataset, which is not currently the case. + genome_id = parent_dataset.genome_datasets.genome_id + if not genome_id: + raise ValueError("No associated Genome found for the given parent dataset UUID") + + child_datasets = [] + for child_type in child_dataset_types: + child_datasets.extend(session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( + GenomeDataset.genome_id == genome_id, + DatasetType.dataset_type_id == child_type.dataset_type_id + ).all()) + + child_data = [(ds.dataset.dataset_uuid, ds.dataset.status) for ds in child_datasets] + + return child_data + + def _query_all_child_datasets(self, session, parent_dataset_uuid): + # This method returns the child datasets for a given dataset + child_datasets = self._query_child_datasets(session, parent_dataset_uuid) + + all_child_datasets = [] + for child_uuid, child_status in child_datasets: + all_child_datasets.append((child_uuid, child_status)) + sub_children = self._query_all_child_datasets(session, child_uuid) + all_child_datasets.extend(sub_children) + + return all_child_datasets def _update_status(self, session, dataset_uuid, status): - #sbumitted to processing update all parents, all the way to top level. - #processing to processed. Check children. Don't update if they are still processing. Update. Recursively call this on parent #Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. # Then convert all to released. #Add a blocker and warning in here. - - #First Check if children have + current_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() if status == DatasetStatus.SUBMITTED: - parent_uuid = self.get_parent_dataset(session,dataset_uuid) - if parent_uuid != None - self._update_status(session, parent_uuid, Dataset.SUBMITTED) - #update all the way back. - elif status == DatasetStatus.PROCESSING: - #Update all the way back - elif status == DatasetStatus.PROCESSED: - #attempt update. See above - elif status == DatasetStatus.RELEASED: - #Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. + #Update to SUBMITTED and all parents. + #Do not touch the children. + #This should only be called in times of strife and error. + current_dataset.status = DatasetStatus.SUBMITTED + parent_uuid, parent_status = self._query_parent_datasets(session,dataset_uuid) + if parent_uuid is not None: + self._update_status(session, parent_uuid, DatasetStatus.SUBMITTED) - parent_datasets = - dataset.status = status + elif status == DatasetStatus.PROCESSING: + #Update to PROCESSING and all parents. + #Do not touch the children. + current_dataset.status = DatasetStatus.PROCESSING + parent_uuid, parent_status = self._query_parent_datasets(session,dataset_uuid) + if parent_uuid is not None: + self._update_status(session, parent_uuid, DatasetStatus.PROCESSING) + elif status == DatasetStatus.PROCESSED: + #Get children + children_uuid = self._query_child_datasets(session, dataset_uuid) + new_status = DatasetStatus.PROCESSED + #Check to see if any are still processing or submitted + for child, child_status in children_uuid: + if child_status == DatasetStatus.PROCESSING or child_status == DatasetStatus.SUBMITTED: + new_status = DatasetStatus.PROCESSING + #Update current dataset if all the children are updated. + if new_status == DatasetStatus.PROCESSED: + current_dataset.status = DatasetStatus.PROCESSED + #Check if parent needs to be updated + parent_uuid = self._query_parent_datasets(session,dataset_uuid) + if parent_uuid is not None: + self._update_status(session,parent_uuid,DatasetStatus.PROCESSED) + elif status == DatasetStatus.RELEASED: + #Get current datasets chain top level. + top_level_uuid = self._query_top_level_parent(dataset_uuid) + #Check that all children and sub children etc + top_level_children = self._query_all_child_datasets(top_level_uuid) + genebuild_uuid = self._query_related_genome_by_type(session, dataset_uuid, "genebuild") + top_level_children.extend(self._query_all_child_datasets(genebuild_uuid)) + assembly_uuid = self._query_related_genome_by_type(session, dataset_uuid, "assembly") + top_level_children.extend(self._query_all_child_datasets(assembly_uuid)) + + # Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. + for child_uuid, child_status in top_level_children: + if child_status is not DatasetStatus.RELEASED or child_status is not DatasetStatus.PROCESSED: + raise DatasetFactoryException(f"Dataset {child_uuid} is not released or processed. It is {child_status}") + top_level_children = self._query_all_child_datasets(top_level_uuid) + for child_uuid, child_status in top_level_children: + child_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() + child_dataset.status = DatasetStatus.RELEASED + else: + raise DatasetFactoryException(f"Dataset status: {status} is not a vallid status") - return status, parent_status def update_dataset_status(self, dataset_uuid, status=None, session=None, metadata_uri=None): # TODO: Check parent for progress and update parent if child From 11635252c7977fdeafcec94bbc628eb10a9c7eaf Mon Sep 17 00:00:00 2001 From: danielp Date: Mon, 26 Feb 2024 15:11:30 +0000 Subject: [PATCH 14/30] Refactored dataset factory to handle new logic --- .../metadata/api/hive/dataset_factory.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index bc8480cc..80a9ca56 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -225,8 +225,20 @@ def _query_all_child_datasets(self, session, parent_dataset_uuid): all_child_datasets.extend(sub_children) return all_child_datasets - def _update_status(self, session, dataset_uuid, status): + def _query_depends_on(self,session, dataset_uuid): + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one_or_none() + dataset_type = dataset.dataset_type + dependent_types = dataset_type.depends_on.split(',') if dataset_type.depends_on else [] + dependent_datasets_info = [] + for dtype in dependent_types: + new_uuid, new_status = self._query_related_genome_by_type(session,dataset_uuid,dtype) + dependent_datasets_info.append((new_uuid, new_status)) + return dependent_datasets_info + + + def _update_status(self, session, dataset_uuid, status): + #TODO: Return UUID, status #Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. # Then convert all to released. #Add a blocker and warning in here. current_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() @@ -242,6 +254,10 @@ def _update_status(self, session, dataset_uuid, status): elif status == DatasetStatus.PROCESSING: #Update to PROCESSING and all parents. #Do not touch the children. + + #TODO:Add check the depending + + current_dataset.status = DatasetStatus.PROCESSING parent_uuid, parent_status = self._query_parent_datasets(session,dataset_uuid) if parent_uuid is not None: @@ -286,7 +302,7 @@ def _update_status(self, session, dataset_uuid, status): - def update_dataset_status(self, dataset_uuid, status=None, session=None, metadata_uri=None): + def update_dataset_status(self, dataset_uuid, status, session=None, metadata_uri=None): # TODO: Check parent for progress and update parent if child """ Updates the status of a dataset identified by its UUID. The status is updated to the next logical state unless From 46836f986bad74a210d548ab9dbb49c1b8b1a078 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 27 Feb 2024 11:41:48 +0000 Subject: [PATCH 15/30] Refactored dataset factory and cleaned methods --- .../metadata/api/hive/dataset_factory.py | 338 ++++++++---------- 1 file changed, 148 insertions(+), 190 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 80a9ca56..46fb0132 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -13,50 +13,104 @@ from ensembl.database import DBConnection from ensembl.production.metadata.api.exceptions import * -from ensembl.production.metadata.api.models import Dataset, Attribute, DatasetAttribute, Genome, GenomeDataset, \ - DatasetType, DatasetSource, DatasetStatus +from ensembl.production.metadata.api.models import Dataset, Genome, GenomeDataset, \ + DatasetType, DatasetStatus from sqlalchemy.sql import func import uuid from ensembl.production.metadata.updater.updater_utils import update_attributes class DatasetFactory: - """ - A class for interacting with the ensembl_genome_metadata database, specifically for modifying the dataset and - dataset attribute tables. - - Attributes - ---------- - session : SQLAlchemy session - An active SQLAlchemy session for database operations. If not provided, a new session is created using metadata_uri. - metadata_db : DBConnection - A database connection object used when a new session is created. - owns_session : bool - Flag to indicate whether this class instance owns the session (True) or if it was provided by the user (False). - - Methods - ------- - create_child_datasets(dataset_uuid, parent_type, child_type, dataset_attributes, genome_uuid): - Creates child datasets based on various parameters like dataset_uuid, parent_type, child_type, etc. - get_parent_datasets(dataset_uuid): - Retrieves parent datasets for a given dataset UUID. - create_dataset(session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, version): - Creates a new dataset and associates it with a genome. - update_dataset_status(dataset_uuid, status): - Updates the status of a dataset identified by its UUID. - update_dataset_attributes(dataset_uuid, attribut_dict): - Updates the attributes of a dataset identified by its UUID. - get_dataset(session, dataset_uuid): - Retrieves a dataset by its UUID. - """ - - def create_all_child_datasets(self, session, dataset_uuid): # Retrieve the top-level dataset - top_level_dataset = self.get_dataset(session, dataset_uuid) + top_level_dataset = self._get_dataset(session, dataset_uuid) self._create_child_datasets_recursive(session, top_level_dataset) + def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, + version): + new_dataset = Dataset( + dataset_uuid=str(uuid.uuid4()), + dataset_type=dataset_type, # Must be an object returned from the current session + name=name, + version=version, + label=label, + created=func.now(), + dataset_source=dataset_source, # Must + status='Submitted', + ) + genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).one() + new_genome_dataset = GenomeDataset( + genome=genome, + dataset=new_dataset, + is_current=False, + ) + new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session) + session.add(new_genome_dataset) + dataset_uuid = new_dataset.dataset_uuid + return dataset_uuid, new_dataset_attributes, new_genome_dataset + + def get_parent_datasets(self, dataset_uuid, session=None, metadata_uri=None): + if session: + return self._query_parent_datasets(session, dataset_uuid) + elif metadata_uri: + metadata_db = DBConnection(metadata_uri) + with metadata_db.session_scope() as session: + return self._query_parent_datasets(session, dataset_uuid) + else: + raise DatasetFactoryException("session or metadata_uri are required") + + def update_dataset_status(self, dataset_uuid, status, **kwargs): + updated_datasets = [(dataset_uuid, status)] + session = kwargs.get('session') + metadata_uri = kwargs.get('metadata_uri') + attribut_dict = kwargs.get('attribut_dict') + if session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + updated_datasets = self._update_status(dataset, status) + if attribut_dict: + updated_datasets = self._update_status(dataset, status) + elif metadata_uri: + metadata_db = DBConnection(metadata_uri) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + if attribut_dict: + updated_datasets = self._update_status(dataset, status) + else: + raise DatasetFactoryException("session or metadata_uri are required") + return updated_datasets + + def update_dataset_attributes(self, dataset_uuid, attribut_dict, **kwargs): + session = kwargs.get('session') + metadata_uri = kwargs.get('metadata_uri') + + if not isinstance(attribut_dict, dict): + raise TypeError("attribut_dict must be a dictionary") + if session: + dataset = self._get_dataset(session, dataset_uuid) + dataset_attributes = update_attributes(dataset, attribut_dict, session) + return dataset_attributes + else: + metadata_db = DBConnection(metadata_uri) + with metadata_db.session_scope() as session: + dataset = self._get_dataset(session, dataset_uuid) + dataset_attributes = update_attributes(dataset, attribut_dict, session) + return dataset_attributes + + def get_genomes_by_status_and_type(self, status, type, **kwargs): + + session = kwargs.get('session') + metadata_uri = kwargs.get('metadata_uri') + + if session: + genome_data = self._query_genomes_by_status_and_type(session, status, type) + return genome_data + else: + metadata_db = DBConnection(metadata_uri) + with metadata_db.session_scope() as session: + genome_data = self._query_genomes_by_status_and_type(session, status, type) + return genome_data + def _create_child_datasets_recursive(self, session, parent_dataset): parent_dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == parent_dataset.dataset_type_id).one() @@ -80,52 +134,11 @@ def _create_child_datasets_recursive(self, session, parent_dataset): dataset_attributes, name, label, version) # Recursively create children of this new child dataset - child_dataset = self.get_dataset(session, child_dataset_uuid) + child_dataset = self._get_dataset(session, child_dataset_uuid) self._create_child_datasets_recursive(session, child_dataset) - - - def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, - version): - """ - Creates a new dataset record and associates it with a specific genome. The new dataset is added to the database session. - - Parameters: - session (SQLAlchemy session): An active database session. - genome_uuid (str): UUID of the genome to associate the dataset with. - dataset_source (DatasetSource): The source of the dataset. - dataset_type (DatasetType): The type of the dataset. - dataset_attributes (dict): Attributes to assign to the dataset. - name (str): Name of the dataset. - label (str): Label for the dataset. - version (str): Version of the dataset. - - Returns: - tuple: Dataset UUID, dataset attributes, and the new genome-dataset association. - """ - new_dataset = Dataset( - dataset_uuid=str(uuid.uuid4()), - dataset_type=dataset_type, # Must be an object returned from the current session - name=name, - version=version, - label=label, - created=func.now(), - dataset_source=dataset_source, # Must - status='Submitted', - ) - genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).one() - new_genome_dataset = GenomeDataset( - genome=genome, - dataset=new_dataset, - is_current=False, - ) - new_dataset_attributes = update_attributes(new_dataset, dataset_attributes, session) - session.add(new_genome_dataset) - dataset_uuid = new_dataset.dataset_uuid - return dataset_uuid, new_dataset_attributes, new_genome_dataset - def _query_parent_datasets(self, session, dataset_uuid): - dataset = self.get_dataset(session, dataset_uuid) + dataset = self._get_dataset(session, dataset_uuid) dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == dataset.dataset_type_id).one() if dataset_type.parent is None: @@ -143,31 +156,6 @@ def _query_parent_datasets(self, session, dataset_uuid): parent_status = parent_genome_dataset.dataset.status return parent_uuid, parent_status - - - - def get_parent_datasets(self, dataset_uuid, session=None, metadata_uri=None): - """ - Retrieves the parent datasets of a specified dataset. If the dataset does not have a parent, - it returns the dataset itself and marks it as 'top_level'. - - Parameters: - dataset_uuid (str): UUID of the dataset for which the parent datasets are to be found. - - Returns: - tuple: Two lists containing UUIDs and types of the parent datasets. - """ - parent_uuid = '' - if session: - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - return self.query_parent_datasets(self.session, dataset_uuid) - elif metadata_uri: - metadata_db = DBConnection(metadata_uri) - with metadata_db.session_scope() as session: - return self.query_parent_datasets(self.session, dataset_uuid) - else: - raise DatasetFactoryException("session or metadata_uri are required") - def _query_top_level_parent(self, session, dataset_uuid): current_uuid = dataset_uuid while True: @@ -176,9 +164,8 @@ def _query_top_level_parent(self, session, dataset_uuid): return current_uuid current_uuid = parent_data[0] - def _query_related_genome_by_type(self, session, dataset_uuid, dataset_type): - dataset = self.get_dataset(session, dataset_uuid) + dataset = self._get_dataset(session, dataset_uuid) genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) if not genome_id: raise ValueError("No associated Genome found for the given dataset UUID") @@ -191,14 +178,14 @@ def _query_related_genome_by_type(self, session, dataset_uuid, dataset_type): return related_uuid, related_status def _query_child_datasets(self, session, dataset_uuid): - parent_dataset = self.get_dataset(session, dataset_uuid) + parent_dataset = self._get_dataset(session, dataset_uuid) parent_dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == parent_dataset.dataset_type_id).one() child_dataset_types = session.query(DatasetType).filter( DatasetType.parent == parent_dataset_type.name).all() if not child_dataset_types: return [] # Return an empty list if no child types are found - #This will break if we have multiple genome datasets for a single dataset, which is not currently the case. + # This will break if we have multiple genome datasets for a single dataset, which is not currently the case. genome_id = parent_dataset.genome_datasets.genome_id if not genome_id: raise ValueError("No associated Genome found for the given parent dataset UUID") @@ -226,63 +213,66 @@ def _query_all_child_datasets(self, session, parent_dataset_uuid): return all_child_datasets - def _query_depends_on(self,session, dataset_uuid): + def _query_depends_on(self, session, dataset_uuid): dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one_or_none() dataset_type = dataset.dataset_type dependent_types = dataset_type.depends_on.split(',') if dataset_type.depends_on else [] dependent_datasets_info = [] for dtype in dependent_types: - new_uuid, new_status = self._query_related_genome_by_type(session,dataset_uuid,dtype) + new_uuid, new_status = self._query_related_genome_by_type(session, dataset_uuid, dtype) dependent_datasets_info.append((new_uuid, new_status)) return dependent_datasets_info - def _update_status(self, session, dataset_uuid, status): - #TODO: Return UUID, status - #Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. + updated_datasets = [] + # Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. # Then convert all to released. #Add a blocker and warning in here. current_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() if status == DatasetStatus.SUBMITTED: - #Update to SUBMITTED and all parents. - #Do not touch the children. - #This should only be called in times of strife and error. + # Update to SUBMITTED and all parents. + # Do not touch the children. + # This should only be called in times of strife and error. current_dataset.status = DatasetStatus.SUBMITTED - parent_uuid, parent_status = self._query_parent_datasets(session,dataset_uuid) + parent_uuid, parent_status = self._query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: self._update_status(session, parent_uuid, DatasetStatus.SUBMITTED) elif status == DatasetStatus.PROCESSING: - #Update to PROCESSING and all parents. - #Do not touch the children. - - #TODO:Add check the depending - - + # Update to PROCESSING and all parents. + # Do not touch the children. + + # Check the dependents + dependents = self._query_depends_on(session, dataset_uuid) + for uuid, dep_status in dependents: + if dep_status != DatasetStatus.PROCESSED or dep_status != DatasetStatus.RELEASED: + return dataset_uuid, status current_dataset.status = DatasetStatus.PROCESSING - parent_uuid, parent_status = self._query_parent_datasets(session,dataset_uuid) + parent_uuid, parent_status = self._query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: self._update_status(session, parent_uuid, DatasetStatus.PROCESSING) elif status == DatasetStatus.PROCESSED: - #Get children + # Get children children_uuid = self._query_child_datasets(session, dataset_uuid) new_status = DatasetStatus.PROCESSED - #Check to see if any are still processing or submitted + # Check to see if any are still processing or submitted for child, child_status in children_uuid: - if child_status == DatasetStatus.PROCESSING or child_status == DatasetStatus.SUBMITTED: + # Not positive on the buisness rule here. Should we limit processed to the parents that have all children finished? + # if child_status == DatasetStatus.PROCESSING or child_status == DatasetStatus.SUBMITTED: + if child_status == DatasetStatus.PROCESSING: new_status = DatasetStatus.PROCESSING - #Update current dataset if all the children are updated. + # Update current dataset if all the children are updated. if new_status == DatasetStatus.PROCESSED: current_dataset.status = DatasetStatus.PROCESSED - #Check if parent needs to be updated - parent_uuid = self._query_parent_datasets(session,dataset_uuid) + # Check if parent needs to be updated + parent_uuid = self._query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: - self._update_status(session,parent_uuid,DatasetStatus.PROCESSED) + self._update_status(session, parent_uuid, DatasetStatus.PROCESSED) elif status == DatasetStatus.RELEASED: - #Get current datasets chain top level. + # Get current datasets chain top level. top_level_uuid = self._query_top_level_parent(dataset_uuid) - #Check that all children and sub children etc + # Check that all children and sub children etc top_level_children = self._query_all_child_datasets(top_level_uuid) genebuild_uuid = self._query_related_genome_by_type(session, dataset_uuid, "genebuild") top_level_children.extend(self._query_all_child_datasets(genebuild_uuid)) @@ -292,7 +282,8 @@ def _update_status(self, session, dataset_uuid, status): # Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. for child_uuid, child_status in top_level_children: if child_status is not DatasetStatus.RELEASED or child_status is not DatasetStatus.PROCESSED: - raise DatasetFactoryException(f"Dataset {child_uuid} is not released or processed. It is {child_status}") + raise DatasetFactoryException( + f"Dataset {child_uuid} is not released or processed. It is {child_status}") top_level_children = self._query_all_child_datasets(top_level_uuid) for child_uuid, child_status in top_level_children: child_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() @@ -300,65 +291,32 @@ def _update_status(self, session, dataset_uuid, status): else: raise DatasetFactoryException(f"Dataset status: {status} is not a vallid status") + updated_datasets.append((current_dataset.dataset_uuid, current_dataset.status)) + return updated_datasets + def _get_dataset(self, session, dataset_uuid): + return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - def update_dataset_status(self, dataset_uuid, status, session=None, metadata_uri=None): - # TODO: Check parent for progress and update parent if child - """ - Updates the status of a dataset identified by its UUID. The status is updated to the next logical state unless - a specific state is provided. - - Parameters: - dataset_uuid (str): UUID of the dataset to update. - status (str, optional): The new status to set for the dataset. If not provided, status is advanced to the next logical state. - - Returns: - tuple: Dataset UUID and the updated status. - """ - if session: - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - updated_status, parent_status = self._update_status(dataset, status) - elif metadata_uri: - metadata_db = DBConnection(metadata_uri) - with metadata_db.session_scope() as session: - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - updated_status, parent_status = self._update_status(dataset, status) - else: - raise DatasetFactoryException("session or metadata_uri are required") - return dataset_uuid, updated_status, parent_status - - def update_dataset_attributes(self, dataset_uuid, attribut_dict): - """ - Updates the attributes of a dataset identified by its UUID. The attributes to be updated are provided as a dictionary. - - Parameters: - dataset_uuid (str): UUID of the dataset to update. - attribute_dict (dict): A dictionary containing attribute names and their new values. - - Returns: - list: Updated dataset attributes. - """ - if not isinstance(attribut_dict, dict): - raise TypeError("attribut_dict must be a dictionary") - if self.owns_session: - with self.metadata_db.session_scope() as session: - dataset = self.get_dataset(session, dataset_uuid) - dataset_attributes = update_attributes(dataset, attribut_dict, session) - return dataset_attributes - else: - dataset = self.get_dataset(self.session, dataset_uuid) - dataset_attributes = update_attributes(dataset, attribut_dict, self.session) - return dataset_attributes - - def get_dataset(self, session, dataset_uuid): - """ - Retrieves a dataset by its UUID using an active database session. - - Parameters: - session (SQLAlchemy session): An active database session. - dataset_uuid (str): UUID of the dataset to retrieve. + def _query_genomes_by_status_and_type(self, session, status, type): + if session is None: + raise ValueError("Session is not provided") + + # Filter by Dataset status and DatasetType name + query = session.query( + Genome.genome_uuid, + Genome.production_name, + Dataset.dataset_uuid + ).join( + GenomeDataset, Genome.genome_id == GenomeDataset.genome_id + ).join( + Dataset, GenomeDataset.dataset_id == Dataset.dataset_id + ).join( + DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id + ).filter( + Dataset.status == status, + DatasetType.name == type + ) - Returns: - Dataset: The dataset object corresponding to the provided UUID. - """ - return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + # Execute query and fetch results + results = query.all() + return results From 97d8d1d1191a52cb610f066134bb21e27d0e0813 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 27 Feb 2024 13:37:40 +0000 Subject: [PATCH 16/30] Updated DatasetStatus Enum to string. --- .../metadata/api/hive/dataset_factory.py | 65 ++++++++++--------- .../production/metadata/api/models/dataset.py | 10 +-- .../sample/ensembl_metadata/dataset_type.txt | 30 +++++---- 3 files changed, 59 insertions(+), 46 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 46fb0132..32d86354 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -25,7 +25,7 @@ class DatasetFactory: def create_all_child_datasets(self, session, dataset_uuid): # Retrieve the top-level dataset top_level_dataset = self._get_dataset(session, dataset_uuid) - self._create_child_datasets_recursive(session, top_level_dataset) + self.__create_child_datasets_recursive(session, top_level_dataset) def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, version): @@ -50,13 +50,15 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat dataset_uuid = new_dataset.dataset_uuid return dataset_uuid, new_dataset_attributes, new_genome_dataset - def get_parent_datasets(self, dataset_uuid, session=None, metadata_uri=None): + def get_parent_datasets(self, dataset_uuid, **kwargs): + session = kwargs.get('session') + metadata_uri = kwargs.get('metadata_uri') if session: - return self._query_parent_datasets(session, dataset_uuid) + return self.__query_parent_datasets(session, dataset_uuid) elif metadata_uri: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: - return self._query_parent_datasets(session, dataset_uuid) + return self.__query_parent_datasets(session, dataset_uuid) else: raise DatasetFactoryException("session or metadata_uri are required") @@ -64,17 +66,17 @@ def update_dataset_status(self, dataset_uuid, status, **kwargs): updated_datasets = [(dataset_uuid, status)] session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') - attribut_dict = kwargs.get('attribut_dict') + attribute_dict = kwargs.get('attribut_dict') if session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() updated_datasets = self._update_status(dataset, status) - if attribut_dict: + if attribute_dict: updated_datasets = self._update_status(dataset, status) elif metadata_uri: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - if attribut_dict: + if attribute_dict: updated_datasets = self._update_status(dataset, status) else: raise DatasetFactoryException("session or metadata_uri are required") @@ -83,7 +85,6 @@ def update_dataset_status(self, dataset_uuid, status, **kwargs): def update_dataset_attributes(self, dataset_uuid, attribut_dict, **kwargs): session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') - if not isinstance(attribut_dict, dict): raise TypeError("attribut_dict must be a dictionary") if session: @@ -98,10 +99,8 @@ def update_dataset_attributes(self, dataset_uuid, attribut_dict, **kwargs): return dataset_attributes def get_genomes_by_status_and_type(self, status, type, **kwargs): - session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') - if session: genome_data = self._query_genomes_by_status_and_type(session, status, type) return genome_data @@ -111,13 +110,13 @@ def get_genomes_by_status_and_type(self, status, type, **kwargs): genome_data = self._query_genomes_by_status_and_type(session, status, type) return genome_data - def _create_child_datasets_recursive(self, session, parent_dataset): + def __create_child_datasets_recursive(self, session, parent_dataset): parent_dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == parent_dataset.dataset_type_id).one() # Find child dataset types for the parent dataset type child_dataset_types = session.query(DatasetType).filter( - DatasetType.parent == parent_dataset_type.name).all() + DatasetType.parent == parent_dataset_type.id).all() for child_type in child_dataset_types: # Example placeholders for dataset properties @@ -135,44 +134,50 @@ def _create_child_datasets_recursive(self, session, parent_dataset): # Recursively create children of this new child dataset child_dataset = self._get_dataset(session, child_dataset_uuid) - self._create_child_datasets_recursive(session, child_dataset) + self.__create_child_datasets_recursive(session, child_dataset) - def _query_parent_datasets(self, session, dataset_uuid): + def __query_parent_datasets(self, session, dataset_uuid): dataset = self._get_dataset(session, dataset_uuid) dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == dataset.dataset_type_id).one() if dataset_type.parent is None: return None - parent_dataset_types = dataset_type.parent.split(';') + parent_dataset_type = dataset_type.parent genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) if not genome_id: raise ValueError("No associated Genome found for the given dataset UUID") parent_genome_dataset = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( GenomeDataset.genome_id == genome_id, - DatasetType.name.in_(parent_dataset_types) - ).one() + DatasetType.id == parent_dataset_type).one() parent_uuid = parent_genome_dataset.dataset.dataset_uuid parent_status = parent_genome_dataset.dataset.status return parent_uuid, parent_status - def _query_top_level_parent(self, session, dataset_uuid): + def __query_top_level_parent(self, session, dataset_uuid): current_uuid = dataset_uuid while True: - parent_data = self._query_parent_datasets(session, current_uuid) + parent_data = self.__query_parent_datasets(session, current_uuid) if parent_data is None: return current_uuid current_uuid = parent_data[0] - def _query_related_genome_by_type(self, session, dataset_uuid, dataset_type): + def __query_related_genome_by_type(self, session, dataset_uuid, dataset_type): dataset = self._get_dataset(session, dataset_uuid) genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) if not genome_id: raise ValueError("No associated Genome found for the given dataset UUID") + + # Determine if dataset_type is an ID or a name + if isinstance(dataset_type, int) or (isinstance(dataset_type, str) and dataset_type.isdigit()): + # dataset_type is treated as an ID + filter_condition = (GenomeDataset.genome_id == genome_id, Dataset.dataset_type_id == dataset_type) + else: + # dataset_type is treated as a name + filter_condition = (GenomeDataset.genome_id == genome_id, DatasetType.name == dataset_type) + related_genome_dataset = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( - GenomeDataset.genome_id == genome_id, - DatasetType.name == dataset_type - ).one() + *filter_condition).one() related_uuid = related_genome_dataset.dataset.dataset_uuid related_status = related_genome_dataset.dataset.status return related_uuid, related_status @@ -219,7 +224,7 @@ def _query_depends_on(self, session, dataset_uuid): dependent_types = dataset_type.depends_on.split(',') if dataset_type.depends_on else [] dependent_datasets_info = [] for dtype in dependent_types: - new_uuid, new_status = self._query_related_genome_by_type(session, dataset_uuid, dtype) + new_uuid, new_status = self.__query_related_genome_by_type(session, dataset_uuid, dtype) dependent_datasets_info.append((new_uuid, new_status)) return dependent_datasets_info @@ -233,7 +238,7 @@ def _update_status(self, session, dataset_uuid, status): # Do not touch the children. # This should only be called in times of strife and error. current_dataset.status = DatasetStatus.SUBMITTED - parent_uuid, parent_status = self._query_parent_datasets(session, dataset_uuid) + parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: self._update_status(session, parent_uuid, DatasetStatus.SUBMITTED) @@ -247,7 +252,7 @@ def _update_status(self, session, dataset_uuid, status): if dep_status != DatasetStatus.PROCESSED or dep_status != DatasetStatus.RELEASED: return dataset_uuid, status current_dataset.status = DatasetStatus.PROCESSING - parent_uuid, parent_status = self._query_parent_datasets(session, dataset_uuid) + parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: self._update_status(session, parent_uuid, DatasetStatus.PROCESSING) @@ -265,18 +270,18 @@ def _update_status(self, session, dataset_uuid, status): if new_status == DatasetStatus.PROCESSED: current_dataset.status = DatasetStatus.PROCESSED # Check if parent needs to be updated - parent_uuid = self._query_parent_datasets(session, dataset_uuid) + parent_uuid = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: self._update_status(session, parent_uuid, DatasetStatus.PROCESSED) elif status == DatasetStatus.RELEASED: # Get current datasets chain top level. - top_level_uuid = self._query_top_level_parent(dataset_uuid) + top_level_uuid = self.__query_top_level_parent(dataset_uuid) # Check that all children and sub children etc top_level_children = self._query_all_child_datasets(top_level_uuid) - genebuild_uuid = self._query_related_genome_by_type(session, dataset_uuid, "genebuild") + genebuild_uuid = self.__query_related_genome_by_type(session, dataset_uuid, "genebuild") top_level_children.extend(self._query_all_child_datasets(genebuild_uuid)) - assembly_uuid = self._query_related_genome_by_type(session, dataset_uuid, "assembly") + assembly_uuid = self.__query_related_genome_by_type(session, dataset_uuid, "assembly") top_level_children.extend(self._query_all_child_datasets(assembly_uuid)) # Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 14aec3b0..7874fdb5 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -38,10 +38,10 @@ class Attribute(LoadAble, Base): class DatasetStatus(enum.Enum): - SUBMITTED = 'Submitted' - PROCESSING = 'Processing' - PROCESSED = 'Processed' - RELEASED = 'Released' + Submitted = 'Submitted' + Processing = 'Processing' + Processed = 'Processed' + Released = 'Released' class Dataset(LoadAble, Base): @@ -55,7 +55,7 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column('status', Enum(DatasetStatus), default=DatasetStatus.SUBMITTED) + status = Column('status', Enum(DatasetStatus), default=DatasetStatus.Submitted) # One to many relationships # dataset_id to dataset attribute and genome dataset diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt index dcc1eb8c..b613a798 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt @@ -1,14 +1,22 @@ 1 assembly Genomic assembly Core Annotation Compilation of sequences for a genome \N \N \N \N 2 genebuild Genomic Build Core Annotation Genomic annotations for an assembly \N \N \N \N -3 variation Variation Data Variation Annotation Short variant data for rattus_norvegicus \N \N \N \N -4 evidence Variation Evidence Variation Annotation Variation Annotation \N \N \N \N -5 regulation_build Regulations Regulatory Annotation Regulatory Annotation \N \N \N \N -6 homologies Comparative homologies Comparative Annotation Comparative Annotation \N genebuild \N \N +3 variation mRatBN7.2 Variation Annotation Short variant data for rattus_norvegicus \N \N \N \N +4 evidence Variation Evidence Variation Annotation \N \N \N \N \N +5 regulation_build Regulations Regulatory Annotation \N \N \N \N \N +6 homologies Comparative homologies Comparative Annotation \N \N \N \N \N 7 regulatory_features Regulatory Annotation Regulatory Annotation Regulatory annotation for an assembly \N \N \N \N -8 xref External References Core Annotation Data obtained from external sources and their links \N genebuild \N \N -9 protein_features Protein Features Core Annotation Annotation for proteins from external sources \N xref \N \N -10 appris APPRIS Core Annotation Data obtained from APPRIS \N xref protein_features \N -11 goa Gene Ontology Annotation Dumps Comparative Annotation Ontology Dumps \N homologies \N \N -12 gpad Gene Ontology Annotation Loading Comparative Annotation Ontology Loading \N goa \N \N -13 gene_name Gene Name Projection Comparative Annotation Gene Name Projection and Update \N gpad;homologies \N \N -14 ftp_dumps FTP Dumps Core Annotation FTP Dumps \N protein_features;appris;gene_name \N \N \ No newline at end of file +8 xrefs External References Production Compute Xref genome annotation for Genebuild \N 2 \N \N +9 protein_features Protein Features Production Compute Interpro scan run against proteins \N 2 8 \N +10 alpha_fold AlphaFold Production Compute AlphaFold compute against proteins \N 2 9 \N +11 checksums Checkums compute Production Compute Compute DNA sequences checksums \N 2 \N \N +12 refget_load Refget Loading Production Compute Refeget database provisioning \N 2 11 \N +13 compara_load Compara Data Loading Production Release Preparation Load MongoDB homologies \N 6 15 \N +14 search_dumps Data dumps for THOAS Production Release Preparation Dumps flat file for THOAS loading \N 2 1,8,9 \N +15 compara_compute Compute homologie database Production Compute Compute genome homologies database \N 6 \N \N +16 ftp_dumps FTP File dumps Production Release Preparation Dumps all FTP File format from genebuild \N 2 1,8,9 \N +17 compara_dumps Homologies file dumps Production Compute Dumped homologies tsv files \N 6 15 \N +18 blast Blast file dumps Production Compute Dumps blast indexed files \N 2 \N \N +20 variation_track Variation Track API update Production Release Preparation Load Variation Track API \N 3 \N \N +21 genome_browser_track Genebuild Track API update Production Release Preparation Load Genebuild track API \N 2 \N \N +22 regulation_track Regulation Track API update Production Release Preparation Load Regulation Track API \N 7 \N \N +23 thoas_load Thoas Loading Production Release Preparation Load MongoDB THOAS collection \N 2 11,12,14 \N \ No newline at end of file From 1dadb4512c7a7c8225407ed36cfe38134aac6934 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 27 Feb 2024 13:44:09 +0000 Subject: [PATCH 17/30] Updated version --- VERSION | 2 +- .../production/metadata/api/hive/dataset_factory.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/VERSION b/VERSION index 359a5b95..10bf840e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.0 \ No newline at end of file +2.0.1 \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 32d86354..1bbdc939 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -170,10 +170,8 @@ def __query_related_genome_by_type(self, session, dataset_uuid, dataset_type): # Determine if dataset_type is an ID or a name if isinstance(dataset_type, int) or (isinstance(dataset_type, str) and dataset_type.isdigit()): - # dataset_type is treated as an ID filter_condition = (GenomeDataset.genome_id == genome_id, Dataset.dataset_type_id == dataset_type) else: - # dataset_type is treated as a name filter_condition = (GenomeDataset.genome_id == genome_id, DatasetType.name == dataset_type) related_genome_dataset = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( @@ -182,12 +180,12 @@ def __query_related_genome_by_type(self, session, dataset_uuid, dataset_type): related_status = related_genome_dataset.dataset.status return related_uuid, related_status - def _query_child_datasets(self, session, dataset_uuid): + def __query_child_datasets(self, session, dataset_uuid): parent_dataset = self._get_dataset(session, dataset_uuid) parent_dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == parent_dataset.dataset_type_id).one() child_dataset_types = session.query(DatasetType).filter( - DatasetType.parent == parent_dataset_type.name).all() + DatasetType.parent == parent_dataset_type.dataset_type_id).all() if not child_dataset_types: return [] # Return an empty list if no child types are found # This will break if we have multiple genome datasets for a single dataset, which is not currently the case. @@ -208,7 +206,7 @@ def _query_child_datasets(self, session, dataset_uuid): def _query_all_child_datasets(self, session, parent_dataset_uuid): # This method returns the child datasets for a given dataset - child_datasets = self._query_child_datasets(session, parent_dataset_uuid) + child_datasets = self.__query_child_datasets(session, parent_dataset_uuid) all_child_datasets = [] for child_uuid, child_status in child_datasets: @@ -258,7 +256,7 @@ def _update_status(self, session, dataset_uuid, status): elif status == DatasetStatus.PROCESSED: # Get children - children_uuid = self._query_child_datasets(session, dataset_uuid) + children_uuid = self.__query_child_datasets(session, dataset_uuid) new_status = DatasetStatus.PROCESSED # Check to see if any are still processing or submitted for child, child_status in children_uuid: From b201dad0a6667c8a8b0ee5a71e23b2508801ee93 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 27 Feb 2024 13:58:08 +0000 Subject: [PATCH 18/30] Fixed dataset factory to use names rather than int --- .../metadata/api/hive/dataset_factory.py | 85 +++++++++---------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 1bbdc939..5e827f0b 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -24,7 +24,7 @@ class DatasetFactory: def create_all_child_datasets(self, session, dataset_uuid): # Retrieve the top-level dataset - top_level_dataset = self._get_dataset(session, dataset_uuid) + top_level_dataset = self.__get_dataset(session, dataset_uuid) self.__create_child_datasets_recursive(session, top_level_dataset) def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, @@ -69,15 +69,15 @@ def update_dataset_status(self, dataset_uuid, status, **kwargs): attribute_dict = kwargs.get('attribut_dict') if session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - updated_datasets = self._update_status(dataset, status) + updated_datasets = self.__update_status(dataset, status) if attribute_dict: - updated_datasets = self._update_status(dataset, status) + updated_datasets = self.__update_status(dataset, status) elif metadata_uri: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() if attribute_dict: - updated_datasets = self._update_status(dataset, status) + updated_datasets = self.__update_status(dataset, status) else: raise DatasetFactoryException("session or metadata_uri are required") return updated_datasets @@ -88,13 +88,13 @@ def update_dataset_attributes(self, dataset_uuid, attribut_dict, **kwargs): if not isinstance(attribut_dict, dict): raise TypeError("attribut_dict must be a dictionary") if session: - dataset = self._get_dataset(session, dataset_uuid) + dataset = self.__get_dataset(session, dataset_uuid) dataset_attributes = update_attributes(dataset, attribut_dict, session) return dataset_attributes else: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: - dataset = self._get_dataset(session, dataset_uuid) + dataset = self.__get_dataset(session, dataset_uuid) dataset_attributes = update_attributes(dataset, attribut_dict, session) return dataset_attributes @@ -102,12 +102,12 @@ def get_genomes_by_status_and_type(self, status, type, **kwargs): session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') if session: - genome_data = self._query_genomes_by_status_and_type(session, status, type) + genome_data = self.__query_genomes_by_status_and_type(session, status, type) return genome_data else: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: - genome_data = self._query_genomes_by_status_and_type(session, status, type) + genome_data = self.__query_genomes_by_status_and_type(session, status, type) return genome_data def __create_child_datasets_recursive(self, session, parent_dataset): @@ -116,7 +116,7 @@ def __create_child_datasets_recursive(self, session, parent_dataset): # Find child dataset types for the parent dataset type child_dataset_types = session.query(DatasetType).filter( - DatasetType.parent == parent_dataset_type.id).all() + DatasetType.parent == parent_dataset_type.dataset_type_id).all() for child_type in child_dataset_types: # Example placeholders for dataset properties @@ -133,11 +133,11 @@ def __create_child_datasets_recursive(self, session, parent_dataset): dataset_attributes, name, label, version) # Recursively create children of this new child dataset - child_dataset = self._get_dataset(session, child_dataset_uuid) + child_dataset = self.__get_dataset(session, child_dataset_uuid) self.__create_child_datasets_recursive(session, child_dataset) def __query_parent_datasets(self, session, dataset_uuid): - dataset = self._get_dataset(session, dataset_uuid) + dataset = self.__get_dataset(session, dataset_uuid) dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == dataset.dataset_type_id).one() if dataset_type.parent is None: @@ -149,7 +149,7 @@ def __query_parent_datasets(self, session, dataset_uuid): parent_genome_dataset = session.query(GenomeDataset).join(Dataset).join(DatasetType).filter( GenomeDataset.genome_id == genome_id, - DatasetType.id == parent_dataset_type).one() + DatasetType.dataset_type_id == parent_dataset_type).one() parent_uuid = parent_genome_dataset.dataset.dataset_uuid parent_status = parent_genome_dataset.dataset.status return parent_uuid, parent_status @@ -163,7 +163,7 @@ def __query_top_level_parent(self, session, dataset_uuid): current_uuid = parent_data[0] def __query_related_genome_by_type(self, session, dataset_uuid, dataset_type): - dataset = self._get_dataset(session, dataset_uuid) + dataset = self.__get_dataset(session, dataset_uuid) genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) if not genome_id: raise ValueError("No associated Genome found for the given dataset UUID") @@ -181,7 +181,7 @@ def __query_related_genome_by_type(self, session, dataset_uuid, dataset_type): return related_uuid, related_status def __query_child_datasets(self, session, dataset_uuid): - parent_dataset = self._get_dataset(session, dataset_uuid) + parent_dataset = self.__get_dataset(session, dataset_uuid) parent_dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == parent_dataset.dataset_type_id).one() child_dataset_types = session.query(DatasetType).filter( @@ -204,19 +204,19 @@ def __query_child_datasets(self, session, dataset_uuid): return child_data - def _query_all_child_datasets(self, session, parent_dataset_uuid): + def __query_all_child_datasets(self, session, parent_dataset_uuid): # This method returns the child datasets for a given dataset child_datasets = self.__query_child_datasets(session, parent_dataset_uuid) all_child_datasets = [] for child_uuid, child_status in child_datasets: all_child_datasets.append((child_uuid, child_status)) - sub_children = self._query_all_child_datasets(session, child_uuid) + sub_children = self.__query_all_child_datasets(session, child_uuid) all_child_datasets.extend(sub_children) return all_child_datasets - def _query_depends_on(self, session, dataset_uuid): + def __query_depends_on(self, session, dataset_uuid): dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one_or_none() dataset_type = dataset.dataset_type dependent_types = dataset_type.depends_on.split(',') if dataset_type.depends_on else [] @@ -226,81 +226,80 @@ def _query_depends_on(self, session, dataset_uuid): dependent_datasets_info.append((new_uuid, new_status)) return dependent_datasets_info - def _update_status(self, session, dataset_uuid, status): + def __update_status(self, session, dataset_uuid, status): updated_datasets = [] # Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. # Then convert all to released. #Add a blocker and warning in here. current_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - if status == DatasetStatus.SUBMITTED: + if status == DatasetStatus.Submitted: # Update to SUBMITTED and all parents. # Do not touch the children. # This should only be called in times of strife and error. - current_dataset.status = DatasetStatus.SUBMITTED + current_dataset.status = DatasetStatus.Submitted parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: - self._update_status(session, parent_uuid, DatasetStatus.SUBMITTED) + self.__update_status(session, parent_uuid, DatasetStatus.Submitted) - elif status == DatasetStatus.PROCESSING: + elif status == DatasetStatus.Processing: # Update to PROCESSING and all parents. # Do not touch the children. # Check the dependents - dependents = self._query_depends_on(session, dataset_uuid) + dependents = self.__query_depends_on(session, dataset_uuid) for uuid, dep_status in dependents: - if dep_status != DatasetStatus.PROCESSED or dep_status != DatasetStatus.RELEASED: + if dep_status != DatasetStatus.Processed or dep_status != DatasetStatus.Released: return dataset_uuid, status - current_dataset.status = DatasetStatus.PROCESSING + current_dataset.status = DatasetStatus.Processing parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: - self._update_status(session, parent_uuid, DatasetStatus.PROCESSING) + self.__update_status(session, parent_uuid, DatasetStatus.Processing) - elif status == DatasetStatus.PROCESSED: + elif status == DatasetStatus.Processed: # Get children children_uuid = self.__query_child_datasets(session, dataset_uuid) - new_status = DatasetStatus.PROCESSED + new_status = DatasetStatus.Processed # Check to see if any are still processing or submitted for child, child_status in children_uuid: # Not positive on the buisness rule here. Should we limit processed to the parents that have all children finished? # if child_status == DatasetStatus.PROCESSING or child_status == DatasetStatus.SUBMITTED: - if child_status == DatasetStatus.PROCESSING: - new_status = DatasetStatus.PROCESSING + if child_status == DatasetStatus.Processing: + new_status = DatasetStatus.Processing # Update current dataset if all the children are updated. - if new_status == DatasetStatus.PROCESSED: - current_dataset.status = DatasetStatus.PROCESSED + if new_status == DatasetStatus.Processed: + current_dataset.status = DatasetStatus.Processed # Check if parent needs to be updated parent_uuid = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: - self._update_status(session, parent_uuid, DatasetStatus.PROCESSED) + self.__update_status(session, parent_uuid, DatasetStatus.Processed) - elif status == DatasetStatus.RELEASED: + elif status == DatasetStatus.Released: # Get current datasets chain top level. top_level_uuid = self.__query_top_level_parent(dataset_uuid) # Check that all children and sub children etc - top_level_children = self._query_all_child_datasets(top_level_uuid) + top_level_children = self.__query_all_child_datasets(top_level_uuid) genebuild_uuid = self.__query_related_genome_by_type(session, dataset_uuid, "genebuild") - top_level_children.extend(self._query_all_child_datasets(genebuild_uuid)) + top_level_children.extend(self.__query_all_child_datasets(genebuild_uuid)) assembly_uuid = self.__query_related_genome_by_type(session, dataset_uuid, "assembly") - top_level_children.extend(self._query_all_child_datasets(assembly_uuid)) + top_level_children.extend(self.__query_all_child_datasets(assembly_uuid)) # Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. for child_uuid, child_status in top_level_children: - if child_status is not DatasetStatus.RELEASED or child_status is not DatasetStatus.PROCESSED: + if child_status is not DatasetStatus.Released or child_status is not DatasetStatus.Processed: raise DatasetFactoryException( f"Dataset {child_uuid} is not released or processed. It is {child_status}") - top_level_children = self._query_all_child_datasets(top_level_uuid) + top_level_children = self.__query_all_child_datasets(top_level_uuid) for child_uuid, child_status in top_level_children: child_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() - child_dataset.status = DatasetStatus.RELEASED + child_dataset.status = DatasetStatus.Released else: raise DatasetFactoryException(f"Dataset status: {status} is not a vallid status") - updated_datasets.append((current_dataset.dataset_uuid, current_dataset.status)) return updated_datasets - def _get_dataset(self, session, dataset_uuid): + def __get_dataset(self, session, dataset_uuid): return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - def _query_genomes_by_status_and_type(self, session, status, type): + def __query_genomes_by_status_and_type(self, session, status, type): if session is None: raise ValueError("Session is not provided") From 91b18d930802675434caad642bba5ece3316bee6 Mon Sep 17 00:00:00 2001 From: Marc Chakiachvili Date: Tue, 27 Feb 2024 15:52:17 +0000 Subject: [PATCH 19/30] Forced urllib3 to downgrade to 1.26.* --- requirements.in | 4 +++- requirements.txt | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requirements.in b/requirements.in index a399c8ff..9338ae0c 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,7 @@ ensembl-py@git+https://github.com/Ensembl/ensembl-py.git@1.2.2 grpcio grpcio-tools -sqlalchemy +sqlalchemy<=2.0 types-pymysql +urllib3~=1.26.15 + diff --git a/requirements.txt b/requirements.txt index 27afc60a..21160d9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -57,8 +57,10 @@ tomli==2.0.1 # via pytest types-pymysql==1.1.0.1 # via -r requirements.in -urllib3==2.1.0 - # via requests +urllib3==1.26.18 + # via + # -r requirements.in + # requests # The following packages are considered to be unsafe in a requirements file: # setuptools From 128f87d39965283e500bdafc516d2911989f251b Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 27 Feb 2024 16:08:51 +0000 Subject: [PATCH 20/30] Fixed dataset_factory.py tests --- .../metadata/api/hive/dataset_factory.py | 15 ++- src/tests/test_dataset_factory.py | 114 ++++++++++-------- 2 files changed, 76 insertions(+), 53 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 5e827f0b..76e4b6c6 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -14,7 +14,7 @@ from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.models import Dataset, Genome, GenomeDataset, \ - DatasetType, DatasetStatus + DatasetType, DatasetStatus, DatasetSource from sqlalchemy.sql import func import uuid from ensembl.production.metadata.updater.updater_utils import update_attributes @@ -24,6 +24,7 @@ class DatasetFactory: def create_all_child_datasets(self, session, dataset_uuid): # Retrieve the top-level dataset + #Will not work on datasets that are tied to multiple genomes! top_level_dataset = self.__get_dataset(session, dataset_uuid) self.__create_child_datasets_recursive(session, top_level_dataset) @@ -120,8 +121,12 @@ def __create_child_datasets_recursive(self, session, parent_dataset): for child_type in child_dataset_types: # Example placeholders for dataset properties - genome_uuid = parent_dataset.genome_datasets.genome_id - dataset_source = parent_dataset.source + if len(parent_dataset.genome_datasets) > 1: + raise ValueError("More than one genome linked to a genome_dataset") + + # Get the first genome's UUID + genome_uuid = parent_dataset.genome_datasets[0].genome.genome_uuid + dataset_source = parent_dataset.dataset_source dataset_type = child_type dataset_attributes = {} # Populate with appropriate attributes name = dataset_type.name @@ -129,9 +134,9 @@ def __create_child_datasets_recursive(self, session, parent_dataset): version = None # Create the child dataset - child_dataset_uuid = self.create_dataset(session, genome_uuid, dataset_source, dataset_type, + child_dataset_uuid, new_dataset_attributes, new_genome_dataset = self.create_dataset(session, genome_uuid, dataset_source, dataset_type, dataset_attributes, name, label, version) - + session.commit() # Recursively create children of this new child dataset child_dataset = self.__get_dataset(session, child_dataset_uuid) self.__create_child_datasets_recursive(session, child_dataset) diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index 63c712b0..f8d2fa49 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -13,7 +13,8 @@ import pytest from ensembl.database import UnitTestDB, DBConnection from ensembl.production.metadata.api.hive.dataset_factory import DatasetFactory -from ensembl.production.metadata.api.models import Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType +from ensembl.production.metadata.api.models import (Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType, + DatasetStatus) db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() @@ -23,25 +24,12 @@ class TestDatasetFactory: dbc = None # type: UnitTestDB - def test_update_dataset_status(self, multi_dbs): - dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - test_uuid = 'fc5d3e13-340c-4e2a-9f49-256fc319331e' - dataset_factory.update_dataset_status(test_uuid) - metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) - with metadata_db.session_scope() as session: - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() - assert dataset.status == 'Processing' - dataset_factory = DatasetFactory(session=session) - dataset_factory.update_dataset_status(test_uuid) - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() - assert dataset.status == 'Processed' - def test_update_dataset_attributes(self, multi_dbs): - dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + #Test that the dataset attribute creation works fine and that the dataset_factory works with a session or a url + dataset_factory = DatasetFactory() test_uuid = 'fc5d3e13-340c-4e2a-9f49-256fc319331e' test_attributes = {"assembly.contig_n50": "test1", "assembly.total_genome_length": "test2"} - # def update_dataset_attributes(self,dataset_uuid, attribut_dict): - dataset_factory.update_dataset_attributes(test_uuid, test_attributes) + dataset_factory.update_dataset_attributes(test_uuid, test_attributes, metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) with metadata_db.session_scope() as session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() @@ -52,9 +40,9 @@ def test_update_dataset_attributes(self, multi_dbs): DatasetAttribute.value == 'test1') \ .one_or_none() assert dataset_attribute is not None - dataset_factory = DatasetFactory(session=session) + dataset_factory = DatasetFactory() test_attributes = {"assembly.gc_percentage": "test3", "genebuild.nc_longest_gene_length": "test4"} - dataset_factory.update_dataset_attributes(test_uuid, test_attributes) + dataset_factory.update_dataset_attributes(test_uuid, test_attributes,session=session) session.commit() dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() test_attribute = session.query(DatasetAttribute) \ @@ -65,6 +53,7 @@ def test_update_dataset_attributes(self, multi_dbs): .all() assert test_attribute is not None + def test_create_dataset(self, multi_dbs): metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) with metadata_db.session_scope() as session: @@ -76,7 +65,7 @@ def test_create_dataset(self, multi_dbs): test_name = 'test_name' test_label = 'test_label' test_version = 'test_version' - dataset_factory = DatasetFactory(session=session) + dataset_factory = DatasetFactory() dataset_uuid, new_dataset_attributes, new_genome_dataset = dataset_factory.create_dataset(session, test_genome_uuid, test_dataset_source, @@ -101,34 +90,63 @@ def test_create_dataset(self, multi_dbs): DatasetAttribute.value == 'test4') \ .all() assert test_attribute is not None - - def test_create_child_datasets_get_parent(self, multi_dbs): - # Tests for individual calling via dataset_uuid or genome_uuid - dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - test_uuid = '90ba6c03-5161-4f9a-911c-1961b9c0470d' - data = dataset_factory.create_child_datasets(dataset_uuid=test_uuid) + def test_create_genebuild_children(self, multi_dbs): metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) with metadata_db.session_scope() as session: - dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').one() - assert dataset.status == 'Submitted' - dataset_factory = DatasetFactory(session=session) - dataset_factory.update_dataset_status(dataset.dataset_uuid, 'Processed') - session.commit() - parent, parent_type = dataset_factory.get_parent_datasets(dataset.dataset_uuid) - assert parent[0] == test_uuid - assert parent_type[0] == 'genebuild' - dataset_factory.create_child_datasets(genome_uuid='9cc516a8-529e-4919-a429-0d7032e295c9', - child_type='protein_features') - # dataset_factory.create_child_datasets(dataset_uuid=data[0], - # child_type='protein_features') - session.commit() - new_dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'protein_features').one() - assert new_dataset.status == 'Submitted' + genebuild_uuid = 'cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2' + assembly_uuid = '02104faf-3fee-4f28-b53c-605843dac941' + dataset_factory = DatasetFactory() + dataset_factory.create_all_child_datasets(session, genebuild_uuid) + data = session.query(Dataset).join(DatasetType).filter( + DatasetType.name == 'thoas_load').one() + data.status == DatasetStatus.Submitted + #test get parent + test_parent, test_status = dataset_factory.get_parent_datasets(data.dataset_uuid, session=session) + assert test_parent == genebuild_uuid - # Tests for bulk calling. - dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - dataset_factory.create_child_datasets(parent_type='genebuild') - metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) - with metadata_db.session_scope() as session: - dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').all() - assert len(dataset) == 240 + # def test_update_dataset_status(self, multi_dbs): + # dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + # test_uuid = 'fc5d3e13-340c-4e2a-9f49-256fc319331e' + # dataset_factory.update_dataset_status(test_uuid) + # metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + # with metadata_db.session_scope() as session: + # dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() + # assert dataset.status == 'Processing' + # dataset_factory = DatasetFactory(session=session) + # dataset_factory.update_dataset_status(test_uuid) + # dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() + # assert dataset.status == 'Processed' + + + + # + # def test_create_child_datasets_get_parent(self, multi_dbs): + # # Tests for individual calling via dataset_uuid or genome_uuid + # dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + # test_uuid = '90ba6c03-5161-4f9a-911c-1961b9c0470d' + # data = dataset_factory.create_child_datasets(dataset_uuid=test_uuid) + # metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + # with metadata_db.session_scope() as session: + # dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').one() + # assert dataset.status == 'Submitted' + # dataset_factory = DatasetFactory(session=session) + # dataset_factory.update_dataset_status(dataset.dataset_uuid, 'Processed') + # session.commit() + # parent, parent_type = dataset_factory.get_parent_datasets(dataset.dataset_uuid) + # assert parent[0] == test_uuid + # assert parent_type[0] == 'genebuild' + # dataset_factory.create_child_datasets(genome_uuid='9cc516a8-529e-4919-a429-0d7032e295c9', + # child_type='protein_features') + # # dataset_factory.create_child_datasets(dataset_uuid=data[0], + # # child_type='protein_features') + # session.commit() + # new_dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'protein_features').one() + # assert new_dataset.status == 'Submitted' + # + # # Tests for bulk calling. + # dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) + # dataset_factory.create_child_datasets(parent_type='genebuild') + # metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + # with metadata_db.session_scope() as session: + # dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').all() + # assert len(dataset) == 240 From 89c3a845e6cc82697404a4fbcf413913e06ac0d1 Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 28 Feb 2024 12:09:54 +0000 Subject: [PATCH 21/30] Fixed dataset_factory.py tests --- .../metadata/api/hive/dataset_factory.py | 66 ++++---- .../production/metadata/api/models/dataset.py | 5 +- src/tests/test_dataset_factory.py | 143 +++++++++++------- 3 files changed, 128 insertions(+), 86 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 76e4b6c6..239beedd 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -24,7 +24,8 @@ class DatasetFactory: def create_all_child_datasets(self, session, dataset_uuid): # Retrieve the top-level dataset - #Will not work on datasets that are tied to multiple genomes! + # Will not work on datasets that are tied to multiple genomes! + # !!!! WILL CREATE THE DATASETS EVEN IF THEY ALREADY EXIST top_level_dataset = self.__get_dataset(session, dataset_uuid) self.__create_child_datasets_recursive(session, top_level_dataset) @@ -69,16 +70,15 @@ def update_dataset_status(self, dataset_uuid, status, **kwargs): metadata_uri = kwargs.get('metadata_uri') attribute_dict = kwargs.get('attribut_dict') if session: - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - updated_datasets = self.__update_status(dataset, status) + updated_datasets = self.__update_status(session, dataset_uuid, status) if attribute_dict: - updated_datasets = self.__update_status(dataset, status) + updated_datasets = self.update_dataset_attributes(dataset_uuid, attribute_dict, session=session) elif metadata_uri: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: - dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + updated_datasets = self.__update_status(session, dataset_uuid, status) if attribute_dict: - updated_datasets = self.__update_status(dataset, status) + updated_datasets = self.update_dataset_attributes(dataset_uuid, attribute_dict, session=session) else: raise DatasetFactoryException("session or metadata_uri are required") return updated_datasets @@ -134,8 +134,11 @@ def __create_child_datasets_recursive(self, session, parent_dataset): version = None # Create the child dataset - child_dataset_uuid, new_dataset_attributes, new_genome_dataset = self.create_dataset(session, genome_uuid, dataset_source, dataset_type, - dataset_attributes, name, label, version) + child_dataset_uuid, new_dataset_attributes, new_genome_dataset = self.create_dataset(session, genome_uuid, + dataset_source, + dataset_type, + dataset_attributes, + name, label, version) session.commit() # Recursively create children of this new child dataset child_dataset = self.__get_dataset(session, child_dataset_uuid) @@ -146,7 +149,7 @@ def __query_parent_datasets(self, session, dataset_uuid): dataset_type = session.query(DatasetType).filter( DatasetType.dataset_type_id == dataset.dataset_type_id).one() if dataset_type.parent is None: - return None + return None, None parent_dataset_type = dataset_type.parent genome_id = next((gd.genome_id for gd in dataset.genome_datasets), None) if not genome_id: @@ -162,10 +165,10 @@ def __query_parent_datasets(self, session, dataset_uuid): def __query_top_level_parent(self, session, dataset_uuid): current_uuid = dataset_uuid while True: - parent_data = self.__query_parent_datasets(session, current_uuid) + parent_data, parent_status = self.__query_parent_datasets(session, current_uuid) if parent_data is None: return current_uuid - current_uuid = parent_data[0] + current_uuid = parent_data def __query_related_genome_by_type(self, session, dataset_uuid, dataset_type): dataset = self.__get_dataset(session, dataset_uuid) @@ -194,7 +197,7 @@ def __query_child_datasets(self, session, dataset_uuid): if not child_dataset_types: return [] # Return an empty list if no child types are found # This will break if we have multiple genome datasets for a single dataset, which is not currently the case. - genome_id = parent_dataset.genome_datasets.genome_id + genome_id = parent_dataset.genome_datasets[0].genome_id if not genome_id: raise ValueError("No associated Genome found for the given parent dataset UUID") @@ -232,10 +235,10 @@ def __query_depends_on(self, session, dataset_uuid): return dependent_datasets_info def __update_status(self, session, dataset_uuid, status): - updated_datasets = [] # Processed to Released. Only accept top level. Check that all assembly and genebuild datsets (all the way down) are processed. # Then convert all to released. #Add a blocker and warning in here. current_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() + updated_datasets = (dataset_uuid, current_dataset.status) if status == DatasetStatus.Submitted: # Update to SUBMITTED and all parents. # Do not touch the children. @@ -248,57 +251,60 @@ def __update_status(self, session, dataset_uuid, status): elif status == DatasetStatus.Processing: # Update to PROCESSING and all parents. # Do not touch the children. - + if current_dataset.status == DatasetStatus.Released: + return updated_datasets # Check the dependents dependents = self.__query_depends_on(session, dataset_uuid) for uuid, dep_status in dependents: if dep_status != DatasetStatus.Processed or dep_status != DatasetStatus.Released: - return dataset_uuid, status + return updated_datasets current_dataset.status = DatasetStatus.Processing parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: self.__update_status(session, parent_uuid, DatasetStatus.Processing) elif status == DatasetStatus.Processed: + if current_dataset.status == DatasetStatus.Released: + return updated_datasets # Get children children_uuid = self.__query_child_datasets(session, dataset_uuid) new_status = DatasetStatus.Processed # Check to see if any are still processing or submitted for child, child_status in children_uuid: - # Not positive on the buisness rule here. Should we limit processed to the parents that have all children finished? - # if child_status == DatasetStatus.PROCESSING or child_status == DatasetStatus.SUBMITTED: - if child_status == DatasetStatus.Processing: + if child_status == DatasetStatus.Processing or child_status == DatasetStatus.Submitted: new_status = DatasetStatus.Processing # Update current dataset if all the children are updated. if new_status == DatasetStatus.Processed: current_dataset.status = DatasetStatus.Processed # Check if parent needs to be updated - parent_uuid = self.__query_parent_datasets(session, dataset_uuid) + parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: self.__update_status(session, parent_uuid, DatasetStatus.Processed) elif status == DatasetStatus.Released: # Get current datasets chain top level. - top_level_uuid = self.__query_top_level_parent(dataset_uuid) + top_level_uuid = self.__query_top_level_parent(session, dataset_uuid) # Check that all children and sub children etc - top_level_children = self.__query_all_child_datasets(top_level_uuid) - genebuild_uuid = self.__query_related_genome_by_type(session, dataset_uuid, "genebuild") - top_level_children.extend(self.__query_all_child_datasets(genebuild_uuid)) - assembly_uuid = self.__query_related_genome_by_type(session, dataset_uuid, "assembly") - top_level_children.extend(self.__query_all_child_datasets(assembly_uuid)) + top_level_children = self.__query_all_child_datasets(session, top_level_uuid) + genebuild_uuid, genebuild_status = self.__query_related_genome_by_type(session, dataset_uuid, "genebuild") + top_level_children.extend(self.__query_all_child_datasets(session, genebuild_uuid)) + assembly_uuid, assembly_status = self.__query_related_genome_by_type(session, dataset_uuid, "assembly") + top_level_children.extend(self.__query_all_child_datasets(session, assembly_uuid)) # Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. for child_uuid, child_status in top_level_children: - if child_status is not DatasetStatus.Released or child_status is not DatasetStatus.Processed: + if child_status != DatasetStatus.Released and child_status != DatasetStatus.Processed: + child_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() raise DatasetFactoryException( f"Dataset {child_uuid} is not released or processed. It is {child_status}") - top_level_children = self.__query_all_child_datasets(top_level_uuid) + top_level_children = self.__query_all_child_datasets(session, top_level_uuid) for child_uuid, child_status in top_level_children: child_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() child_dataset.status = DatasetStatus.Released + current_dataset.status = DatasetStatus.Released else: raise DatasetFactoryException(f"Dataset status: {status} is not a vallid status") - updated_datasets.append((current_dataset.dataset_uuid, current_dataset.status)) + updated_datasets = (current_dataset.dataset_uuid, current_dataset.status) return updated_datasets def __get_dataset(self, session, dataset_uuid): @@ -322,8 +328,8 @@ def __query_genomes_by_status_and_type(self, session, status, type): ).filter( Dataset.status == status, DatasetType.name == type - ) + ).all() # Execute query and fetch results - results = query.all() + results = query return results diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 52f9ee1e..20ef52de 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -57,8 +57,9 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column('status', Enum(DatasetStatus), default=DatasetStatus.Submitted) - + status = Column('status', Enum(DatasetStatus, + values_callable=lambda x: [str(status_enum.value) for status_enum in DatasetStatus]), + default=DatasetStatus.Submitted) # One to many relationships # dataset_id to dataset attribute and genome dataset dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index f8d2fa49..537e73a3 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -14,23 +14,28 @@ from ensembl.database import UnitTestDB, DBConnection from ensembl.production.metadata.api.hive.dataset_factory import DatasetFactory from ensembl.production.metadata.api.models import (Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType, - DatasetStatus) + DatasetStatus, GenomeDataset, Genome) db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() +sample_path = Path(__file__).parent.parent / "ensembl" / "production" / "metadata" / "api" / "sample" -@pytest.mark.parametrize("multi_dbs", [[{'src': 'ensembl_metadata'}, {'src': 'ncbi_taxonomy'}]], indirect=True) + +@pytest.mark.parametrize("multi_dbs", [[{'src': sample_path / 'ensembl_genome_metadata'}, + {'src': sample_path / 'ncbi_taxonomy'}, + ]], indirect=True) class TestDatasetFactory: dbc = None # type: UnitTestDB def test_update_dataset_attributes(self, multi_dbs): - #Test that the dataset attribute creation works fine and that the dataset_factory works with a session or a url + # Test that the dataset attribute creation works fine and that the dataset_factory works with a session or a url dataset_factory = DatasetFactory() test_uuid = 'fc5d3e13-340c-4e2a-9f49-256fc319331e' test_attributes = {"assembly.contig_n50": "test1", "assembly.total_genome_length": "test2"} - dataset_factory.update_dataset_attributes(test_uuid, test_attributes, metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + dataset_factory.update_dataset_attributes(test_uuid, test_attributes, + metadata_uri=multi_dbs['ensembl_genome_metadata'].dbc.url) + metadata_db = DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() dataset_attribute = session.query(DatasetAttribute) \ @@ -42,7 +47,7 @@ def test_update_dataset_attributes(self, multi_dbs): assert dataset_attribute is not None dataset_factory = DatasetFactory() test_attributes = {"assembly.gc_percentage": "test3", "genebuild.nc_longest_gene_length": "test4"} - dataset_factory.update_dataset_attributes(test_uuid, test_attributes,session=session) + dataset_factory.update_dataset_attributes(test_uuid, test_attributes, session=session) session.commit() dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() test_attribute = session.query(DatasetAttribute) \ @@ -53,9 +58,8 @@ def test_update_dataset_attributes(self, multi_dbs): .all() assert test_attribute is not None - def test_create_dataset(self, multi_dbs): - metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + metadata_db = DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: test_attributes = {"assembly.contig_n50": "test1", "assembly.total_genome_length": "test2"} test_genome_uuid = '48b1b849-3b73-4242-ae83-af2290aeb071' @@ -90,8 +94,9 @@ def test_create_dataset(self, multi_dbs): DatasetAttribute.value == 'test4') \ .all() assert test_attribute is not None + def test_create_genebuild_children(self, multi_dbs): - metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) + metadata_db = DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) with metadata_db.session_scope() as session: genebuild_uuid = 'cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2' assembly_uuid = '02104faf-3fee-4f28-b53c-605843dac941' @@ -100,53 +105,83 @@ def test_create_genebuild_children(self, multi_dbs): data = session.query(Dataset).join(DatasetType).filter( DatasetType.name == 'thoas_load').one() data.status == DatasetStatus.Submitted - #test get parent + # test get parent test_parent, test_status = dataset_factory.get_parent_datasets(data.dataset_uuid, session=session) assert test_parent == genebuild_uuid - # def test_update_dataset_status(self, multi_dbs): - # dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - # test_uuid = 'fc5d3e13-340c-4e2a-9f49-256fc319331e' - # dataset_factory.update_dataset_status(test_uuid) - # metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) - # with metadata_db.session_scope() as session: - # dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() - # assert dataset.status == 'Processing' - # dataset_factory = DatasetFactory(session=session) - # dataset_factory.update_dataset_status(test_uuid) - # dataset = session.query(Dataset).filter(Dataset.dataset_uuid == test_uuid).one() - # assert dataset.status == 'Processed' - + def test_update_dataset_status(self, multi_dbs): + metadata_db = DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) + with metadata_db.session_scope() as session: + genebuild_uuid = 'cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2' + dataset_factory = DatasetFactory() + genebuild_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genebuild_uuid).one() + # Get the genome for this one + genome_uuid = genebuild_dataset.genome_datasets[0].genome.genome_uuid + # Check that xref is made + xref_uuid = session.query(Dataset.dataset_uuid) \ + .join(GenomeDataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ + .join(Genome, Genome.genome_id == GenomeDataset.genome_id) \ + .join(DatasetType, DatasetType.dataset_type_id == Dataset.dataset_type_id) \ + .filter(Genome.genome_uuid == genome_uuid) \ + .filter(DatasetType.name == "xrefs").one() + protfeat_uuid = session.query(Dataset.dataset_uuid) \ + .join(GenomeDataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ + .join(Genome, Genome.genome_id == GenomeDataset.genome_id) \ + .join(DatasetType, DatasetType.dataset_type_id == Dataset.dataset_type_id) \ + .filter(Genome.genome_uuid == genome_uuid) \ + .filter(DatasetType.name == "protein_features").one() + protfeat_uuid = protfeat_uuid[0] + xref_uuid = xref_uuid[0] + # Processing + # Fail to update protein_features + temp, failed_status = dataset_factory.update_dataset_status(protfeat_uuid, DatasetStatus.Processing, + session=session) + session.commit() + failed_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == protfeat_uuid).one() + assert failed_status == DatasetStatus.Submitted + assert failed_status_check[0] == DatasetStatus.Submitted + # succeed on xref + temp, succeed_status = dataset_factory.update_dataset_status(xref_uuid, DatasetStatus.Processing, + session=session) + session.commit() + succeed_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == xref_uuid).one() + genebuild_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == genebuild_uuid).one() + assert succeed_status == DatasetStatus.Processing + assert succeed_status_check[0] == DatasetStatus.Processing + assert genebuild_status_check[0] == DatasetStatus.Processing + # Processed + # Fail to update genebuild + temp, failed_status = dataset_factory.update_dataset_status(genebuild_uuid, DatasetStatus.Processed, + session=session) + session.commit() + genebuild_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == genebuild_uuid).one() + assert failed_status == DatasetStatus.Processing + assert genebuild_status_check[0] == DatasetStatus.Processing + # Change all the children + child_dataset_uuids = session.query(Dataset.dataset_uuid) \ + .join(GenomeDataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ + .join(Genome, Genome.genome_id == GenomeDataset.genome_id) \ + .join(DatasetType, DatasetType.dataset_type_id == Dataset.dataset_type_id) \ + .filter(Genome.genome_uuid == genome_uuid) \ + .filter(DatasetType.name != "genebuild").all() + for temp_uuid in child_dataset_uuids: + temp_uuid = temp_uuid[0] + dataset_factory.update_dataset_status(temp_uuid, DatasetStatus.Processed, session=session) + session.commit() + genebuild_status_check = session.query(Dataset.status).filter( + Dataset.dataset_uuid == genebuild_uuid).one() + assert genebuild_status_check[0] == DatasetStatus.Processed + dataset_factory.update_dataset_status(genebuild_uuid, DatasetStatus.Released, session=session) + session.commit() + genebuild_status_check = session.query(Dataset.status).filter( + Dataset.dataset_uuid == genebuild_uuid).one() + assert genebuild_status_check[0] == DatasetStatus.Released + protfeat_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == protfeat_uuid).one() + assert protfeat_status_check[0] == DatasetStatus.Released - # - # def test_create_child_datasets_get_parent(self, multi_dbs): - # # Tests for individual calling via dataset_uuid or genome_uuid - # dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - # test_uuid = '90ba6c03-5161-4f9a-911c-1961b9c0470d' - # data = dataset_factory.create_child_datasets(dataset_uuid=test_uuid) - # metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) - # with metadata_db.session_scope() as session: - # dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').one() - # assert dataset.status == 'Submitted' - # dataset_factory = DatasetFactory(session=session) - # dataset_factory.update_dataset_status(dataset.dataset_uuid, 'Processed') - # session.commit() - # parent, parent_type = dataset_factory.get_parent_datasets(dataset.dataset_uuid) - # assert parent[0] == test_uuid - # assert parent_type[0] == 'genebuild' - # dataset_factory.create_child_datasets(genome_uuid='9cc516a8-529e-4919-a429-0d7032e295c9', - # child_type='protein_features') - # # dataset_factory.create_child_datasets(dataset_uuid=data[0], - # # child_type='protein_features') - # session.commit() - # new_dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'protein_features').one() - # assert new_dataset.status == 'Submitted' - # - # # Tests for bulk calling. - # dataset_factory = DatasetFactory(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url) - # dataset_factory.create_child_datasets(parent_type='genebuild') - # metadata_db = DBConnection(multi_dbs['ensembl_metadata'].dbc.url) - # with metadata_db.session_scope() as session: - # dataset = session.query(Dataset).join(DatasetType).filter(DatasetType.name == 'xref').all() - # assert len(dataset) == 240 + # Check for submitted change + dataset_factory.update_dataset_status(protfeat_uuid, DatasetStatus.Submitted, session=session) + session.commit() + submitted_status = session.query(Dataset.status).filter(Dataset.dataset_uuid == protfeat_uuid).one() + assert submitted_status[0] == DatasetStatus.Submitted From b84dc67e08f07f050221067863a358ee297293f2 Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 28 Feb 2024 14:29:49 +0000 Subject: [PATCH 22/30] Minor updates of dataset_factory.py --- .../metadata/api/hive/dataset_factory.py | 44 ++++++++++--------- .../metadata/updater/updater_utils.py | 1 + 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/hive/dataset_factory.py index 239beedd..ff8c6a28 100644 --- a/src/ensembl/production/metadata/api/hive/dataset_factory.py +++ b/src/ensembl/production/metadata/api/hive/dataset_factory.py @@ -10,18 +10,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +import uuid + from ensembl.database import DBConnection +from sqlalchemy.sql import func from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.models import Dataset, Genome, GenomeDataset, \ - DatasetType, DatasetStatus, DatasetSource -from sqlalchemy.sql import func -import uuid + DatasetType, DatasetStatus from ensembl.production.metadata.updater.updater_utils import update_attributes class DatasetFactory: - + # TODO: Multiple genomes for a single dataset are not incoporated def create_all_child_datasets(self, session, dataset_uuid): # Retrieve the top-level dataset # Will not work on datasets that are tied to multiple genomes! @@ -84,6 +85,7 @@ def update_dataset_status(self, dataset_uuid, status, **kwargs): return updated_datasets def update_dataset_attributes(self, dataset_uuid, attribut_dict, **kwargs): + #TODO ADD DELETE opiton to kwargs to redo dataset_attributes. session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') if not isinstance(attribut_dict, dict): @@ -99,16 +101,16 @@ def update_dataset_attributes(self, dataset_uuid, attribut_dict, **kwargs): dataset_attributes = update_attributes(dataset, attribut_dict, session) return dataset_attributes - def get_genomes_by_status_and_type(self, status, type, **kwargs): + def get_genomes_by_status_and_type(self, status, dataset_type, **kwargs): session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') if session: - genome_data = self.__query_genomes_by_status_and_type(session, status, type) + genome_data = self.__query_genomes_by_status_and_type(session, status, dataset_type) return genome_data else: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: - genome_data = self.__query_genomes_by_status_and_type(session, status, type) + genome_data = self.__query_genomes_by_status_and_type(session, status, dataset_type) return genome_data def __create_child_datasets_recursive(self, session, parent_dataset): @@ -239,6 +241,7 @@ def __update_status(self, session, dataset_uuid, status): # Then convert all to released. #Add a blocker and warning in here. current_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() updated_datasets = (dataset_uuid, current_dataset.status) + #if released if status == DatasetStatus.Submitted: # Update to SUBMITTED and all parents. # Do not touch the children. @@ -251,12 +254,12 @@ def __update_status(self, session, dataset_uuid, status): elif status == DatasetStatus.Processing: # Update to PROCESSING and all parents. # Do not touch the children. - if current_dataset.status == DatasetStatus.Released: + if current_dataset.status == DatasetStatus.Released: #and it is not top level. return updated_datasets # Check the dependents dependents = self.__query_depends_on(session, dataset_uuid) for uuid, dep_status in dependents: - if dep_status != DatasetStatus.Processed or dep_status != DatasetStatus.Released: + if dep_status in (DatasetStatus.Processed, DatasetStatus.Released): return updated_datasets current_dataset.status = DatasetStatus.Processing parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) @@ -264,24 +267,23 @@ def __update_status(self, session, dataset_uuid, status): self.__update_status(session, parent_uuid, DatasetStatus.Processing) elif status == DatasetStatus.Processed: - if current_dataset.status == DatasetStatus.Released: + if current_dataset.status == DatasetStatus.Released: #and it is not top level. return updated_datasets # Get children children_uuid = self.__query_child_datasets(session, dataset_uuid) - new_status = DatasetStatus.Processed # Check to see if any are still processing or submitted for child, child_status in children_uuid: - if child_status == DatasetStatus.Processing or child_status == DatasetStatus.Submitted: - new_status = DatasetStatus.Processing + if child_status in (DatasetStatus.Processing, DatasetStatus.Submitted): + return updated_datasets # Update current dataset if all the children are updated. - if new_status == DatasetStatus.Processed: - current_dataset.status = DatasetStatus.Processed - # Check if parent needs to be updated - parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) - if parent_uuid is not None: - self.__update_status(session, parent_uuid, DatasetStatus.Processed) + current_dataset.status = DatasetStatus.Processed + # Check if parent needs to be updated + parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) + if parent_uuid is not None: + self.__update_status(session, parent_uuid, DatasetStatus.Processed) elif status == DatasetStatus.Released: + #TODO: Check that you are top level. Then check all children are ready to release. # Get current datasets chain top level. top_level_uuid = self.__query_top_level_parent(session, dataset_uuid) # Check that all children and sub children etc @@ -310,7 +312,7 @@ def __update_status(self, session, dataset_uuid, status): def __get_dataset(self, session, dataset_uuid): return session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() - def __query_genomes_by_status_and_type(self, session, status, type): + def __query_genomes_by_status_and_type(self, session, status, dataset_type): if session is None: raise ValueError("Session is not provided") @@ -327,7 +329,7 @@ def __query_genomes_by_status_and_type(self, session, status, type): DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id ).filter( Dataset.status == status, - DatasetType.name == type + DatasetType.name == dataset_type ).all() # Execute query and fetch results diff --git a/src/ensembl/production/metadata/updater/updater_utils.py b/src/ensembl/production/metadata/updater/updater_utils.py index 230605c7..543c10cd 100644 --- a/src/ensembl/production/metadata/updater/updater_utils.py +++ b/src/ensembl/production/metadata/updater/updater_utils.py @@ -14,6 +14,7 @@ def update_attributes(dataset, attributes, session): + # TODO If attributes already exist, update them. Add option to replace all. dataset_attributes = [] for attribute, value in attributes.items(): meta_attribute = session.query(Attribute).filter(Attribute.name == attribute).one_or_none() From 2d51ac4fdf3b589630d615e249c3f24aa7dce78b Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 28 Feb 2024 14:32:24 +0000 Subject: [PATCH 23/30] Renamed and moved files --- .../production/metadata/api/{hive => factories}/__init__.py | 0 .../api/{hive/dataset_factory.py => factories/datasets.py} | 0 src/tests/test_dataset_factory.py | 4 +++- 3 files changed, 3 insertions(+), 1 deletion(-) rename src/ensembl/production/metadata/api/{hive => factories}/__init__.py (100%) rename src/ensembl/production/metadata/api/{hive/dataset_factory.py => factories/datasets.py} (100%) diff --git a/src/ensembl/production/metadata/api/hive/__init__.py b/src/ensembl/production/metadata/api/factories/__init__.py similarity index 100% rename from src/ensembl/production/metadata/api/hive/__init__.py rename to src/ensembl/production/metadata/api/factories/__init__.py diff --git a/src/ensembl/production/metadata/api/hive/dataset_factory.py b/src/ensembl/production/metadata/api/factories/datasets.py similarity index 100% rename from src/ensembl/production/metadata/api/hive/dataset_factory.py rename to src/ensembl/production/metadata/api/factories/datasets.py diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index 537e73a3..daf4f2b6 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -10,9 +10,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from pathlib import Path + import pytest from ensembl.database import UnitTestDB, DBConnection -from ensembl.production.metadata.api.hive.dataset_factory import DatasetFactory +from ensembl.production.metadata.api.factories.dataset_factory import DatasetFactory + from ensembl.production.metadata.api.models import (Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType, DatasetStatus, GenomeDataset, Genome) From 8c5fe454d382a7fb41cbc140d97d36051b23f7ed Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 29 Feb 2024 09:19:15 +0000 Subject: [PATCH 24/30] Fixed Reference --- src/tests/test_dataset_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index daf4f2b6..3d9424fe 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -13,8 +13,8 @@ import pytest from ensembl.database import UnitTestDB, DBConnection -from ensembl.production.metadata.api.factories.dataset_factory import DatasetFactory +from ensembl.production.metadata.api.factories.datasets import DatasetFactory from ensembl.production.metadata.api.models import (Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType, DatasetStatus, GenomeDataset, Genome) From 71e11c28c892164af60c834d0917ca2688efa899 Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 29 Feb 2024 09:57:39 +0000 Subject: [PATCH 25/30] Fixed logic error --- src/ensembl/production/metadata/api/factories/datasets.py | 5 +++-- src/tests/test_dataset_factory.py | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index ff8c6a28..bee49549 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -40,7 +40,7 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat label=label, created=func.now(), dataset_source=dataset_source, # Must - status='Submitted', + status=DatasetStatus.Submitted, ) genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).one() new_genome_dataset = GenomeDataset( @@ -146,6 +146,7 @@ def __create_child_datasets_recursive(self, session, parent_dataset): child_dataset = self.__get_dataset(session, child_dataset_uuid) self.__create_child_datasets_recursive(session, child_dataset) + def __query_parent_datasets(self, session, dataset_uuid): dataset = self.__get_dataset(session, dataset_uuid) dataset_type = session.query(DatasetType).filter( @@ -259,7 +260,7 @@ def __update_status(self, session, dataset_uuid, status): # Check the dependents dependents = self.__query_depends_on(session, dataset_uuid) for uuid, dep_status in dependents: - if dep_status in (DatasetStatus.Processed, DatasetStatus.Released): + if dep_status not in (DatasetStatus.Processed, DatasetStatus.Released): return updated_datasets current_dataset.status = DatasetStatus.Processing parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index 3d9424fe..4b6b9079 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -102,11 +102,14 @@ def test_create_genebuild_children(self, multi_dbs): with metadata_db.session_scope() as session: genebuild_uuid = 'cc3c7f95-b5dc-4cc1-aa15-2817c89bd1e2' assembly_uuid = '02104faf-3fee-4f28-b53c-605843dac941' + dataset_factory = DatasetFactory() + dataset_factory.create_all_child_datasets(session, genebuild_uuid) + session.commit() data = session.query(Dataset).join(DatasetType).filter( - DatasetType.name == 'thoas_load').one() - data.status == DatasetStatus.Submitted + DatasetType.name == 'genome_browser_track').one() + assert data.status == DatasetStatus.Submitted # test get parent test_parent, test_status = dataset_factory.get_parent_datasets(data.dataset_uuid, session=session) assert test_parent == genebuild_uuid From 35e469b770911280b32fe09f284496c2a3c3baf9 Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 29 Feb 2024 12:22:06 +0000 Subject: [PATCH 26/30] Reverted complex logic in status. --- src/ensembl/production/metadata/api/models/dataset.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 20ef52de..2ed53682 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -9,15 +9,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import datetime import enum +import logging +import uuid from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index, JSON from sqlalchemy.dialects.mysql import DATETIME from sqlalchemy.orm import relationship from sqlalchemy.sql import func -import datetime -import uuid -import logging from ensembl.production.metadata.api.exceptions import MissingMetaException from ensembl.production.metadata.api.models.base import Base, LoadAble @@ -57,9 +57,8 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column('status', Enum(DatasetStatus, - values_callable=lambda x: [str(status_enum.value) for status_enum in DatasetStatus]), - default=DatasetStatus.Submitted) + status = Column('status', Enum(DatasetStatus), default=DatasetStatus.Submitted) + # One to many relationships # dataset_id to dataset attribute and genome dataset dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', From c99a8f7adda8749e91da04731ae9b669da14b46d Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 29 Feb 2024 13:39:15 +0000 Subject: [PATCH 27/30] Revert to string enum. --- .../metadata/api/factories/datasets.py | 38 +++++++++---------- .../production/metadata/api/models/dataset.py | 10 +---- .../ensembl_genome_metadata/dataset.txt | 2 +- src/tests/test_dataset_factory.py | 38 +++++++++---------- 4 files changed, 40 insertions(+), 48 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index bee49549..f4b50517 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -17,7 +17,7 @@ from ensembl.production.metadata.api.exceptions import * from ensembl.production.metadata.api.models import Dataset, Genome, GenomeDataset, \ - DatasetType, DatasetStatus + DatasetType from ensembl.production.metadata.updater.updater_utils import update_attributes @@ -40,7 +40,7 @@ def create_dataset(self, session, genome_uuid, dataset_source, dataset_type, dat label=label, created=func.now(), dataset_source=dataset_source, # Must - status=DatasetStatus.Submitted, + status="Submitted", ) genome = session.query(Genome).filter(Genome.genome_uuid == genome_uuid).one() new_genome_dataset = GenomeDataset( @@ -243,47 +243,47 @@ def __update_status(self, session, dataset_uuid, status): current_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == dataset_uuid).one() updated_datasets = (dataset_uuid, current_dataset.status) #if released - if status == DatasetStatus.Submitted: + if status == "Submitted": # Update to SUBMITTED and all parents. # Do not touch the children. # This should only be called in times of strife and error. - current_dataset.status = DatasetStatus.Submitted + current_dataset.status = "Submitted" parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: - self.__update_status(session, parent_uuid, DatasetStatus.Submitted) + self.__update_status(session, parent_uuid, "Submitted") - elif status == DatasetStatus.Processing: + elif status == "Processing": # Update to PROCESSING and all parents. # Do not touch the children. - if current_dataset.status == DatasetStatus.Released: #and it is not top level. + if current_dataset.status == "Released": # and it is not top level. return updated_datasets # Check the dependents dependents = self.__query_depends_on(session, dataset_uuid) for uuid, dep_status in dependents: - if dep_status not in (DatasetStatus.Processed, DatasetStatus.Released): + if dep_status not in ("Processed", "Released"): return updated_datasets - current_dataset.status = DatasetStatus.Processing + current_dataset.status = "Processing" parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: - self.__update_status(session, parent_uuid, DatasetStatus.Processing) + self.__update_status(session, parent_uuid, "Processing") - elif status == DatasetStatus.Processed: - if current_dataset.status == DatasetStatus.Released: #and it is not top level. + elif status == "Processed": + if current_dataset.status == "Released": #and it is not top level. return updated_datasets # Get children children_uuid = self.__query_child_datasets(session, dataset_uuid) # Check to see if any are still processing or submitted for child, child_status in children_uuid: - if child_status in (DatasetStatus.Processing, DatasetStatus.Submitted): + if child_status in ("Processing", "Submitted"): return updated_datasets # Update current dataset if all the children are updated. - current_dataset.status = DatasetStatus.Processed + current_dataset.status = "Processed" # Check if parent needs to be updated parent_uuid, parent_status = self.__query_parent_datasets(session, dataset_uuid) if parent_uuid is not None: - self.__update_status(session, parent_uuid, DatasetStatus.Processed) + self.__update_status(session, parent_uuid, "Processed") - elif status == DatasetStatus.Released: + elif status == "Released": #TODO: Check that you are top level. Then check all children are ready to release. # Get current datasets chain top level. top_level_uuid = self.__query_top_level_parent(session, dataset_uuid) @@ -296,15 +296,15 @@ def __update_status(self, session, dataset_uuid, status): # Update if all datasets in it's chain are processed, all genebuild and assembly are processed. Else return error. for child_uuid, child_status in top_level_children: - if child_status != DatasetStatus.Released and child_status != DatasetStatus.Processed: + if child_status != "Released" and child_status != "Processed": child_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() raise DatasetFactoryException( f"Dataset {child_uuid} is not released or processed. It is {child_status}") top_level_children = self.__query_all_child_datasets(session, top_level_uuid) for child_uuid, child_status in top_level_children: child_dataset = session.query(Dataset).filter(Dataset.dataset_uuid == child_uuid).one() - child_dataset.status = DatasetStatus.Released - current_dataset.status = DatasetStatus.Released + child_dataset.status = "Released" + current_dataset.status = "Released" else: raise DatasetFactoryException(f"Dataset status: {status} is not a vallid status") updated_datasets = (current_dataset.dataset_uuid, current_dataset.status) diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 2ed53682..58017aa6 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -10,7 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import datetime -import enum import logging import uuid @@ -39,13 +38,6 @@ class Attribute(LoadAble, Base): # none -class DatasetStatus(enum.Enum): - Submitted = 'Submitted' - Processing = 'Processing' - Processed = 'Processed' - Released = 'Released' - - class Dataset(LoadAble, Base): __tablename__ = 'dataset' @@ -57,7 +49,7 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column('status', Enum(DatasetStatus), default=DatasetStatus.Submitted) + status = Column(Enum('Submitted', 'Processing', 'Processed', 'Released'), server_default=text("'Submitted'")) # One to many relationships # dataset_id to dataset attribute and genome dataset diff --git a/src/ensembl/production/metadata/api/sample/ensembl_genome_metadata/dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_genome_metadata/dataset.txt index aa18a33b..f299eba7 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_genome_metadata/dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_genome_metadata/dataset.txt @@ -310,7 +310,7 @@ 356 90ba6c03-5161-4f9a-911c-1961b9c0470d genebuild ENS01 2023-09-22 15:06:46.000000 GCA_018472825.1_ENS01 188 2 Submitted 357 4519fdf3-8b4e-463b-9822-69c45ee408da assembly \N 2023-09-22 15:06:46.000000 GCA_000001735.1 190 1 Submitted 359 de92123a-22ca-407f-9954-d8c0f8b17f64 assembly \N 2023-09-22 15:06:48.000000 GCA_018503575.1 192 1 Submitted -361 e95e194c-52adc-4b1e-94d4-1d5c0a03e9e3 assembly \N 2023-09-22 15:06:48.000000 GCA_018466835.1 193 1 Submitted +361 e95e194c-52ad-4b1e-94d4-1d5c0a03e9e3 assembly \N 2023-09-22 15:06:48.000000 GCA_018466835.1 193 1 Submitted 363 2b5664b7-6b42-4a18-9128-3019d631b836 assembly \N 2023-09-22 15:06:48.000000 GCA_905237065.2 191 1 Submitted 364 2ffbdc3f-1c68-42b1-b99f-4449c5914ec5 genebuild ENS01 2023-09-22 15:06:48.000000 GCA_905237065.2_ENS01 191 2 Submitted 365 b8fa1a4e-6d40-4540-a022-8846abee284c assembly \N 2023-09-22 15:06:49.000000 GCA_018472765.1 194 1 Submitted diff --git a/src/tests/test_dataset_factory.py b/src/tests/test_dataset_factory.py index 4b6b9079..f47d8354 100644 --- a/src/tests/test_dataset_factory.py +++ b/src/tests/test_dataset_factory.py @@ -16,7 +16,7 @@ from ensembl.production.metadata.api.factories.datasets import DatasetFactory from ensembl.production.metadata.api.models import (Dataset, DatasetAttribute, Attribute, DatasetSource, DatasetType, - DatasetStatus, GenomeDataset, Genome) + GenomeDataset, Genome) db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() @@ -109,7 +109,7 @@ def test_create_genebuild_children(self, multi_dbs): session.commit() data = session.query(Dataset).join(DatasetType).filter( DatasetType.name == 'genome_browser_track').one() - assert data.status == DatasetStatus.Submitted + assert data.status == "Submitted" # test get parent test_parent, test_status = dataset_factory.get_parent_datasets(data.dataset_uuid, session=session) assert test_parent == genebuild_uuid @@ -139,30 +139,30 @@ def test_update_dataset_status(self, multi_dbs): xref_uuid = xref_uuid[0] # Processing # Fail to update protein_features - temp, failed_status = dataset_factory.update_dataset_status(protfeat_uuid, DatasetStatus.Processing, + temp, failed_status = dataset_factory.update_dataset_status(protfeat_uuid, "Processing", session=session) session.commit() failed_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == protfeat_uuid).one() - assert failed_status == DatasetStatus.Submitted - assert failed_status_check[0] == DatasetStatus.Submitted + assert failed_status == "Submitted" + assert failed_status_check[0] == "Submitted" # succeed on xref - temp, succeed_status = dataset_factory.update_dataset_status(xref_uuid, DatasetStatus.Processing, + temp, succeed_status = dataset_factory.update_dataset_status(xref_uuid, "Processing", session=session) session.commit() succeed_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == xref_uuid).one() genebuild_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == genebuild_uuid).one() - assert succeed_status == DatasetStatus.Processing - assert succeed_status_check[0] == DatasetStatus.Processing - assert genebuild_status_check[0] == DatasetStatus.Processing + assert succeed_status == "Processing" + assert succeed_status_check[0] == "Processing" + assert genebuild_status_check[0] == "Processing" # Processed # Fail to update genebuild - temp, failed_status = dataset_factory.update_dataset_status(genebuild_uuid, DatasetStatus.Processed, + temp, failed_status = dataset_factory.update_dataset_status(genebuild_uuid, "Processed", session=session) session.commit() genebuild_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == genebuild_uuid).one() - assert failed_status == DatasetStatus.Processing - assert genebuild_status_check[0] == DatasetStatus.Processing + assert failed_status == "Processing" + assert genebuild_status_check[0] == "Processing" # Change all the children child_dataset_uuids = session.query(Dataset.dataset_uuid) \ .join(GenomeDataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ @@ -172,21 +172,21 @@ def test_update_dataset_status(self, multi_dbs): .filter(DatasetType.name != "genebuild").all() for temp_uuid in child_dataset_uuids: temp_uuid = temp_uuid[0] - dataset_factory.update_dataset_status(temp_uuid, DatasetStatus.Processed, session=session) + dataset_factory.update_dataset_status(temp_uuid, "Processed", session=session) session.commit() genebuild_status_check = session.query(Dataset.status).filter( Dataset.dataset_uuid == genebuild_uuid).one() - assert genebuild_status_check[0] == DatasetStatus.Processed - dataset_factory.update_dataset_status(genebuild_uuid, DatasetStatus.Released, session=session) + assert genebuild_status_check[0] == "Processed" + dataset_factory.update_dataset_status(genebuild_uuid, "Released", session=session) session.commit() genebuild_status_check = session.query(Dataset.status).filter( Dataset.dataset_uuid == genebuild_uuid).one() - assert genebuild_status_check[0] == DatasetStatus.Released + assert genebuild_status_check[0] == "Released" protfeat_status_check = session.query(Dataset.status).filter(Dataset.dataset_uuid == protfeat_uuid).one() - assert protfeat_status_check[0] == DatasetStatus.Released + assert protfeat_status_check[0] == "Released" # Check for submitted change - dataset_factory.update_dataset_status(protfeat_uuid, DatasetStatus.Submitted, session=session) + dataset_factory.update_dataset_status(protfeat_uuid, "Submitted", session=session) session.commit() submitted_status = session.query(Dataset.status).filter(Dataset.dataset_uuid == protfeat_uuid).one() - assert submitted_status[0] == DatasetStatus.Submitted + assert submitted_status[0] == "Submitted" From 5a3beb82e2a6d20d8faa4c678d2e85b3b57c1d4f Mon Sep 17 00:00:00 2001 From: danielp Date: Fri, 1 Mar 2024 16:14:18 +0000 Subject: [PATCH 28/30] Minor fixes --- .../production/metadata/api/factories/datasets.py | 12 ++++++------ .../production/metadata/api/models/dataset.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index f4b50517..36057299 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -69,7 +69,7 @@ def update_dataset_status(self, dataset_uuid, status, **kwargs): updated_datasets = [(dataset_uuid, status)] session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') - attribute_dict = kwargs.get('attribut_dict') + attribute_dict = kwargs.get('attribute_dict') if session: updated_datasets = self.__update_status(session, dataset_uuid, status) if attribute_dict: @@ -84,21 +84,21 @@ def update_dataset_status(self, dataset_uuid, status, **kwargs): raise DatasetFactoryException("session or metadata_uri are required") return updated_datasets - def update_dataset_attributes(self, dataset_uuid, attribut_dict, **kwargs): + def update_dataset_attributes(self, dataset_uuid, attribute_dict, **kwargs): #TODO ADD DELETE opiton to kwargs to redo dataset_attributes. session = kwargs.get('session') metadata_uri = kwargs.get('metadata_uri') - if not isinstance(attribut_dict, dict): - raise TypeError("attribut_dict must be a dictionary") + if not isinstance(attribute_dict, dict): + raise TypeError("attribute_dict must be a dictionary") if session: dataset = self.__get_dataset(session, dataset_uuid) - dataset_attributes = update_attributes(dataset, attribut_dict, session) + dataset_attributes = update_attributes(dataset, attribute_dict, session) return dataset_attributes else: metadata_db = DBConnection(metadata_uri) with metadata_db.session_scope() as session: dataset = self.__get_dataset(session, dataset_uuid) - dataset_attributes = update_attributes(dataset, attribut_dict, session) + dataset_attributes = update_attributes(dataset, attribute_dict, session) return dataset_attributes def get_genomes_by_status_and_type(self, status, dataset_type, **kwargs): diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 58017aa6..1009561d 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -49,7 +49,7 @@ class Dataset(LoadAble, Base): created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow) dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) label = Column(String(128), nullable=False) - status = Column(Enum('Submitted', 'Processing', 'Processed', 'Released'), server_default=text("'Submitted'")) + status = Column(Enum('Submitted', 'Processing', 'Processed', 'Released'), server_default=text('Submitted')) # One to many relationships # dataset_id to dataset attribute and genome dataset From 0614c2ab7d64ea297ccf524837faa16e0bf7cc4a Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Sun, 3 Mar 2024 17:22:21 +0000 Subject: [PATCH 29/30] genome factory for new metadata db and test cases adde --- .../metadata/api/factories/genome.py | 237 ++++++++++++++++++ src/tests/test_genome_factory.py | 194 ++++++++++++++ 2 files changed, 431 insertions(+) create mode 100644 src/ensembl/production/metadata/api/factories/genome.py create mode 100644 src/tests/test_genome_factory.py diff --git a/src/ensembl/production/metadata/api/factories/genome.py b/src/ensembl/production/metadata/api/factories/genome.py new file mode 100644 index 00000000..3566a83c --- /dev/null +++ b/src/ensembl/production/metadata/api/factories/genome.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Fetch Genome Info From New Metadata Database +''' + +import argparse +import json +import logging +import re +from dataclasses import dataclass, field +from ensembl.database import DBConnection +from ensembl.production.metadata.api.factories.datasets import DatasetFactory +from ensembl.production.metadata.api.models.dataset import DatasetType, Dataset, DatasetSource +from ensembl.production.metadata.api.models.genome import Genome, GenomeDataset +from ensembl.production.metadata.api.models.organism import Organism, OrganismGroup, OrganismGroupMember +from sqlalchemy import select, text +from typing import List + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class GenomeInputFilters: + + metadata_db_uri: str + genome_uuid: List[str] = field(default_factory=list) + dataset_uuid: List[str] = field(default_factory=list) + division: List[str] = field(default_factory=list) + dataset_type: str = "assembly" + species: List[str] = field(default_factory=list) + antispecies: List[str] = field(default_factory=list) + dataset_status: List[str] = field(default_factory=lambda: ["Submitted"]) + batch_size: int = 50 + page: int = 1 + organism_group_type: str = "DIVISION" + run_all: int = 0 + update_dataset_status: str = "" + update_dataset_attribute: dict = field(default_factory=lambda: {}) + columns: List = field(default_factory=lambda: [Genome.genome_uuid, + Genome.production_name.label('species'), + Dataset.dataset_uuid, + Dataset.status.label('dataset_status'), + DatasetSource.name.label('dataset_source'), + DatasetType.name.label('dataset_type'), + ]) +@dataclass +class GenomeFactory: + @staticmethod + def _apply_filters(query, filters): + + query = query.filter(OrganismGroup.type == filters.organism_group_type) + + if filters.run_all: + filters.division = [ + 'EnsemblBacteria', + 'EnsemblVertebrates', + 'EnsemblPlants', + 'EnsemblProtists', + 'EnsemblMetazoa', + 'EnsemblFungi', + ] + + if filters.genome_uuid: + query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid)) + + if filters.dataset_uuid: + query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid)) + + if filters.division: + ensembl_divisions = filters.division + + if filters.organism_group_type == 'DIVISION': + pattern = re.compile(r'^(ensembl)?', re.IGNORECASE) + ensembl_divisions = ['Ensembl' + pattern.sub('', d).capitalize() for d in ensembl_divisions if d] + + query = query.filter(OrganismGroup.name.in_(ensembl_divisions)) + + if filters.species: + species = set(filters.species) - set(filters.antispecies) + + if species: + query = query.filter(Genome.production_name.in_(filters.species)) + else: + query = query.filter(~Genome.production_name.in_(filters.antispecies)) + + elif filters.antispecies: + query = query.filter(~Genome.production_name.in_(filters.antispecies)) + + if filters.dataset_type: + query = query.filter(Genome.genome_datasets.any(DatasetType.name.in_([filters.dataset_type]))) + + if filters.dataset_status: + query = query.filter(Dataset.status.in_(filters.dataset_status)) + + if filters.batch_size: + filters.page = filters.page if filters.page > 0 else 1 + query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size) + + return query + + def _build_query(self, filters): + query = select(filters.columns) \ + .select_from(Genome) \ + .join(Genome.organism) \ + .join(Organism.organism_group_members) \ + .join(OrganismGroupMember.organism_group) \ + .outerjoin(Genome.genome_datasets) \ + .join(GenomeDataset.dataset) \ + .join(Dataset.dataset_source) \ + .join(Dataset.dataset_type) \ + .group_by(Genome.genome_id) \ + .order_by(Genome.genome_uuid) + + return self._apply_filters(query, filters) + + def get_genomes(self, **filters: GenomeInputFilters): + + filters = GenomeInputFilters(**filters) + logger.info(f'Get Genomes with filters {filters}') + + with DBConnection(filters.metadata_db_uri).session_scope() as session: + query = self._build_query(filters) + logger.info(f'Executing SQL query: {query}') + for genome in session.execute(query).fetchall(): + genome_info = genome._asdict() + dataset_uuid = genome_info.get('dataset_uuid', None) + + # TODO: below code required with implementation of datasetstatus enum class in dataset models + # #convert status enum object to string value + # dataset_status = genome_info.get('dataset_status', None) + # if dataset_status and isinstance(dataset_status, DatasetStatus) : + # genome_info['dataset_status'] = dataset_status.value + + if not dataset_uuid: + logger.warning( + f"No dataset uuid found for genome {genome_info} skipping this genome " + ) + continue + + if filters.update_dataset_status: + _, status = DatasetFactory().update_dataset_status(dataset_uuid, filters.update_dataset_status, + session=session) + if filters.update_dataset_status == status: + + logger.info( + f"Updated Dataset status for dataset uuid: {dataset_uuid} from {filters.update_dataset_status} to {status} for genome {genome_info['genome_uuid']}" + ) + genome_info['updated_dataset_status'] = status + + else: + logger.warning( + f"Cannot update status for dataset uuid: {dataset_uuid} {filters.update_dataset_status} to {status} for genome {genome['genome_uuid']}" + ) + genome_info['updated_dataset_status'] = None + + yield genome_info + + + +def main(): + parser = argparse.ArgumentParser( + prog='genome.py', + description='Fetch Ensembl genome info from the new metadata database' + ) + parser.add_argument('--genome_uuid', type=str, nargs='*', default=[], required=False, + help='List of genome UUIDs to filter the query. Default is an empty list.') + parser.add_argument('--dataset_uuid', type=str, nargs='*', default=[], required=False, + help='List of dataset UUIDs to filter the query. Default is an empty list.') + parser.add_argument('--organism_group_type', type=str, default='DIVISION', required=False, + help='Organism group type to filter the query. Default is "DIVISION"') + parser.add_argument('--division', type=str, nargs='*', default=[], required=False, + help='List of organism group names to filter the query. Default is an empty list.') + parser.add_argument('--dataset_type', type=str, default="assembly", required=False, + help='List of dataset types to filter the query. Default is an empty list.') + parser.add_argument('--species', type=str, nargs='*', default=[], required=False, + help='List of Species Production names to filter the query. Default is an empty list.') + parser.add_argument('--antispecies', type=str, nargs='*', default=[], required=False, + help='List of Species Production names to exclude from the query. Default is an empty list.') + parser.add_argument('--dataset_status', nargs='*', default=["Submitted"], + choices=['Submitted', 'Processing', 'Processed', 'Released'], required=False, + help='List of dataset statuses to filter the query. Default is an empty list.') + parser.add_argument('--update_dataset_status', type=str, default="", required=False, + choices=['Submitted', 'Processing', 'Processed', 'Released', ''], + help='Update the status of the selected datasets to the specified value. ') + parser.add_argument('--batch_size', type=int, default=50, required=False, + help='Number of results to retrieve per batch. Default is 50.') + parser.add_argument('--page', default=1, required=False, + type=lambda x: int(x) if int(x) > 0 else argparse.ArgumentTypeError("{x} is not a positive integer"), + help='The page number for pagination. Default is 1.') + parser.add_argument('--metadata_db_uri', type=str, required=True, + help='metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata') + parser.add_argument('--output', type=str, required=True, help='output file ex: genome_info.json') + + args = parser.parse_args() + + meta_details = re.match(r"mysql:\/\/.*:?(.*?)@(.*?):\d+\/(.*)", args.metadata_db_uri) + with open(args.output, 'w') as json_output: + logger.info(f'Connecting Metadata Database with host:{meta_details.group(2)} & dbname:{meta_details.group(3)}') + + genome_fetcher = GenomeFactory() + + logger.info(f'Writing Results to {args.output}') + for genome in genome_fetcher.get_genomes( + metadata_db_uri=args.metadata_db_uri, + update_dataset_status=args.update_dataset_status, + genome_uuid=args.genome_uuid, + dataset_uuid=args.dataset_uuid, + organism_group_type=args.organism_group_type, + division=args.division, + dataset_type=args.dataset_type, + species=args.species, + antispecies=args.antispecies, + batch_size=args.batch_size, + dataset_status=args.dataset_status, + ) or []: + json.dump(genome, json_output) + json_output.write("\n") + + logger.info(f'Completed !') + + +if __name__ == "__main__": + logger.info('Fetching Genome Information From New Metadata Database') + main() diff --git a/src/tests/test_genome_factory.py b/src/tests/test_genome_factory.py new file mode 100644 index 00000000..7c2ca05e --- /dev/null +++ b/src/tests/test_genome_factory.py @@ -0,0 +1,194 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path + +import pytest +from ensembl.database import UnitTestDB, DBConnection +from ensembl.production.metadata.api.factories.genome import GenomeFactory, GenomeInputFilters +from sqlalchemy import func +from ensembl.production.metadata.api.models import Dataset, Genome + +db_directory = Path(__file__).parent / 'databases' +db_directory = db_directory.resolve() + +sample_path = Path(__file__).parent.parent / "ensembl" / "production" / "metadata" / "api" / "sample" + + + +@pytest.mark.parametrize("multi_dbs", [[{'src': sample_path / 'ensembl_genome_metadata'}, + {'src': sample_path / 'ncbi_taxonomy'}, + ]], indirect=True) +@pytest.fixture(scope="class") +def metadata_db(multi_dbs): + return DBConnection(multi_dbs['ensembl_genome_metadata'].dbc.url) + + +@pytest.fixture(scope="class") +def genome_factory(): + return GenomeFactory() + +@pytest.mark.parametrize("multi_dbs", [[{'src': sample_path / 'ensembl_genome_metadata'}, + {'src': sample_path / 'ncbi_taxonomy'}, + ]], indirect=True) +@pytest.fixture(scope="function") +def genome_filters(multi_dbs): + return { + 'genome_uuid': [], + 'dataset_uuid': [], + 'division': [], + 'dataset_type' : 'assembly', + 'species' : [], + 'antispecies': [], + 'dataset_status': ["Submitted"], + 'batch_size': 50, + 'organism_group_type': "DIVISION", + 'metadata_db_uri': multi_dbs['ensembl_genome_metadata'].dbc.url + } + + +@pytest.fixture(scope="function") +def expected_columns(): + return ['genome_uuid', + 'production_name', + 'dataset_uuid', + 'dataset_status', + 'dataset_source', + 'dataset_type', + ] + + +@pytest.mark.parametrize("multi_dbs", [[{'src': sample_path / 'ensembl_genome_metadata'}, + {'src': sample_path / 'ncbi_taxonomy'}, + ]], indirect=True) +class TestGenomeFactory: + dbc = None # type: UnitTestDB + + def test_input_filters_type(self, multi_dbs, metadata_db, genome_factory, genome_filters): + filters = GenomeInputFilters(**genome_filters) + assert isinstance(filters.genome_uuid, list) + assert isinstance(filters.dataset_uuid, list) + assert isinstance(filters.division, list) + assert isinstance(filters.dataset_type, str) + assert isinstance(filters.species, list) + assert isinstance(filters.antispecies, list) + assert isinstance(filters.dataset_status, list) + assert isinstance(filters.batch_size, int) + assert isinstance(filters.organism_group_type, str) + assert isinstance(filters.update_dataset_status, str) + + def test_fetch_genomes_by_default_params(self, multi_dbs, metadata_db, genome_factory, genome_filters): + + # fetch genome using genomefacotry with default filters + fetched_genome_factory_count = len([ genome for genome in genome_factory.get_genomes(**genome_filters)]) + assert fetched_genome_factory_count == genome_filters['batch_size'] + + def test_fetch_genomes_by_batch_size_10_40_all(self, multi_dbs, metadata_db, genome_factory, genome_filters): + + # fetch genome using genomefacotry with batchsize 10 + genome_filters['batch_size'] = 10 + fetched_genome_factory_count = len([ genome for genome in genome_factory.get_genomes(**genome_filters)]) + assert fetched_genome_factory_count == genome_filters['batch_size'] + + genome_filters['batch_size'] = 40 + fetched_genome_factory_count = len([ genome for genome in genome_factory.get_genomes(**genome_filters)]) + assert fetched_genome_factory_count == genome_filters['batch_size'] + + genome_filters['batch_size'] = 0 # fetch all genomes with dataset assembly + fetched_genome_factory_count = len([genome for genome in genome_factory.get_genomes(**genome_filters)]) + with metadata_db.session_scope() as session: + genome_count = session.query(Genome).count() + assert fetched_genome_factory_count == genome_count + + def test_fetch_genomes_by_genome_uuid(self, multi_dbs, metadata_db, genome_factory, genome_filters): + + # fetch genome using genomefacotry with default filters + genome_filters['genome_uuid'] = ['a73351f7-93e7-11ec-a39d-005056b38ce3'] + genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) + + with metadata_db.session_scope() as session: + genome = session.query(Genome).filter(Genome.genome_uuid==genome_filters['genome_uuid'] ).one() + assert genome_factory_result['genome_uuid'] == genome_filters['genome_uuid'][0] + assert genome.genome_uuid == genome_filters['genome_uuid'][0] + assert genome.genome_uuid == genome_factory_result['genome_uuid'] + assert genome.production_name == genome_factory_result['species'] + + def test_fetch_genomes_by_dataset_uuid(self, multi_dbs, metadata_db, genome_factory, genome_filters): + + genome_filters['dataset_uuid'] = ['02104faf-3fee-4f28-b53c-605843dac941'] + # fetch genome using genomefacotry with dataset uuid + genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genome_filters['dataset_uuid']).one() + assert genome_factory_result['dataset_uuid'] == genome_filters['dataset_uuid'][0] + assert dataset.dataset_uuid == genome_filters['dataset_uuid'][0] + + def test_fetch_genomes_by_default_status_submitted(self, multi_dbs, metadata_db, genome_factory, genome_filters): + + genome_filters['dataset_uuid'] = ['02104faf-3fee-4f28-b53c-605843dac941'] + genome_filters['dataset_status'] = [] + # fetch genome using genomefacotry with dataset uuid + genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genome_filters['dataset_uuid']).one() + assert genome_factory_result['dataset_uuid'] == genome_filters['dataset_uuid'][0] + assert dataset.dataset_uuid == genome_filters['dataset_uuid'][0] + assert dataset.status == genome_factory_result['dataset_status'] + + def test_update_dataset_status_submitted_processing_processed_released(self, multi_dbs, metadata_db, genome_factory, genome_filters): + + # fetch genome using genomefacotry with dataset uuid + genome_filters['genome_uuid'] = [] + genome_filters['dataset_uuid'] = ['02104faf-3fee-4f28-b53c-605843dac941'] + + # update dataset status to processing + genome_filters['update_dataset_status'] = 'Processing' + + # fetch genomes by status submitted and update to processing + genome_factory_result = [ genome for genome in genome_factory.get_genomes(**genome_filters)][0] + + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genome_filters['dataset_uuid']).one() + assert genome_factory_result['updated_dataset_status'] == dataset.status + + # update dataset status to processed + genome_filters['update_dataset_status'] = 'Processed' + genome_filters['dataset_status'] = ['Processing'] + + # fetch genomes by status processing and update to processed + genome_factory_result = [genome for genome in genome_factory.get_genomes(**genome_filters)][0] + + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genome_filters['dataset_uuid']).one() + assert genome_factory_result['updated_dataset_status'] == dataset.status + + # update dataset status to processed + genome_filters['update_dataset_status'] = 'Released' + genome_filters['dataset_status'] = ['Processed'] + + # fetch genomes by status processed and update to released + genome_factory_result = [genome for genome in genome_factory.get_genomes(**genome_filters)][0] + + with metadata_db.session_scope() as session: + dataset = session.query(Dataset).filter(Dataset.dataset_uuid == genome_filters['dataset_uuid']).one() + assert genome_factory_result['updated_dataset_status'] == dataset.status + + def test_expected_columns(self, multi_dbs, genome_factory, genome_filters, expected_columns): + #fetch genomes with default filters + returned_columns = list(next(genome_factory.get_genomes(**genome_filters)).keys()) + assert returned_columns.sort() == expected_columns.sort() + + def test_expected_columns_on_update_status(self, multi_dbs, genome_factory, expected_columns, genome_filters): + genome_filters['dataset_uuid'] = ['06b4892b-8e34-49bc-be84-8126e5a7cf93'] + genome_filters['update_dataset_status'] = 'Processing' + expected_columns.append('updated_dataset_status') + returned_columns = list(next(genome_factory.get_genomes(**genome_filters)).keys()) + assert returned_columns.sort() == expected_columns.sort() \ No newline at end of file From cb8298651637a7f1248ef5b53fa48ecea7236bf7 Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Sun, 3 Mar 2024 18:20:43 +0000 Subject: [PATCH 30/30] fix: ONLY_FULL_GROUP_BY wic work for any mysql client version --- src/ensembl/production/metadata/api/factories/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ensembl/production/metadata/api/factories/genome.py b/src/ensembl/production/metadata/api/factories/genome.py index 3566a83c..fb0cdef2 100644 --- a/src/ensembl/production/metadata/api/factories/genome.py +++ b/src/ensembl/production/metadata/api/factories/genome.py @@ -121,7 +121,7 @@ def _build_query(self, filters): .join(GenomeDataset.dataset) \ .join(Dataset.dataset_source) \ .join(Dataset.dataset_type) \ - .group_by(Genome.genome_id) \ + .group_by(Genome.genome_id, Dataset.dataset_id) \ .order_by(Genome.genome_uuid) return self._apply_filters(query, filters)