From ad54727dae4568f48319443e2074bcb5d0d123eb Mon Sep 17 00:00:00 2001 From: vinay Date: Fri, 9 May 2025 18:28:34 +0100 Subject: [PATCH 1/7] Chnage Genome Factory to DIVISION agnostic --- .../metadata/api/factories/genomes.py | 50 +++--- src/tests/test_genome_factory.py | 142 +++++++++++++++++- 2 files changed, 162 insertions(+), 30 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/genomes.py b/src/ensembl/production/metadata/api/factories/genomes.py index 9f7f4cf6..597800fb 100644 --- a/src/ensembl/production/metadata/api/factories/genomes.py +++ b/src/ensembl/production/metadata/api/factories/genomes.py @@ -44,6 +44,7 @@ class GenomeInputFilters: species: List[str] = field(default_factory=list) antispecies: List[str] = field(default_factory=list) dataset_status: List[str] = field(default_factory=lambda: ["Submitted"]) + dataset_version: str = '' release_id: int = 0 batch_size: int = 50 page: int = 1 @@ -54,6 +55,7 @@ class GenomeInputFilters: columns: List = field(default_factory=lambda: [Genome.genome_uuid.label('genome_uuid'), Genome.production_name.label('species'), Dataset.dataset_uuid.label('dataset_uuid'), + Dataset.version.label('dataset_version'), Dataset.status.label('dataset_status'), DatasetSource.name.label('dataset_source'), DatasetType.name.label('dataset_type') @@ -65,9 +67,6 @@ class GenomeFactory: @staticmethod def _apply_filters(query, filters): - if filters.organism_group_type: - query = query.filter(OrganismGroup.type == filters.organism_group_type) - if filters.run_all: filters.division = [ 'EnsemblBacteria', @@ -78,21 +77,28 @@ def _apply_filters(query, filters): 'EnsemblFungi', ] - if filters.genome_uuid: - query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid)) + if filters.organism_group_type or any( + [i.element.table.name in ['organism_group', 'organism_group_member'] for i in filters.columns]): + query = query.outerjoin(Organism.organism_group_members).outerjoin(OrganismGroupMember.organism_group) - if filters.dataset_uuid: - query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid)) + if filters.organism_group_type: + query = query.filter(OrganismGroup.type == filters.organism_group_type) - if filters.division: + if filters.division and filters.organism_group_type: ensembl_divisions = filters.division - if filters.organism_group_type == 'DIVISION': + if filters.organism_group_type == 'Division': pattern = re.compile(r'^(ensembl)?', re.IGNORECASE) ensembl_divisions = ['Ensembl' + pattern.sub('', d).capitalize() for d in ensembl_divisions if d] query = query.filter(OrganismGroup.name.in_(ensembl_divisions)) + if filters.genome_uuid: + query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid)) + + if filters.dataset_uuid: + query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid)) + if filters.species: species = set(filters.species) - set(filters.antispecies) @@ -106,8 +112,7 @@ def _apply_filters(query, filters): if filters.release_id: query = query.join(Genome.genome_releases) - query = query.filter(GenomeDataset.release_id==filters.release_id) - query = query.filter(GenomeRelease.release_id==filters.release_id) + query = query.filter(GenomeRelease.release_id == filters.release_id) if filters.dataset_type: query = query.filter(Genome.genome_datasets.any(DatasetType.name.in_([filters.dataset_type]))) @@ -115,6 +120,9 @@ def _apply_filters(query, filters): if filters.dataset_status: query = query.filter(Dataset.status.in_(filters.dataset_status)) + if filters.dataset_version: + query = query.filter(Dataset.version == filters.dataset_version) + if filters.batch_size: filters.page = filters.page if filters.page > 0 else 1 query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size) @@ -126,8 +134,6 @@ def _build_query(self, filters): .select_from(Genome) \ .join(Genome.assembly) \ .join(Genome.organism) \ - .join(Organism.organism_group_members) \ - .join(OrganismGroupMember.organism_group) \ .join(Genome.genome_datasets) \ .join(GenomeDataset.dataset) \ .join(Dataset.dataset_source) \ @@ -154,13 +160,7 @@ def get_genomes(self, **filters: GenomeInputFilters): if dataset_status and isinstance(dataset_status, DatasetStatus): genome_info['dataset_status'] = dataset_status.value - if not dataset_uuid: - logger.warning( - f"No dataset uuid found for genome {genome_info} skipping this genome " - ) - continue - - if filters.update_dataset_status: + if filters.update_dataset_status and dataset_uuid: _, status = DatasetFactory(filters.metadata_db_uri) \ .update_dataset_status(dataset_uuid, filters.update_dataset_status, @@ -174,7 +174,7 @@ def get_genomes(self, **filters: GenomeInputFilters): else: logger.warning( - f"Cannot update status for dataset uuid: {dataset_uuid} " + f"Cannot update status for dataset uuid: {dataset_uuid} , ensure the column with dataset_uuid declared " f"{filters.update_dataset_status} to {status} for genome {genome_info['genome_uuid']}" ) genome_info['updated_dataset_status'] = None @@ -191,8 +191,8 @@ def main(): help='List of genome UUIDs to filter the query. Default is an empty list.') parser.add_argument('--dataset_uuid', type=str, nargs='*', default=[], required=False, help='List of dataset UUIDs to filter the query. Default is an empty list.') - parser.add_argument('--organism_group_type', type=str, default='DIVISION', required=False, - help='Organism group type to filter the query. Default is "DIVISION"') + parser.add_argument('--organism_group_type', type=str, required=False, default="Division", + help='Organism group type to filter the query. ex: "DIVISION", check organism_group table for more types') parser.add_argument('--division', type=str, nargs='*', default=[], required=False, help='List of organism group names to filter the query. Default is an empty list.') parser.add_argument('--dataset_type', type=str, default="assembly", required=False, @@ -202,10 +202,12 @@ def main(): parser.add_argument('--antispecies', type=str, nargs='*', default=[], required=False, help='List of Species Production names to exclude from the query. Default is an empty list.') parser.add_argument('--release_id', type=int, default=0, required=False, - help='Genome_dataset release_id to filter the query. Default is 0 (no filter).') + help='Genome_dataset release_id to filter the query for released genomes and datasets. Default is 0 (no filter).') parser.add_argument('--dataset_status', nargs='*', default=["Submitted"], choices=['Submitted', 'Processing', 'Processed', 'Released'], required=False, help='List of dataset statuses to filter the query. Default is an empty list.') + parser.add_argument('--dataset_version', type=str, required=False, + help='Filter the query by dataset version') parser.add_argument('--update_dataset_status', type=str, default="", required=False, choices=['Submitted', 'Processing', 'Processed', 'Released', ''], help='Update the status of the selected datasets to the specified value. ') diff --git a/src/tests/test_genome_factory.py b/src/tests/test_genome_factory.py index 88c5634c..7e6bb22b 100644 --- a/src/tests/test_genome_factory.py +++ b/src/tests/test_genome_factory.py @@ -13,19 +13,27 @@ import pytest from ensembl.utils.database import UnitTestDB, DBConnection - +from collections import namedtuple from ensembl.production.metadata.api.exceptions import DatasetFactoryException from ensembl.production.metadata.api.factories.genomes import GenomeInputFilters -from ensembl.production.metadata.api.models import Dataset, Genome, DatasetStatus +from ensembl.production.metadata.api.models import Dataset, Genome, DatasetStatus, OrganismGroup, OrganismGroupMember, \ + Organism import logging +from ensembl.production.metadata.scripts.organism_to_organismgroup import process_genomes + logger = logging.getLogger(__name__) +Args = namedtuple('Args', [ + 'metadata_db_uri', 'core_server_uri', 'organism_group_type', + 'organism_group_name', 'genome_uuid', 'release_id', 'remove', 'raise_error' +]) + @pytest.mark.parametrize("test_dbs", [[{'src': Path(__file__).parent / "databases/ensembl_genome_metadata"}, - {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, - ]], indirect=True) + {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, + ]], indirect=True) @pytest.fixture(scope="function") def genome_filters(test_dbs): return { @@ -54,8 +62,8 @@ def expected_columns(): @pytest.mark.parametrize("test_dbs", [[{'src': Path(__file__).parent / "databases/ensembl_genome_metadata"}, - {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, - ]], indirect=True) + {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, + ]], indirect=True) class TestGenomeFactory: dbc: UnitTestDB = None @@ -202,3 +210,125 @@ def test_expected_columns_on_update_status(self, genome_factory, expected_column expected_columns.append('updated_dataset_status') returned_columns = list(next(genome_factory.get_genomes(**genome_filters)).keys()) assert returned_columns.sort() == expected_columns.sort() + + @pytest.mark.parametrize( + "genome_uuid, organism_group_type, organism_group_name", + [ + ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblBacteria"), + ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Internal", "Populars"), + ("65d4f21f-695a-4ed0-be67-5732a551fea4", "Division", "EnsemblVertebrates"), + ("a73357ab-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblPlants"), + ("a733574a-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblFungi"), + ("a73356e1-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblProtists") + ] + ) + def test_fetch_genomes_by_division_type_organism_group_type(self, test_dbs, genome_factory, genome_filters, + genome_uuid, organism_group_type, organism_group_name): + genome_filters['genome_uuid'] = [genome_uuid] + genome_filters['columns'] = [Genome.genome_uuid.label('genome_uuid'), + OrganismGroup.name.label('organism_group_name'), + OrganismGroup.type.label('organism_group_type'), + ] + genome_filters['dataset_status'] = ['Released'] + genome_filters['division'] = [organism_group_name] + genome_filters['organism_group_type'] = organism_group_type + + genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) + + assert genome_factory_result['genome_uuid'] == genome_uuid + assert genome_factory_result['organism_group_name'] == organism_group_name + assert genome_factory_result['organism_group_type'] == organism_group_type + + @pytest.mark.parametrize( + "genome_uuid, organism_group_type, organism_group_name", + [ + ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblBacteria"), + ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Internal", "Populars"), + ("65d4f21f-695a-4ed0-be67-5732a551fea4", "Division", "EnsemblVertebrates"), + ("a73357ab-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblPlants"), + ("a733574a-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblFungi"), + ("a73356e1-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblProtists") + ] + ) + def test_fetch_genomes_by_division_type_organism_group_type(self, test_dbs, genome_factory, genome_filters, + genome_uuid, organism_group_type, organism_group_name): + genome_filters['genome_uuid'] = [genome_uuid] + genome_filters['columns'] = [Genome.genome_uuid.label('genome_uuid'), + OrganismGroup.name.label('organism_group_name'), + OrganismGroup.type.label('organism_group_type'), + ] + genome_filters['dataset_status'] = ['Released'] + genome_filters['division'] = [organism_group_name] + genome_filters['organism_group_type'] = organism_group_type + + genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) + + assert genome_factory_result['genome_uuid'] == genome_uuid + assert genome_factory_result['organism_group_name'] == organism_group_name + assert genome_factory_result['organism_group_type'] == organism_group_type + + @pytest.mark.parametrize( + "genome_uuid, organism_group_type, organism_group_name", + [ + ("63b4ffbf-0147-4aa7-b0af-7575bb822740", "Division", "EnsemblVertebrates"), + ] + ) + def test_fetch_genomes_not_assigned_to_organism_group_type(self, test_dbs, genome_factory, genome_filters, + genome_uuid, organism_group_type, organism_group_name): + + #remove genome from organism group and make sure it is not attached to any other organism group + metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) + args = Args( + metadata_db_uri=test_dbs['ensembl_genome_metadata'].dbc.url, + core_server_uri=None, + organism_group_type=organism_group_type, + organism_group_name=organism_group_name, + genome_uuid=[genome_uuid], + release_id=[], + remove=True, + raise_error=False + ) + + metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) + + with (metadata_db.session_scope() as session): + organism_group = session.query(OrganismGroup).filter( + OrganismGroup.name == args.organism_group_name, + OrganismGroup.type == args.organism_group_type + ).one_or_none() + + organism_group_id = organism_group.organism_group_id if organism_group else None + assert organism_group_id is not None + + # remove organism from organism group + process_genomes(session, args, organism_group_id=organism_group_id) + session.commit() + + # Check if the organism group removed + query = ( + session.query(Genome, Organism, OrganismGroup).join(Organism, Organism.organism_id == Genome.organism_id + ).join(OrganismGroupMember, + OrganismGroupMember.organism_id == Organism.organism_id + ).join(OrganismGroup, + OrganismGroup.organism_group_id == OrganismGroupMember.organism_group_id + ).filter( + Genome.genome_uuid.in_([args.genome_uuid]), + OrganismGroup.name == args.organism_group_name, + ) + ) + + assert query.count() == 0, "Organism group member should be removed and not assigned to any other organism group" + + genome_filters['genome_uuid'] = [genome_uuid] + genome_filters['columns'] = [Genome.genome_uuid.label('genome_uuid'), + OrganismGroup.name.label('organism_group_name'), + OrganismGroup.type.label('organism_group_type'), + ] + genome_filters['dataset_status'] = ['Processed'] + genome_filters['division'] = [] + genome_filters['organism_group_type'] = '' + genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) + + assert genome_factory_result['genome_uuid'] == genome_uuid, "Genome UUID should match" + assert genome_factory_result['organism_group_name'] is None, "Organism group name should be None" + assert genome_factory_result['organism_group_type'] is None, "Organism group type should be None" From 6e79593e2e1912c01b10e322247699a6a8b0b609 Mon Sep 17 00:00:00 2001 From: vinay Date: Fri, 9 May 2025 18:29:45 +0100 Subject: [PATCH 2/7] Chnage Genome Factory to DIVISION agnostic --- genome_info.json | 0 src/ensembl/production/metadata/api/factories/genome_info.json | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 genome_info.json create mode 100644 src/ensembl/production/metadata/api/factories/genome_info.json diff --git a/genome_info.json b/genome_info.json new file mode 100644 index 00000000..e69de29b diff --git a/src/ensembl/production/metadata/api/factories/genome_info.json b/src/ensembl/production/metadata/api/factories/genome_info.json new file mode 100644 index 00000000..e69de29b From 3ea162250cc83a75c06bfd52bf1118b697bf955a Mon Sep 17 00:00:00 2001 From: vinay Date: Sat, 10 May 2025 08:53:02 +0100 Subject: [PATCH 3/7] Add Group by for genome uuid and dataset uuid --- src/ensembl/production/metadata/api/factories/genomes.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ensembl/production/metadata/api/factories/genomes.py b/src/ensembl/production/metadata/api/factories/genomes.py index 597800fb..5b70f7b0 100644 --- a/src/ensembl/production/metadata/api/factories/genomes.py +++ b/src/ensembl/production/metadata/api/factories/genomes.py @@ -126,6 +126,15 @@ def _apply_filters(query, filters): if filters.batch_size: filters.page = filters.page if filters.page > 0 else 1 query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size) + + # check if dataset/genome uuid in column list if not add it as we group by genome uuid and dataset uuid + if not any([i.element.table.name == 'dataset' and i.element.name == 'dataset_uuid' + for i in filters.columns]): + filters.columns.append(Dataset.dataset_uuid.label('dataset_uuid')) + if not any([i.element.table.name == 'genome' and i.element.name == 'genome_uuid' + for i in filters.columns]): + filters.columns.append(Genome.genome_uuid.label('genome_uuid')) + logger.debug(f"Filter Query {query}") return query From b36a3112685de0ad77b9a9645bbee08931a31933 Mon Sep 17 00:00:00 2001 From: vinay Date: Sat, 10 May 2025 09:07:42 +0100 Subject: [PATCH 4/7] Remove group by --- src/ensembl/production/metadata/api/factories/genomes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ensembl/production/metadata/api/factories/genomes.py b/src/ensembl/production/metadata/api/factories/genomes.py index 5b70f7b0..366ecbda 100644 --- a/src/ensembl/production/metadata/api/factories/genomes.py +++ b/src/ensembl/production/metadata/api/factories/genomes.py @@ -146,9 +146,10 @@ def _build_query(self, filters): .join(Genome.genome_datasets) \ .join(GenomeDataset.dataset) \ .join(Dataset.dataset_source) \ - .join(Dataset.dataset_type) \ - .group_by(Genome.genome_id, Dataset.dataset_id) \ - .order_by(Genome.genome_uuid) + .join(Dataset.dataset_type) + #\ + # .group_by(Genome.genome_id, Dataset.dataset_id) \ + # .order_by(Genome.genome_uuid) return self._apply_filters(query, filters) From 2c047d0c0e5a70f495ae7a3143bb5f4eceb31fe4 Mon Sep 17 00:00:00 2001 From: vinay Date: Tue, 17 Jun 2025 18:26:18 +0100 Subject: [PATCH 5/7] Support Multi species --- .../metadata/scripts/organism_to_organismgroup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py b/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py index c85b1203..ff6bfe99 100644 --- a/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py +++ b/src/ensembl/production/metadata/scripts/organism_to_organismgroup.py @@ -19,12 +19,15 @@ ) -def fetch_division_name(core_db_uri: str) -> str: +def fetch_division_name(core_db_uri: str, production_name: str) -> str: """ Fetch the division name from the core database. """ with DBConnection(core_db_uri).session_scope() as session: - query = session.query(Meta).filter(Meta.meta_key == 'species.division').one_or_none() + query = session.query(Meta.species_id).filter(Meta.meta_key == 'species.production_name', + Meta.meta_value == production_name).one_or_none() + query = session.query(Meta).filter(Meta.meta_key == 'species.division', + Meta.species_id == query.species_id).one_or_none() return query.meta_value if query else None @@ -92,7 +95,7 @@ def process_genomes(session, args, organism_group_id: int = None): for genome, dataset_source in query.all(): logging.info(f"Processing genome {genome.genome_uuid} for organism {genome.organism_id}") if not (args.organism_group_type and args.organism_group_name) and args.core_server_uri: - division_name = fetch_division_name(os.path.join(args.core_server_uri, dataset_source.name)) + division_name = fetch_division_name(os.path.join(args.core_server_uri, dataset_source.name), genome.production_name) if division_name: organism_group = session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none() if organism_group: From 4b9cc1fad35a2e928554d65ed9bf418f3dd89681 Mon Sep 17 00:00:00 2001 From: vinay Date: Tue, 17 Jun 2025 18:31:58 +0100 Subject: [PATCH 6/7] Fix conflicts with main --- src/tests/test_genome_factory.py | 142 ++----------------------------- 1 file changed, 6 insertions(+), 136 deletions(-) diff --git a/src/tests/test_genome_factory.py b/src/tests/test_genome_factory.py index 7e6bb22b..88c5634c 100644 --- a/src/tests/test_genome_factory.py +++ b/src/tests/test_genome_factory.py @@ -13,27 +13,19 @@ import pytest from ensembl.utils.database import UnitTestDB, DBConnection -from collections import namedtuple + from ensembl.production.metadata.api.exceptions import DatasetFactoryException from ensembl.production.metadata.api.factories.genomes import GenomeInputFilters -from ensembl.production.metadata.api.models import Dataset, Genome, DatasetStatus, OrganismGroup, OrganismGroupMember, \ - Organism +from ensembl.production.metadata.api.models import Dataset, Genome, DatasetStatus import logging -from ensembl.production.metadata.scripts.organism_to_organismgroup import process_genomes - logger = logging.getLogger(__name__) -Args = namedtuple('Args', [ - 'metadata_db_uri', 'core_server_uri', 'organism_group_type', - 'organism_group_name', 'genome_uuid', 'release_id', 'remove', 'raise_error' -]) - @pytest.mark.parametrize("test_dbs", [[{'src': Path(__file__).parent / "databases/ensembl_genome_metadata"}, - {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, - ]], indirect=True) + {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, + ]], indirect=True) @pytest.fixture(scope="function") def genome_filters(test_dbs): return { @@ -62,8 +54,8 @@ def expected_columns(): @pytest.mark.parametrize("test_dbs", [[{'src': Path(__file__).parent / "databases/ensembl_genome_metadata"}, - {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, - ]], indirect=True) + {'src': Path(__file__).parent / "databases/ncbi_taxonomy"}, + ]], indirect=True) class TestGenomeFactory: dbc: UnitTestDB = None @@ -210,125 +202,3 @@ def test_expected_columns_on_update_status(self, genome_factory, expected_column expected_columns.append('updated_dataset_status') returned_columns = list(next(genome_factory.get_genomes(**genome_filters)).keys()) assert returned_columns.sort() == expected_columns.sort() - - @pytest.mark.parametrize( - "genome_uuid, organism_group_type, organism_group_name", - [ - ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblBacteria"), - ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Internal", "Populars"), - ("65d4f21f-695a-4ed0-be67-5732a551fea4", "Division", "EnsemblVertebrates"), - ("a73357ab-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblPlants"), - ("a733574a-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblFungi"), - ("a73356e1-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblProtists") - ] - ) - def test_fetch_genomes_by_division_type_organism_group_type(self, test_dbs, genome_factory, genome_filters, - genome_uuid, organism_group_type, organism_group_name): - genome_filters['genome_uuid'] = [genome_uuid] - genome_filters['columns'] = [Genome.genome_uuid.label('genome_uuid'), - OrganismGroup.name.label('organism_group_name'), - OrganismGroup.type.label('organism_group_type'), - ] - genome_filters['dataset_status'] = ['Released'] - genome_filters['division'] = [organism_group_name] - genome_filters['organism_group_type'] = organism_group_type - - genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) - - assert genome_factory_result['genome_uuid'] == genome_uuid - assert genome_factory_result['organism_group_name'] == organism_group_name - assert genome_factory_result['organism_group_type'] == organism_group_type - - @pytest.mark.parametrize( - "genome_uuid, organism_group_type, organism_group_name", - [ - ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblBacteria"), - ("a73351f7-93e7-11ec-a39d-005056b38ce3", "Internal", "Populars"), - ("65d4f21f-695a-4ed0-be67-5732a551fea4", "Division", "EnsemblVertebrates"), - ("a73357ab-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblPlants"), - ("a733574a-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblFungi"), - ("a73356e1-93e7-11ec-a39d-005056b38ce3", "Division", "EnsemblProtists") - ] - ) - def test_fetch_genomes_by_division_type_organism_group_type(self, test_dbs, genome_factory, genome_filters, - genome_uuid, organism_group_type, organism_group_name): - genome_filters['genome_uuid'] = [genome_uuid] - genome_filters['columns'] = [Genome.genome_uuid.label('genome_uuid'), - OrganismGroup.name.label('organism_group_name'), - OrganismGroup.type.label('organism_group_type'), - ] - genome_filters['dataset_status'] = ['Released'] - genome_filters['division'] = [organism_group_name] - genome_filters['organism_group_type'] = organism_group_type - - genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) - - assert genome_factory_result['genome_uuid'] == genome_uuid - assert genome_factory_result['organism_group_name'] == organism_group_name - assert genome_factory_result['organism_group_type'] == organism_group_type - - @pytest.mark.parametrize( - "genome_uuid, organism_group_type, organism_group_name", - [ - ("63b4ffbf-0147-4aa7-b0af-7575bb822740", "Division", "EnsemblVertebrates"), - ] - ) - def test_fetch_genomes_not_assigned_to_organism_group_type(self, test_dbs, genome_factory, genome_filters, - genome_uuid, organism_group_type, organism_group_name): - - #remove genome from organism group and make sure it is not attached to any other organism group - metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) - args = Args( - metadata_db_uri=test_dbs['ensembl_genome_metadata'].dbc.url, - core_server_uri=None, - organism_group_type=organism_group_type, - organism_group_name=organism_group_name, - genome_uuid=[genome_uuid], - release_id=[], - remove=True, - raise_error=False - ) - - metadata_db = DBConnection(test_dbs['ensembl_genome_metadata'].dbc.url) - - with (metadata_db.session_scope() as session): - organism_group = session.query(OrganismGroup).filter( - OrganismGroup.name == args.organism_group_name, - OrganismGroup.type == args.organism_group_type - ).one_or_none() - - organism_group_id = organism_group.organism_group_id if organism_group else None - assert organism_group_id is not None - - # remove organism from organism group - process_genomes(session, args, organism_group_id=organism_group_id) - session.commit() - - # Check if the organism group removed - query = ( - session.query(Genome, Organism, OrganismGroup).join(Organism, Organism.organism_id == Genome.organism_id - ).join(OrganismGroupMember, - OrganismGroupMember.organism_id == Organism.organism_id - ).join(OrganismGroup, - OrganismGroup.organism_group_id == OrganismGroupMember.organism_group_id - ).filter( - Genome.genome_uuid.in_([args.genome_uuid]), - OrganismGroup.name == args.organism_group_name, - ) - ) - - assert query.count() == 0, "Organism group member should be removed and not assigned to any other organism group" - - genome_filters['genome_uuid'] = [genome_uuid] - genome_filters['columns'] = [Genome.genome_uuid.label('genome_uuid'), - OrganismGroup.name.label('organism_group_name'), - OrganismGroup.type.label('organism_group_type'), - ] - genome_filters['dataset_status'] = ['Processed'] - genome_filters['division'] = [] - genome_filters['organism_group_type'] = '' - genome_factory_result = next(genome_factory.get_genomes(**genome_filters)) - - assert genome_factory_result['genome_uuid'] == genome_uuid, "Genome UUID should match" - assert genome_factory_result['organism_group_name'] is None, "Organism group name should be None" - assert genome_factory_result['organism_group_type'] is None, "Organism group type should be None" From 74246f2143ba70ee0cb68269a6f3b317aec633fc Mon Sep 17 00:00:00 2001 From: vinay Date: Tue, 17 Jun 2025 18:35:50 +0100 Subject: [PATCH 7/7] Remove unwanted files --- src/ensembl/production/metadata/api/factories/genome_info.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/ensembl/production/metadata/api/factories/genome_info.json diff --git a/src/ensembl/production/metadata/api/factories/genome_info.json b/src/ensembl/production/metadata/api/factories/genome_info.json deleted file mode 100644 index e69de29b..00000000