From 996a9ee0343ae3f47c754e46afe7a5c54fee3aba Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 18 Sep 2023 13:47:53 +0100 Subject: [PATCH 1/6] fetch unreleased data only if required internally (rest of the changes will be on gRPC service) --- src/ensembl/production/metadata/api/genome.py | 17 +++++++------- src/tests/test_api.py | 23 ++++++++++++++++++- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 5e5c92d2..5e6960f2 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -146,7 +146,11 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as # Apply additional filters based on the provided parameters if unreleased_only: - genome_select = genome_select.filter(Genome.genome_releases is None) + # this filter will get all Genome entries where there's no associated GenomeRelease + # the tilde (~) symbol is used for negation. + genome_select = genome_select.filter(~Genome.genome_releases.any()) + # since we are getting only unreleased data + current_only = False # These options are in order of decreasing specificity, # and thus the ones later in the list can be redundant. @@ -206,14 +210,9 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as if release_type is not None: genome_select = genome_select.filter(EnsemblRelease.release_type == release_type) - # get current_only only if release_version is Not specified - if release_version is None or release_version == 0: - # grab released and unreleased genomes if release_version is not specified - pass - elif release_version is not None and release_version > 0: + if release_version is not None and release_version > 0: genome_select = genome_select.filter(EnsemblRelease.version <= release_version) elif current_only: - # current_only will be executed only if none of the condition above are met genome_select = genome_select.filter(GenomeRelease.is_current == 1) # print(f"genome_select query ====> {str(genome_select)}") @@ -468,7 +467,9 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= genome_select = genome_select.filter(Dataset.dataset_uuid.in_(dataset_uuid)) if unreleased_datasets: - genome_select = genome_select.filter(GenomeDataset.ensembl_release is None) + # this filter will get all GenomeDataset entries where there's no associated EnsemblRelease + # the tilde (~) symbol is used for negation. + genome_select = genome_select.filter(~GenomeDataset.ensembl_release.any()) if dataset_name is not None and "all" not in dataset_name: genome_select = genome_select.filter(DatasetType.name.in_(dataset_name)) diff --git a/src/tests/test_api.py b/src/tests/test_api.py index bbae36ae..3d1f2020 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -258,7 +258,28 @@ def test_fetch_genome_uuid(self, multi_dbs, ensembl_name, assembly_name, use_def test = conn.fetch_genomes( ensembl_name=ensembl_name, assembly_name=assembly_name, - use_default_assembly=use_default_assembly + use_default_assembly=use_default_assembly, + unreleased_only=False, + current_only=False + ) + assert len(test) == 1 + assert test[0].Genome.genome_uuid == expected_output + + @pytest.mark.parametrize( + "ensembl_name, assembly_name, use_default_assembly, expected_output", + [ + ("homo_sapiens", "GRCh38.p13", False, "a7335667-93e7-11ec-a39d-005056b38ce3"), + ("homo_sapiens", "GRCh38", True, "a7335667-93e7-11ec-a39d-005056b38ce3"), + ] + ) + def test_fetch_genome_uuid_is_current(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly, expected_output): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + ensembl_name=ensembl_name, + assembly_name=assembly_name, + use_default_assembly=use_default_assembly, + unreleased_only=False ) assert len(test) == 1 assert test[0].Genome.genome_uuid == expected_output From c40893df5443389bc1c7ae63c48387a22d385b64 Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 18 Sep 2023 14:34:46 +0100 Subject: [PATCH 2/6] fix minor test --- src/tests/test_updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index ad33a7f8..46d32aba 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -71,7 +71,7 @@ def test_update_assembly(self, multi_dbs): taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') assert test_collect[1].Organism.scientific_name == 'carol_jabberwocky' - assert test_collect[1].Assembly.accession == 'weird02' + assert test_collect[1].Assembly.accession == 'weird01' # def test_update_geneset(self, multi_dbs): From c2aab5d74bb6a1210ac644482d4874f9907e32d5 Mon Sep 17 00:00:00 2001 From: Bilal Date: Wed, 20 Sep 2023 13:59:40 +0100 Subject: [PATCH 3/6] :construction: [WIP] replace unreleased_only with allow_unreleased and changes the logic in fetch_genomes() --- src/ensembl/production/metadata/api/genome.py | 112 +++++++++--------- .../api/sample/ensembl_metadata/assembly.txt | 1 + .../ensembl_metadata/assembly_sequence.txt | 3 +- .../api/sample/ensembl_metadata/attribute.txt | 3 +- .../api/sample/ensembl_metadata/dataset.txt | 1 + .../ensembl_metadata/dataset_attribute.txt | 1 + .../ensembl_metadata/dataset_source.txt | 1 + .../sample/ensembl_metadata/dataset_type.txt | 1 + .../api/sample/ensembl_metadata/genome.txt | 1 + .../ensembl_metadata/genome_dataset.txt | 1 + .../api/sample/ensembl_metadata/organism.txt | 1 + .../organism_group_member.txt | 3 +- src/tests/test_api.py | 24 +++- src/tests/test_updater.py | 2 +- 14 files changed, 94 insertions(+), 61 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 5e6960f2..6a1b6b66 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -78,7 +78,7 @@ def fetch_taxonomy_ids(self, taxonomy_names): def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, assembly_accession=None, assembly_name=None, use_default_assembly=False, ensembl_name=None, taxonomy_id=None, - group=None, group_type=None, unreleased_only=False, site_name=None, release_type=None, + group=None, group_type=None, allow_unreleased=False, site_name=None, release_type=None, release_version=None, current_only=True): """ Fetches genome information based on the specified parameters. @@ -94,7 +94,7 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as taxonomy_id (Union[int, List[int]]): The taxonomy ID(s) of the organism(s) to fetch. group (Union[str, List[str]]): The name(s) of the organism group(s) to filter by. group_type (Union[str, List[str]]): The type(s) of the organism group(s) to filter by. - unreleased_only (bool): Whether to fetch only genomes that have not been released (default: False). + allow_unreleased (bool): Whether to fetch unreleased genomes too or not (default: False). site_name (str): The name of the Ensembl site to filter by. release_type (str): The type of the Ensembl release to filter by. release_version (int): The maximum version of the Ensembl release to filter by. @@ -145,15 +145,6 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as .filter(OrganismGroup.type.in_(group_type)).filter(OrganismGroup.name.in_(group)) # Apply additional filters based on the provided parameters - if unreleased_only: - # this filter will get all Genome entries where there's no associated GenomeRelease - # the tilde (~) symbol is used for negation. - genome_select = genome_select.filter(~Genome.genome_releases.any()) - # since we are getting only unreleased data - current_only = False - - # These options are in order of decreasing specificity, - # and thus the ones later in the list can be redundant. if genome_id is not None: genome_select = genome_select.filter(Genome.genome_id.in_(genome_id)) @@ -186,78 +177,87 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as if taxonomy_id is not None: genome_select = genome_select.filter(Organism.taxonomy_id.in_(taxonomy_id)) - # Check if genome is released - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - # copy genome_select as we don't want to include GenomeDataset - # because it results in multiple row for a given genome (genome can have many datasets) - check_query = genome_select - prep_query = check_query.add_columns(GenomeDataset) \ - .join(GenomeDataset, Genome.genome_id == GenomeDataset.genome_id) \ - .filter(GenomeDataset.release_id.isnot(None)) - is_genome_released = session.execute(prep_query).first() + if not allow_unreleased: + # Check if genome is released + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + # copy genome_select as we don't want to include GenomeDataset + # because it results in multiple row for a given genome (genome can have many datasets) + check_query = genome_select + prep_query = check_query.add_columns(GenomeDataset) \ + .join(GenomeDataset, Genome.genome_id == GenomeDataset.genome_id) \ + .filter(GenomeDataset.release_id.isnot(None)) + is_genome_released = session.execute(prep_query).first() + + if is_genome_released: + # Include release related info if released_only is True + genome_select = genome_select.add_columns(GenomeRelease, EnsemblRelease, EnsemblSite) \ + .join(GenomeRelease, Genome.genome_id == GenomeRelease.genome_id) \ + .join(EnsemblRelease, GenomeRelease.release_id == EnsemblRelease.release_id) \ + .join(EnsemblSite, EnsemblSite.site_id == EnsemblRelease.site_id) - if is_genome_released: - # Include release related info if released_only is True - genome_select = genome_select.add_columns(GenomeRelease, EnsemblRelease, EnsemblSite) \ - .join(GenomeRelease, Genome.genome_id == GenomeRelease.genome_id) \ - .join(EnsemblRelease, GenomeRelease.release_id == EnsemblRelease.release_id) \ - .join(EnsemblSite, EnsemblSite.site_id == EnsemblRelease.site_id) + if release_version is not None and release_version > 0: + # if release is specified + genome_select = genome_select.filter(EnsemblRelease.version <= release_version) + elif current_only: + # else get current only + genome_select = genome_select.filter(GenomeRelease.is_current == 1) - if site_name is not None: - genome_select = genome_select.add_columns(EnsemblSite).filter(EnsemblSite.name == site_name) + if site_name is not None: + genome_select = genome_select.add_columns(EnsemblSite).filter(EnsemblSite.name == site_name) - if release_type is not None: - genome_select = genome_select.filter(EnsemblRelease.release_type == release_type) + if release_type is not None: + genome_select = genome_select.filter(EnsemblRelease.release_type == release_type) - if release_version is not None and release_version > 0: - genome_select = genome_select.filter(EnsemblRelease.version <= release_version) - elif current_only: - genome_select = genome_select.filter(GenomeRelease.is_current == 1) + else: + # since both allow_unreleased and is_genome_released are False + # don't include unreleased Genomes + # for some reason this breaks test_updater tests + genome_select = genome_select.filter(Genome.genome_releases.any()) # print(f"genome_select query ====> {str(genome_select)}") with self.metadata_db.session_scope() as session: session.expire_on_commit = False return session.execute(genome_select.order_by("ensembl_name")).all() - def fetch_genomes_by_genome_uuid(self, genome_uuid, unreleased_only=False, site_name=None, release_type=None, + def fetch_genomes_by_genome_uuid(self, genome_uuid, allow_unreleased=False, site_name=None, release_type=None, release_version=None, current_only=True): return self.fetch_genomes( genome_uuid=genome_uuid, - unreleased_only=unreleased_only, + allow_unreleased=allow_unreleased, site_name=site_name, release_type=release_type, release_version=release_version, current_only=current_only, ) - def fetch_genomes_by_assembly_accession(self, assembly_accession, unreleased_only=False, site_name=None, + def fetch_genomes_by_assembly_accession(self, assembly_accession, allow_unreleased=False, site_name=None, release_type=None, release_version=None, current_only=True): return self.fetch_genomes( assembly_accession=assembly_accession, - unreleased_only=unreleased_only, + allow_unreleased=allow_unreleased, site_name=site_name, release_type=release_type, release_version=release_version, current_only=current_only, ) - def fetch_genomes_by_ensembl_name(self, ensembl_name, unreleased_only=False, site_name=None, release_type=None, + def fetch_genomes_by_ensembl_name(self, ensembl_name, allow_unreleased=False, site_name=None, release_type=None, release_version=None, current_only=True): return self.fetch_genomes( ensembl_name=ensembl_name, - unreleased_only=unreleased_only, + allow_unreleased=allow_unreleased, site_name=site_name, release_type=release_type, release_version=release_version, current_only=current_only, ) - def fetch_genomes_by_taxonomy_id(self, taxonomy_id, unreleased_only=False, site_name=None, release_type=None, + def fetch_genomes_by_taxonomy_id(self, taxonomy_id, allow_unreleased=False, site_name=None, release_type=None, release_version=None, current_only=True): return self.fetch_genomes( taxonomy_id=taxonomy_id, - unreleased_only=unreleased_only, + allow_unreleased=allow_unreleased, site_name=site_name, release_type=release_type, release_version=release_version, @@ -267,7 +267,7 @@ def fetch_genomes_by_taxonomy_id(self, taxonomy_id, unreleased_only=False, site_ def fetch_genomes_by_scientific_name( self, scientific_name, - unreleased_only=False, + allow_unreleased=False, site_name=None, release_type=None, release_version=None, @@ -277,7 +277,7 @@ def fetch_genomes_by_scientific_name( return self.fetch_genomes_by_taxonomy_id( taxonomy_ids, - unreleased_only=unreleased_only, + allow_unreleased=allow_unreleased, site_name=site_name, release_type=release_type, release_version=release_version, @@ -469,9 +469,16 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= if unreleased_datasets: # this filter will get all GenomeDataset entries where there's no associated EnsemblRelease # the tilde (~) symbol is used for negation. - genome_select = genome_select.filter(~GenomeDataset.ensembl_release.any()) - - if dataset_name is not None and "all" not in dataset_name: + genome_select = genome_select.filter(~GenomeDataset.ensembl_release.has()) + + if "all" in dataset_name: + # TODO: fetch the list dynamically from the DB + dataset_type_names = [ + 'assembly', 'genebuild', 'variation', 'evidence', + 'regulation_build', 'homologies', 'regulatory_features' + ] + genome_select = genome_select.filter(DatasetType.name.in_(dataset_type_names)) + else: genome_select = genome_select.filter(DatasetType.name.in_(dataset_name)) if dataset_source is not None: @@ -486,13 +493,12 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= # is not bound to a Session; attribute refresh operation cannot proceed # (Background on this error at: https://sqlalche.me/e/14/bhk3) session.expire_on_commit = False - # copy genome_select as we don't want to include GenomeDataset - # because it results in multiple row for a given genome (genome can have many datasets) - prep_query = genome_select.filter(GenomeDataset.ensembl_release is not None) + # Check if GenomeDataset HAS an ensembl_release + prep_query = genome_select.filter(GenomeDataset.ensembl_release.has()) is_genome_released = session.execute(prep_query).first() if is_genome_released: - # Include release related info if released_only is True + # Include release related info genome_select = genome_select.add_columns(EnsemblRelease) \ .join(EnsemblRelease, GenomeDataset.release_id == EnsemblRelease.release_id) @@ -539,7 +545,7 @@ def fetch_genomes_info( genomes = self.fetch_genomes( genome_id=genome_id, genome_uuid=genome_uuid, - unreleased_only=unreleased_genomes, + allow_unreleased=unreleased_genomes, ensembl_name=ensembl_name, group=group, group_type=group_type, diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt index bdd05df1..889af4d0 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt @@ -6,3 +6,4 @@ 6 7e8ed3a8-d724-4cba-92e1-e968719b7a18 \N GCA_000146045.2 chromosome R64-1-1 \N R64-1-1 \N 2023-05-12 13:32:46 R64-1-1 \N 0 \N 7 f7de35c9-e0e8-4e81-b186-2962098d6361 \N GCA_000002985.3 chromosome WBcel235 \N WBcel235 \N 2023-05-12 13:32:52 WBcel235 \N 0 \N 8 eeaaa233-151c-4848-8b85-a05a9993101e \N GCA_000001499.28 chromosome GRCh38 t2t \N GRCh38 t2t \N 2023-09-07 14:30:58 GRCh38_t2t \N 1 \N +9 34372aad-5bb1-4304-8a13-28cb4afc601e \N GCA_000001735.1 chromosome TAIR10 \N TAIR10 \N 2023-08-18 12:22:34 TAIR10 \N 1 \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly_sequence.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly_sequence.txt index fa7a99c8..e39b129a 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly_sequence.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly_sequence.txt @@ -2021,4 +2021,5 @@ 2021 IV 7 BX284604.4 1 17493829 \N SO:0000738 \N \N 2022 V 7 BX284605.5 1 20924180 \N SO:0000738 \N \N 2023 X 7 BX284606.5 1 17718942 \N SO:0000738 \N \N -2024 MtDNA 7 X54252.1 1 13794 \N SO:0000737 \N \N \ No newline at end of file +2024 MtDNA 7 X54252.1 1 13794 \N SO:0000737 \N \N +2025 Mt 6 Mt 1 366924 \N SO:0000737 \N \N \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt index 20eeffd7..3ff5c650 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt @@ -69,4 +69,5 @@ 69 genebuild.last_geneset_update last_geneset_update last_geneset_update string 70 genebuild.version version version string 71 sample.gene_param sample.gene_param sample.gene_param string -72 sample.location_param sample.location_param sample.location_param string \ No newline at end of file +72 sample.location_param sample.location_param sample.location_param string +73 assembly.date assembly.date assembly.date string \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt index 5ae528a0..adba2146 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset.txt @@ -30,4 +30,5 @@ 42 ea044d8e-33f1-4c9f-9b9f-8c0bd1dcf642 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted 44 feaa37ea-4217-4d9d-afca-599bdae11b36 2 genebuild \N 2023-09-12 13:32:52.00 7 2023-05 Submitted 45 feaa37ea-4217-4d9d-afca-600bdae11b36 1 asssembly \N 2023-09-12 13:32:52.00 7 2023-05 Submitted +46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt index 188c3b52..d0b4c4fe 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_attribute.txt @@ -398,3 +398,4 @@ 339 57.7 56 38 399 17461 55 42 400 87.4 56 42 +401 2008-04 73 46 diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_source.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_source.txt index 72ea933a..7d869c08 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_source.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_source.txt @@ -8,3 +8,4 @@ 10 variation gathered_data_1 11 compara gathered_data_2 12 regulation gathered_data_3 +13 core arabidopsis_thaliana_core_57_110_11 diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt index d93d7437..de9b64ef 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/dataset_type.txt @@ -4,3 +4,4 @@ 4 evidence Variation Evidence Variation Annotation \N \N 5 regulation_build Regulations Regulatory Annotation \N \N 6 homologies Comparative homologies Comparative Annotation \N \N +7 regulatory_features Regulations Regulatory Annotation \N \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt index df30f6e6..9dbe31e4 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt @@ -6,4 +6,5 @@ 6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46 7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52 8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58 +9 90720316-006c-470b-a7dd-82d28f952264 9 8 2023-08-18 12:22:34 diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt index a17cd5b7..49ae11da 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_dataset.txt @@ -30,3 +30,4 @@ 56 42 7 1 1 57 44 7 \N 0 58 45 7 \N 0 +59 46 9 \N 0 diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt index 27fcb47f..0fec9adb 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt @@ -5,3 +5,4 @@ 5 0dc46f87-0b61-403a-8cd3-86b7e0cce8f0 559292 4932 Saccharomyces cerevisiae S288C Saccharomyces cerevisiae S288c Saccharomyces_cerevisiae saccharomyces_cerevisiae \N 6 0f4aad7b-db15-4a72-af1e-82bbae54226 6239 6239 Caenorhabditis elegans (PRJNA13758) N2 Caenorhabditis elegans Caenorhabditis_elegans caenorhabditis_elegans \N 7 dbbsaf09-2db8-429b-a407-c15a4ca2876d 9606 9606 Human T2T \N Homo sapiens Homo_sapiens homo_sapiens_t2t \N +8 02b934c5-83af-4b3c-9fc1-5a0f01823396 3702 3702 thale-cress \N Arabidopsis thaliana arabidopsis_thaliana arabidopsis_thaliana \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt index ddf92f9e..a462a349 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt @@ -9,4 +9,5 @@ 9 0 3 7 3 10 0 4 7 4 11 0 5 7 5 -12 0 6 7 6 \ No newline at end of file +12 0 6 7 6 +13 0 8 4 1 \ No newline at end of file diff --git a/src/tests/test_api.py b/src/tests/test_api.py index 3d1f2020..c51de97d 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -43,7 +43,7 @@ def fetch_with_all_args_no_conflict(self, multi_dbs): ensembl_name="caenorhabditis_elegans", taxonomy_id="6239", group="EnsemblMetazoa", - unreleased_only=False, + allow_unreleased=False, site_name="Ensembl", release_type="integrated", release_version="108.0", @@ -60,7 +60,7 @@ def fetch_with_all_args_conflict(self, multi_dbs): ensembl_name="caenorhabditis_elegans", taxonomy_id="9606", # Conflicting taxonomy_id group="EnsemblBacteria", # Conflicting group - unreleased_only=False, + allow_unreleased=False, site_name="Ensembl", release_type="integrated", release_version="108.0", @@ -259,7 +259,7 @@ def test_fetch_genome_uuid(self, multi_dbs, ensembl_name, assembly_name, use_def ensembl_name=ensembl_name, assembly_name=assembly_name, use_default_assembly=use_default_assembly, - unreleased_only=False, + allow_unreleased=False, current_only=False ) assert len(test) == 1 @@ -279,7 +279,7 @@ def test_fetch_genome_uuid_is_current(self, multi_dbs, ensembl_name, assembly_na ensembl_name=ensembl_name, assembly_name=assembly_name, use_default_assembly=use_default_assembly, - unreleased_only=False + allow_unreleased=False ) assert len(test) == 1 assert test[0].Genome.genome_uuid == expected_output @@ -312,3 +312,19 @@ def test_popular_species(self, multi_dbs): for data in test[1:]: # All others have only one genome in test DB assert data[5] == 1 + + # def test_fetch_genomes_info(self, multi_dbs): + # conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + # taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + # test = conn.fetch_genomes_info( + # unreleased_genomes=True, + # # unreleased_datasets=True, + # # dataset_name="all" + # ) + # print(f"len(test) ===> {len(list(test))}") + # # print(f"type(test) ===> {type(test)}") + # for test_res in test: + # print(f"test_res ===> {test_res}") + # # print(f"Genome.genome_uuid ===> {test_res[0]['genome'].Genome.genome_uuid}") + + diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 46d32aba..ad33a7f8 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -71,7 +71,7 @@ def test_update_assembly(self, multi_dbs): taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') assert test_collect[1].Organism.scientific_name == 'carol_jabberwocky' - assert test_collect[1].Assembly.accession == 'weird01' + assert test_collect[1].Assembly.accession == 'weird02' # def test_update_geneset(self, multi_dbs): From 55938163eddc9a5ae83ae103480e2ba2688f04d5 Mon Sep 17 00:00:00 2001 From: Bilal Date: Wed, 20 Sep 2023 22:14:12 +0100 Subject: [PATCH 4/6] update fetch_genome_datasets() fetching releases logic --- src/ensembl/production/metadata/api/genome.py | 62 ++++++++++--------- src/tests/test_api.py | 33 ++++++---- 2 files changed, 53 insertions(+), 42 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 6a1b6b66..8e176249 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -199,8 +199,9 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as if release_version is not None and release_version > 0: # if release is specified genome_select = genome_select.filter(EnsemblRelease.version <= release_version) - elif current_only: - # else get current only + current_only = False + + if current_only: genome_select = genome_select.filter(GenomeRelease.is_current == 1) if site_name is not None: @@ -212,7 +213,7 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as else: # since both allow_unreleased and is_genome_released are False # don't include unreleased Genomes - # for some reason this breaks test_updater tests + # TODO: for some reason this breaks test_updater tests genome_select = genome_select.filter(Genome.genome_releases.any()) # print(f"genome_select query ====> {str(genome_select)}") @@ -388,7 +389,7 @@ def fetch_sequences_by_assembly_accession( assembly_accession=assembly_accession, chromosomal_only=chromosomal_only ) - def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid=None, unreleased_datasets=False, + def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid=None, allow_unreleased=False, dataset_uuid=None, dataset_name=None, dataset_source=None, dataset_type=None, release_version=None): """ @@ -398,7 +399,7 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= genome_id (int or list or None): Genome ID(s) to filter by. genome_uuid (str or list or None): Genome UUID(s) to filter by. organism_uuid (str or list or None): Organism UUID(s) to filter by. - unreleased_datasets (bool): Flag indicating whether to fetch only unreleased datasets. + allow_unreleased (bool): Flag indicating whether to allowing fetching unreleased datasets too or not. dataset_uuid (str or list or None): Dataset UUID(s) to filter by. dataset_name (str or None): Dataset name to filter by, default is 'assembly'. dataset_source (str or None): Dataset source to filter by. @@ -466,11 +467,6 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= if dataset_uuid is not None: genome_select = genome_select.filter(Dataset.dataset_uuid.in_(dataset_uuid)) - if unreleased_datasets: - # this filter will get all GenomeDataset entries where there's no associated EnsemblRelease - # the tilde (~) symbol is used for negation. - genome_select = genome_select.filter(~GenomeDataset.ensembl_release.has()) - if "all" in dataset_name: # TODO: fetch the list dynamically from the DB dataset_type_names = [ @@ -487,28 +483,36 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= if dataset_type is not None: genome_select = genome_select.filter(DatasetType.name.in_(dataset_type)) - with self.metadata_db.session_scope() as session: - # This is needed in order to ovoid tests throwing: - # sqlalchemy.orm.exc.DetachedInstanceError: Instance - # is not bound to a Session; attribute refresh operation cannot proceed - # (Background on this error at: https://sqlalche.me/e/14/bhk3) - session.expire_on_commit = False - # Check if GenomeDataset HAS an ensembl_release - prep_query = genome_select.filter(GenomeDataset.ensembl_release.has()) - is_genome_released = session.execute(prep_query).first() + if not allow_unreleased: # Get released datasets only + # Check if dataset is released + with self.metadata_db.session_scope() as session: + # This is needed in order to ovoid tests throwing: + # sqlalchemy.orm.exc.DetachedInstanceError: Instance + # is not bound to a Session; attribute refresh operation cannot proceed + # (Background on this error at: https://sqlalche.me/e/14/bhk3) + session.expire_on_commit = False + # Check if GenomeDataset HAS an ensembl_release + prep_query = genome_select.filter(GenomeDataset.ensembl_release.has()) + is_dataset_released = session.execute(prep_query).first() - if is_genome_released: + if is_dataset_released: # Include release related info genome_select = genome_select.add_columns(EnsemblRelease) \ .join(EnsemblRelease, GenomeDataset.release_id == EnsemblRelease.release_id) - if release_version: - genome_select = genome_select.filter(EnsemblRelease.version <= release_version) + if release_version: + genome_select = genome_select.filter(EnsemblRelease.version <= release_version) - # print(f"genome_select str ====> {str(genome_select)}") - logger.debug(genome_select) + else: + # since both allow_unreleased and is_dataset_released are False + # don't include unreleased Datasets + genome_select = genome_select.filter(GenomeDataset.ensembl_release.has()) - return session.execute(genome_select).all() + # print(f"genome_select str ====> {str(genome_select)}") + logger.debug(genome_select) + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(genome_select).all() except Exception as e: raise ValueError(str(e)) @@ -517,11 +521,11 @@ def fetch_genomes_info( self, genome_id=None, genome_uuid=None, - unreleased_genomes=False, + allow_unreleased_genomes=False, ensembl_name=None, group=None, group_type=None, - unreleased_datasets=False, + allow_unreleased_datasets=False, dataset_name=None, dataset_source=None ): @@ -545,7 +549,7 @@ def fetch_genomes_info( genomes = self.fetch_genomes( genome_id=genome_id, genome_uuid=genome_uuid, - allow_unreleased=unreleased_genomes, + allow_unreleased=allow_unreleased_genomes, ensembl_name=ensembl_name, group=group, group_type=group_type, @@ -554,7 +558,7 @@ def fetch_genomes_info( for genome in genomes: dataset = self.fetch_genome_datasets( genome_uuid=genome[0].genome_uuid, - unreleased_datasets=unreleased_datasets, + allow_unreleased=allow_unreleased_datasets, dataset_name=dataset_name, dataset_source=dataset_source ) diff --git a/src/tests/test_api.py b/src/tests/test_api.py index c51de97d..ca74a8d3 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -313,18 +313,25 @@ def test_popular_species(self, multi_dbs): # All others have only one genome in test DB assert data[5] == 1 - # def test_fetch_genomes_info(self, multi_dbs): - # conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - # taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - # test = conn.fetch_genomes_info( - # unreleased_genomes=True, - # # unreleased_datasets=True, - # # dataset_name="all" - # ) - # print(f"len(test) ===> {len(list(test))}") - # # print(f"type(test) ===> {type(test)}") - # for test_res in test: - # print(f"test_res ===> {test_res}") - # # print(f"Genome.genome_uuid ===> {test_res[0]['genome'].Genome.genome_uuid}") + @pytest.mark.parametrize( + "allow_unreleased, output_count, expected_genome_uuid", + [ + # fetches everything + (True, 9, "90720316-006c-470b-a7dd-82d28f952264"), + # fetches released datasets and genomes with but current_only=1 (default) + (False, 6, "a733550b-93e7-11ec-a39d-005056b38ce3"), + ] + ) + def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, expected_genome_uuid): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes_info( + allow_unreleased_genomes=allow_unreleased, + allow_unreleased_datasets=allow_unreleased, + group_type=['division', 'internal'] + ) + output_to_list = list(test) + assert len(output_to_list) == output_count + assert output_to_list[0][0]['genome'].Genome.genome_uuid == expected_genome_uuid From 967ead165e06871a1d63a1f0906e92880e9be349 Mon Sep 17 00:00:00 2001 From: Bilal Date: Wed, 20 Sep 2023 23:01:29 +0100 Subject: [PATCH 5/6] fix typo (Tab) --- src/ensembl/production/metadata/api/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 405a7773..1fee2704 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -514,7 +514,7 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= logger.debug(genome_select) with self.metadata_db.session_scope() as session: session.expire_on_commit = False - return session.execute(genome_select).all() + return session.execute(genome_select).all() except Exception as e: raise ValueError(str(e)) From 461a95f0a3b4cd8d37c3e6cd91d32888e529d55d Mon Sep 17 00:00:00 2001 From: Bilal Date: Thu, 21 Sep 2023 16:08:00 +0100 Subject: [PATCH 6/6] include unreleased_only scenario, improve, fix and add tests --- src/ensembl/production/metadata/api/genome.py | 48 +++++--- src/tests/test_api.py | 107 ++++++++++-------- 2 files changed, 90 insertions(+), 65 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 1fee2704..25a99c98 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -78,8 +78,8 @@ def fetch_taxonomy_ids(self, taxonomy_names): def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, assembly_accession=None, assembly_name=None, use_default_assembly=False, ensembl_name=None, taxonomy_id=None, - group=None, group_type=None, allow_unreleased=False, site_name=None, release_type=None, - release_version=None, current_only=True): + group=None, group_type=None, allow_unreleased=False, unreleased_only=False, site_name=None, + release_type=None, release_version=None, current_only=True): """ Fetches genome information based on the specified parameters. @@ -95,6 +95,9 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as group (Union[str, List[str]]): The name(s) of the organism group(s) to filter by. group_type (Union[str, List[str]]): The type(s) of the organism group(s) to filter by. allow_unreleased (bool): Whether to fetch unreleased genomes too or not (default: False). + unreleased_only (bool): Fetch only unreleased genomes (default: False). allow_unreleased is used by gRPC + to fetch both released and unreleased genomes, while unreleased_only + is used in production pipelines (fetches only unreleased genomes) site_name (str): The name of the Ensembl site to filter by. release_type (str): The type of the Ensembl release to filter by. release_version (int): The maximum version of the Ensembl release to filter by. @@ -177,8 +180,18 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as if taxonomy_id is not None: genome_select = genome_select.filter(Organism.taxonomy_id.in_(taxonomy_id)) - if not allow_unreleased: + if allow_unreleased: + # fetch everything (released + unreleased) + pass + elif unreleased_only: + # fetch unreleased only + # this filter will get all Genome entries where there's no associated GenomeRelease + # the tilde (~) symbol is used for negation. + genome_select = genome_select.filter(~Genome.genome_releases.any()) + else: + # fetch released only # Check if genome is released + # TODO: why did I add this check?! -> removing this breaks the test_update tests with self.metadata_db.session_scope() as session: session.expire_on_commit = False # copy genome_select as we don't want to include GenomeDataset @@ -210,12 +223,6 @@ def fetch_genomes(self, genome_id=None, genome_uuid=None, organism_uuid=None, as if release_type is not None: genome_select = genome_select.filter(EnsemblRelease.release_type == release_type) - else: - # since both allow_unreleased and is_genome_released are False - # don't include unreleased Genomes - # TODO: for some reason this breaks test_updater tests - genome_select = genome_select.filter(Genome.genome_releases.any()) - # print(f"genome_select query ====> {str(genome_select)}") with self.metadata_db.session_scope() as session: session.expire_on_commit = False @@ -390,8 +397,8 @@ def fetch_sequences_by_assembly_accession( ) def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid=None, allow_unreleased=False, - dataset_uuid=None, dataset_name=None, dataset_source=None, dataset_type=None, - release_version=None, dataset_attributes=None): + unreleased_only=False, dataset_uuid=None, dataset_name=None, dataset_source=None, + dataset_type=None, release_version=None, dataset_attributes=None): """ Fetches genome datasets based on the provided parameters. @@ -400,6 +407,9 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= genome_uuid (str or list or None): Genome UUID(s) to filter by. organism_uuid (str or list or None): Organism UUID(s) to filter by. allow_unreleased (bool): Flag indicating whether to allowing fetching unreleased datasets too or not. + unreleased_only (bool): Fetch only unreleased datasets (default: False). allow_unreleased is used by gRPC + to fetch both released and unreleased datasets, while unreleased_only + is used in production pipelines (fetches only unreleased datasets) dataset_uuid (str or list or None): Dataset UUID(s) to filter by. dataset_name (str or None): Dataset name to filter by, default is 'assembly'. dataset_source (str or None): Dataset source to filter by. @@ -485,7 +495,16 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= .join(DatasetAttribute, DatasetAttribute.dataset_id == Dataset.dataset_id) \ .join(Attribute, Attribute.attribute_id == DatasetAttribute.attribute_id) - if not allow_unreleased: # Get released datasets only + if allow_unreleased: + # Get everything + pass + elif unreleased_only: + # Get only unreleased datasets + # this filter will get all Datasets entries where there's no associated GenomeDataset + # the tilde (~) symbol is used for negation. + genome_select = genome_select.filter(~GenomeDataset.ensembl_release.has()) + else: + # Get released datasets only # Check if dataset is released with self.metadata_db.session_scope() as session: # This is needed in order to ovoid tests throwing: @@ -505,11 +524,6 @@ def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid= if release_version: genome_select = genome_select.filter(EnsemblRelease.version <= release_version) - else: - # since both allow_unreleased and is_dataset_released are False - # don't include unreleased Datasets - genome_select = genome_select.filter(GenomeDataset.ensembl_release.has()) - # print(f"genome_select str ====> {str(genome_select)}") logger.debug(genome_select) with self.metadata_db.session_scope() as session: diff --git a/src/tests/test_api.py b/src/tests/test_api.py index ca74a8d3..8c18272d 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -29,13 +29,32 @@ def test_load_database(self, multi_dbs): db_test = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) assert db_test, "DB should not be empty" - def fetch_all_genomes(self, multi_dbs): - conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) - test = conn.fetch_genomes() - assert len(test) == 7 + @pytest.mark.parametrize( + "allow_unreleased, unreleased_only, current_only, output_count", + [ + # fetches everything (7 released + 2 unreleased) + (True, False, True, 9), + # fetches all released genomes (with current_only=0) + (False, False, False, 7), + # fetches released genomes with current_only=1 (default) + (False, False, True, 6), + # fetches all unreleased genomes + (False, True, True, 2), + ] + ) + def test_fetch_all_genomes(self, multi_dbs, allow_unreleased, unreleased_only, current_only, output_count): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + allow_unreleased=allow_unreleased, + unreleased_only=unreleased_only, + current_only=current_only + ) + assert len(test) == output_count - def fetch_with_all_args_no_conflict(self, multi_dbs): - conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) + def test_fetch_with_all_args_no_conflict(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test = conn.fetch_genomes( genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3", assembly_accession="GCA_000002985.3", @@ -51,8 +70,9 @@ def fetch_with_all_args_no_conflict(self, multi_dbs): ) assert len(test) == 0 - def fetch_with_all_args_conflict(self, multi_dbs): - conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) + def test_fetch_with_all_args_conflict(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test = conn.fetch_genomes( genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3", assembly_accession="GCA_000002985.3", @@ -66,7 +86,7 @@ def fetch_with_all_args_conflict(self, multi_dbs): release_version="108.0", current_only=True ) - assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' + assert len(test) == 0 def test_fetch_releases(self, multi_dbs): conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) @@ -106,7 +126,6 @@ def test_fetch_genomes(self, multi_dbs): test = conn.fetch_genomes(genome_uuid='a7335667-93e7-11ec-a39d-005056b38ce3') assert test[0].Organism.scientific_name == 'Homo sapiens' - # def test_fetch_genomes_by_group_division(self, multi_dbs): # conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, # taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) @@ -116,7 +135,6 @@ def test_fetch_genomes(self, multi_dbs): # Other PR will likely change this drastically, so the effort is not really necessary. Their are 7 groups. # assert division_filter in division_results - def test_fetch_genomes_by_genome_uuid(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) @@ -206,44 +224,37 @@ def test_fetch_sequences_chromosomal_only(self, multi_dbs): ) assert test[-1].AssemblySequence.chromosomal == 1 - def test_fetch_genome_dataset_default_topic_assembly(self, multi_dbs): - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genome_datasets(genome_uuid='a73357ab-93e7-11ec-a39d-005056b38ce3') - assert test[0].DatasetType.topic == 'Core Annotation' - - def test_fetch_genome_dataset_uuid(self, multi_dbs): - uuid = '0dc05c6e-2910-4dbd-879a-719ba97d5824' - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genome_datasets(dataset_uuid=uuid, dataset_name='genebuild') - assert test[0].Dataset.dataset_uuid == uuid - - def test_fetch_genome_dataset_genome_uuid(self, multi_dbs): - uuid = 'a73357ab-93e7-11ec-a39d-005056b38ce3' - conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genome_datasets(genome_uuid=uuid) - assert test[0].Genome.genome_uuid == uuid - - def test_fetch_genome_datasets(self, multi_dbs): + @pytest.mark.parametrize( + "genome_uuid, dataset_uuid, allow_unreleased, unreleased_only, expected_dataset_uuid, expected_count", + [ + # nothing specified + allow_unreleased -> fetches everything + (None, None, True, False, "559d7660-d92d-47e1-924e-e741151c2cef", 33), + # specifying genome_uuid + ("a73357ab-93e7-11ec-a39d-005056b38ce3", None, False, False, "b4ff55e3-d06a-4772-bb13-81c3207669e3", 5), + # specifying dataset_uuid + (None, "0dc05c6e-2910-4dbd-879a-719ba97d5824", False, False, "0dc05c6e-2910-4dbd-879a-719ba97d5824", 1), + # fetch unreleased datasets only + (None, None, False, True, "feaa37ea-4217-4d9d-afca-600bdae11b36", 3), + ] + ) + def test_fetch_genome_dataset_all( + self, multi_dbs, genome_uuid, + dataset_uuid, allow_unreleased, + unreleased_only, expected_dataset_uuid, + expected_count + ): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_genome_datasets() - assert test[0].Dataset.dataset_uuid == '559d7660-d92d-47e1-924e-e741151c2cef' - assert test[0].DatasetType.name == 'assembly' - - # TODO: fix it, there are no unreleased datasets (add one?) - # def test_fetch_genome_datasets_unreleased(self, multi_dbs): - # conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, - # taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - # test = conn.fetch_genome_datasets( - # dataset_name="all", - # unreleased_datasets=True - # ) - # print(f"test ===> {test}") - # assert test[0].GenomeDataset.release_id is None - # assert test[0].GenomeDataset.is_current == 0 + test = conn.fetch_genome_datasets( + genome_uuid=genome_uuid, + dataset_uuid=dataset_uuid, + unreleased_only=unreleased_only, + allow_unreleased=allow_unreleased, + # fetch all datasets (default: dataset_name="assembly") + dataset_name="all" + ) + assert test[0].Dataset.dataset_uuid == expected_dataset_uuid + assert len(test) == expected_count @pytest.mark.parametrize( "ensembl_name, assembly_name, use_default_assembly, expected_output", @@ -318,7 +329,7 @@ def test_popular_species(self, multi_dbs): [ # fetches everything (True, 9, "90720316-006c-470b-a7dd-82d28f952264"), - # fetches released datasets and genomes with but current_only=1 (default) + # fetches released datasets and genomes with current_only=1 (default) (False, 6, "a733550b-93e7-11ec-a39d-005056b38ce3"), ] )