From 96182a33bdcee130c0404a5cf5295c0b3dba44e7 Mon Sep 17 00:00:00 2001 From: Marc Chakiachvili Date: Thu, 7 Sep 2023 16:05:02 +0100 Subject: [PATCH 1/3] First version of get_popular_organism (list + counts) --- src/ensembl/production/metadata/api/genome.py | 43 +++++++++++++++++++ .../api/sample/ensembl_metadata/assembly.txt | 1 + .../api/sample/ensembl_metadata/genome.txt | 2 + .../api/sample/ensembl_metadata/organism.txt | 1 + .../ensembl_metadata/organism_group.txt | 3 +- .../organism_group_member.txt | 8 +++- src/tests/test_api.py | 12 ++++++ 7 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 8defa4eb..063794b0 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -340,3 +340,46 @@ def fetch_genomes_info( yield [{'genome': genome, 'datasets': dataset}] except Exception as e: raise ValueError(str(e)) + + def get_popular_organisms_counts(self, release_version=None): + from ensembl.production.metadata.api.models.organism import OrganismGroup, Organism, OrganismGroupMember + from sqlalchemy.orm import aliased + from sqlalchemy import func + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + o_species = aliased(Organism) + o = aliased(Organism) + if not release_version: + # Get latest released organisms + query = session.query( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order.label('order'), + func.count().label('count') + ) + + query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) + query = query.join(Genome, o.organism_id == Genome.organism_id) + query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id) + query = query.join(OrganismGroupMember, o_species.organism_id == OrganismGroupMember.organism_id) + query = query.join(OrganismGroup, + OrganismGroupMember.organism_group_id == OrganismGroup.organism_group_id) + query = query.filter(OrganismGroup.code == 'popular') + query = query.group_by( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order + ) + query = query.order_by(OrganismGroupMember.order) + # TODO check if we should return a dictionary instead + return query.all() + else: + # change group to release_version_state and related genomes + raise NotImplementedError('Not implemented yet') + pass + + diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt index 57eb3a87..009a9a71 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt @@ -5,3 +5,4 @@ 5 ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1 \N GCA_900519105.1 chromosome \N IWGSC \N IWGSC \N 2023-05-12 13:32:36 IWGSC \N 0 \N 6 7e8ed3a8-d724-4cba-92e1-e968719b7a18 \N GCA_000146045.2 chromosome \N R64-1-1 \N R64-1-1 \N 2023-05-12 13:32:46 R64-1-1 \N 0 \N 7 f7de35c9-e0e8-4e81-b186-2962098d6361 \N GCA_000002985.3 chromosome \N WBcel235 \N WBcel235 \N 2023-05-12 13:32:52 WBcel235 \N 0 \N +8 eeaaa233-151c-4848-8b85-a05a9993101e \N GCA_000001499.28 chromosome 1 GRCh38 t2t \N GRCh38 t2t \N 2023-09-07 14:30:58 GRCh38_t2t \N 1 \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt index 662a6680..df30f6e6 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt @@ -5,3 +5,5 @@ 5 a73357ab-93e7-11ec-a39d-005056b38ce3 5 4 2023-05-12 13:32:36 6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46 7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52 +8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58 + diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt index e9e23d73..27fcb47f 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt @@ -4,3 +4,4 @@ 4 d64c34ca-b37a-476b-83b5-f21d07a3ae67 4565 4565 Triticum aestivum reference (Chinese spring) Triticum aestivum Triticum_aestivum triticum_aestivum \N 5 0dc46f87-0b61-403a-8cd3-86b7e0cce8f0 559292 4932 Saccharomyces cerevisiae S288C Saccharomyces cerevisiae S288c Saccharomyces_cerevisiae saccharomyces_cerevisiae \N 6 0f4aad7b-db15-4a72-af1e-82bbae54226 6239 6239 Caenorhabditis elegans (PRJNA13758) N2 Caenorhabditis elegans Caenorhabditis_elegans caenorhabditis_elegans \N +7 dbbsaf09-2db8-429b-a407-c15a4ca2876d 9606 9606 Human T2T \N Homo sapiens Homo_sapiens homo_sapiens_t2t \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt index 8c85f340..9a8d8aa9 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt @@ -3,4 +3,5 @@ 3 Division EnsemblProtists protists 4 Division EnsemblPlants plants 5 Division EnsemblFungi fungi -6 Division EnsemblMetazoa metazoa \ No newline at end of file +6 Division EnsemblMetazoa metazoa +7 Internal Popular Species popular \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt index b105589e..ddf92f9e 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt @@ -3,4 +3,10 @@ 3 0 \N 3 3 4 0 \N 4 4 5 0 \N 5 5 -6 0 \N 6 6 \ No newline at end of file +6 0 \N 6 6 +7 0 1 7 1 +8 0 2 7 2 +9 0 3 7 3 +10 0 4 7 4 +11 0 5 7 5 +12 0 6 7 6 \ No newline at end of file diff --git a/src/tests/test_api.py b/src/tests/test_api.py index 121b6a9a..267659ab 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -162,3 +162,15 @@ def test_fetch_genome_info(self, multi_dbs): # test = conn.fetch_genomes_info(genome_uuid=uuid) # assert test['genome'][0].genome_uuid == uuid # assert test['datasets'][0][0].genome_uuid == uuid + + def test_popular_species(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.get_popular_organisms_counts() + # First result should be Human + assert test[0][2] == 'Human' + # We should have three assemblies associated with Human (Two for grch37.38 organism + one t2t) + assert test[0][5] == 3 + for data in test[1:]: + # All others have only one genome in test DB + assert data[5] == 1 From 1c8e9ed52537c1dfcfe04de0cee7876bd218bf54 Mon Sep 17 00:00:00 2001 From: Marc Chakiachvili Date: Mon, 11 Sep 2023 09:32:17 +0100 Subject: [PATCH 2/3] updates following comments on PR. More consistency with previous code. --- src/ensembl/production/metadata/api/genome.py | 71 ++++++++++--------- src/tests/test_api.py | 2 +- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 063794b0..f9dd124f 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -341,45 +341,46 @@ def fetch_genomes_info( except Exception as e: raise ValueError(str(e)) - def get_popular_organisms_counts(self, release_version=None): + def fetch_organisms_group_counts(self, release_version=None, group_code='popular'): from ensembl.production.metadata.api.models.organism import OrganismGroup, Organism, OrganismGroupMember from sqlalchemy.orm import aliased from sqlalchemy import func + o_species = aliased(Organism) + o = aliased(Organism) + if not release_version: + # Get latest released organisms + query = db.select( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order.label('order'), + func.count().label('count') + ) + + query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) + query = query.join(Genome, o.organism_id == Genome.organism_id) + query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id) + query = query.join(OrganismGroupMember, o_species.organism_id == OrganismGroupMember.organism_id) + query = query.join(OrganismGroup, + OrganismGroupMember.organism_group_id == OrganismGroup.organism_group_id) + query = query.filter(OrganismGroup.code == group_code) + query = query.group_by( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order + ) + query = query.order_by(OrganismGroupMember.order) + else: + # change group to release_version_state and related genomes + raise NotImplementedError('Not implemented yet') + pass + with self.metadata_db.session_scope() as session: session.expire_on_commit = False - o_species = aliased(Organism) - o = aliased(Organism) - if not release_version: - # Get latest released organisms - query = session.query( - o_species.species_taxonomy_id, - o_species.ensembl_name, - o_species.common_name, - o_species.scientific_name, - OrganismGroupMember.order.label('order'), - func.count().label('count') - ) - - query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) - query = query.join(Genome, o.organism_id == Genome.organism_id) - query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id) - query = query.join(OrganismGroupMember, o_species.organism_id == OrganismGroupMember.organism_id) - query = query.join(OrganismGroup, - OrganismGroupMember.organism_group_id == OrganismGroup.organism_group_id) - query = query.filter(OrganismGroup.code == 'popular') - query = query.group_by( - o_species.species_taxonomy_id, - o_species.ensembl_name, - o_species.common_name, - o_species.scientific_name, - OrganismGroupMember.order - ) - query = query.order_by(OrganismGroupMember.order) - # TODO check if we should return a dictionary instead - return query.all() - else: - # change group to release_version_state and related genomes - raise NotImplementedError('Not implemented yet') - pass + # TODO check if we should return a dictionary instead + return session.execute(query).all() diff --git a/src/tests/test_api.py b/src/tests/test_api.py index 267659ab..5e2f555b 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -166,7 +166,7 @@ def test_fetch_genome_info(self, multi_dbs): def test_popular_species(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.get_popular_organisms_counts() + test = conn.fetch_organisms_group_counts() # First result should be Human assert test[0][2] == 'Human' # We should have three assemblies associated with Human (Two for grch37.38 organism + one t2t) From 7cbf7f81246d58e4c604994cc006a37b6fda8a20 Mon Sep 17 00:00:00 2001 From: Marc Chakiachvili Date: Mon, 11 Sep 2023 10:42:59 +0100 Subject: [PATCH 3/3] Refactored Import for consistency --- src/ensembl/production/metadata/api/genome.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index f9dd124f..dd72b198 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -9,15 +9,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import sqlalchemy as db +import logging +import sqlalchemy as db +from sqlalchemy.orm import aliased from ensembl.database import DBConnection from ensembl.ncbi_taxonomy.models import NCBITaxaName - from ensembl.production.metadata.api.base import BaseAdaptor, check_parameter from ensembl.production.metadata.api.models import Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember, \ GenomeRelease, EnsemblRelease, EnsemblSite, AssemblySequence, GenomeDataset, Dataset, DatasetType, DatasetSource -import logging logger = logging.getLogger(__name__) @@ -342,9 +342,6 @@ def fetch_genomes_info( raise ValueError(str(e)) def fetch_organisms_group_counts(self, release_version=None, group_code='popular'): - from ensembl.production.metadata.api.models.organism import OrganismGroup, Organism, OrganismGroupMember - from sqlalchemy.orm import aliased - from sqlalchemy import func o_species = aliased(Organism) o = aliased(Organism) if not release_version: @@ -355,7 +352,7 @@ def fetch_organisms_group_counts(self, release_version=None, group_code='popular o_species.common_name, o_species.scientific_name, OrganismGroupMember.order.label('order'), - func.count().label('count') + db.func.count().label('count') ) query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) @@ -379,8 +376,5 @@ def fetch_organisms_group_counts(self, release_version=None, group_code='popular pass with self.metadata_db.session_scope() as session: - session.expire_on_commit = False # TODO check if we should return a dictionary instead return session.execute(query).all() - -