diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 8defa4eb..dd72b198 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -9,15 +9,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import sqlalchemy as db +import logging +import sqlalchemy as db +from sqlalchemy.orm import aliased from ensembl.database import DBConnection from ensembl.ncbi_taxonomy.models import NCBITaxaName - from ensembl.production.metadata.api.base import BaseAdaptor, check_parameter from ensembl.production.metadata.api.models import Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember, \ GenomeRelease, EnsemblRelease, EnsemblSite, AssemblySequence, GenomeDataset, Dataset, DatasetType, DatasetSource -import logging logger = logging.getLogger(__name__) @@ -340,3 +340,41 @@ def fetch_genomes_info( yield [{'genome': genome, 'datasets': dataset}] except Exception as e: raise ValueError(str(e)) + + def fetch_organisms_group_counts(self, release_version=None, group_code='popular'): + o_species = aliased(Organism) + o = aliased(Organism) + if not release_version: + # Get latest released organisms + query = db.select( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order.label('order'), + db.func.count().label('count') + ) + + query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) + query = query.join(Genome, o.organism_id == Genome.organism_id) + query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id) + query = query.join(OrganismGroupMember, o_species.organism_id == OrganismGroupMember.organism_id) + query = query.join(OrganismGroup, + OrganismGroupMember.organism_group_id == OrganismGroup.organism_group_id) + query = query.filter(OrganismGroup.code == group_code) + query = query.group_by( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order + ) + query = query.order_by(OrganismGroupMember.order) + else: + # change group to release_version_state and related genomes + raise NotImplementedError('Not implemented yet') + pass + + with self.metadata_db.session_scope() as session: + # TODO check if we should return a dictionary instead + return session.execute(query).all() diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt index 57eb3a87..009a9a71 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/assembly.txt @@ -5,3 +5,4 @@ 5 ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1 \N GCA_900519105.1 chromosome \N IWGSC \N IWGSC \N 2023-05-12 13:32:36 IWGSC \N 0 \N 6 7e8ed3a8-d724-4cba-92e1-e968719b7a18 \N GCA_000146045.2 chromosome \N R64-1-1 \N R64-1-1 \N 2023-05-12 13:32:46 R64-1-1 \N 0 \N 7 f7de35c9-e0e8-4e81-b186-2962098d6361 \N GCA_000002985.3 chromosome \N WBcel235 \N WBcel235 \N 2023-05-12 13:32:52 WBcel235 \N 0 \N +8 eeaaa233-151c-4848-8b85-a05a9993101e \N GCA_000001499.28 chromosome 1 GRCh38 t2t \N GRCh38 t2t \N 2023-09-07 14:30:58 GRCh38_t2t \N 1 \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt index 662a6680..df30f6e6 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt @@ -5,3 +5,5 @@ 5 a73357ab-93e7-11ec-a39d-005056b38ce3 5 4 2023-05-12 13:32:36 6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46 7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52 +8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58 + diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt index e9e23d73..27fcb47f 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism.txt @@ -4,3 +4,4 @@ 4 d64c34ca-b37a-476b-83b5-f21d07a3ae67 4565 4565 Triticum aestivum reference (Chinese spring) Triticum aestivum Triticum_aestivum triticum_aestivum \N 5 0dc46f87-0b61-403a-8cd3-86b7e0cce8f0 559292 4932 Saccharomyces cerevisiae S288C Saccharomyces cerevisiae S288c Saccharomyces_cerevisiae saccharomyces_cerevisiae \N 6 0f4aad7b-db15-4a72-af1e-82bbae54226 6239 6239 Caenorhabditis elegans (PRJNA13758) N2 Caenorhabditis elegans Caenorhabditis_elegans caenorhabditis_elegans \N +7 dbbsaf09-2db8-429b-a407-c15a4ca2876d 9606 9606 Human T2T \N Homo sapiens Homo_sapiens homo_sapiens_t2t \N diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt index 8c85f340..9a8d8aa9 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group.txt @@ -3,4 +3,5 @@ 3 Division EnsemblProtists protists 4 Division EnsemblPlants plants 5 Division EnsemblFungi fungi -6 Division EnsemblMetazoa metazoa \ No newline at end of file +6 Division EnsemblMetazoa metazoa +7 Internal Popular Species popular \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt index b105589e..ddf92f9e 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/organism_group_member.txt @@ -3,4 +3,10 @@ 3 0 \N 3 3 4 0 \N 4 4 5 0 \N 5 5 -6 0 \N 6 6 \ No newline at end of file +6 0 \N 6 6 +7 0 1 7 1 +8 0 2 7 2 +9 0 3 7 3 +10 0 4 7 4 +11 0 5 7 5 +12 0 6 7 6 \ No newline at end of file diff --git a/src/tests/test_api.py b/src/tests/test_api.py index 121b6a9a..5e2f555b 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -162,3 +162,15 @@ def test_fetch_genome_info(self, multi_dbs): # test = conn.fetch_genomes_info(genome_uuid=uuid) # assert test['genome'][0].genome_uuid == uuid # assert test['datasets'][0][0].genome_uuid == uuid + + def test_popular_species(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_organisms_group_counts() + # First result should be Human + assert test[0][2] == 'Human' + # We should have three assemblies associated with Human (Two for grch37.38 organism + one t2t) + assert test[0][5] == 3 + for data in test[1:]: + # All others have only one genome in test DB + assert data[5] == 1