diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..287a2f0f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,162 @@ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + diff --git a/.gitignore b/.gitignore index 0f548944..c7c60b9f 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,4 @@ dmypy.json # Cython debug symbols cython_debug/ +.python-version diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..8eaeccc7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.8.15-alpine +# Package +RUN apk update && apk add --no-cache git mariadb-dev build-base + +RUN addgroup -S appgroup && adduser -S appuser -G appgroup +RUN mkdir -p /usr/src/app +RUN chown -R appuser:appgroup /usr/src/app/ + +USER appuser +WORKDIR /usr/src/app +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 +ENV PIP_ROOT_USER_ACTION=ignore + +COPY --chown=appuser:appgroup . /usr/src/app/ +ENV PATH="/usr/src/app/venv/bin:$PATH" + +RUN python -m venv /usr/src/app/venv/ +RUN pip install --upgrade pip +RUN pip install . +RUN pip uninstall -y ensembl-hive + +CMD ["/usr/src/app/venv/bin/python", "/usr/src/app/src/ensembl/production/metadata/grpc/service.py"] diff --git a/MANIFEST.in b/MANIFEST.in index c9458ac5..901895e2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,6 @@ include LICENSE include NOTICE include VERSION -recursive-include src/ensembl/production/metadata/api/sample/ * \ No newline at end of file +recursive-include src/ensembl/production/metadata/api/sample/ * +include README.md +include requirements.txt diff --git a/NOTICE b/NOTICE index 51e5a9d4..b13839b6 100644 --- a/NOTICE +++ b/NOTICE @@ -1,4 +1,4 @@ -Ensembl Metadata API +Ensembl Metadata API/Services Copyright [2018-2023] EMBL-European Bioinformatics Institute This product includes software developed at: diff --git a/README.md b/README.md index 08f2e696..237c47a5 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ -# Ensembl Metadata API +<<<<<<< HEAD +# Ensembl Metadata API / GRPC API +# SQLAlchemy ORM for the Ensembl Metadata database. +# GRPC Service protofile to interact with metadata database through GRPC [![Build Status](https://travis-ci.com/Ensembl/ensembl-metadata-api.svg?branch=main)](https://travis-ci.com/Ensembl/ensembl-metadata-api) -SQLAlchemy ORM for the Ensembl Metadata database. - ## System Requirements - Python 3.8+ @@ -32,10 +33,6 @@ pip install -r requirements.txt [ensembl-metadata-admin](https://github.com/Ensembl/ensembl-metadata-admin): Django ORM for the Ensembl Metadata database -[ensembl-metadata-registry](https://github.com/Ensembl/ensembl-metadata-registry): GUI for the Ensembl Metadata database - -[ensembl-metadata-service](https://github.com/Ensembl/ensembl-metadata-service): gRPC layer for the Ensembl Metadata database - ## Development @@ -59,6 +56,24 @@ pip install --upgrade pip pip install -r requirements-dev.txt ``` +======= +To generate client and server files +(Remember to run these after adding a new method in ensembl_metadata.proto) +``` +python3 -m grpc_tools.protoc -Iprotos --python_out=src --grpc_python_out=src protos/ensembl/production/metadata/grpc/ensembl_metadata.proto +``` + +Start the server script + +``` +PYTHONPATH='src' python3 src/ensembl/production/metadata/grpc/service.py +``` + +Start the client script +``` +PYTHONPATH='src' python3 src/ensembl/production/metadata/grpc/client_examples.py +``` + ### Testing Run test suite: @@ -78,6 +93,41 @@ To actually reformat all files contained in `src`: ``` cd ensembl-metadata-api black src +PYTHONPATH='src' pytest +``` + +To run tests, calculate and display testing coverage stats: +``` +cd ensembl-metadata-api +coverage run -m pytest +coverage report -m +``` + +#### Explore test DB content + +As for now, some of the test DB sqlite content is different from what's in MySQL metadata DB (e.g. release `version` in `ensembl_release`) + +> `test.db` created when running tests is deleted once tests are executed. + +To take a look at the test data you can create a temporary `sampledb.db` importing `tables.sql` content using the command: + +``` +cat tables.sql | sqlite3 sampledb.db +``` + +You can then open `sampledb.db` using [DB Browser for SQLite](https://sqlitebrowser.org/dl/). + +### Automatic Formatting +``` +cd ensembl-metadata-api +black --check src tests +``` +Use `--diff` to print a diff of what Black would change, without actually changing the files. + +To actually reformat all files contained in `src` and `test`: +``` +cd ensembl-metadata-api +black src tests ``` ### Linting and type checking @@ -88,3 +138,20 @@ mypy src ``` Pylint will check the code for syntax, name errors and formatting style. Mypy will use type hints to statically type check the code. +======= +cd ensembl-metadata-service +pylint src tests +mypy src tests +``` +Pylint will check the code for syntax, name errors and formatting style. +Mypy will use type hints to statically type check the code. + +### To build docker image +``` +docker build -t ensembl-metadata-service . +``` + +### To run docker container +``` + docker run -t -i -e METADATA_URI= -e TAXONOMY_URI= -p 80:80 ensembl-metadata-service +``` diff --git a/VERSION b/VERSION index 8c9698aa..359a5b95 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.3.4 \ No newline at end of file +2.0.0 \ No newline at end of file diff --git a/conftest.py b/conftest.py index d05df27f..917f0e9a 100644 --- a/conftest.py +++ b/conftest.py @@ -10,11 +10,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -import logging from pathlib import Path -import pytest from _pytest.config import Config +import pytest +import sqlalchemy as db + +from ensembl.production.metadata.grpc.adaptors.genome import GenomeAdaptor +from ensembl.production.metadata.grpc.adaptors.release import ReleaseAdaptor pytest_plugins = ("ensembl.plugins.pytest_unittest",) @@ -22,3 +25,25 @@ def pytest_configure(config: Config) -> None: pytest.dbs_dir = Path(__file__).parent / 'src' / 'ensembl' / 'production' / 'metadata' / 'api' / 'sample' +@pytest.fixture(scope="class") +def engine(multi_dbs): + os.environ["METADATA_URI"] = multi_dbs["ensembl_metadata"].dbc.url + os.environ["TAXONOMY_URI"] = multi_dbs["ncbi_taxonomy"].dbc.url + yield db.create_engine(multi_dbs["ensembl_metadata"].dbc.url) + + +@pytest.fixture(scope="class") +def genome_db_conn(multi_dbs): + genome_conn = GenomeAdaptor( + metadata_uri=multi_dbs["ensembl_metadata"].dbc.url, + taxonomy_uri=multi_dbs["ncbi_taxonomy"].dbc.url + ) + yield genome_conn + + +@pytest.fixture(scope="class") +def release_db_conn(multi_dbs): + release_conn = ReleaseAdaptor( + metadata_uri=multi_dbs["ensembl_metadata"].dbc.url + ) + yield release_conn diff --git a/protos/ensembl/production/metadata/grpc/ensembl_metadata.proto b/protos/ensembl/production/metadata/grpc/ensembl_metadata.proto new file mode 100644 index 00000000..6b42c7cb --- /dev/null +++ b/protos/ensembl/production/metadata/grpc/ensembl_metadata.proto @@ -0,0 +1,433 @@ +/* + * This file defines what API calls we are going to handle + * where we got our gRPC definition and what request and response + * side or server side or both. + * It's used to generate two files: + * ensembl_metadata_pb2: which has all the different requests and responses + * ensembl_metadata_pb2_grpc: which has the services that we want to implement + * and the "stubs" that we will be using in the client side + */ + + +syntax = "proto3"; + +package ensembl_metadata; + +// IMPORTANT: the directory structure of the protos directory should mirror the structure of the src directory to avoid +// Python import errors. + +// Metadata for the genomes in Ensembl. +service EnsemblMetadata { + // Retrieve genome by its UUID. + rpc GetGenomeByUUID(GenomeUUIDRequest) returns (Genome) {} + + // Retrieve genome UUID by providing production name and assembly id. + rpc GetGenomeUUID(GenomeInfoRequest) returns (GenomeUUID) {} + + // Retrieve genomes by keyword search + rpc GetGenomesByKeyword(GenomeByKeywordRequest) returns (stream Genome) {} + + // Retrieve all genomes for a give assembly accession ID + rpc GetGenomesByAssemblyAccessionID(AssemblyAccessionIDRequest) returns (stream Genome) {} + + // Get species information for a genome UUID + rpc GetSpeciesInformation(GenomeUUIDRequest) returns (Species) {} + + // Get assembly information + rpc GetAssemblyInformation(AssemblyIDRequest) returns (AssemblyInfo) {} + + // Get subspecies information + rpc GetSubSpeciesInformation(OrganismIDRequest) returns (SubSpecies) {} + + // Get top level statistics + rpc GetTopLevelStatistics(OrganismIDRequest) returns (TopLevelStatistics) {} + + // Get top level statistics by UUID + rpc GetTopLevelStatisticsByUUID(GenomeUUIDRequest) returns (TopLevelStatisticsByUUID) {} + + // Retrieve genome by Ensembl name and site, and optionally release. + rpc GetGenomeByName(GenomeNameRequest) returns (Genome) {} + + // Retrieve release details. + rpc GetRelease(ReleaseRequest) returns (stream Release) {} + + // Retrieve release details for a genome. + rpc GetReleaseByUUID(GenomeUUIDRequest) returns (stream Release) {} + + // Retrieve sequence metadata for a genome's assembly. + rpc GetGenomeSequence(GenomeSequenceRequest) returns (stream GenomeSequence) {} + + // Retrieve region information for a genome's assembly. + rpc GetAssemblyRegion(AssemblyRegionRequest) returns (stream AssemblyRegion) {} + + // Retrieve region information for a genome's assembly with a given sequence region name. + rpc GetGenomeAssemblySequenceRegion(GenomeAssemblySequenceRegionRequest) returns (GenomeAssemblySequenceRegion) {} + + // Retrieve a list of dataset_ids associated with a genome UUID. + rpc GetDatasetsListByUUID(DatasetsRequest) returns (Datasets) {} + + // Retrieve dataset info by genome uuid and dataset_type + rpc GetDatasetInformation(GenomeDatatypeRequest) returns (DatasetInfos) {} + + // Retrieve organisms group count + rpc GetOrganismsGroupCount(OrganismsGroupRequest) returns (OrganismsGroupCount) {} + + // Retrieve genome UUID by genome_tag. + // genome_tag value will be either in assembly.url_name or tol_id column + rpc GetGenomeUUIDByTag(GenomeTagRequest) returns (GenomeUUID) {} +} + +/* +A genome is a collection of datasets for an assembly, +which may or may not be in the current Ensembl release. + */ +message Genome { + string genome_uuid = 1; + Assembly assembly = 2; + Taxon taxon = 3; + string created = 4; + Organism organism = 5; + AttributesInfo attributes_info = 6; + int32 related_assemblies_count = 7; + Release release = 8; +} + +message Species { + string genome_uuid = 1; + uint32 taxon_id = 2; + string scientific_name = 3; + string scientific_parlance_name = 4; + string genbank_common_name = 5; + repeated string synonym = 6; +} + +message AssemblyInfo { + string assembly_uuid = 1; + string accession = 2; + string level = 3; + string name = 4; + uint32 chromosomal = 5; + // length will be a string, because some browser can't handle a full 64bit value as a number, + // so the only way to ensure they can work for those clients is to handle them as strings + // source: https://github.com/protocolbuffers/protobuf/issues/2679 + uint64 length = 6; + string sequence_location = 7; + string md5 = 8; + string sha512t24u = 9; +} + +message SubSpecies { + string organism_uuid = 1; + repeated string species_type = 2; + repeated string species_name = 3; +} + +/* +Statistics related objects. + */ +message AttributeStatistics { + string name = 1; + string label = 2; + string statistic_type = 3; + string statistic_value = 4; +} + +message TopLevelStatisticsByUUID { + string genome_uuid = 1; + repeated AttributeStatistics statistics = 2; +} + +message TopLevelStatistics { + string organism_uuid = 1; + repeated TopLevelStatisticsByUUID stats_by_genome_uuid = 2; +} + +/* +An INSDC record of a genome assembly. + */ +message Assembly { + string accession = 1; + string name = 2; + string ucsc_name = 3; + string level = 4; + string ensembl_name = 5; + string assembly_uuid = 6; + bool is_reference = 7; + string url_name = 8; + string tol_id = 9; +} + +/* +Taxonomic information derived from the NCBI Taxonomy. + */ +message Taxon { + uint32 taxonomy_id = 1; + string scientific_name = 2; + string strain = 3; + repeated string alternative_names = 4; +} + +/* +Release details for all Ensembl sites. + */ +message Release { + double release_version = 1; + string release_date = 2; + string release_label = 3; + bool is_current = 4; + string site_name = 5; + string site_label = 6; + string site_uri = 7; +} + +message Organism { + string common_name = 1; + string strain = 2; + string scientific_name = 3; + string ensembl_name = 4; + string scientific_parlance_name = 5; + string organism_uuid = 6; + string strain_type = 7; + int32 taxonomy_id = 8; + int32 species_taxonomy_id = 9; +} + +message Attribute { + string name = 1; + string label = 2; + string description = 3; + string type = 4; +} + +message AttributesInfo { + string genebuild_method = 1; + string genebuild_method_display = 2; + string genebuild_last_geneset_update = 3; + string genebuild_version = 4; + string genebuild_provider_name = 5; + string genebuild_provider_url = 6; + string genebuild_sample_gene = 7; + string genebuild_sample_location = 8; + string assembly_level = 9; + string assembly_date = 10; + string assembly_provider_name = 11; + string assembly_provider_url = 12; + string variation_sample_variant = 13; +} + +/* +Wrapper for a list of DatasetInfo objects +*/ +message DatasetInfos { + string genome_uuid = 1; + string dataset_type = 2; + /* + Details for datasets + */ + message DatasetInfo { + string dataset_uuid = 1; + string dataset_name = 2; + string name = 3; + string type = 4; + string dataset_version = 5; + string dataset_label = 6; + double version = 7; + string value = 8; + } + repeated DatasetInfo dataset_infos = 3; +} + +/* +Metadata about the sequences that comprise a genome's assembly. + */ +message GenomeSequence { + string accession = 1; + string name = 2; + string sequence_location = 3; + uint64 length = 4; + bool chromosomal = 5; +} + +/* +Metadata about the sequences that comprise a genome's assembly. + */ +message AssemblyRegion { + string name = 1; + int32 rank = 2; + string md5 = 3; + uint64 length = 4; + string sha512t24u = 5; + bool chromosomal = 6; +} + +/* +Metadata about the sequences that comprise a genome's assembly. + */ +message GenomeAssemblySequenceRegion { + string name = 1; + string md5 = 2; + uint64 length = 3; + string sha512t24u = 4; + bool chromosomal = 5; +} + +/* +Datasets list associated to the provided genome_uuid + */ +message Datasets { + string genome_uuid = 1; + // https://stackoverflow.com/q/56401376/4488332 + // https://developers.google.com/protocol-buffers/docs/reference/python-generated#map-fields + map datasets = 2; +} + +/* +Genome UUID + */ +message GenomeUUID { + string genome_uuid = 1; +} + +message OrganismsGroup { + uint32 species_taxonomy_id = 1; + string ensembl_name = 2; + string common_name = 3; + string scientific_name = 4; + uint32 order = 5; + uint32 count = 6; +} + +message OrganismsGroupCount { + repeated OrganismsGroup organisms_group_count = 1; + double release_version = 2; +} + + +/* +The messages below are used to request data - required-ness is not enforced +by protocol buffers, but in practice some fields are mandatory in order to +receive a non-empty response, so this is indicated with a comment. + */ + +/* +Genome UUID filter. +If release_version is not given, the current version is used. + */ +message GenomeUUIDRequest { + string genome_uuid = 1; // Mandatory + double release_version = 2; // Optional +} + +/* +Genome keyword filter. +If release_version is not given, the current version is used. + */ +message GenomeByKeywordRequest { + string keyword = 1; // Mandatory + double release_version = 2; // Optional +} + +/* +Genome name filter. +If release_version is not given, the current version is used. + */ +message GenomeNameRequest { + string ensembl_name = 1; // Mandatory + string site_name = 2; // Mandatory + double release_version = 3; // Optional +} + +/* +Assembly ID filter + */ +message AssemblyIDRequest { + string assembly_uuid = 1; // Mandatory + double release_version = 2; // Optional +} + +/* +Assembly ID filter + */ +message AssemblyAccessionIDRequest { + string assembly_accession = 1; // Mandatory + double release_version = 2; // Optional +} + +message OrganismIDRequest { + string organism_uuid = 1; + string group = 2; +} + +/* +Release filter. +An empty message will return all releases, for all sites. + */ +message ReleaseRequest { + repeated string site_name = 1; // Optional + repeated double release_version = 2; // Optional + bool current_only = 3; // Optional +} + +/* +Genome sequence filter. + */ +message GenomeSequenceRequest { + string genome_uuid = 1; // Mandatory + bool chromosomal_only = 2; // Optional +} + +/* +Genome sequence filter. + */ +message AssemblyRegionRequest { + string genome_uuid = 1; // Mandatory + bool chromosomal_only = 2; // Optional +} + +/* +Genome sequence filter. +TODO: Is there a way to DRY this since it's equivalent to GetAssemblyRegionRequest above plus sequence_region_name + */ +message GenomeAssemblySequenceRegionRequest { + string genome_uuid = 1; // Mandatory + string sequence_region_name = 2; // Mandatory +} + +/* +Datasets filter. + */ +message DatasetsRequest { + string genome_uuid = 1; // Mandatory + double release_version = 2; // Optional +} + +/* +Genome datatype filter + */ +message GenomeDatatypeRequest { + string genome_uuid = 1; // Mandatory + string dataset_type = 2; // Mandatory +} + +/* +Genome info filter used by Compara (EA-1090) + */ +message GenomeInfoRequest { + string ensembl_name = 1; // Mandatory + string assembly_name = 2; // Mandatory + bool use_default = 3; // Optional +} + +/* +Organisms group count request + */ +message OrganismsGroupRequest { + double release_version = 1; // Optional +} + +/* +Genome Tag request + */ +message GenomeTagRequest { + string genome_tag = 1; // Mandatory +} diff --git a/requirements-test.txt b/requirements-test.txt index 7fa2ef45..019b6b38 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,5 @@ +-r requirements.txt pytest pylint mypy -coverage +coverage[toml] diff --git a/requirements.in b/requirements.in index b09cc994..a399c8ff 100644 --- a/requirements.in +++ b/requirements.in @@ -1 +1,5 @@ -ensembl-py@git+https://github.com/Ensembl/ensembl-py.git@1.2.2 \ No newline at end of file +ensembl-py@git+https://github.com/Ensembl/ensembl-py.git@1.2.2 +grpcio +grpcio-tools +sqlalchemy +types-pymysql diff --git a/requirements.txt b/requirements.txt index a2455b19..27afc60a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,8 +14,14 @@ ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git@1.2.2 # via -r requirements.in exceptiongroup==1.2.0 # via pytest -greenlet==3.0.1 +greenlet==3.0.3 # via sqlalchemy +grpcio==1.60.0 + # via + # -r requirements.in + # grpcio-tools +grpcio-tools==1.60.0 + # via -r requirements.in idna==3.6 # via requests iniconfig==2.0.0 @@ -26,7 +32,9 @@ packaging==23.2 # via pytest pluggy==1.3.0 # via pytest -pytest==7.4.3 +protobuf==4.25.2 + # via grpcio-tools +pytest==7.4.4 # via # ensembl-py # pytest-dependency @@ -38,13 +46,19 @@ pyyaml==6.0.1 # via ensembl-py requests==2.31.0 # via ensembl-py -sqlalchemy==1.4.50 +sqlalchemy==1.4.51 # via + # -r requirements.in # ensembl-py # sqlalchemy-utils sqlalchemy-utils==0.38.3 # via ensembl-py tomli==2.0.1 # via pytest +types-pymysql==1.1.0.1 + # via -r requirements.in urllib3==2.1.0 # via requests + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/setup.py b/setup.py index 28ab9370..7bd0dab1 100644 --- a/setup.py +++ b/setup.py @@ -27,20 +27,20 @@ def import_requirements(): setup( - name='ensembl_metadata_api', + name='ensembl-metadata-api', version=os.getenv('CI_COMMIT_TAG', version), description='Ensembl Metadata API', long_description=readme, - author='Daniel Poppleton,Marc Chakiachvili,Vinay Kaikala', - author_email='danielp@ebi.ac.uk,mchakiachvili@ebi.ac.uk,vinay@ebi.ac.uk', + author='Alisha Aneja,Bilal El Houdaigui,Daniel Poppleton,Marc Chakiachvili,Sanjay boddu,Vinay Kaikala', + author_email='aaneja@ebi.ac.uk,bilal@ebi.ac.uk, danielp@ebi.ac.uk,mchakiachvili@ebi.ac.uk,sboddu@ebi.ac.uk,vinay@ebi.ac.uk', url='https://www.ensembl.org', download_url='https://github.com/Ensembl/ensembl-metadata-api', license='Apache License 2.0', packages=find_namespace_packages(where='src', include=['ensembl.*']), package_dir={'': 'src'}, include_package_data=True, - python_requires='>=3.8', - install_requires=import_requirements(), + python_requires='>=3.8.9', + install_requires=[import_requirements()], classifiers=[ "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", diff --git a/src/ensembl/production/metadata/grpc/__init__.py b/src/ensembl/production/metadata/grpc/__init__.py new file mode 100644 index 00000000..37af0dc4 --- /dev/null +++ b/src/ensembl/production/metadata/grpc/__init__.py @@ -0,0 +1,11 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/ensembl/production/metadata/grpc/adaptors/__init__.py b/src/ensembl/production/metadata/grpc/adaptors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ensembl/production/metadata/grpc/adaptors/base.py b/src/ensembl/production/metadata/grpc/adaptors/base.py new file mode 100644 index 00000000..8c9b0612 --- /dev/null +++ b/src/ensembl/production/metadata/grpc/adaptors/base.py @@ -0,0 +1,31 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ensembl.database import DBConnection +from ensembl.production.metadata.grpc.config import MetadataConfig as config + + +##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies. +# Add in best genome (see doc) +# More functions for related genomes + + +class BaseAdaptor: + def __init__(self, metadata_uri): + self.metadata_db = DBConnection(metadata_uri, pool_size=config.pool_size, pool_recycle=config.pool_recycle) + + +def check_parameter(param): + if isinstance(param, tuple): + param = param[0] + if param is not None and not isinstance(param, list): + param = [param] + return param diff --git a/src/ensembl/production/metadata/grpc/adaptors/genome.py b/src/ensembl/production/metadata/grpc/adaptors/genome.py new file mode 100644 index 00000000..09551860 --- /dev/null +++ b/src/ensembl/production/metadata/grpc/adaptors/genome.py @@ -0,0 +1,674 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sqlalchemy as db +from sqlalchemy.orm import aliased +from ensembl.database import DBConnection +from ensembl.ncbi_taxonomy.models import NCBITaxaName +from ensembl.production.metadata.grpc.adaptors.base import BaseAdaptor, check_parameter +from ensembl.production.metadata.api.models import Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember, \ + GenomeRelease, EnsemblRelease, EnsemblSite, AssemblySequence, GenomeDataset, Dataset, DatasetType, DatasetSource, \ + Attribute, DatasetAttribute +import logging + +from ensembl.production.metadata.grpc.config import MetadataConfig + +logger = logging.getLogger(__name__) + + +class GenomeAdaptor(BaseAdaptor): + def __init__(self, metadata_uri: str, taxonomy_uri: str): + super().__init__(metadata_uri) + self.taxonomy_db = DBConnection(taxonomy_uri, pool_size=MetadataConfig.pool_size, pool_recycle=MetadataConfig.pool_recycle) + + def fetch_taxonomy_names(self, taxonomy_ids, synonyms=None): + + if synonyms is None: + synonyms = [] + taxonomy_ids = check_parameter(taxonomy_ids) + synonyms = [ + "common name", + "equivalent name", + "genbank synonym", + "synonym", + ] if len(check_parameter(synonyms)) == 0 else synonyms + required_class_name = ["genbank common name", "scientific name"] + taxons = {} + with self.taxonomy_db.session_scope() as session: + for tid in taxonomy_ids: + taxons[tid] = {"scientific_name": None, "genbank_common_name": None, "synonym": []} + + taxonomyname_query = db.select( + NCBITaxaName.name, + NCBITaxaName.name_class, + ).filter( + NCBITaxaName.taxon_id == tid, + NCBITaxaName.name_class.in_(required_class_name + synonyms), + ) + + for taxon_name in session.execute(taxonomyname_query).all(): + if taxon_name[1] in synonyms: + taxons[tid]['synonym'].append(taxon_name[0]) + if taxon_name[1] in required_class_name: + taxon_format_name = "_".join(taxon_name[1].split(' ')) + taxons[tid][taxon_format_name] = taxon_name[0] + return taxons + + def fetch_taxonomy_ids(self, taxonomy_names): + taxids = [] + taxonomy_names = check_parameter(taxonomy_names) + for taxon in taxonomy_names: + taxa_name_select = db.select( + NCBITaxaName.taxon_id + ).filter( + NCBITaxaName.name == taxon + ) + with self.taxonomy_db.session_scope() as session: + logger.debug(taxa_name_select) + taxid = session.execute(taxa_name_select).one() + taxids.append(taxid[0]) + return taxids + + def fetch_genomes(self, genome_id=None, genome_uuid=None, genome_tag=None, organism_uuid=None, assembly_uuid=None, + assembly_accession=None, assembly_name=None, use_default_assembly=False, ensembl_name=None, + taxonomy_id=None, group=None, group_type=None, allow_unreleased=False, unreleased_only=False, + site_name=None, release_type=None, release_version=None, current_only=True): + """ + Fetches genome information based on the specified parameters. + + Args: + genome_id (Union[int, List[int]]): The ID(s) of the genome(s) to fetch. + genome_uuid (Union[str, List[str]]): The UUID(s) of the genome(s) to fetch. + genome_tag (Union[str, List[str]]): genome_tag value is either in Assembly.url_name or told_id. + organism_uuid (Union[str, List[str]]): The UUID(s) of the organism(s) to fetch. + assembly_uuid (Union[str, List[str]]): The UUID(s) of the assembly(s) to fetch. + assembly_accession (Union[str, List[str]]): The assenbly accession of the assembly(s) to fetch. + assembly_name (Union[str, List[str]]): The name(s) of the assembly(s) to fetch. + use_default_assembly (bool): Whether to use default assembly name or not. + ensembl_name (Union[str, List[str]]): The Ensembl name(s) of the organism(s) to fetch. + taxonomy_id (Union[int, List[int]]): The taxonomy ID(s) of the organism(s) to fetch. + group (Union[str, List[str]]): The name(s) of the organism group(s) to filter by. + group_type (Union[str, List[str]]): The type(s) of the organism group(s) to filter by. + allow_unreleased (bool): Whether to fetch unreleased genomes too or not (default: False). + unreleased_only (bool): Fetch only unreleased genomes (default: False). allow_unreleased is used by gRPC + to fetch both released and unreleased genomes, while unreleased_only + is used in production pipelines (fetches only unreleased genomes) + site_name (str): The name of the Ensembl site to filter by. + release_type (str): The type of the Ensembl release to filter by. + release_version (int): The maximum version of the Ensembl release to filter by. + current_only (bool): Whether to fetch only current genomes. + + Returns: + List[Tuple[Genome, Organism, Assembly, EnsemblRelease]]: A list of tuples containing the fetched genome information. + Each tuple contains the following elements: + - Genome: An instance of the Genome class. + - Organism: An instance of the Organism class. + - Assembly: An instance of the Assembly class. + - EnsemblRelease: An instance of the EnsemblRelease class. + + Notes: + - The parameters are not mutually exclusive, meaning more than one of them can be provided at a time. + - The function uses a database session to execute the query and returns the results as a list of tuples. + - The results are ordered by the Ensembl name. + + Example usage: + genome_info = fetch_genomes(genome_id=12345) + """ + # Parameter validation + genome_id = check_parameter(genome_id) + genome_uuid = check_parameter(genome_uuid) + genome_tag = check_parameter(genome_tag) + organism_uuid = check_parameter(organism_uuid) + assembly_uuid = check_parameter(assembly_uuid) + assembly_accession = check_parameter(assembly_accession) + assembly_name = check_parameter(assembly_name) + ensembl_name = check_parameter(ensembl_name) + taxonomy_id = check_parameter(taxonomy_id) + group = check_parameter(group) + group_type = check_parameter(group_type) + + # Construct the initial database query + genome_select = db.select( + Genome, Organism, Assembly + ).select_from(Genome) \ + .join(Organism, Organism.organism_id == Genome.organism_id) \ + .join(Assembly, Assembly.assembly_id == Genome.assembly_id) + + # Apply group filtering if group parameter is provided + if group: + group_type = group_type if group_type else ['Division'] + genome_select = db.select( + Genome, Organism, Assembly, OrganismGroup, OrganismGroupMember + ).join(Genome.assembly).join(Genome.organism) \ + .join(Organism.organism_group_members) \ + .join(OrganismGroupMember.organism_group) \ + .filter(OrganismGroup.type.in_(group_type)).filter(OrganismGroup.name.in_(group)) + + # Apply additional filters based on the provided parameters + if genome_id is not None: + genome_select = genome_select.filter(Genome.genome_id.in_(genome_id)) + + if genome_uuid is not None: + genome_select = genome_select.filter(Genome.genome_uuid.in_(genome_uuid)) + + if genome_tag is not None: + genome_select = genome_select.filter( + db.or_( + Assembly.url_name.in_(genome_tag), + Assembly.tol_id.in_(genome_tag) + ) + ) + + if organism_uuid is not None: + genome_select = genome_select.filter(Organism.organism_uuid.in_(organism_uuid)) + + if assembly_uuid is not None: + genome_select = genome_select.filter(Assembly.assembly_uuid.in_(assembly_uuid)) + + if assembly_accession is not None: + genome_select = genome_select.filter(Assembly.accession.in_(assembly_accession)) + + if assembly_name is not None: + # case() function is used to conditionally select between columns, sql equivalent is: + # CASE + # WHEN :use_default_assembly = 1 THEN assembly.assembly_default + # ELSE assembly.name + # END + conditional_column = db.case( + # literal is used to prevent evaluating use_default_assembly to a boolean (True or False) + [(db.literal(use_default_assembly) == 1, Assembly.assembly_default)], + else_=Assembly.name + ) + lowered_assemblies = [name.lower() for name in assembly_name] + genome_select = genome_select.filter(db.func.lower(conditional_column).in_(lowered_assemblies)) + + if ensembl_name is not None: + genome_select = genome_select.filter(Organism.ensembl_name.in_(ensembl_name)) + + if taxonomy_id is not None: + genome_select = genome_select.filter(Organism.taxonomy_id.in_(taxonomy_id)) + + if allow_unreleased: + # fetch everything (released + unreleased) + pass + elif unreleased_only: + # fetch unreleased only + # this filter will get all Genome entries where there's no associated GenomeRelease + # the tilde (~) symbol is used for negation. + genome_select = genome_select.filter(~Genome.genome_releases.any()) + else: + # fetch released only + # Check if genome is released + # TODO: why did I add this check?! -> removing this breaks the test_update tests + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + # copy genome_select as we don't want to include GenomeDataset + # because it results in multiple row for a given genome (genome can have many datasets) + check_query = genome_select + prep_query = check_query.add_columns(GenomeDataset) \ + .join(GenomeDataset, Genome.genome_id == GenomeDataset.genome_id) \ + .filter(GenomeDataset.release_id.isnot(None)) + is_genome_released = session.execute(prep_query).first() + + if is_genome_released: + # Include release related info if released_only is True + genome_select = genome_select.add_columns(GenomeRelease, EnsemblRelease, EnsemblSite) \ + .join(GenomeRelease, Genome.genome_id == GenomeRelease.genome_id) \ + .join(EnsemblRelease, GenomeRelease.release_id == EnsemblRelease.release_id) \ + .join(EnsemblSite, EnsemblSite.site_id == EnsemblRelease.site_id) + + if release_version is not None and release_version > 0: + # if release is specified + genome_select = genome_select.filter(EnsemblRelease.version <= release_version) + current_only = False + + if current_only: + genome_select = genome_select.filter(GenomeRelease.is_current == 1) + + if site_name is not None: + genome_select = genome_select.add_columns(EnsemblSite).filter(EnsemblSite.name == site_name) + + if release_type is not None: + genome_select = genome_select.filter(EnsemblRelease.release_type == release_type) + + # print(f"genome_select query ====> {str(genome_select)}") + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(genome_select.order_by("ensembl_name")).all() + + def fetch_genomes_by_genome_uuid(self, genome_uuid, allow_unreleased=False, site_name=None, release_type=None, + release_version=None, current_only=True): + return self.fetch_genomes( + genome_uuid=genome_uuid, + allow_unreleased=allow_unreleased, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_assembly_accession(self, assembly_accession, allow_unreleased=False, site_name=None, + release_type=None, release_version=None, current_only=True): + return self.fetch_genomes( + assembly_accession=assembly_accession, + allow_unreleased=allow_unreleased, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_ensembl_name(self, ensembl_name, allow_unreleased=False, site_name=None, release_type=None, + release_version=None, current_only=True): + return self.fetch_genomes( + ensembl_name=ensembl_name, + allow_unreleased=allow_unreleased, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_taxonomy_id(self, taxonomy_id, allow_unreleased=False, site_name=None, release_type=None, + release_version=None, current_only=True): + return self.fetch_genomes( + taxonomy_id=taxonomy_id, + allow_unreleased=allow_unreleased, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_scientific_name( + self, + scientific_name, + allow_unreleased=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + taxonomy_ids = self.fetch_taxonomy_ids(scientific_name) + + return self.fetch_genomes_by_taxonomy_id( + taxonomy_ids, + allow_unreleased=allow_unreleased, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genome_by_keyword(self, keyword=None, release_version=None): + """ + Fetches genomes based on a keyword and release version. + + Args: + keyword (str or None): Keyword to search for in various attributes of genomes, assemblies, and organisms. + release_version (int or None): Release version to filter by. If set to 0 or None, fetches only current genomes. + + Returns: + list: A list of fetched genomes matching the keyword and release version. + """ + genome_query = db.select( + Genome, GenomeRelease, EnsemblRelease, Assembly, Organism, EnsemblSite + ).select_from(Genome) \ + .outerjoin(Organism, Organism.organism_id == Genome.organism_id) \ + .outerjoin(Assembly, Assembly.assembly_id == Genome.assembly_id) \ + .outerjoin(GenomeRelease, Genome.genome_id == GenomeRelease.genome_id) \ + .outerjoin(EnsemblRelease, GenomeRelease.release_id == EnsemblRelease.release_id) \ + .outerjoin(EnsemblSite, EnsemblSite.site_id == EnsemblRelease.site_id) + + if keyword is not None: + genome_query = genome_query.where(db.or_(db.func.lower(Assembly.tol_id) == keyword.lower(), + db.func.lower(Assembly.accession) == keyword.lower(), + db.func.lower(Assembly.name) == keyword.lower(), + db.func.lower(Assembly.ensembl_name) == keyword.lower(), + db.func.lower(Organism.common_name) == keyword.lower(), + db.func.lower(Organism.scientific_name) == keyword.lower(), + db.func.lower( + Organism.scientific_parlance_name) == keyword.lower(), + db.func.lower(Organism.species_taxonomy_id) == keyword.lower())) + + if release_version == 0 or release_version is None: + genome_query = genome_query.where(EnsemblRelease.is_current == 1) + else: + genome_query = genome_query.where(EnsemblRelease.version <= release_version) + + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(genome_query).all() + + def fetch_sequences(self, genome_id=None, genome_uuid=None, assembly_uuid=None, assembly_accession=None, + assembly_sequence_accession=None, assembly_sequence_name=None, chromosomal_only=False): + """ + Fetches sequences based on the provided parameters. + + Args: + genome_id (int or None): Genome ID to filter by. + genome_uuid (str or None): Genome UUID to filter by. + assembly_uuid (Union[str, List[str]]): The assembly_uuid of the assembly(s) to fetch. + assembly_accession (str or None): Assembly accession to filter by. + assembly_sequence_accession (str or None): Assembly Sequence accession to filter by. + assembly_sequence_name (str or None): Assembly Sequence name to filter by. + chromosomal_only (bool): Flag indicating whether to fetch only chromosomal sequences. + + Returns: + list: A list of fetched sequences. + """ + genome_id = check_parameter(genome_id) + genome_uuid = check_parameter(genome_uuid) + assembly_uuid = check_parameter(assembly_uuid) + assembly_accession = check_parameter(assembly_accession) + assembly_sequence_accession = check_parameter(assembly_sequence_accession) + assembly_sequence_name = check_parameter(assembly_sequence_name) + + seq_select = db.select( + Genome, Assembly, AssemblySequence + ).select_from(Genome) \ + .join(Assembly, Assembly.assembly_id == Genome.assembly_id) \ + .join(AssemblySequence, AssemblySequence.assembly_id == Assembly.assembly_id) + + if chromosomal_only: + seq_select = seq_select.filter(AssemblySequence.chromosomal == 1) + + # These options are in order of decreasing specificity, + # and thus the ones later in the list can be redundant. + if genome_id is not None: + seq_select = seq_select.filter(Genome.genome_id == genome_id) + + if genome_uuid is not None: + seq_select = seq_select.filter(Genome.genome_uuid == genome_uuid) + + if assembly_accession is not None: + seq_select = seq_select.filter(Assembly.accession == assembly_accession) + + if assembly_uuid is not None: + seq_select = seq_select.filter(Assembly.assembly_uuid.in_(assembly_uuid)) + + if assembly_sequence_accession is not None: + seq_select = seq_select.filter(AssemblySequence.accession == assembly_sequence_accession) + + if assembly_sequence_name is not None: + seq_select = seq_select.filter(AssemblySequence.name == assembly_sequence_name) + + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(seq_select).all() + + def fetch_sequences_by_genome_uuid(self, genome_uuid, chromosomal_only=False): + return self.fetch_sequences( + genome_uuid=genome_uuid, chromosomal_only=chromosomal_only + ) + + def fetch_sequences_by_assembly_accession( + self, assembly_accession, chromosomal_only=False + ): + return self.fetch_sequences( + assembly_accession=assembly_accession, chromosomal_only=chromosomal_only + ) + + def fetch_genome_datasets(self, genome_id=None, genome_uuid=None, organism_uuid=None, allow_unreleased=False, + unreleased_only=False, dataset_uuid=None, dataset_name=None, dataset_source=None, + dataset_type=None, release_version=None, dataset_attributes=None): + """ + Fetches genome datasets based on the provided parameters. + + Args: + genome_id (int or list or None): Genome ID(s) to filter by. + genome_uuid (str or list or None): Genome UUID(s) to filter by. + organism_uuid (str or list or None): Organism UUID(s) to filter by. + allow_unreleased (bool): Flag indicating whether to allowing fetching unreleased datasets too or not. + unreleased_only (bool): Fetch only unreleased datasets (default: False). allow_unreleased is used by gRPC + to fetch both released and unreleased datasets, while unreleased_only + is used in production pipelines (fetches only unreleased datasets) + dataset_uuid (str or list or None): Dataset UUID(s) to filter by. + dataset_name (str or None): Dataset name to filter by, default is 'assembly'. + dataset_source (str or None): Dataset source to filter by. + dataset_type (str or None): Dataset type to filter by. + release_version (float or None): EnsemblRelease version to filter by. + dataset_attributes (bool): Flag to include dataset attributes + + Returns: + List[Tuple[ + Genome, GenomeDataset, Dataset, DatasetType, + DatasetSource, EnsemblRelease, DatasetAttribute, Attribute + ]]: A list of tuples containing the fetched genome information. + Each tuple contains the following elements: + - Genome: An instance of the Genome class. + - Organism: An instance of the Organism class. + - GenomeDataset: An instance of the GenomeDataset class. + - Dataset: An instance of the Dataset class. + - DatasetType: An instance of the DatasetType class. + - DatasetSource: An instance of the DatasetSource class. + - EnsemblRelease: An instance of the EnsemblRelease class. + - DatasetAttribute: An instance of the DatasetAttribute class. + - Attribute: An instance of the Attribute class. + + Raises: + ValueError: If an exception occurs during the fetch process. + + """ + try: + genome_select = db.select( + Genome, + GenomeDataset, + Dataset, + DatasetType, + DatasetSource, + ).select_from(Genome) \ + .join(GenomeDataset, Genome.genome_id == GenomeDataset.genome_id) \ + .join(Dataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ + .join(DatasetType, Dataset.dataset_type_id == DatasetType.dataset_type_id) \ + .join(DatasetSource, Dataset.dataset_source_id == DatasetSource.dataset_source_id).order_by( + Genome.genome_uuid, Dataset.dataset_uuid) + + # set default group topic as 'assembly' to fetch unique datasource + if not dataset_name: + dataset_name = "assembly" + + genome_id = check_parameter(genome_id) + genome_uuid = check_parameter(genome_uuid) + organism_uuid = check_parameter(organism_uuid) + dataset_uuid = check_parameter(dataset_uuid) + dataset_name = check_parameter(dataset_name) + dataset_source = check_parameter(dataset_source) + dataset_type = check_parameter(dataset_type) + + if genome_id is not None: + genome_select = genome_select.filter(Genome.genome_id.in_(genome_id)) + + if genome_uuid is not None: + genome_select = genome_select.filter(Genome.genome_uuid.in_(genome_uuid)) + + if organism_uuid is not None: + genome_select = genome_select.join(Organism, Organism.organism_id == Genome.organism_id) \ + .filter(Organism.organism_uuid.in_(organism_uuid)) + + if dataset_uuid is not None: + genome_select = genome_select.filter(Dataset.dataset_uuid.in_(dataset_uuid)) + + if "all" in dataset_name: + # TODO: fetch the list dynamically from the DB + # TODO: you can as well simply remove the filter, if you want them all. + dataset_type_names = [ + 'assembly', 'genebuild', 'variation', 'evidence', + 'regulation_build', 'homologies', 'regulatory_features' + ] + genome_select = genome_select.filter(DatasetType.name.in_(dataset_type_names)) + else: + genome_select = genome_select.filter(DatasetType.name.in_(dataset_name)) + + if dataset_source is not None: + genome_select = genome_select.filter(DatasetSource.name.in_(dataset_source)) + + if dataset_type is not None: + genome_select = genome_select.filter(DatasetType.name.in_(dataset_type)) + + if dataset_attributes: + genome_select = genome_select.add_columns(DatasetAttribute, Attribute) \ + .join(DatasetAttribute, DatasetAttribute.dataset_id == Dataset.dataset_id) \ + .join(Attribute, Attribute.attribute_id == DatasetAttribute.attribute_id).order_by(Attribute.name) + + if allow_unreleased: + # Get everything + pass + elif unreleased_only: + # Get only unreleased datasets + # this filter will get all Datasets entries where there's no associated GenomeDataset + # the tilde (~) symbol is used for negation. + genome_select = genome_select.filter(~GenomeDataset.ensembl_release.has()) + else: + # Get released datasets only + # Check if dataset is released + with self.metadata_db.session_scope() as session: + # This is needed in order to ovoid tests throwing: + # sqlalchemy.orm.exc.DetachedInstanceError: Instance + # is not bound to a Session; attribute refresh operation cannot proceed + # (Background on this error at: https://sqlalche.me/e/14/bhk3) + session.expire_on_commit = False + # Check if GenomeDataset HAS an ensembl_release + prep_query = genome_select.filter(GenomeDataset.ensembl_release.has()) + is_dataset_released = session.execute(prep_query).first() + + if is_dataset_released: + # Include release related info + genome_select = genome_select.add_columns(EnsemblRelease) \ + .join(EnsemblRelease, GenomeDataset.release_id == EnsemblRelease.release_id) + + if release_version: + genome_select = genome_select.filter(EnsemblRelease.version <= release_version) + + # print(f"genome_select str ====> {str(genome_select)}") + logger.debug(genome_select) + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(genome_select).all() + + except Exception as e: + raise ValueError(str(e)) + + def fetch_genomes_info( + self, + genome_id=None, + genome_uuid=None, + allow_unreleased_genomes=False, + ensembl_name=None, + group=None, + group_type=None, + allow_unreleased_datasets=False, + dataset_name=None, + dataset_source=None, + dataset_attributes=True, + + ): + try: + genome_id = check_parameter(genome_id) + genome_uuid = check_parameter(genome_uuid) + ensembl_name = check_parameter(ensembl_name) + group = check_parameter(group) + group_type = check_parameter(group_type) + dataset_name = check_parameter(dataset_name) + dataset_source = check_parameter(dataset_source) + + if group is None: + group_type = group_type if group_type else ['Division'] + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + group = [org_type[0] for org_type in session.execute( + db.select(OrganismGroup.name).filter(OrganismGroup.type.in_(group_type))).all()] + + # get genome, assembly and organism information + genomes = self.fetch_genomes( + genome_id=genome_id, + genome_uuid=genome_uuid, + allow_unreleased=allow_unreleased_genomes, + ensembl_name=ensembl_name, + group=group, + group_type=group_type, + ) + + for genome in genomes: + dataset = self.fetch_genome_datasets( + genome_uuid=genome[0].genome_uuid, + allow_unreleased=allow_unreleased_datasets, + dataset_name=dataset_name, + dataset_source=dataset_source, + dataset_attributes=dataset_attributes + ) + res = [{'genome': genome, 'datasets': dataset}] + yield res + except Exception as e: + raise ValueError(str(e)) + + def fetch_organisms_group_counts(self, release_version=None, group_code='popular'): + o_species = aliased(Organism) + o = aliased(Organism) + if not release_version: + # Get latest released organisms + query = db.select( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order.label('order'), + db.func.count().label('count') + ) + + query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) + query = query.join(Genome, o.organism_id == Genome.organism_id) + query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id) + query = query.join(OrganismGroupMember, o_species.organism_id == OrganismGroupMember.organism_id) + query = query.join(OrganismGroup, + OrganismGroupMember.organism_group_id == OrganismGroup.organism_group_id) + query = query.filter(OrganismGroup.code == group_code) + + query = query.group_by( + o_species.species_taxonomy_id, + o_species.ensembl_name, + o_species.common_name, + o_species.scientific_name, + OrganismGroupMember.order + ) + query = query.order_by(OrganismGroupMember.order) + else: + # change group to release_version_state and related genomes + raise NotImplementedError('Not implemented yet') + pass + + with self.metadata_db.session_scope() as session: + # TODO check if we should return a dictionary instead + return session.execute(query).all() + + def fetch_related_assemblies_count(self, organism_uuid, release_version=None): + """ + Fetch all related assemblies for the same organism and all the ones sharing the same species_taxon_id + release_version is to return only the ones which were available unitl this release_version (not implemented yet) + """ + o_species = aliased(Organism) + o = aliased(Organism) + if not release_version: + # Get latest released organisms + query = db.select(db.func.count(o_species.ensembl_name)) + query = query.join(o, o_species.species_taxonomy_id == o.species_taxonomy_id) + query = query.join(Genome, o.organism_id == Genome.organism_id) + query = query.join(Assembly, Genome.assembly_id == Assembly.assembly_id) + query = query.filter(o_species.organism_uuid == organism_uuid) + else: + # change group to release_version_state and related genomes + raise NotImplementedError('Not implemented yet') + pass + + # print(f"query ---> {query}") + with self.metadata_db.session_scope() as session: + return session.execute(query).scalar() diff --git a/src/ensembl/production/metadata/grpc/adaptors/release.py b/src/ensembl/production/metadata/grpc/adaptors/release.py new file mode 100644 index 00000000..3237eb3e --- /dev/null +++ b/src/ensembl/production/metadata/grpc/adaptors/release.py @@ -0,0 +1,174 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + +import sqlalchemy as db + +from ensembl.production.metadata.grpc.adaptors.base import check_parameter, BaseAdaptor +from ensembl.production.metadata.api.models import EnsemblRelease, EnsemblSite, GenomeRelease, Genome, GenomeDataset, \ + Dataset + +logger = logging.getLogger(__name__) + + +class ReleaseAdaptor(BaseAdaptor): + + def fetch_releases( + self, + release_id=None, + release_version=None, + current_only=True, + release_type=None, + site_name=None, + ): + """ + Fetches releases based on the provided parameters. + + Args: + release_id (int or list or None): Release ID(s) to filter by. + release_version (str or list or None): Release version(s) to filter by. + current_only (bool): Flag indicating whether to fetch only current releases. + release_type (str or list or None): Release type(s) to filter by. + site_name (str or list or None): Name(s) of the Ensembl site to filter by. + + Returns: + list: A list of fetched releases. + """ + release_id = check_parameter(release_id) + release_version = check_parameter(release_version) + release_type = check_parameter(release_type) + site_name = check_parameter(site_name) + + release_select = db.select( + EnsemblRelease, EnsemblSite + ).join(EnsemblRelease.ensembl_site) + + # WHERE ensembl_release.release_id = :release_id_1 + if release_id is not None: + release_select = release_select.filter( + EnsemblRelease.release_id.in_(release_id) + ) + # WHERE ensembl_release.version = :version_1 + elif release_version is not None: + release_select = release_select.filter( + EnsemblRelease.version.in_(release_version) + ) + # WHERE ensembl_release.is_current =:is_current_1 + elif current_only: + release_select = release_select.filter( + EnsemblRelease.is_current == 1 + ) + + # WHERE ensembl_release.release_type = :release_type_1 + if release_type is not None: + release_select = release_select.filter( + EnsemblRelease.release_type.in_(release_type) + ) + + # WHERE ensembl_site.name = :name_1 + if site_name is not None: + release_select = release_select.filter( + EnsemblSite.name.in_(site_name) + ) + logger.debug(f"Query: {release_select}") + with self.metadata_db.session_scope() as session: + session.expire_on_commit = False + return session.execute(release_select).all() + + def fetch_releases_for_genome(self, genome_uuid, site_name=None): + + # SELECT genome_release.release_id + # FROM genome_release + # JOIN genome ON genome.genome_id = genome_release.genome_id + # WHERE genome.genome_uuid =:genome_uuid_1 + release_id_select = db.select( + GenomeRelease.release_id + ).filter( + Genome.genome_uuid == genome_uuid + ).join( + GenomeRelease.genome + ) + + release_ids = [] + with self.metadata_db.session_scope() as session: + release_objects = session.execute(release_id_select).all() + for rid in release_objects: + release_ids.append(rid[0]) + release_ids = list(dict.fromkeys(release_ids)) + return self.fetch_releases(release_id=release_ids, site_name=site_name) + + def fetch_releases_for_dataset(self, dataset_uuid, site_name=None): + + # SELECT genome_release.release_id + # FROM genome_dataset + # JOIN dataset ON dataset.dataset_id = genome_dataset.dataset_id + # WHERE dataset.dataset_uuid = :dataset_uuid_1 + release_id_select = db.select( + GenomeDataset.release_id + ).filter( + Dataset.dataset_uuid == dataset_uuid + ).join( + GenomeDataset.dataset + ) + + release_ids = [] + with self.metadata_db.session_scope() as session: + release_objects = session.execute(release_id_select).all() + for rid in release_objects: + release_ids.append(rid[0]) + release_ids = list(dict.fromkeys(release_ids)) + return self.fetch_releases(release_id=release_ids, site_name=site_name) + + +class NewReleaseAdaptor(BaseAdaptor): + + def __init__(self, metadata_uri=None): + super().__init__(metadata_uri) + # Get current release ID from ensembl_release + with self.metadata_db.session_scope() as session: + self.current_release_id = ( + session.execute(db.select(EnsemblRelease.release_id).filter(EnsemblRelease.is_current == 1)).one()[0]) + if self.current_release_id == "": + raise Exception("Current release not found") + logger.debug(f'Release ID: {self.current_release_id}') + + # Get last release ID from ensembl_release + with self.metadata_db.session_scope() as session: + ############### Refactor this once done. It is messy. + current_version = int(session.execute( + db.select(EnsemblRelease.version).filter(EnsemblRelease.release_id == self.current_release_id)).one()[ + 0]) + past_versions = session.execute( + db.select(EnsemblRelease.version).filter(EnsemblRelease.version < current_version)).all() + sorted_versions = [] + # Do I have to account for 1.12 and 1.2 + for version in past_versions: + sorted_versions.append(float(version[0])) + sorted_versions.sort() + self.previous_release_id = (session.execute( + db.select(EnsemblRelease.release_id).filter(EnsemblRelease.version == sorted_versions[-1])).one()[0]) + if self.previous_release_id == "": + raise Exception("Previous release not found") + + # new_genomes (list of new genomes in the new release) + def fetch_new_genomes(self): + # TODO: this code must be never called yet, because it would never work!!!! + with self.metadata_db.session_scope() as session: + genome_selector = db.select( + EnsemblRelease, EnsemblSite + ).join(EnsemblRelease.ensembl_site) + old_genomes = session.execute( + db.select(EnsemblRelease.version).filter(EnsemblRelease.version < current_version)).all() + new_genomes = [] + novel_old_genomes = [] + novel_new_genomes = [] + return session.execute(release_select).all() diff --git a/src/ensembl/production/metadata/grpc/client_examples.py b/src/ensembl/production/metadata/grpc/client_examples.py new file mode 100755 index 00000000..9ae64b04 --- /dev/null +++ b/src/ensembl/production/metadata/grpc/client_examples.py @@ -0,0 +1,364 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import grpc +import logging + +from ensembl_metadata_pb2 import ( + GenomeUUIDRequest, + GenomeNameRequest, + ReleaseRequest, + GenomeSequenceRequest, + AssemblyIDRequest, + GenomeByKeywordRequest, + AssemblyAccessionIDRequest, + OrganismIDRequest, + DatasetsRequest, + GenomeDatatypeRequest, + GenomeInfoRequest, + OrganismsGroupRequest, + AssemblyRegionRequest, + GenomeAssemblySequenceRegionRequest, + GenomeTagRequest +) + +import ensembl.production.metadata.grpc.ensembl_metadata_pb2_grpc as ensembl_metadata_pb2_grpc + + +def get_genome(stub, genome_request): + if isinstance(genome_request, GenomeUUIDRequest): + genome = stub.GetGenomeByUUID(genome_request) + print(genome) + elif isinstance(genome_request, GenomeNameRequest): + genome = stub.GetGenomeByName(genome_request) + print(genome) + else: + print("Unrecognised request message") + return + + if genome.genome_uuid == '': + print("No genome") + return + + +def get_genomes_by_keyword(stub, genome_request): + if isinstance(genome_request, GenomeByKeywordRequest): + genomes = stub.GetGenomesByKeyword(genome_request) + for genome in genomes: + print(genome) + + +def get_genomes(stub): + request1 = GenomeUUIDRequest(genome_uuid="9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1") + request2 = GenomeUUIDRequest(genome_uuid="rhubarb") + request3 = GenomeNameRequest(ensembl_name="129S1_SvImJ_v1", site_name="Ensembl") + request4 = GenomeNameRequest( + ensembl_name="accipiter_gentilis", site_name="rapid", release_version=13.0 + ) + request5 = GenomeNameRequest( + ensembl_name="banana", site_name="plants", release_version=104.0 + ) + request6 = GenomeByKeywordRequest(keyword="Human") + request7 = GenomeByKeywordRequest(keyword="Bigfoot") + print("**** Valid UUID ****") + get_genome(stub, request1) + print("**** Invalid UUID ****") + get_genome(stub, request2) + print("**** Name, no release ****") + get_genome(stub, request3) + print("**** Name, past release ****") + get_genome(stub, request4) + print("**** Invalid name ****") + get_genome(stub, request5) + print("**** Valid keyword, no release ****") + get_genomes_by_keyword(stub, request6) + print("**** Invalid keyword ****") + get_genomes_by_keyword(stub, request7) + + +def list_genome_sequences(stub): + request1 = GenomeSequenceRequest( + genome_uuid="2afef36f-3660-4b8c-819b-d1e5a77c9918", chromosomal_only=True + ) + genome_sequences1 = stub.GetGenomeSequence(request1) + print("**** Only chromosomes ****") + for seq in genome_sequences1: + print(seq) + + request2 = GenomeSequenceRequest(genome_uuid="2afef36f-3660-4b8c-819b-d1e5a77c9918") + genome_sequences2 = stub.GetGenomeSequence(request2) + print("**** All sequences ****") + for seq in genome_sequences2: + print(seq) + + request3 = GenomeSequenceRequest(genome_uuid="garbage") + genome_sequences3 = stub.GetGenomeSequence(request3) + print("**** Invalid UUID ****") + for seq in genome_sequences3: + print(seq) + + +def list_genome_assembly_sequences(stub): + request1 = AssemblyRegionRequest( + genome_uuid="2afef36f-3660-4b8c-819b-d1e5a77c9918", + chromosomal_only=False + ) + genome_assembly_sequences1 = stub.GetAssemblyRegion(request1) + + request2 = AssemblyRegionRequest( + genome_uuid="2afef36f-3660-4b8c-819b-d1e5a77c9918", + chromosomal_only=True + ) + genome_assembly_sequences2 = stub.GetAssemblyRegion(request2) + print("**** Chromosomal and non-chromosomal ****") + for seq in genome_assembly_sequences1: + print(seq) + + print("**** Chromosomal_only ****") + for seq in genome_assembly_sequences2: + print(seq) + + +def list_genome_assembly_sequences_region(stub): + request1 = GenomeAssemblySequenceRegionRequest( + genome_uuid="9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1", + sequence_region_name="HG03540#1#h1tg000001l" + ) + genome_assembly_sequences_region1 = stub.GetGenomeAssemblySequenceRegion(request1) + print("**** Non-chromosomal ****") + print(genome_assembly_sequences_region1) + + request2 = GenomeAssemblySequenceRegionRequest( + genome_uuid="2afef36f-3660-4b8c-819b-d1e5a77c9918", + sequence_region_name="3" + ) + genome_assembly_sequences_region2 = stub.GetGenomeAssemblySequenceRegion(request2) + print("**** Chromosomal ****") + print(genome_assembly_sequences_region2) + + +def list_releases(stub): + request1 = ReleaseRequest() + releases1 = stub.GetRelease(request1) + print("**** All releases ****") + for release in releases1: + print(release) + + request2 = ReleaseRequest(site_name=["rapid"]) + releases2 = stub.GetRelease(request2) + print("**** All Rapid releases ****") + for release in releases2: + print(release) + + request3 = ReleaseRequest(site_name=["rapid"], current_only=1) + releases3 = stub.GetRelease(request3) + print("**** Current Rapid release ****") + for release in releases3: + print(release) + + request4 = ReleaseRequest(release_version=[1]) + releases4 = stub.GetRelease(request4) + print("**** Version 14 ****") + for release in releases4: + print(release) + + request5 = ReleaseRequest(release_version=[79]) + releases5 = stub.GetRelease(request5) + print("**** Version 79 ****") + for release in releases5: + print(release) + + request6 = ReleaseRequest(release_version=[1]) + releases6 = stub.GetRelease(request6) + print("**** Versions 14 and 15 ****") + for release in releases6: + print(release) + + +def list_releases_by_uuid(stub): + request1 = GenomeUUIDRequest(genome_uuid="a73351f7-93e7-11ec-a39d-005056b38ce3") + releases1 = stub.GetReleaseByUUID(request1) + print("**** Release for Narwhal ****") + for release in releases1: + print(release) + + +def get_species_information_by_uuid(stub): + request1 = GenomeUUIDRequest(genome_uuid="9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1") + releases1 = stub.GetSpeciesInformation(request1) + print("**** Species information ****") + print(releases1) + + +def get_assembly_information(stub): + request1 = AssemblyIDRequest(assembly_uuid="9d2dc346-358a-4c70-8fd8-3ff194246a76") + releases1 = stub.GetAssemblyInformation(request1) + print("**** Assembly information ****") + print(releases1) + + +def get_genomes_by_assembly_accession(stub): + request1 = AssemblyAccessionIDRequest(assembly_accession="GCA_001624185.1") + genomes1 = stub.GetGenomesByAssemblyAccessionID(request1) + print("**** Genomes from assembly accession information ****") + for genome in genomes1: + print(genome) + + request2 = AssemblyAccessionIDRequest(assembly_accession=None) + genomes2 = stub.GetGenomesByAssemblyAccessionID(request2) + print("**** Genomes from null assembly accession ****") + print(list(genomes2)) + + +def get_sub_species_info(stub): + request1 = OrganismIDRequest( + organism_uuid="86dd50f1-421e-4829-aca5-13ccc9a459f6", + group="EnsemblPlants" + ) + releases1 = stub.GetSubSpeciesInformation(request1) + print("**** Sub species information ****") + print(releases1) + + +def get_top_level_statistics(stub): + request1 = OrganismIDRequest( + organism_uuid="86dd50f1-421e-4829-aca5-13ccc9a459f6", + group="EnsemblPlants" + ) + releases1 = stub.GetTopLevelStatistics(request1) + print("**** Top level statistics ****") + print(releases1) + + +def get_top_level_statistics_by_uuid(stub): + genome_request = GenomeUUIDRequest( + genome_uuid="a7335667-93e7-11ec-a39d-005056b38ce3" + ) + toplevel_stats_by_uuid_request = stub.GetTopLevelStatisticsByUUID(genome_request) + print("**** Top level statistics by UUID ****") + print(toplevel_stats_by_uuid_request) + + +def get_datasets_list_by_uuid(stub): + request1 = DatasetsRequest( + genome_uuid="9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1" + ) + request2 = DatasetsRequest( + genome_uuid="9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1", release_version=108.0 + ) + print("**** Release not specified ****") + datasets1 = stub.GetDatasetsListByUUID(request1) + print(datasets1) + print("**** Release specified ****") + datasets2 = stub.GetDatasetsListByUUID(request2) + print(datasets2) + + +def get_dataset_infos_by_dataset_type(stub): + request1 = GenomeDatatypeRequest( + genome_uuid="9caa2cae-d1c8-4cfc-9ffd-2e13bc3e95b1", dataset_type="assembly" + ) + datasets1 = stub.GetDatasetInformation(request1) + print(datasets1.dataset_infos) + + +def get_genome_uuid(stub): + request1 = GenomeInfoRequest( + ensembl_name="homo_sapiens_37", assembly_name="GRCh37.p13" + ) + genome_uuid1 = stub.GetGenomeUUID(request1) + request2 = GenomeInfoRequest( + ensembl_name="homo_sapiens_37", assembly_name="GRCh37", use_default=True + ) + genome_uuid2 = stub.GetGenomeUUID(request2) + request3 = GenomeInfoRequest( + ensembl_name="homo_sapiens_37", assembly_name="GRCh37.p13", use_default=True + ) + genome_uuid3 = stub.GetGenomeUUID(request3) + + print("**** Using assembly_name ****") + print(genome_uuid1) + print("**** Using assembly_default ****") + print(genome_uuid2) + print("**** Using assembly_default (No results) ****") + print(genome_uuid3) + + +def get_organisms_group_count(stub): + request = OrganismsGroupRequest() + organisms_group_count = stub.GetOrganismsGroupCount(request) + print(organisms_group_count) + + +def get_genome_uuid_by_tag(stub): + request1 = GenomeTagRequest(genome_tag="grch37") + genome_uuid1 = stub.GetGenomeUUIDByTag(request1) + request2 = GenomeTagRequest(genome_tag="grch38") + genome_uuid2 = stub.GetGenomeUUIDByTag(request2) + request3 = GenomeTagRequest(genome_tag="r64-1-1") + genome_uuid3 = stub.GetGenomeUUIDByTag(request3) + request4 = GenomeTagRequest(genome_tag="foo") + genome_uuid4 = stub.GetGenomeUUIDByTag(request4) + + print("**** Genome Tag: grch37 ****") + print(genome_uuid1) + print("**** Genome Tag: grch38 ****") + print(genome_uuid2) + print("**** Genome Tag: r64-1-1 ****") + print(genome_uuid3) + print("**** Genome Tag: foo ****") + print(genome_uuid4) + + +def run(): + with grpc.insecure_channel("localhost:50051") as channel: + stub = ensembl_metadata_pb2_grpc.EnsemblMetadataStub(channel) + print("---------------Get Species Information-----------") + get_species_information_by_uuid(stub) + print("---------------Get Assembly Information-----------") + get_assembly_information(stub) + print( + "---------------Get Genome Information from assembly accession-----------" + ) + get_genomes_by_assembly_accession(stub) + print("---------------Get Subspecies Information-----------") + get_sub_species_info(stub) + print("---------------Get Top Level Statistics-----------") + get_top_level_statistics(stub) + print("---------------Get Top Level Statistics By UUID-----------") + get_top_level_statistics_by_uuid(stub) + print("-------------- Get Genomes --------------") + get_genomes(stub) + print("-------------- List Genome Sequences --------------") + list_genome_sequences(stub) + print("-------------- List Genome Assembly Sequences --------------") + list_genome_assembly_sequences(stub) + print("-------------- List Region Info for Given Sequence Name --------------") + list_genome_assembly_sequences_region(stub) + print("-------------- List Releases --------------") + list_releases(stub) + print("-------------- List Releases for Genome --------------") + list_releases_by_uuid(stub) + print("---------------Get Datasets List-----------") + get_datasets_list_by_uuid(stub) + print("-------------- List Dataset information for Genome --------------") + get_dataset_infos_by_dataset_type(stub) + print("-------------- Get Genome UUID --------------") + get_genome_uuid(stub) + print("-------------- Get Organisms Group Count --------------") + get_organisms_group_count(stub) + print("-------------- Get Genome UUID By Tag --------------") + get_genome_uuid_by_tag(stub) + + +if __name__ == "__main__": + logging.basicConfig() + run() diff --git a/src/ensembl/production/metadata/grpc/config.py b/src/ensembl/production/metadata/grpc/config.py new file mode 100644 index 00000000..ca57a3ca --- /dev/null +++ b/src/ensembl/production/metadata/grpc/config.py @@ -0,0 +1,21 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + + +class MetadataConfig: + metadata_uri = os.environ.get("METADATA_URI", f"mysql://ensembl@localhost:3306/ensembl_genome_metadata") + taxon_uri = os.environ.get("TAXONOMY_URI", f"mysql://ensembl@localhost:3306/ncbi_taxonomy") + pool_size = os.environ.get("POOL_SIZE", 20) + max_overflow = os.environ.get("MAX_OVERFLOW", 0) + pool_recycle = os.environ.get("POOL_RECYCLE", 50) + allow_unreleased = os.environ.get("ALLOW_UNRELEASED", False) diff --git a/src/ensembl/production/metadata/grpc/ensembl_metadata_pb2.py b/src/ensembl/production/metadata/grpc/ensembl_metadata_pb2.py new file mode 100755 index 00000000..f17d2d39 --- /dev/null +++ b/src/ensembl/production/metadata/grpc/ensembl_metadata_pb2.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: ensembl/production/metadata/grpc/ensembl_metadata.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n7ensembl/production/metadata/grpc/ensembl_metadata.proto\x12\x10\x65nsembl_metadata\"\xbb\x02\n\x06Genome\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12,\n\x08\x61ssembly\x18\x02 \x01(\x0b\x32\x1a.ensembl_metadata.Assembly\x12&\n\x05taxon\x18\x03 \x01(\x0b\x32\x17.ensembl_metadata.Taxon\x12\x0f\n\x07\x63reated\x18\x04 \x01(\t\x12,\n\x08organism\x18\x05 \x01(\x0b\x32\x1a.ensembl_metadata.Organism\x12\x39\n\x0f\x61ttributes_info\x18\x06 \x01(\x0b\x32 .ensembl_metadata.AttributesInfo\x12 \n\x18related_assemblies_count\x18\x07 \x01(\x05\x12*\n\x07release\x18\x08 \x01(\x0b\x32\x19.ensembl_metadata.Release\"\x99\x01\n\x07Species\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x10\n\x08taxon_id\x18\x02 \x01(\r\x12\x17\n\x0fscientific_name\x18\x03 \x01(\t\x12 \n\x18scientific_parlance_name\x18\x04 \x01(\t\x12\x1b\n\x13genbank_common_name\x18\x05 \x01(\t\x12\x0f\n\x07synonym\x18\x06 \x03(\t\"\xb6\x01\n\x0c\x41ssemblyInfo\x12\x15\n\rassembly_uuid\x18\x01 \x01(\t\x12\x11\n\taccession\x18\x02 \x01(\t\x12\r\n\x05level\x18\x03 \x01(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\x13\n\x0b\x63hromosomal\x18\x05 \x01(\r\x12\x0e\n\x06length\x18\x06 \x01(\x04\x12\x19\n\x11sequence_location\x18\x07 \x01(\t\x12\x0b\n\x03md5\x18\x08 \x01(\t\x12\x12\n\nsha512t24u\x18\t \x01(\t\"O\n\nSubSpecies\x12\x15\n\rorganism_uuid\x18\x01 \x01(\t\x12\x14\n\x0cspecies_type\x18\x02 \x03(\t\x12\x14\n\x0cspecies_name\x18\x03 \x03(\t\"c\n\x13\x41ttributeStatistics\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05label\x18\x02 \x01(\t\x12\x16\n\x0estatistic_type\x18\x03 \x01(\t\x12\x17\n\x0fstatistic_value\x18\x04 \x01(\t\"j\n\x18TopLevelStatisticsByUUID\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x39\n\nstatistics\x18\x02 \x03(\x0b\x32%.ensembl_metadata.AttributeStatistics\"u\n\x12TopLevelStatistics\x12\x15\n\rorganism_uuid\x18\x01 \x01(\t\x12H\n\x14stats_by_genome_uuid\x18\x02 \x03(\x0b\x32*.ensembl_metadata.TopLevelStatisticsByUUID\"\xb2\x01\n\x08\x41ssembly\x12\x11\n\taccession\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x11\n\tucsc_name\x18\x03 \x01(\t\x12\r\n\x05level\x18\x04 \x01(\t\x12\x14\n\x0c\x65nsembl_name\x18\x05 \x01(\t\x12\x15\n\rassembly_uuid\x18\x06 \x01(\t\x12\x14\n\x0cis_reference\x18\x07 \x01(\x08\x12\x10\n\x08url_name\x18\x08 \x01(\t\x12\x0e\n\x06tol_id\x18\t \x01(\t\"`\n\x05Taxon\x12\x13\n\x0btaxonomy_id\x18\x01 \x01(\r\x12\x17\n\x0fscientific_name\x18\x02 \x01(\t\x12\x0e\n\x06strain\x18\x03 \x01(\t\x12\x19\n\x11\x61lternative_names\x18\x04 \x03(\t\"\x9c\x01\n\x07Release\x12\x17\n\x0frelease_version\x18\x01 \x01(\x01\x12\x14\n\x0crelease_date\x18\x02 \x01(\t\x12\x15\n\rrelease_label\x18\x03 \x01(\t\x12\x12\n\nis_current\x18\x04 \x01(\x08\x12\x11\n\tsite_name\x18\x05 \x01(\t\x12\x12\n\nsite_label\x18\x06 \x01(\t\x12\x10\n\x08site_uri\x18\x07 \x01(\t\"\xde\x01\n\x08Organism\x12\x13\n\x0b\x63ommon_name\x18\x01 \x01(\t\x12\x0e\n\x06strain\x18\x02 \x01(\t\x12\x17\n\x0fscientific_name\x18\x03 \x01(\t\x12\x14\n\x0c\x65nsembl_name\x18\x04 \x01(\t\x12 \n\x18scientific_parlance_name\x18\x05 \x01(\t\x12\x15\n\rorganism_uuid\x18\x06 \x01(\t\x12\x13\n\x0bstrain_type\x18\x07 \x01(\t\x12\x13\n\x0btaxonomy_id\x18\x08 \x01(\x05\x12\x1b\n\x13species_taxonomy_id\x18\t \x01(\x05\"K\n\tAttribute\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05label\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\"\xa1\x03\n\x0e\x41ttributesInfo\x12\x18\n\x10genebuild_method\x18\x01 \x01(\t\x12 \n\x18genebuild_method_display\x18\x02 \x01(\t\x12%\n\x1dgenebuild_last_geneset_update\x18\x03 \x01(\t\x12\x19\n\x11genebuild_version\x18\x04 \x01(\t\x12\x1f\n\x17genebuild_provider_name\x18\x05 \x01(\t\x12\x1e\n\x16genebuild_provider_url\x18\x06 \x01(\t\x12\x1d\n\x15genebuild_sample_gene\x18\x07 \x01(\t\x12!\n\x19genebuild_sample_location\x18\x08 \x01(\t\x12\x16\n\x0e\x61ssembly_level\x18\t \x01(\t\x12\x15\n\rassembly_date\x18\n \x01(\t\x12\x1e\n\x16\x61ssembly_provider_name\x18\x0b \x01(\t\x12\x1d\n\x15\x61ssembly_provider_url\x18\x0c \x01(\t\x12 \n\x18variation_sample_variant\x18\r \x01(\t\"\xa4\x02\n\x0c\x44\x61tasetInfos\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x14\n\x0c\x64\x61taset_type\x18\x02 \x01(\t\x12\x41\n\rdataset_infos\x18\x03 \x03(\x0b\x32*.ensembl_metadata.DatasetInfos.DatasetInfo\x1a\xa5\x01\n\x0b\x44\x61tasetInfo\x12\x14\n\x0c\x64\x61taset_uuid\x18\x01 \x01(\t\x12\x14\n\x0c\x64\x61taset_name\x18\x02 \x01(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x17\n\x0f\x64\x61taset_version\x18\x05 \x01(\t\x12\x15\n\rdataset_label\x18\x06 \x01(\t\x12\x0f\n\x07version\x18\x07 \x01(\x01\x12\r\n\x05value\x18\x08 \x01(\t\"q\n\x0eGenomeSequence\x12\x11\n\taccession\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x19\n\x11sequence_location\x18\x03 \x01(\t\x12\x0e\n\x06length\x18\x04 \x01(\x04\x12\x13\n\x0b\x63hromosomal\x18\x05 \x01(\x08\"r\n\x0e\x41ssemblyRegion\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04rank\x18\x02 \x01(\x05\x12\x0b\n\x03md5\x18\x03 \x01(\t\x12\x0e\n\x06length\x18\x04 \x01(\x04\x12\x12\n\nsha512t24u\x18\x05 \x01(\t\x12\x13\n\x0b\x63hromosomal\x18\x06 \x01(\x08\"r\n\x1cGenomeAssemblySequenceRegion\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0b\n\x03md5\x18\x02 \x01(\t\x12\x0e\n\x06length\x18\x03 \x01(\x04\x12\x12\n\nsha512t24u\x18\x04 \x01(\t\x12\x13\n\x0b\x63hromosomal\x18\x05 \x01(\x08\"\xac\x01\n\x08\x44\x61tasets\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12:\n\x08\x64\x61tasets\x18\x02 \x03(\x0b\x32(.ensembl_metadata.Datasets.DatasetsEntry\x1aO\n\rDatasetsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12-\n\x05value\x18\x02 \x01(\x0b\x32\x1e.ensembl_metadata.DatasetInfos:\x02\x38\x01\"!\n\nGenomeUUID\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\"\x8f\x01\n\x0eOrganismsGroup\x12\x1b\n\x13species_taxonomy_id\x18\x01 \x01(\r\x12\x14\n\x0c\x65nsembl_name\x18\x02 \x01(\t\x12\x13\n\x0b\x63ommon_name\x18\x03 \x01(\t\x12\x17\n\x0fscientific_name\x18\x04 \x01(\t\x12\r\n\x05order\x18\x05 \x01(\r\x12\r\n\x05\x63ount\x18\x06 \x01(\r\"o\n\x13OrganismsGroupCount\x12?\n\x15organisms_group_count\x18\x01 \x03(\x0b\x32 .ensembl_metadata.OrganismsGroup\x12\x17\n\x0frelease_version\x18\x02 \x01(\x01\"A\n\x11GenomeUUIDRequest\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x17\n\x0frelease_version\x18\x02 \x01(\x01\"B\n\x16GenomeByKeywordRequest\x12\x0f\n\x07keyword\x18\x01 \x01(\t\x12\x17\n\x0frelease_version\x18\x02 \x01(\x01\"U\n\x11GenomeNameRequest\x12\x14\n\x0c\x65nsembl_name\x18\x01 \x01(\t\x12\x11\n\tsite_name\x18\x02 \x01(\t\x12\x17\n\x0frelease_version\x18\x03 \x01(\x01\"C\n\x11\x41ssemblyIDRequest\x12\x15\n\rassembly_uuid\x18\x01 \x01(\t\x12\x17\n\x0frelease_version\x18\x02 \x01(\x01\"Q\n\x1a\x41ssemblyAccessionIDRequest\x12\x1a\n\x12\x61ssembly_accession\x18\x01 \x01(\t\x12\x17\n\x0frelease_version\x18\x02 \x01(\x01\"9\n\x11OrganismIDRequest\x12\x15\n\rorganism_uuid\x18\x01 \x01(\t\x12\r\n\x05group\x18\x02 \x01(\t\"R\n\x0eReleaseRequest\x12\x11\n\tsite_name\x18\x01 \x03(\t\x12\x17\n\x0frelease_version\x18\x02 \x03(\x01\x12\x14\n\x0c\x63urrent_only\x18\x03 \x01(\x08\"F\n\x15GenomeSequenceRequest\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x18\n\x10\x63hromosomal_only\x18\x02 \x01(\x08\"F\n\x15\x41ssemblyRegionRequest\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x18\n\x10\x63hromosomal_only\x18\x02 \x01(\x08\"X\n#GenomeAssemblySequenceRegionRequest\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x1c\n\x14sequence_region_name\x18\x02 \x01(\t\"?\n\x0f\x44\x61tasetsRequest\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x17\n\x0frelease_version\x18\x02 \x01(\x01\"B\n\x15GenomeDatatypeRequest\x12\x13\n\x0bgenome_uuid\x18\x01 \x01(\t\x12\x14\n\x0c\x64\x61taset_type\x18\x02 \x01(\t\"U\n\x11GenomeInfoRequest\x12\x14\n\x0c\x65nsembl_name\x18\x01 \x01(\t\x12\x15\n\rassembly_name\x18\x02 \x01(\t\x12\x13\n\x0buse_default\x18\x03 \x01(\x08\"0\n\x15OrganismsGroupRequest\x12\x17\n\x0frelease_version\x18\x01 \x01(\x01\"&\n\x10GenomeTagRequest\x12\x12\n\ngenome_tag\x18\x01 \x01(\t2\xd2\x0e\n\x0f\x45nsemblMetadata\x12R\n\x0fGetGenomeByUUID\x12#.ensembl_metadata.GenomeUUIDRequest\x1a\x18.ensembl_metadata.Genome\"\x00\x12T\n\rGetGenomeUUID\x12#.ensembl_metadata.GenomeInfoRequest\x1a\x1c.ensembl_metadata.GenomeUUID\"\x00\x12]\n\x13GetGenomesByKeyword\x12(.ensembl_metadata.GenomeByKeywordRequest\x1a\x18.ensembl_metadata.Genome\"\x00\x30\x01\x12m\n\x1fGetGenomesByAssemblyAccessionID\x12,.ensembl_metadata.AssemblyAccessionIDRequest\x1a\x18.ensembl_metadata.Genome\"\x00\x30\x01\x12Y\n\x15GetSpeciesInformation\x12#.ensembl_metadata.GenomeUUIDRequest\x1a\x19.ensembl_metadata.Species\"\x00\x12_\n\x16GetAssemblyInformation\x12#.ensembl_metadata.AssemblyIDRequest\x1a\x1e.ensembl_metadata.AssemblyInfo\"\x00\x12_\n\x18GetSubSpeciesInformation\x12#.ensembl_metadata.OrganismIDRequest\x1a\x1c.ensembl_metadata.SubSpecies\"\x00\x12\x64\n\x15GetTopLevelStatistics\x12#.ensembl_metadata.OrganismIDRequest\x1a$.ensembl_metadata.TopLevelStatistics\"\x00\x12p\n\x1bGetTopLevelStatisticsByUUID\x12#.ensembl_metadata.GenomeUUIDRequest\x1a*.ensembl_metadata.TopLevelStatisticsByUUID\"\x00\x12R\n\x0fGetGenomeByName\x12#.ensembl_metadata.GenomeNameRequest\x1a\x18.ensembl_metadata.Genome\"\x00\x12M\n\nGetRelease\x12 .ensembl_metadata.ReleaseRequest\x1a\x19.ensembl_metadata.Release\"\x00\x30\x01\x12V\n\x10GetReleaseByUUID\x12#.ensembl_metadata.GenomeUUIDRequest\x1a\x19.ensembl_metadata.Release\"\x00\x30\x01\x12\x62\n\x11GetGenomeSequence\x12\'.ensembl_metadata.GenomeSequenceRequest\x1a .ensembl_metadata.GenomeSequence\"\x00\x30\x01\x12\x62\n\x11GetAssemblyRegion\x12\'.ensembl_metadata.AssemblyRegionRequest\x1a .ensembl_metadata.AssemblyRegion\"\x00\x30\x01\x12\x8a\x01\n\x1fGetGenomeAssemblySequenceRegion\x12\x35.ensembl_metadata.GenomeAssemblySequenceRegionRequest\x1a..ensembl_metadata.GenomeAssemblySequenceRegion\"\x00\x12X\n\x15GetDatasetsListByUUID\x12!.ensembl_metadata.DatasetsRequest\x1a\x1a.ensembl_metadata.Datasets\"\x00\x12\x62\n\x15GetDatasetInformation\x12\'.ensembl_metadata.GenomeDatatypeRequest\x1a\x1e.ensembl_metadata.DatasetInfos\"\x00\x12j\n\x16GetOrganismsGroupCount\x12\'.ensembl_metadata.OrganismsGroupRequest\x1a%.ensembl_metadata.OrganismsGroupCount\"\x00\x12X\n\x12GetGenomeUUIDByTag\x12\".ensembl_metadata.GenomeTagRequest\x1a\x1c.ensembl_metadata.GenomeUUID\"\x00\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'ensembl.production.metadata.grpc.ensembl_metadata_pb2', _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _DATASETS_DATASETSENTRY._options = None + _DATASETS_DATASETSENTRY._serialized_options = b'8\001' + _globals['_GENOME']._serialized_start=78 + _globals['_GENOME']._serialized_end=393 + _globals['_SPECIES']._serialized_start=396 + _globals['_SPECIES']._serialized_end=549 + _globals['_ASSEMBLYINFO']._serialized_start=552 + _globals['_ASSEMBLYINFO']._serialized_end=734 + _globals['_SUBSPECIES']._serialized_start=736 + _globals['_SUBSPECIES']._serialized_end=815 + _globals['_ATTRIBUTESTATISTICS']._serialized_start=817 + _globals['_ATTRIBUTESTATISTICS']._serialized_end=916 + _globals['_TOPLEVELSTATISTICSBYUUID']._serialized_start=918 + _globals['_TOPLEVELSTATISTICSBYUUID']._serialized_end=1024 + _globals['_TOPLEVELSTATISTICS']._serialized_start=1026 + _globals['_TOPLEVELSTATISTICS']._serialized_end=1143 + _globals['_ASSEMBLY']._serialized_start=1146 + _globals['_ASSEMBLY']._serialized_end=1324 + _globals['_TAXON']._serialized_start=1326 + _globals['_TAXON']._serialized_end=1422 + _globals['_RELEASE']._serialized_start=1425 + _globals['_RELEASE']._serialized_end=1581 + _globals['_ORGANISM']._serialized_start=1584 + _globals['_ORGANISM']._serialized_end=1806 + _globals['_ATTRIBUTE']._serialized_start=1808 + _globals['_ATTRIBUTE']._serialized_end=1883 + _globals['_ATTRIBUTESINFO']._serialized_start=1886 + _globals['_ATTRIBUTESINFO']._serialized_end=2303 + _globals['_DATASETINFOS']._serialized_start=2306 + _globals['_DATASETINFOS']._serialized_end=2598 + _globals['_DATASETINFOS_DATASETINFO']._serialized_start=2433 + _globals['_DATASETINFOS_DATASETINFO']._serialized_end=2598 + _globals['_GENOMESEQUENCE']._serialized_start=2600 + _globals['_GENOMESEQUENCE']._serialized_end=2713 + _globals['_ASSEMBLYREGION']._serialized_start=2715 + _globals['_ASSEMBLYREGION']._serialized_end=2829 + _globals['_GENOMEASSEMBLYSEQUENCEREGION']._serialized_start=2831 + _globals['_GENOMEASSEMBLYSEQUENCEREGION']._serialized_end=2945 + _globals['_DATASETS']._serialized_start=2948 + _globals['_DATASETS']._serialized_end=3120 + _globals['_DATASETS_DATASETSENTRY']._serialized_start=3041 + _globals['_DATASETS_DATASETSENTRY']._serialized_end=3120 + _globals['_GENOMEUUID']._serialized_start=3122 + _globals['_GENOMEUUID']._serialized_end=3155 + _globals['_ORGANISMSGROUP']._serialized_start=3158 + _globals['_ORGANISMSGROUP']._serialized_end=3301 + _globals['_ORGANISMSGROUPCOUNT']._serialized_start=3303 + _globals['_ORGANISMSGROUPCOUNT']._serialized_end=3414 + _globals['_GENOMEUUIDREQUEST']._serialized_start=3416 + _globals['_GENOMEUUIDREQUEST']._serialized_end=3481 + _globals['_GENOMEBYKEYWORDREQUEST']._serialized_start=3483 + _globals['_GENOMEBYKEYWORDREQUEST']._serialized_end=3549 + _globals['_GENOMENAMEREQUEST']._serialized_start=3551 + _globals['_GENOMENAMEREQUEST']._serialized_end=3636 + _globals['_ASSEMBLYIDREQUEST']._serialized_start=3638 + _globals['_ASSEMBLYIDREQUEST']._serialized_end=3705 + _globals['_ASSEMBLYACCESSIONIDREQUEST']._serialized_start=3707 + _globals['_ASSEMBLYACCESSIONIDREQUEST']._serialized_end=3788 + _globals['_ORGANISMIDREQUEST']._serialized_start=3790 + _globals['_ORGANISMIDREQUEST']._serialized_end=3847 + _globals['_RELEASEREQUEST']._serialized_start=3849 + _globals['_RELEASEREQUEST']._serialized_end=3931 + _globals['_GENOMESEQUENCEREQUEST']._serialized_start=3933 + _globals['_GENOMESEQUENCEREQUEST']._serialized_end=4003 + _globals['_ASSEMBLYREGIONREQUEST']._serialized_start=4005 + _globals['_ASSEMBLYREGIONREQUEST']._serialized_end=4075 + _globals['_GENOMEASSEMBLYSEQUENCEREGIONREQUEST']._serialized_start=4077 + _globals['_GENOMEASSEMBLYSEQUENCEREGIONREQUEST']._serialized_end=4165 + _globals['_DATASETSREQUEST']._serialized_start=4167 + _globals['_DATASETSREQUEST']._serialized_end=4230 + _globals['_GENOMEDATATYPEREQUEST']._serialized_start=4232 + _globals['_GENOMEDATATYPEREQUEST']._serialized_end=4298 + _globals['_GENOMEINFOREQUEST']._serialized_start=4300 + _globals['_GENOMEINFOREQUEST']._serialized_end=4385 + _globals['_ORGANISMSGROUPREQUEST']._serialized_start=4387 + _globals['_ORGANISMSGROUPREQUEST']._serialized_end=4435 + _globals['_GENOMETAGREQUEST']._serialized_start=4437 + _globals['_GENOMETAGREQUEST']._serialized_end=4475 + _globals['_ENSEMBLMETADATA']._serialized_start=4478 + _globals['_ENSEMBLMETADATA']._serialized_end=6352 +# @@protoc_insertion_point(module_scope) diff --git a/src/ensembl/production/metadata/grpc/ensembl_metadata_pb2_grpc.py b/src/ensembl/production/metadata/grpc/ensembl_metadata_pb2_grpc.py new file mode 100755 index 00000000..d2c66bc2 --- /dev/null +++ b/src/ensembl/production/metadata/grpc/ensembl_metadata_pb2_grpc.py @@ -0,0 +1,692 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from ensembl.production.metadata.grpc import ensembl_metadata_pb2 as ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2 + + +class EnsemblMetadataStub(object): + """IMPORTANT: the directory structure of the protos directory should mirror the structure of the src directory to avoid + Python import errors. + + Metadata for the genomes in Ensembl. + """ + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.GetGenomeByUUID = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetGenomeByUUID', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + ) + self.GetGenomeUUID = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetGenomeUUID', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeInfoRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUID.FromString, + ) + self.GetGenomesByKeyword = channel.unary_stream( + '/ensembl_metadata.EnsemblMetadata/GetGenomesByKeyword', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeByKeywordRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + ) + self.GetGenomesByAssemblyAccessionID = channel.unary_stream( + '/ensembl_metadata.EnsemblMetadata/GetGenomesByAssemblyAccessionID', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyAccessionIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + ) + self.GetSpeciesInformation = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetSpeciesInformation', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Species.FromString, + ) + self.GetAssemblyInformation = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetAssemblyInformation', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyInfo.FromString, + ) + self.GetSubSpeciesInformation = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetSubSpeciesInformation', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.SubSpecies.FromString, + ) + self.GetTopLevelStatistics = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetTopLevelStatistics', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.TopLevelStatistics.FromString, + ) + self.GetTopLevelStatisticsByUUID = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetTopLevelStatisticsByUUID', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.TopLevelStatisticsByUUID.FromString, + ) + self.GetGenomeByName = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetGenomeByName', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeNameRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + ) + self.GetRelease = channel.unary_stream( + '/ensembl_metadata.EnsemblMetadata/GetRelease', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.ReleaseRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Release.FromString, + ) + self.GetReleaseByUUID = channel.unary_stream( + '/ensembl_metadata.EnsemblMetadata/GetReleaseByUUID', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Release.FromString, + ) + self.GetGenomeSequence = channel.unary_stream( + '/ensembl_metadata.EnsemblMetadata/GetGenomeSequence', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeSequenceRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeSequence.FromString, + ) + self.GetAssemblyRegion = channel.unary_stream( + '/ensembl_metadata.EnsemblMetadata/GetAssemblyRegion', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyRegionRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyRegion.FromString, + ) + self.GetGenomeAssemblySequenceRegion = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetGenomeAssemblySequenceRegion', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeAssemblySequenceRegionRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeAssemblySequenceRegion.FromString, + ) + self.GetDatasetsListByUUID = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetDatasetsListByUUID', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.DatasetsRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Datasets.FromString, + ) + self.GetDatasetInformation = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetDatasetInformation', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeDatatypeRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.DatasetInfos.FromString, + ) + self.GetOrganismsGroupCount = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetOrganismsGroupCount', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismsGroupRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismsGroupCount.FromString, + ) + self.GetGenomeUUIDByTag = channel.unary_unary( + '/ensembl_metadata.EnsemblMetadata/GetGenomeUUIDByTag', + request_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeTagRequest.SerializeToString, + response_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUID.FromString, + ) + + +class EnsemblMetadataServicer(object): + """IMPORTANT: the directory structure of the protos directory should mirror the structure of the src directory to avoid + Python import errors. + + Metadata for the genomes in Ensembl. + """ + + def GetGenomeByUUID(self, request, context): + """Retrieve genome by its UUID. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetGenomeUUID(self, request, context): + """Retrieve genome UUID by providing production name and assembly id. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetGenomesByKeyword(self, request, context): + """Retrieve genomes by keyword search + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetGenomesByAssemblyAccessionID(self, request, context): + """Retrieve all genomes for a give assembly accession ID + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetSpeciesInformation(self, request, context): + """Get species information for a genome UUID + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetAssemblyInformation(self, request, context): + """Get assembly information + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetSubSpeciesInformation(self, request, context): + """Get subspecies information + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetTopLevelStatistics(self, request, context): + """Get top level statistics + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetTopLevelStatisticsByUUID(self, request, context): + """Get top level statistics by UUID + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetGenomeByName(self, request, context): + """Retrieve genome by Ensembl name and site, and optionally release. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetRelease(self, request, context): + """Retrieve release details. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetReleaseByUUID(self, request, context): + """Retrieve release details for a genome. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetGenomeSequence(self, request, context): + """Retrieve sequence metadata for a genome's assembly. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetAssemblyRegion(self, request, context): + """Retrieve region information for a genome's assembly. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetGenomeAssemblySequenceRegion(self, request, context): + """Retrieve region information for a genome's assembly with a given sequence region name. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetDatasetsListByUUID(self, request, context): + """Retrieve a list of dataset_ids associated with a genome UUID. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetDatasetInformation(self, request, context): + """Retrieve dataset info by genome uuid and dataset_type + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetOrganismsGroupCount(self, request, context): + """Retrieve organisms group count + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GetGenomeUUIDByTag(self, request, context): + """Retrieve genome UUID by genome_tag. + genome_tag value will be either in assembly.url_name or tol_id column + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_EnsemblMetadataServicer_to_server(servicer, server): + rpc_method_handlers = { + 'GetGenomeByUUID': grpc.unary_unary_rpc_method_handler( + servicer.GetGenomeByUUID, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.SerializeToString, + ), + 'GetGenomeUUID': grpc.unary_unary_rpc_method_handler( + servicer.GetGenomeUUID, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeInfoRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUID.SerializeToString, + ), + 'GetGenomesByKeyword': grpc.unary_stream_rpc_method_handler( + servicer.GetGenomesByKeyword, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeByKeywordRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.SerializeToString, + ), + 'GetGenomesByAssemblyAccessionID': grpc.unary_stream_rpc_method_handler( + servicer.GetGenomesByAssemblyAccessionID, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyAccessionIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.SerializeToString, + ), + 'GetSpeciesInformation': grpc.unary_unary_rpc_method_handler( + servicer.GetSpeciesInformation, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Species.SerializeToString, + ), + 'GetAssemblyInformation': grpc.unary_unary_rpc_method_handler( + servicer.GetAssemblyInformation, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyInfo.SerializeToString, + ), + 'GetSubSpeciesInformation': grpc.unary_unary_rpc_method_handler( + servicer.GetSubSpeciesInformation, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.SubSpecies.SerializeToString, + ), + 'GetTopLevelStatistics': grpc.unary_unary_rpc_method_handler( + servicer.GetTopLevelStatistics, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.TopLevelStatistics.SerializeToString, + ), + 'GetTopLevelStatisticsByUUID': grpc.unary_unary_rpc_method_handler( + servicer.GetTopLevelStatisticsByUUID, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.TopLevelStatisticsByUUID.SerializeToString, + ), + 'GetGenomeByName': grpc.unary_unary_rpc_method_handler( + servicer.GetGenomeByName, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeNameRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.SerializeToString, + ), + 'GetRelease': grpc.unary_stream_rpc_method_handler( + servicer.GetRelease, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.ReleaseRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Release.SerializeToString, + ), + 'GetReleaseByUUID': grpc.unary_stream_rpc_method_handler( + servicer.GetReleaseByUUID, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Release.SerializeToString, + ), + 'GetGenomeSequence': grpc.unary_stream_rpc_method_handler( + servicer.GetGenomeSequence, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeSequenceRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeSequence.SerializeToString, + ), + 'GetAssemblyRegion': grpc.unary_stream_rpc_method_handler( + servicer.GetAssemblyRegion, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyRegionRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyRegion.SerializeToString, + ), + 'GetGenomeAssemblySequenceRegion': grpc.unary_unary_rpc_method_handler( + servicer.GetGenomeAssemblySequenceRegion, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeAssemblySequenceRegionRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeAssemblySequenceRegion.SerializeToString, + ), + 'GetDatasetsListByUUID': grpc.unary_unary_rpc_method_handler( + servicer.GetDatasetsListByUUID, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.DatasetsRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Datasets.SerializeToString, + ), + 'GetDatasetInformation': grpc.unary_unary_rpc_method_handler( + servicer.GetDatasetInformation, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeDatatypeRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.DatasetInfos.SerializeToString, + ), + 'GetOrganismsGroupCount': grpc.unary_unary_rpc_method_handler( + servicer.GetOrganismsGroupCount, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismsGroupRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismsGroupCount.SerializeToString, + ), + 'GetGenomeUUIDByTag': grpc.unary_unary_rpc_method_handler( + servicer.GetGenomeUUIDByTag, + request_deserializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeTagRequest.FromString, + response_serializer=ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUID.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'ensembl_metadata.EnsemblMetadata', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + + # This class is part of an EXPERIMENTAL API. +class EnsemblMetadata(object): + """IMPORTANT: the directory structure of the protos directory should mirror the structure of the src directory to avoid + Python import errors. + + Metadata for the genomes in Ensembl. + """ + + @staticmethod + def GetGenomeByUUID(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomeByUUID', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetGenomeUUID(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomeUUID', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeInfoRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUID.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetGenomesByKeyword(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomesByKeyword', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeByKeywordRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetGenomesByAssemblyAccessionID(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomesByAssemblyAccessionID', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyAccessionIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetSpeciesInformation(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetSpeciesInformation', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Species.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetAssemblyInformation(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetAssemblyInformation', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyInfo.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetSubSpeciesInformation(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetSubSpeciesInformation', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.SubSpecies.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetTopLevelStatistics(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetTopLevelStatistics', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.TopLevelStatistics.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetTopLevelStatisticsByUUID(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetTopLevelStatisticsByUUID', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.TopLevelStatisticsByUUID.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetGenomeByName(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomeByName', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeNameRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Genome.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetRelease(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/ensembl_metadata.EnsemblMetadata/GetRelease', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.ReleaseRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Release.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetReleaseByUUID(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/ensembl_metadata.EnsemblMetadata/GetReleaseByUUID', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUIDRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Release.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetGenomeSequence(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomeSequence', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeSequenceRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeSequence.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetAssemblyRegion(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/ensembl_metadata.EnsemblMetadata/GetAssemblyRegion', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyRegionRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.AssemblyRegion.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetGenomeAssemblySequenceRegion(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomeAssemblySequenceRegion', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeAssemblySequenceRegionRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeAssemblySequenceRegion.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetDatasetsListByUUID(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetDatasetsListByUUID', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.DatasetsRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.Datasets.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetDatasetInformation(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetDatasetInformation', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeDatatypeRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.DatasetInfos.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetOrganismsGroupCount(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetOrganismsGroupCount', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismsGroupRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.OrganismsGroupCount.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GetGenomeUUIDByTag(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/ensembl_metadata.EnsemblMetadata/GetGenomeUUIDByTag', + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeTagRequest.SerializeToString, + ensembl_dot_production_dot_metadata_dot_grpc_dot_ensembl__metadata__pb2.GenomeUUID.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/src/ensembl/production/metadata/grpc/protobuf_msg_factory.py b/src/ensembl/production/metadata/grpc/protobuf_msg_factory.py new file mode 100644 index 00000000..05776016 --- /dev/null +++ b/src/ensembl/production/metadata/grpc/protobuf_msg_factory.py @@ -0,0 +1,383 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ensembl.production.metadata.grpc import ensembl_metadata_pb2 + + +def create_species(species_data=None, taxo_info=None): + if species_data is None: + return ensembl_metadata_pb2.Species() + + species = ensembl_metadata_pb2.Species( + genome_uuid=species_data.Genome.genome_uuid, + taxon_id=species_data.Organism.taxonomy_id, + scientific_name=species_data.Organism.scientific_name, + scientific_parlance_name=species_data.Organism.scientific_parlance_name, + genbank_common_name=taxo_info["genbank_common_name"], + synonym=taxo_info["synonym"], + ) + return species + + +def create_stats_by_genome_uuid(data=None): + if data is None: + return ensembl_metadata_pb2.TopLevelStatisticsByUUID() + + # list of TopLevelStatisticsByUUID (see the proto file) + genome_uuid_stats = [] + # this dict will help us group stats by genome_uuid, protobuf is pain in the back... + # it won't let us do that while constructing the object + statistics = {} + for result in data: + # start creating a dictionary with genome_uuid as key and stats as value list + if result.Genome.genome_uuid not in list(statistics.keys()): + statistics[result.Genome.genome_uuid] = [] + + one_stat = ensembl_metadata_pb2.AttributeStatistics( + name=result.Attribute.name, + label=result.Attribute.label, + statistic_type=result.Attribute.type, + statistic_value=result.DatasetAttribute.value + ) + statistics[result.Genome.genome_uuid].append(one_stat) + + # now we can construct the object after having everything in statistics grouped by genome_uuid + for genome_uuid in list(statistics.keys()): + genome_uuid_stat = ensembl_metadata_pb2.TopLevelStatisticsByUUID() + genome_uuid_stat.genome_uuid = genome_uuid + for stat in statistics[genome_uuid]: + genome_uuid_stat.statistics.append(stat) + + genome_uuid_stats.append(genome_uuid_stat) + + return genome_uuid_stats + + +def create_top_level_statistics(data=None): + if data is None: + return ensembl_metadata_pb2.TopLevelStatistics() + + species = ensembl_metadata_pb2.TopLevelStatistics( + organism_uuid=data["organism_uuid"], + stats_by_genome_uuid=data["stats_by_genome_uuid"], + ) + return species + + +def create_top_level_statistics_by_uuid(data=None): + if data is None: + return ensembl_metadata_pb2.TopLevelStatisticsByUUID() + + species = ensembl_metadata_pb2.TopLevelStatisticsByUUID( + genome_uuid=data["genome_uuid"], + statistics=data["statistics"], + ) + return species + + +def create_sub_species(data=None): + if data is None: + return ensembl_metadata_pb2.SubSpecies() + + sub_species = ensembl_metadata_pb2.SubSpecies( + organism_uuid=data["organism_uuid"], + species_name=data["species_name"], + species_type=data["species_type"], + ) + return sub_species + + +def create_assembly(data=None): + if data is None: + return ensembl_metadata_pb2.AssemblyInfo() + + assembly = ensembl_metadata_pb2.Assembly( + assembly_uuid=data.Assembly.assembly_uuid, + accession=data.Assembly.accession, + level=data.Assembly.level, + name=data.Assembly.name, + ucsc_name=data.Assembly.ucsc_name, + ensembl_name=data.Assembly.ensembl_name, + is_reference=data.Assembly.is_reference, + url_name=data.Assembly.url_name, + tol_id=data.Assembly.tol_id, + ) + return assembly + + +def create_taxon(data=None, alternative_names=[]): + if data is None: + return ensembl_metadata_pb2.Taxon() + + taxon = ensembl_metadata_pb2.Taxon( + alternative_names=alternative_names, + taxonomy_id=data.Organism.taxonomy_id, + scientific_name=data.Organism.scientific_name, + strain=data.Organism.strain, + ) + return taxon + + +def create_organism(data=None): + if data is None: + return ensembl_metadata_pb2.Organism() + + organism = ensembl_metadata_pb2.Organism( + common_name=data.Organism.common_name, + strain=data.Organism.strain, + strain_type=data.Organism.strain_type, + scientific_name=data.Organism.scientific_name, + ensembl_name=data.Organism.ensembl_name, + scientific_parlance_name=data.Organism.scientific_parlance_name, + organism_uuid=data.Organism.organism_uuid, + taxonomy_id=data.Organism.taxonomy_id, + species_taxonomy_id=data.Organism.species_taxonomy_id, + ) + return organism + + +def create_attribute(data=None): + if data is None: + return ensembl_metadata_pb2.Attribute() + + attribute = ensembl_metadata_pb2.Attribute( + name=data.Attribute.name, + label=data.Attribute.label, + description=data.Attribute.description, + type=data.Attribute.type, + ) + return attribute + + +def create_attributes_info(data=None): + if data is None: + return ensembl_metadata_pb2.AttributesInfo() + + # from EA-1105 + required_attributes = { + "genebuild.method": "", + "genebuild.method_display": "", + "genebuild.last_geneset_update": "", + "genebuild.version": "", + "genebuild.provider_name": "", + "genebuild.provider_url": "", + "genebuild.sample_gene": "", + "genebuild.sample_location": "", + "assembly.level": "", + "assembly.date": "", + "assembly.provider_name": "", + "assembly.provider_url": "", + "variation.sample_variant": "" + } + + # set required_attributes values + for attrib_data in data: + attrib_name = attrib_data.Attribute.name + if attrib_name in list(required_attributes.keys()): + # print(f"%%%%%% {attrib_name} => {attrib_data.DatasetAttribute.value}") + required_attributes[attrib_name] = attrib_data.DatasetAttribute.value + + return ensembl_metadata_pb2.AttributesInfo( + genebuild_method=required_attributes["genebuild.method"], + genebuild_method_display=required_attributes["genebuild.method_display"], + genebuild_last_geneset_update=required_attributes["genebuild.last_geneset_update"], + genebuild_version=required_attributes["genebuild.version"], + genebuild_provider_name=required_attributes["genebuild.provider_name"], + genebuild_provider_url=required_attributes["genebuild.provider_url"], + genebuild_sample_gene=required_attributes["genebuild.sample_gene"], + genebuild_sample_location=required_attributes["genebuild.sample_location"], + assembly_level=required_attributes["assembly.level"], + assembly_date=required_attributes["assembly.date"], + assembly_provider_name=required_attributes["assembly.provider_name"], + assembly_provider_url=required_attributes["assembly.provider_url"], + variation_sample_variant=required_attributes["variation.sample_variant"], + ) + + +def create_assembly_info(data=None): + if data is None: + return ensembl_metadata_pb2.AssemblyInfo() + + assembly_info = ensembl_metadata_pb2.AssemblyInfo( + assembly_uuid=data.Assembly.assembly_uuid, + accession=data.Assembly.accession, + level=data.Assembly.level, + name=data.Assembly.name, + chromosomal=data.AssemblySequence.chromosomal, + length=data.AssemblySequence.length, + sequence_location=data.AssemblySequence.sequence_location, + md5=data.AssemblySequence.md5, + sha512t24u=data.AssemblySequence.sha512t24u, + ) + return assembly_info + + +def create_genome_uuid(data=None): + if data is None: + return ensembl_metadata_pb2.GenomeUUID() + + genome_uuid = ensembl_metadata_pb2.GenomeUUID( + genome_uuid=data["genome_uuid"] + ) + return genome_uuid + + +def create_genome(data=None, attributes=None, count=0, alternative_names=[]): + if data is None: + return ensembl_metadata_pb2.Genome() + + assembly = create_assembly(data) + taxon = create_taxon(data, alternative_names) + organism = create_organism(data) + attributes_info = create_attributes_info(attributes) + release = create_release(data) + + genome = ensembl_metadata_pb2.Genome( + genome_uuid=data.Genome.genome_uuid, + created=str(data.Genome.created), + assembly=assembly, + taxon=taxon, + organism=organism, + attributes_info=attributes_info, + release=release, + related_assemblies_count=count + ) + return genome + + +def create_genome_sequence(data=None): + if data is None: + return ensembl_metadata_pb2.GenomeSequence() + + genome_sequence = ensembl_metadata_pb2.GenomeSequence( + accession=data.AssemblySequence.accession, + name=data.AssemblySequence.name, + sequence_location=data.AssemblySequence.sequence_location, + length=data.AssemblySequence.length, + chromosomal=data.AssemblySequence.chromosomal + ) + return genome_sequence + + +def create_assembly_region(data=None): + if data is None: + return ensembl_metadata_pb2.AssemblyRegion() + + assembly_region = ensembl_metadata_pb2.AssemblyRegion( + name=data.AssemblySequence.name, + rank=data.AssemblySequence.chromosome_rank, + md5=data.AssemblySequence.md5, + length=data.AssemblySequence.length, + sha512t24u=data.AssemblySequence.sha512t24u, + chromosomal=data.AssemblySequence.chromosomal + ) + + return assembly_region + + +def create_genome_assembly_sequence_region(data=None): + if data is None: + return ensembl_metadata_pb2.GenomeAssemblySequenceRegion() + + genome_assembly_sequence_region = ensembl_metadata_pb2.GenomeAssemblySequenceRegion( + name=data.AssemblySequence.name, + md5=data.AssemblySequence.md5, + length=data.AssemblySequence.length, + sha512t24u=data.AssemblySequence.sha512t24u, + chromosomal=data.AssemblySequence.chromosomal + ) + + return genome_assembly_sequence_region + + +def create_release(data=None): + if data is None: + return ensembl_metadata_pb2.Release() + + release = ensembl_metadata_pb2.Release( + release_version=data.EnsemblRelease.version if hasattr(data, 'EnsemblRelease') else None, + release_date=str(data.EnsemblRelease.release_date) if hasattr(data, 'EnsemblRelease') else "Unreleased", + release_label=data.EnsemblRelease.label if hasattr(data, 'EnsemblRelease') else "Unreleased", + is_current=data.EnsemblRelease.is_current if hasattr(data, 'EnsemblRelease') else False, + site_name=data.EnsemblSite.name if hasattr(data, 'EnsemblSite') else "Unknown (not released yet)", + site_label=data.EnsemblSite.label if hasattr(data, 'EnsemblSite') else "Unknown (not released yet)", + site_uri=data.EnsemblSite.uri if hasattr(data, 'EnsemblSite') else "Unknown (not released yet)", + ) + return release + + +def create_datasets(data=None): + if data is None: + return ensembl_metadata_pb2.Datasets() + + return ensembl_metadata_pb2.Datasets( + genome_uuid=data["genome_uuid"], datasets=data["datasets"] + ) + + +def create_dataset_info(data=None): + if data is None: + return ensembl_metadata_pb2.DatasetInfos.DatasetInfo() + + return ensembl_metadata_pb2.DatasetInfos.DatasetInfo( + dataset_uuid=data.Dataset.dataset_uuid, + dataset_name=data.Dataset.name, + name=data.Attribute.name, + type=data.Attribute.type, + dataset_version=data.Dataset.version, + dataset_label=data.Dataset.label, + version=int(data.EnsemblRelease.version) if hasattr(data, 'EnsemblRelease') else None, + value=data.DatasetAttribute.value, + ) + + +def create_dataset_infos(genome_uuid=None, requested_dataset_type=None, data=None): + if data is None or data == []: + return ensembl_metadata_pb2.DatasetInfos() + + dataset_infos = [create_dataset_info(result) for result in data] + return ensembl_metadata_pb2.DatasetInfos( + genome_uuid=genome_uuid, + dataset_type=requested_dataset_type, + dataset_infos=dataset_infos, + ) + + +def populate_dataset_info(data): + return ensembl_metadata_pb2.DatasetInfos.DatasetInfo( + dataset_uuid=data.Dataset.dataset_uuid, + dataset_name=data.Dataset.name, + dataset_version=data.Dataset.version, + dataset_label=data.Dataset.label, + version=int(data.EnsemblRelease.version) if hasattr(data, 'EnsemblRelease') else None, + ) + + +def create_organisms_group_count(data, release_version): + if data is None: + return ensembl_metadata_pb2.OrganismsGroupCount() + + organisms_list = [] + for organism in data: + created_organism_group = ensembl_metadata_pb2.OrganismsGroup( + species_taxonomy_id=organism[0], + ensembl_name=organism[1], + common_name=organism[2], + scientific_name=organism[3], + order=organism[4], + count=organism[5], + ) + organisms_list.append(created_organism_group) + + return ensembl_metadata_pb2.OrganismsGroupCount( + organisms_group_count=organisms_list, + release_version=release_version + ) diff --git a/src/ensembl/production/metadata/grpc/service.py b/src/ensembl/production/metadata/grpc/service.py new file mode 100644 index 00000000..75a145ca --- /dev/null +++ b/src/ensembl/production/metadata/grpc/service.py @@ -0,0 +1,32 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from concurrent import futures +import grpc +import logging + +from ensembl.production.metadata.grpc import ensembl_metadata_pb2_grpc +from ensembl.production.metadata.grpc.servicer import EnsemblMetadataServicer + + +def serve(): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + ensembl_metadata_pb2_grpc.add_EnsemblMetadataServicer_to_server( + EnsemblMetadataServicer(), server + ) + server.add_insecure_port("[::]:50051") + server.start() + server.wait_for_termination() + + +if __name__ == "__main__": + logging.basicConfig() + serve() diff --git a/src/ensembl/production/metadata/grpc/servicer.py b/src/ensembl/production/metadata/grpc/servicer.py new file mode 100644 index 00000000..c7375a5e --- /dev/null +++ b/src/ensembl/production/metadata/grpc/servicer.py @@ -0,0 +1,96 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ensembl.production.metadata.grpc import ensembl_metadata_pb2_grpc + +import ensembl.production.metadata.grpc.utils as utils + + +class EnsemblMetadataServicer(ensembl_metadata_pb2_grpc.EnsemblMetadataServicer): + def __init__(self): + self.db = utils.connect_to_db() + + def GetSpeciesInformation(self, request, context): + return utils.get_species_information(self.db, request.genome_uuid) + + def GetAssemblyInformation(self, request, context): + return utils.get_assembly_information(self.db, request.assembly_uuid) + + def GetGenomesByAssemblyAccessionID(self, request, context): + return utils.get_genomes_from_assembly_accession_iterator( + self.db, request.assembly_accession, request.release_version + ) + + def GetSubSpeciesInformation(self, request, context): + return utils.get_sub_species_info(self.db, request.organism_uuid, request.group) + + def GetTopLevelStatistics(self, request, context): + return utils.get_top_level_statistics(self.db, request.organism_uuid, request.group) + + def GetTopLevelStatisticsByUUID(self, request, context): + return utils.get_top_level_statistics_by_uuid(self.db, request.genome_uuid) + + def GetGenomeUUID(self, request, context): + return utils.get_genome_uuid(self.db, request.ensembl_name, request.assembly_name, request.use_default) + + def GetGenomeByUUID(self, request, context): + return utils.get_genome_by_uuid(self.db, request.genome_uuid, request.release_version) + + def GetGenomesByKeyword(self, request, context): + return utils.get_genomes_by_keyword_iterator( + self.db, request.keyword, request.release_version + ) + + def GetGenomeByName(self, request, context): + return utils.get_genome_by_name( + self.db, request.ensembl_name, request.site_name, request.release_version + ) + + def GetRelease(self, request, context): + return utils.release_iterator( + self.db, request.site_name, request.release_version, request.current_only + ) + + def GetReleaseByUUID(self, request, context): + return utils.release_by_uuid_iterator(self.db, request.genome_uuid) + + def GetGenomeSequence(self, request, context): + return utils.genome_sequence_iterator( + self.db, request.genome_uuid, request.chromosomal_only + ) + + def GetAssemblyRegion(self, request, context): + return utils.assembly_region_iterator( + self.db, request.genome_uuid, request.chromosomal_only + ) + + def GetGenomeAssemblySequenceRegion(self, request, context): + return utils.genome_assembly_sequence_region( + self.db, request.genome_uuid, request.sequence_region_name + ) + + def GetDatasetsListByUUID(self, request, context): + return utils.get_datasets_list_by_uuid( + self.db, request.genome_uuid, request.release_version + ) + + def GetDatasetInformation(self, request, context): + return utils.get_dataset_by_genome_and_dataset_type( + self.db, request.genome_uuid, request.dataset_type + ) + + def GetOrganismsGroupCount(self, request, context): + return utils.get_organisms_group_count( + self.db, request.release_version + ) + + def GetGenomeUUIDByTag(self, request, context): + return utils.get_genome_uuid_by_tag(self.db, request.genome_tag) diff --git a/src/ensembl/production/metadata/grpc/utils.py b/src/ensembl/production/metadata/grpc/utils.py new file mode 100644 index 00000000..0c79a2ed --- /dev/null +++ b/src/ensembl/production/metadata/grpc/utils.py @@ -0,0 +1,417 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +from ensembl.production.metadata.grpc import ensembl_metadata_pb2 +from ensembl.production.metadata.grpc.config import MetadataConfig as cfg +from ensembl.production.metadata.grpc.adaptors.genome import GenomeAdaptor +from ensembl.production.metadata.grpc.adaptors.release import ReleaseAdaptor +import ensembl.production.metadata.grpc.protobuf_msg_factory as msg_factory + + +def connect_to_db(): + conn = GenomeAdaptor( + metadata_uri=cfg.metadata_uri, + taxonomy_uri=cfg.taxon_uri + ) + return conn + + +def get_alternative_names(db_conn, taxon_id): + """ Get alternative names for a given taxon ID """ + taxon_ifo = db_conn.fetch_taxonomy_names(taxon_id) + alternative_names = taxon_ifo[taxon_id].get('synonym') + genbank_common_name = taxon_ifo[taxon_id].get('genbank_common_name') + + if genbank_common_name is not None: + alternative_names.append(genbank_common_name) + + # remove duplicates + unique_alternative_names = list(set(alternative_names)) + # sort before returning (otherwise the test breaks) + return sorted(unique_alternative_names) + + +def get_top_level_statistics(db_conn, organism_uuid, group): + if organism_uuid is None: + return msg_factory.create_top_level_statistics() + + stats_results = db_conn.fetch_genome_datasets( + organism_uuid=organism_uuid, + dataset_name="all", + dataset_attributes=True + ) + + if len(stats_results) > 0: + stats_by_genome_uuid = msg_factory.create_stats_by_genome_uuid(stats_results) + return msg_factory.create_top_level_statistics({ + 'organism_uuid': organism_uuid, + 'stats_by_genome_uuid': stats_by_genome_uuid + }) + + return msg_factory.create_top_level_statistics() + + +def get_top_level_statistics_by_uuid(db_conn, genome_uuid): + if genome_uuid is None: + return msg_factory.create_top_level_statistics_by_uuid() + + stats_results = db_conn.fetch_genome_datasets( + genome_uuid=genome_uuid, + dataset_name="all", + dataset_attributes=True + ) + + statistics = [] + if len(stats_results) > 0: + for result in stats_results: + statistics.append({ + 'name': result.Attribute.name, + 'label': result.Attribute.label, + 'statistic_type': result.Attribute.type, + 'statistic_value': result.DatasetAttribute.value + }) + return msg_factory.create_top_level_statistics_by_uuid( + ({"genome_uuid": genome_uuid, "statistics": statistics}) + ) + + return msg_factory.create_top_level_statistics_by_uuid() + + +def get_assembly_information(db_conn, assembly_uuid): + if assembly_uuid is None: + return msg_factory.create_assembly_info() + + assembly_results = db_conn.fetch_sequences( + assembly_uuid=assembly_uuid + ) + if len(assembly_results) > 0: + return msg_factory.create_assembly_info(assembly_results[0]) + + return msg_factory.create_assembly_info() + + +def create_genome_with_attributes_and_count(db_conn, genome, release_version): + # we fetch attributes related to that genome + attrib_data_results = db_conn.fetch_genome_datasets( + genome_uuid=genome.Genome.genome_uuid, + release_version=release_version, + dataset_name="all", + dataset_attributes=True + ) + # fetch related assemblies count + related_assemblies_count = db_conn.fetch_related_assemblies_count( + organism_uuid=genome.Organism.organism_uuid + ) + + alternative_names = get_alternative_names(db_conn, genome.Organism.taxonomy_id) + + return msg_factory.create_genome( + data=genome, + attributes=attrib_data_results, + count=related_assemblies_count, + alternative_names=alternative_names + ) + + +def get_genomes_from_assembly_accession_iterator(db_conn, assembly_accession, release_version): + if assembly_accession is None: + return msg_factory.create_genome() + + genome_results = db_conn.fetch_genomes( + assembly_accession=assembly_accession, + allow_unreleased=cfg.allow_unreleased + ) + for genome in genome_results: + yield msg_factory.create_genome(data=genome) + + return msg_factory.create_genome() + +def get_species_information(db_conn, genome_uuid): + if genome_uuid is None: + return msg_factory.create_species() + + species_results = db_conn.fetch_genomes( + genome_uuid=genome_uuid, + allow_unreleased=cfg.allow_unreleased + ) + if len(species_results) == 1: + tax_id = species_results[0].Organism.taxonomy_id + taxo_results = db_conn.fetch_taxonomy_names(tax_id) + return msg_factory.create_species(species_results[0], taxo_results[tax_id]) + + return msg_factory.create_species() + + +def get_sub_species_info(db_conn, organism_uuid, group): + if organism_uuid is None: + return msg_factory.create_sub_species() + + sub_species_results = db_conn.fetch_genomes( + organism_uuid=organism_uuid, + group=group, + allow_unreleased=cfg.allow_unreleased + ) + + species_name = [] + species_type = [] + if len(sub_species_results) > 0: + for result in sub_species_results: + if result.OrganismGroup.type not in species_type: + species_type.append(result.OrganismGroup.type) + if result.OrganismGroup.name not in species_name: + species_name.append(result.OrganismGroup.name) + + return msg_factory.create_sub_species({ + 'organism_uuid': organism_uuid, + 'species_type': species_type, + 'species_name': species_name + }) + + return msg_factory.create_sub_species() + + +def get_genome_uuid(db_conn, ensembl_name, assembly_name, use_default=False): + if ensembl_name is None or assembly_name is None: + return msg_factory.create_genome_uuid() + + genome_uuid_result = db_conn.fetch_genomes( + ensembl_name=ensembl_name, + assembly_name=assembly_name, + use_default_assembly=use_default, + allow_unreleased=cfg.allow_unreleased + ) + + if len(genome_uuid_result) == 1: + return msg_factory.create_genome_uuid( + {"genome_uuid": genome_uuid_result[0].Genome.genome_uuid} + ) + # PATCH: This is a special case, see EA-1112 for more details + elif len(genome_uuid_result) == 0: + # Try looking using only assembly_default (no ensembl_name is needed) + using_default_assembly_only_result = db_conn.fetch_genomes( + assembly_name=assembly_name, + use_default_assembly=True, + allow_unreleased=cfg.allow_unreleased + ) + if len(using_default_assembly_only_result) == 1: + return msg_factory.create_genome_uuid( + {"genome_uuid": using_default_assembly_only_result[0].Genome.genome_uuid} + ) + + return msg_factory.create_genome_uuid() + + +def get_genome_by_uuid(db_conn, genome_uuid, release_version): + if genome_uuid is None: + return msg_factory.create_genome() + + # We first get the genome info + genome_results = db_conn.fetch_genomes( + genome_uuid=genome_uuid, + release_version=release_version, + allow_unreleased=cfg.allow_unreleased + ) + + if len(genome_results) == 1: + return create_genome_with_attributes_and_count( + db_conn=db_conn, genome=genome_results[0], release_version=release_version + ) + + return msg_factory.create_genome() + + +def get_genomes_by_keyword_iterator(db_conn, keyword, release_version): + if not keyword: + return msg_factory.create_genome() + + genome_results = db_conn.fetch_genome_by_keyword( + keyword=keyword, + release_version=release_version + ) + + if len(genome_results) > 0: + # Create an empty list to store the most recent genomes + most_recent_genomes = [] + # Group `genome_results` based on the `assembly_accession` field + for _, genome_release_group in itertools.groupby(genome_results, lambda r: r.Assembly.accession): + # Sort the genomes in each group based on the `release_version` field in descending order + sorted_genomes = sorted(genome_release_group, key=lambda g: g.EnsemblRelease.version, reverse=True) + # Select the most recent genome from the sorted group (first element) + most_recent_genome = sorted_genomes[0] + # Add the most recent genome to the `most_recent_genomes` list + most_recent_genomes.append(most_recent_genome) + + for genome_row in most_recent_genomes: + yield msg_factory.create_genome(data=genome_row) + + return msg_factory.create_genome() + + +def get_genome_by_name(db_conn, ensembl_name, site_name, release_version): + if ensembl_name is None and site_name is None: + return msg_factory.create_genome() + + genome_results = db_conn.fetch_genomes( + ensembl_name=ensembl_name, + site_name=site_name, + release_version=release_version, + allow_unreleased=cfg.allow_unreleased + ) + if len(genome_results) == 1: + return create_genome_with_attributes_and_count( + db_conn=db_conn, genome=genome_results[0], release_version=release_version + ) + + return msg_factory.create_genome() + + +def get_datasets_list_by_uuid(db_conn, genome_uuid, release_version): + if genome_uuid is None: + return msg_factory.create_datasets() + + datasets_results = db_conn.fetch_genome_datasets( + genome_uuid=genome_uuid, + # fetch all datasets, default is 'assembly' only + dataset_name="all", + release_version=release_version, + allow_unreleased=cfg.allow_unreleased, + dataset_attributes=True + ) + + if len(datasets_results) > 0: + # ds_obj_dict where all datasets are stored as: + # { dataset_type_1: [datasets_dt1_1, datasets_dt1_2], dataset_type_2: [datasets_dt2_1] } + ds_obj_dict = {} + for result in datasets_results: + dataset_type = result.DatasetType.name + # Populate the objects bottom up + datasets_info = msg_factory.populate_dataset_info(result) + # Construct the datasets dictionary + if dataset_type in ds_obj_dict: + ds_obj_dict[dataset_type].append(datasets_info) + else: + ds_obj_dict[dataset_type] = [datasets_info] + + dataset_object_dict = {} + # map each datasets list (e.g: [datasets_dt1_1, datasets_dt1_2]) to DatasetInfos + for dataset_type_key in ds_obj_dict: + dataset_object_dict[dataset_type_key] = ensembl_metadata_pb2.DatasetInfos( + dataset_infos=ds_obj_dict[dataset_type_key] + ) + + return msg_factory.create_datasets({ + 'genome_uuid': genome_uuid, + 'datasets': dataset_object_dict + }) + + return msg_factory.create_datasets() + + +def genome_sequence_iterator(db_conn, genome_uuid, chromosomal_only): + if genome_uuid is None: + return + + assembly_sequence_results = db_conn.fetch_sequences( + genome_uuid=genome_uuid, + chromosomal_only=chromosomal_only, + ) + for result in assembly_sequence_results: + yield msg_factory.create_genome_sequence(result) + + +def assembly_region_iterator(db_conn, genome_uuid, chromosomal_only): + if genome_uuid is None: + return + + assembly_sequence_results = db_conn.fetch_sequences( + genome_uuid=genome_uuid, + chromosomal_only=chromosomal_only, + ) + for result in assembly_sequence_results: + yield msg_factory.create_assembly_region(result) + + +def genome_assembly_sequence_region(db_conn, genome_uuid, sequence_region_name): + if genome_uuid is None or sequence_region_name is None: + return msg_factory.create_genome_assembly_sequence_region() + + assembly_sequence_results = db_conn.fetch_sequences( + genome_uuid=genome_uuid, + assembly_sequence_name=sequence_region_name + ) + if len(assembly_sequence_results) == 1: + return msg_factory.create_genome_assembly_sequence_region(assembly_sequence_results[0]) + + return msg_factory.create_genome_assembly_sequence_region() + + +def release_iterator(metadata_db, site_name, release_version, current_only): + conn = ReleaseAdaptor(metadata_uri=cfg.metadata_uri) + + # set release_version/site_name to None if it's an empty list + release_version = release_version or None + site_name = site_name or None + + release_results = conn.fetch_releases( + release_version=release_version, + current_only=current_only, + site_name=site_name + ) + + for result in release_results: + yield msg_factory.create_release(result) + + +def release_by_uuid_iterator(metadata_db, genome_uuid): + if genome_uuid is None: + return + + conn = ReleaseAdaptor(metadata_uri=cfg.metadata_uri) + release_results = conn.fetch_releases_for_genome( + genome_uuid=genome_uuid, + ) + + for result in release_results: + yield msg_factory.create_release(result) + + +def get_dataset_by_genome_and_dataset_type(db_conn, genome_uuid, requested_dataset_type): + if genome_uuid is None: + return msg_factory.create_dataset_infos() + + dataset_results = db_conn.fetch_genome_datasets( + genome_uuid=genome_uuid, + dataset_type=requested_dataset_type, + dataset_attributes=True + ) + return msg_factory.create_dataset_infos(genome_uuid, requested_dataset_type, dataset_results) + + +def get_organisms_group_count(db_conn, release_version): + count_result = db_conn.fetch_organisms_group_counts(release_version=release_version) + return msg_factory.create_organisms_group_count(count_result, release_version) + + +def get_genome_uuid_by_tag(db_conn, genome_tag): + if genome_tag is None: + return msg_factory.create_genome_uuid() + + genome_uuid_result = db_conn.fetch_genomes( + genome_tag=genome_tag, + allow_unreleased=cfg.allow_unreleased + ) + + if len(genome_uuid_result) == 1: + return msg_factory.create_genome_uuid( + {"genome_uuid": genome_uuid_result[0].Genome.genome_uuid} + ) + return msg_factory.create_genome_uuid() diff --git a/src/tests/__init__.py b/src/tests/__init__.py index e69de29b..ecadf745 100644 --- a/src/tests/__init__.py +++ b/src/tests/__init__.py @@ -0,0 +1 @@ +from tests import * diff --git a/src/tests/test_api.py b/src/tests/test_api.py index 1b3b2404..553bbd1d 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -9,23 +9,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Unit tests for api module +""" + from pathlib import Path -from unittest import mock -from unittest.mock import Mock, patch import pytest -import re - -import sqlalchemy -from ensembl.database import UnitTestDB, DBConnection -from ensembl.production.metadata.api.exceptions import UpdateBackCoreException -from ensembl.production.metadata.api.factory import meta_factory -from ensembl.production.metadata.api.models import Organism, Assembly, Dataset, AssemblySequence, DatasetAttribute, \ - DatasetSource, DatasetType, Attribute, Genome -from ensembl.core.models import Meta - -from ensembl.production.metadata.updater.core import CoreMetaUpdater +from ensembl.database import DBConnection +from ensembl.database import UnitTestDB +from ensembl.production.metadata.api.models import Organism, Genome db_directory = Path(__file__).parent / 'databases' db_directory = db_directory.resolve() @@ -62,3 +56,4 @@ def test_organism_ensembl_name_compat(self, multi_dbs): ensembl_name = session.query(Organism).filter(Organism.ensembl_name == 'homo_sapiens').first() biosample_id = session.query(Organism).filter(Organism.biosample_id == 'homo_sapiens').first() assert ensembl_name.organism_uuid == biosample_id.organism_uuid + diff --git a/src/tests/test_grpc.py b/src/tests/test_grpc.py new file mode 100644 index 00000000..967c3af7 --- /dev/null +++ b/src/tests/test_grpc.py @@ -0,0 +1,425 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for api module +""" + +import pytest +import pkg_resources +from pathlib import Path + +from ensembl.database import UnitTestDB +from ensembl.production.metadata.grpc.adaptors.genome import GenomeAdaptor +from ensembl.production.metadata.grpc.adaptors.release import ReleaseAdaptor + +distribution = pkg_resources.get_distribution("ensembl-metadata-api") +sample_path = Path(distribution.location) / "ensembl" / "production" / "metadata" / "api" / "sample" + + +@pytest.mark.parametrize("multi_dbs", [[{"src": sample_path / "ensembl_metadata"}, + {"src": sample_path / "ncbi_taxonomy"}]], + indirect=True) +class TestMetadataDB: + dbc = None # type: UnitTestDB + + def test_load_database(self, multi_dbs): + db_test = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) + assert db_test, "DB should not be empty" + + @pytest.mark.parametrize( + "allow_unreleased, unreleased_only, current_only, output_count", + [ + # fetches everything (7 released + 2 unreleased) + (True, False, True, 9), + # fetches all released genomes (with current_only=0) + (False, False, False, 7), + # fetches released genomes with current_only=1 (default) + (False, False, True, 6), + # fetches all unreleased genomes + (False, True, True, 2), + ] + ) + def test_fetch_all_genomes(self, multi_dbs, allow_unreleased, unreleased_only, current_only, output_count): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + allow_unreleased=allow_unreleased, + unreleased_only=unreleased_only, + current_only=current_only + ) + assert len(test) == output_count + + def test_fetch_with_all_args_no_conflict(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3", + assembly_accession="GCA_000002985.3", + assembly_name="WBcel235", + ensembl_name="caenorhabditis_elegans", + taxonomy_id="6239", + group="EnsemblMetazoa", + allow_unreleased=False, + site_name="Ensembl", + release_type="integrated", + release_version="108.0", + current_only=True + ) + assert len(test) == 0 + + def test_fetch_with_all_args_conflict(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3", + assembly_accession="GCA_000002985.3", + assembly_name="WBcel235", + ensembl_name="caenorhabditis_elegans", + taxonomy_id="9606", # Conflicting taxonomy_id + group="EnsemblBacteria", # Conflicting group + allow_unreleased=False, + site_name="Ensembl", + release_type="integrated", + release_version="108.0", + current_only=True + ) + assert len(test) == 0 + + def test_fetch_releases(self, multi_dbs): + conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) + test = conn.fetch_releases(release_id=2) + # test the one to many connection + assert test[0].EnsemblSite.name == 'Ensembl' + assert test[0].EnsemblSite.label == 'Ensembl Genome Browser' + # test the direct access. + assert test[0].EnsemblRelease.label == 'Scaling Phase 1' + + # currently only have one release, so the testing is not comprehensive + def test_fetch_releases_for_genome(self, multi_dbs): + conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) + test = conn.fetch_releases_for_genome('a73351f7-93e7-11ec-a39d-005056b38ce3') + assert test[0].EnsemblSite.name == 'Ensembl' + + def test_fetch_releases_for_dataset(self, multi_dbs): + conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url) + test = conn.fetch_releases_for_dataset('3316fe1a-83e7-46da-8a56-cf2b693d8060') + assert test[0].EnsemblSite.name == 'Ensembl' + + def test_fetch_taxonomy_names(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_taxonomy_names(taxonomy_ids=[6239, 511145]) + assert test[511145]['scientific_name'] == 'Escherichia coli str. K-12 substr. MG1655' + + def test_fetch_taxonomy_ids(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_taxonomy_ids(taxonomy_names='Caenorhabditis elegans') + assert test[0] == 6239 + + def test_fetch_genomes(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes(genome_uuid='a7335667-93e7-11ec-a39d-005056b38ce3') + assert test[0].Organism.scientific_name == 'Homo sapiens' + + # def test_fetch_genomes_by_group_division(self, multi_dbs): + # conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + # taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + # division_filter = 'EnsemblVertebrates' + # test = conn.fetch_genomes(group=division_filter) + # assert len(test) == 1 + # Other PR will likely change this drastically, so the effort is not really necessary. Their are 7 groups. + # assert division_filter in division_results + + def test_fetch_genomes_by_genome_uuid(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes_by_genome_uuid('a733550b-93e7-11ec-a39d-005056b38ce3') + assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' + + def test_fetch_genome_by_ensembl_and_assembly_name(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes(assembly_name='WBcel235', ensembl_name='caenorhabditis_elegans') + assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' + + def test_fetch_genomes_by_assembly_accession(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes_by_assembly_accession('GCA_000005845.2') + assert test[0].Organism.scientific_name == 'Escherichia coli str. K-12 substr. MG1655 str. K12 (GCA_000005845)' + + def test_fetch_genomes_by_assembly_sequence_accession(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_sequences( + genome_uuid='a7335667-93e7-11ec-a39d-005056b38ce3', + assembly_accession='GCA_000001405.28', + assembly_sequence_accession='CM000686.2' + ) + assert test[0].AssemblySequence.name == 'Y' + + def test_fetch_genomes_by_assembly_sequence_accession_empty(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_sequences( + genome_uuid='s0m3-r4nd0m-g3n3-uu1d-v4lu3', + assembly_accession='GCA_000001405.28', + assembly_sequence_accession='CM000686.2' + ) + assert len(test) == 0 + + def test_fetch_genomes_by_ensembl_name(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes_by_ensembl_name('caenorhabditis_elegans') + assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' + + def test_fetch_genomes_by_taxonomy_id(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes_by_taxonomy_id(6239) + assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' + + def test_fetch_genomes_by_scientific_name(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes_by_scientific_name( + scientific_name='Caenorhabditis elegans', + site_name='Ensembl' + ) + assert test[0].Organism.scientific_name == 'Caenorhabditis elegans' + + def test_fetch_sequences(self, multi_dbs): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_sequences(assembly_uuid='eeaaa2bf-151c-4848-8b85-a05a9993101e') + # this test is going to drive me nuts + # Locally and on GitLab CI/CD: AssemblySequence.accession == 'CHR_HG107_PATCH' + # in Travis, its: AssemblySequence.accession == 'KI270757.1' + # to please bothI'm using 'sequence_location' for now + assert test[0].AssemblySequence.sequence_location == 'SO:0000738' + + @pytest.mark.parametrize( + "genome_uuid, assembly_accession, chromosomal_only, expected_output", + [ + # Chromosomal and non-chromosomal + ("a7335667-93e7-11ec-a39d-005056b38ce3", "GCA_000001405.28", False, 0), + # Chromosomal only + ("a7335667-93e7-11ec-a39d-005056b38ce3", "GCA_000001405.28", True, 1), + ] + ) + def test_fetch_sequences_chromosomal(self, multi_dbs, genome_uuid, assembly_accession, chromosomal_only, + expected_output): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_sequences( + genome_uuid=genome_uuid, + assembly_accession=assembly_accession, + chromosomal_only=chromosomal_only + ) + assert test[-1].AssemblySequence.chromosomal == expected_output + + @pytest.mark.parametrize( + "genome_uuid, assembly_sequence_name, chromosomal_only, expected_output", + [ + ("a7335667-93e7-11ec-a39d-005056b38ce3", "MT", False, "J01415.2"), + ("a7335667-93e7-11ec-a39d-005056b38ce3", "LRG_778", False, "LRG_778"), + ("a7335667-93e7-11ec-a39d-005056b38ce3", "LRG_778", True, None), + ("some-random-genome-uuid", "LRG_778", False, None), + ("a7335667-93e7-11ec-a39d-005056b38ce3", "fake_assembly_name", False, None), + ("some-random-genome-uuid", "fake_assembly_name", False, None), + ] + ) + def test_fetch_sequences_by_assembly_seq_name(self, multi_dbs, genome_uuid, assembly_sequence_name, + chromosomal_only, expected_output): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_sequences( + genome_uuid=genome_uuid, + assembly_sequence_name=assembly_sequence_name, + chromosomal_only=chromosomal_only + ) + for result in test: + assert result.AssemblySequence.accession == expected_output + + @pytest.mark.parametrize( + "genome_uuid, dataset_uuid, allow_unreleased, unreleased_only, expected_dataset_uuid, expected_count", + [ + # nothing specified + allow_unreleased -> fetches everything + (None, None, True, False, "6e82999b-7a8c-429c-a2af-8d77a59a2e81", 32), + # specifying genome_uuid + ("a73357ab-93e7-11ec-a39d-005056b38ce3", None, False, False, "0dc05c6e-2910-4dbd-879a-719ba97d5824", 5), + # specifying dataset_uuid + (None, "0dc05c6e-2910-4dbd-879a-719ba97d5824", False, False, "0dc05c6e-2910-4dbd-879a-719ba97d5824", 1), + # fetch unreleased datasets only + (None, None, False, True, "385f1ec2-bd06-40ce-873a-98e199f10505", 1), + ] + ) + def test_fetch_genome_dataset_all( + self, multi_dbs, genome_uuid, + dataset_uuid, allow_unreleased, + unreleased_only, expected_dataset_uuid, + expected_count): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genome_datasets( + genome_uuid=genome_uuid, + dataset_uuid=dataset_uuid, + unreleased_only=unreleased_only, + allow_unreleased=allow_unreleased, + # fetch all datasets (default: dataset_name="assembly") + dataset_name="all" + ) + assert test[0].Dataset.dataset_uuid == expected_dataset_uuid + assert len(test) == expected_count + + @pytest.mark.parametrize( + "organism_uuid, expected_count", + [ + # homo_sapien + ("db2a5f09-2db8-429b-a407-c15a4ca2876d", 11), + # e-coli + ("21279e3e-e651-43e1-a6fc-79e390b9e8a8", 3), + # non-existing organism + ("organism-yet-to-be-discovered", 0), + ] + ) + def test_fetch_genome_dataset_by_organism_uuid(self, multi_dbs, organism_uuid, expected_count): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genome_datasets( + organism_uuid=organism_uuid, + # fetch all datasets (default: dataset_name="assembly") + dataset_name="all" + ) + assert len(test) == expected_count + + @pytest.mark.parametrize( + "ensembl_name, assembly_name, use_default_assembly, expected_output", + [ + ("homo_sapiens", "GRCh37.p13", False, "3704ceb1-948d-11ec-a39d-005056b38ce3"), + ("homo_sapiens", "GRCh37", True, "3704ceb1-948d-11ec-a39d-005056b38ce3"), + ] + ) + def test_fetch_genome_uuid(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly, expected_output): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + ensembl_name=ensembl_name, + assembly_name=assembly_name, + use_default_assembly=use_default_assembly, + allow_unreleased=False, + current_only=False + ) + assert len(test) == 1 + assert test[0].Genome.genome_uuid == expected_output + + @pytest.mark.parametrize( + "ensembl_name, assembly_name, use_default_assembly, expected_output", + [ + ("homo_sapiens", "GRCh38.p13", False, "a7335667-93e7-11ec-a39d-005056b38ce3"), + ("homo_sapiens", "GRCh38", True, "a7335667-93e7-11ec-a39d-005056b38ce3"), + ] + ) + def test_fetch_genome_uuid_is_current(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly, + expected_output): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + ensembl_name=ensembl_name, + assembly_name=assembly_name, + use_default_assembly=use_default_assembly, + allow_unreleased=False + ) + assert len(test) == 1 + assert test[0].Genome.genome_uuid == expected_output + + @pytest.mark.parametrize( + "ensembl_name, assembly_name, use_default_assembly", + [ + ("homo_sapiens", "GRCh37", False), + ("homo_sapiens", "GRCh37.p13", True), + ] + ) + def test_fetch_genome_uuid_empty(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes( + ensembl_name=ensembl_name, + assembly_name=assembly_name, + use_default_assembly=use_default_assembly + ) + assert len(test) == 0 + + @pytest.mark.parametrize( + "species_taxonomy_id, expected_organism, expected_assemblies_count", + [ + # fetch everything + (None, "Human", 3) + ] + ) + def test_fetch_organisms_group_counts(self, multi_dbs, species_taxonomy_id, expected_organism, + expected_assemblies_count): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_organisms_group_counts() + # When fetching everything: + # First result should be Human + assert test[0][2] == expected_organism + # We should have three assemblies associated with Human (Two for grch37.38 organism + one t2t) + assert test[0][5] == expected_assemblies_count + + for data in test[1:]: + # All others have only one genome in test DB + assert data[5] == 1 + + @pytest.mark.parametrize( + "organism_uuid, expected_assemblies_count", + [ + # Human + ('db2a5f09-2db8-429b-a407-c15a4ca2876d', 3), + # Triticum aestivum + ('d64c34ca-b37a-476b-83b5-f21d07a3ae67', 1), + ] + ) + def test_fetch_related_assemblies_count(self, multi_dbs, organism_uuid, expected_assemblies_count): + conn = GenomeAdaptor( + metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url + ) + + test = conn.fetch_related_assemblies_count(organism_uuid=organism_uuid) + # We should have three assemblies associated with Human (Two for grch37.38 organism + one t2t) + assert test == expected_assemblies_count + + @pytest.mark.parametrize( + "allow_unreleased, output_count, expected_genome_uuid", + [ + # fetches everything + (True, 9, "a73356e1-93e7-11ec-a39d-005056b38ce3"), + # fetches released datasets and genomes with current_only=1 (default) + (False, 6, "a73356e1-93e7-11ec-a39d-005056b38ce3"), + ] + ) + def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, expected_genome_uuid): + conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, + taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) + test = conn.fetch_genomes_info( + allow_unreleased_genomes=allow_unreleased, + allow_unreleased_datasets=allow_unreleased, + group_type=['division', 'internal'] + ) + output_to_list = list(test) + assert len(output_to_list) == output_count + assert output_to_list[0][0]['genome'].Genome.genome_uuid == expected_genome_uuid diff --git a/src/tests/test_protobuf_msg_factory.py b/src/tests/test_protobuf_msg_factory.py new file mode 100644 index 00000000..1e2b379c --- /dev/null +++ b/src/tests/test_protobuf_msg_factory.py @@ -0,0 +1,287 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for protobuf_msg_factory.py +""" +import json +from pathlib import Path + +import pkg_resources +import pytest +from ensembl.database import UnitTestDB +from google.protobuf import json_format + +import ensembl.production.metadata.grpc.protobuf_msg_factory as msg_factory + +distribution = pkg_resources.get_distribution("ensembl-metadata-api") +sample_path = Path(distribution.location) / "ensembl" / "production" / "metadata" / "api" / "sample" + + +@pytest.mark.parametrize("multi_dbs", [[{"src": sample_path / "ensembl_metadata"}, + {"src": sample_path / "ncbi_taxonomy"}]], + indirect=True) +class TestClass: + dbc = None # type: UnitTestDB + + def test_create_genome(self, multi_dbs, genome_db_conn): + """Test service.create_genome function""" + genome_input_data = genome_db_conn.fetch_genomes( + genome_uuid="a7335667-93e7-11ec-a39d-005056b38ce3" + ) + # Make sure we are only getting one + assert len(genome_input_data) == 1 + + attrib_input_data = genome_db_conn.fetch_genome_datasets( + genome_uuid=genome_input_data[0].Genome.genome_uuid, + dataset_attributes=True + ) + # 11 attributes + assert len(attrib_input_data) == 11 + + related_assemblies_input_count = genome_db_conn.fetch_related_assemblies_count( + organism_uuid=genome_input_data[0].Organism.organism_uuid + ) + # There are three related assemblies + assert related_assemblies_input_count == 3 + + expected_output = { + "genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3", + "assembly": { + "accession": "GCA_000001405.28", + "assemblyUuid": "eeaaa2bf-151c-4848-8b85-a05a9993101e", + "name": "GRCh38.p13", + "ucscName": "hg38", + "level": "chromosome", + "ensemblName": "GRCh38.p13", + "isReference": True, + "urlName": "GRCh38" + }, + "taxon": { + "taxonomyId": 9606, + "scientificName": "Homo sapiens" + }, + "created": "2023-05-12 13:30:58", + "attributesInfo": { + "assemblyLevel": "chromosome", + "assemblyDate": "2013-12" + }, + "organism": { + "commonName": "Human", + "ensemblName": "Homo_sapiens", + "organismUuid": "db2a5f09-2db8-429b-a407-c15a4ca2876d", + "scientificName": "Homo sapiens", + "scientificParlanceName": "homo_sapiens", + "speciesTaxonomyId": 9606, + "taxonomyId": 9606 + }, + "relatedAssembliesCount": 3, + "release": { + "releaseVersion": 108.0, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "isCurrent": True, + "siteName": "Ensembl", + "siteLabel": "Ensembl Genome Browser", + "siteUri": "https://beta.ensembl.org" + } + } + + output = json_format.MessageToJson( + msg_factory.create_genome( + data=genome_input_data[0], + attributes=attrib_input_data, + count=related_assemblies_input_count + ) + ) + assert json.loads(output) == expected_output + + def test_create_assembly_info(self, multi_dbs, genome_db_conn): + input_data = genome_db_conn.fetch_sequences(assembly_uuid="eeaaa2bf-151c-4848-8b85-a05a9993101e") + expected_output = { + "accession": "GCA_000001405.28", + "assemblyUuid": "eeaaa2bf-151c-4848-8b85-a05a9993101e", + # "chromosomal": 1, + "length": "71251", + "level": "chromosome", + "name": "GRCh38.p13", + "sequenceLocation": "SO:0000738" + } + + output = json_format.MessageToJson(msg_factory.create_assembly_info(input_data[0])) + assert json.loads(output) == expected_output + + def test_create_species(self, multi_dbs, genome_db_conn): + species_input_data = genome_db_conn.fetch_genomes(genome_uuid="a7335667-93e7-11ec-a39d-005056b38ce3") + tax_id = species_input_data[0].Organism.taxonomy_id + taxo_results = genome_db_conn.fetch_taxonomy_names(tax_id) + expected_output = { + "genbankCommonName": "human", + "genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3", + "scientificName": "Homo sapiens", + "scientificParlanceName": "homo_sapiens", + "taxonId": 9606 + } + + output = json_format.MessageToJson(msg_factory.create_species(species_input_data[0], taxo_results[tax_id])) + assert json.loads(output) == expected_output + + def test_create_stats_by_organism_uuid(self, genome_db_conn): + organism_uuid = "21279e3e-e651-43e1-a6fc-79e390b9e8a8" + input_data = genome_db_conn.fetch_genome_datasets( + organism_uuid=organism_uuid, + dataset_attributes=True, + dataset_name="all" + ) + + first_expected_stat = { + 'label': 'Average CDS length', + 'name': 'average_cds_length', + 'statisticType': 'bp', + 'statisticValue': '938.55' + } + output = json_format.MessageToJson(msg_factory.create_stats_by_genome_uuid(input_data)[0]) + assert json.loads(output)['genomeUuid'] == "a73351f7-93e7-11ec-a39d-005056b38ce3" + # check the first stat info of the first genome_uuid + # print(json.loads(output)['statistics']) + assert json.loads(output)['statistics'][0] == first_expected_stat + + def test_create_top_level_statistics(self, multi_dbs, genome_db_conn): + organism_uuid = "21279e3e-e651-43e1-a6fc-79e390b9e8a8" + input_data = genome_db_conn.fetch_genome_datasets( + organism_uuid=organism_uuid, + dataset_attributes=True, + dataset_name="all" + ) + + first_expected_stat = { + 'label': 'Average CDS length', + 'name': 'average_cds_length', + 'statisticType': 'bp', + 'statisticValue': '938.55' + } + stats_by_genome_uuid = msg_factory.create_stats_by_genome_uuid(input_data) + + output = json_format.MessageToJson( + msg_factory.create_top_level_statistics({ + 'organism_uuid': organism_uuid, + 'stats_by_genome_uuid': stats_by_genome_uuid + }) + ) + output_dict = json.loads(output) + assert 'organismUuid' in output_dict.keys() and 'statsByGenomeUuid' in output_dict.keys() + # These tests are pain in the back + # TODO: find a way to improve this spaghetti + assert output_dict["organismUuid"] == "21279e3e-e651-43e1-a6fc-79e390b9e8a8" + assert output_dict['statsByGenomeUuid'][0]['genomeUuid'] == "a73351f7-93e7-11ec-a39d-005056b38ce3" + assert output_dict['statsByGenomeUuid'][0]['statistics'][0] == first_expected_stat + + def test_create_genome_sequence(self, multi_dbs, genome_db_conn): + input_data = genome_db_conn.fetch_sequences(genome_uuid="a7335667-93e7-11ec-a39d-005056b38ce3") + expected_output = { + "accession": "KI270757.1", + # "chromosomal": True, + "length": "71251", + # "name": "CHR_HG1_PATCH", + "sequenceLocation": "SO:0000738" + } + output = json_format.MessageToJson(msg_factory.create_genome_sequence(input_data[0])) + assert json.loads(output) == expected_output + + def test_create_assembly_region(self, multi_dbs, genome_db_conn): + input_data = genome_db_conn.fetch_sequences( + genome_uuid="a7335667-93e7-11ec-a39d-005056b38ce3", + ) + # TODO: Check why this is failing when name and chromosomal is provided + expected_output = { + # "name": "CHR_HG1_PATCH", + "length": "71251", + # "chromosomal": True + } + output = json_format.MessageToJson(msg_factory.create_assembly_region(input_data[0])) + assert json.loads(output) == expected_output + + def test_create_genome_assembly_sequence_region(self, multi_dbs, genome_db_conn): + input_data = genome_db_conn.fetch_sequences( + genome_uuid="a7335667-93e7-11ec-a39d-005056b38ce3", + assembly_accession="GCA_000001405.28", + assembly_sequence_accession="CM000686.2" + ) + expected_output = { + "name": "Y", + "length": "57227415", + "chromosomal": True + } + output = json_format.MessageToJson(msg_factory.create_assembly_region(input_data[0])) + assert json.loads(output) == expected_output + + def test_create_release(self, multi_dbs, release_db_conn): + input_data = release_db_conn.fetch_releases(release_version=108) + expected_output = { + "releaseVersion": 108.0, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "isCurrent": True, + "siteName": "Ensembl", + "siteLabel": "Ensembl Genome Browser", + "siteUri": "https://beta.ensembl.org" + } + output = json_format.MessageToJson(msg_factory.create_release(input_data[0])) + assert json.loads(output) == expected_output + + def test_create_organisms_group_count(self, multi_dbs, genome_db_conn): + input_data = genome_db_conn.fetch_organisms_group_counts() + expected_result = { + "organismsGroupCount": [ + { + "speciesTaxonomyId": 9606, + "ensemblName": "Homo_sapiens", + "commonName": "Human", + "scientificName": "Homo sapiens", + "order": 1, + "count": 3 + } + ] + } + # we have 6 organism in the test data + assert len(input_data) == 6 + # send just the first element + output = json_format.MessageToJson( + msg_factory.create_organisms_group_count( + data=[input_data[0]], + release_version=None + ) + ) + assert json.loads(output) == expected_result + + @pytest.mark.parametrize( + "genome_tag, current_only, expected_output", + [ + # url_name = GRCh38 => homo_sapien 38 + ("GRCh38", True, {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), + # tol_id = mHomSap1 => homo_sapien 37 + # I randomly picked up this tol_id, probably wrong (biologically speaking) + ("GRCh37", False, {"genomeUuid": "3704ceb1-948d-11ec-a39d-005056b38ce3"}), + # Null + ("iDontExist", False, {}), + ] + ) + def test_create_genome_uuid(self, genome_db_conn, genome_tag, current_only, expected_output): + input_data = genome_db_conn.fetch_genomes( + genome_tag=genome_tag, + current_only=current_only + ) + + genome_uuid = input_data[0].Genome.genome_uuid if len(input_data) == 1 else "" + output = json_format.MessageToJson( + msg_factory.create_genome_uuid({"genome_uuid": genome_uuid}) + ) + assert json.loads(output) == expected_output diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py new file mode 100644 index 00000000..787893c2 --- /dev/null +++ b/src/tests/test_utils.py @@ -0,0 +1,1142 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for utils.py +""" +import json +from pathlib import Path + +import pkg_resources +import pytest +from ensembl.database import UnitTestDB +from google.protobuf import json_format + +from ensembl.production.metadata.grpc import ensembl_metadata_pb2, utils + +distribution = pkg_resources.get_distribution("ensembl-metadata-api") +sample_path = Path(distribution.location) / "ensembl" / "production" / "metadata" / "api" / "sample" + + +@pytest.mark.parametrize("multi_dbs", [[{"src": sample_path / "ensembl_metadata"}, + {"src": sample_path / "ncbi_taxonomy"}]], + indirect=True) +class TestUtils: + dbc = None # type: UnitTestDB + + @pytest.mark.parametrize( + "taxon_id, expected_output", + [ + # e-coli + ( + 562, + [ + "Bacillus coli", "Bacterium coli", "Bacterium coli commune", + "E. coli", "Enterococcus coli", "Escherichia/Shigella coli" + ] + ), + # wheat + ( + 4565, + [ + 'Canadian hard winter wheat', 'Triticum aestivum subsp. aestivum', + 'Triticum vulgare', 'bread wheat', 'common wheat', 'wheat' + ] + ), + # human + (9606, ["human"]), + # non-existent + (100, []), + ] + ) + def test_get_alternative_names(self, genome_db_conn, taxon_id, expected_output): + output = utils.get_alternative_names(genome_db_conn, taxon_id) + assert output == expected_output + + def test_get_assembly_information(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_assembly_information(genome_db_conn, "eeaaa2bf-151c-4848-8b85-a05a9993101e")) + expected_output = { + "accession": "GCA_000001405.28", + "assemblyUuid": "eeaaa2bf-151c-4848-8b85-a05a9993101e", + # "chromosomal": 1, + "length": "71251", + "level": "chromosome", + "name": "GRCh38.p13", + "sequenceLocation": "SO:0000738" + } + assert json.loads(output) == expected_output + + def test_get_genomes_from_assembly_accession_iterator(self, genome_db_conn): + output = [ + json.loads(json_format.MessageToJson(response)) for response in + utils.get_genomes_from_assembly_accession_iterator( + db_conn=genome_db_conn, assembly_accession="GCA_000005845.2", release_version=None + ) + ] + + expected_output = [ + { + "assembly": { + "accession": "GCA_000005845.2", + "assemblyUuid": "f78618ef-1075-47ee-a496-be26cad47912", + "ensemblName": "ASM584v2", + "level": "chromosome", + "name": "ASM584v2" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:32:14", + "genomeUuid": "a73351f7-93e7-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Escherichia coli str. K-12 substr. MG1655 str. K12 (GCA_000005845)", + "ensemblName": "Escherichia_coli_str_k_12_substr_mg1655_gca_000005845", + "organismUuid": "21279e3e-e651-43e1-a6fc-79e390b9e8a8", + "scientificName": "Escherichia coli str. K-12 substr. MG1655 str. K12 (GCA_000005845)", + "scientificParlanceName": "escherichia_coli_str_k_12_substr_mg1655_gca_000005845", + "speciesTaxonomyId": 562, + "taxonomyId": 511145 + }, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "scientificName": "Escherichia coli str. K-12 substr. MG1655 str. K12 (GCA_000005845)", + "taxonomyId": 511145 + } + } + ] + assert output == expected_output + + @pytest.mark.parametrize( + "assembly_accession, release_version", + [ + # null + (None, None), + # no matches + ("asdfasdfadf", None), + ] + ) + def test_get_genomes_from_assembly_accession_iterator_null(self, genome_db_conn, assembly_accession, + release_version): + output = [ + json.loads(json_format.MessageToJson(response)) for response in + utils.get_genomes_from_assembly_accession_iterator( + db_conn=genome_db_conn, assembly_accession=assembly_accession, release_version=release_version + ) + ] + assert output == [] + + # TODO: Ask Daniel / Investigate why organism_group_member test table is not populated + # def test_get_sub_species_info(self, genome_db_conn): + # output = json_format.MessageToJson( + # utils.get_sub_species_info( + # db_conn=genome_db_conn, + # organism_uuid="21279e3e-e651-43e1-a6fc-79e390b9e8a8", + # group="EnsemblBacteria" + # ) + # ) + # print(f"output ===> {output}") + # expected_output = { + # "organismUuid": "21279e3e-e651-43e1-a6fc-79e390b9e8a8", + # "speciesName": ["EnsemblBacteria"], + # "speciesType": ["Division"]} + # assert json.loads(output) == expected_output + # + # output2 = json_format.MessageToJson(utils.get_sub_species_info(genome_db_conn, "s0m3-r4nd0m-0rg4n1sm-uu1d")) + # expected_output2 = {} + # assert json.loads(output2) == expected_output2 + + def test_get_top_level_statistics(self, genome_db_conn): + # Triticum aestivum + output = json_format.MessageToJson( + utils.get_top_level_statistics( + db_conn=genome_db_conn, + group="EnsemblPlants", + organism_uuid="d64c34ca-b37a-476b-83b5-f21d07a3ae67", + ) + ) + output = json.loads(output) + first_genome_stats = output["statsByGenomeUuid"][0]["statistics"] + assert len(first_genome_stats) == 51 + assert first_genome_stats[0] == { + 'label': 'Average CDS length', + 'name': 'average_cds_length', + 'statisticType': 'bp', + 'statisticValue': '1332.42' + } + assert first_genome_stats[1] == { + 'label': 'Average coding exons per transcript', + 'name': 'average_coding_exons_per_coding_transcript', + 'statisticType': 'float', + 'statisticValue': '5.34' + } + #assert first_genome_stats[1] == { + # 'label': 'Average exon length per coding gene', + # 'name': 'average_coding_exon_length', + # 'statisticType': 'bp', + # 'statisticValue': '249.47' + #} + + def test_get_top_level_statistics_by_uuid(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_top_level_statistics_by_uuid( + genome_db_conn, "a73357ab-93e7-11ec-a39d-005056b38ce3" + ) + ) + output = json.loads(output) + assert len(output["statistics"]) == 51 + assert output["statistics"][0] == { + 'label': 'Average CDS length', + 'name': 'average_cds_length', + 'statisticType': 'bp', + 'statisticValue': '1332.42' + } + assert output["statistics"][2] == { + 'label': 'Average exon length per coding gene', + 'name': 'average_coding_exon_length', + 'statisticType': 'bp', + 'statisticValue': '249.47' + } + #assert output["statistics"][2] == { + # 'label': 'Average coding exons per transcript', + # 'name': 'average_coding_exons_per_coding_transcript', + # 'statisticType': 'float', + # 'statisticValue': '5.34' + #} + + def test_get_datasets_list_by_uuid(self, genome_db_conn): + # the expected_output is too long and duplicated + # because of the returned attributes + # TODO: Fix this later + output = json_format.MessageToJson( + utils.get_datasets_list_by_uuid(genome_db_conn, "a73357ab-93e7-11ec-a39d-005056b38ce3", 108.0)) + + expected_output = { + "genomeUuid": "a73357ab-93e7-11ec-a39d-005056b38ce3", + "datasets": { + "evidence": { + "datasetInfos": [ + { + "datasetUuid": "64a66f22-07a9-476e-9816-785e2ccb9c30", + "datasetName": "evidence", + "datasetLabel": "Manual Add", + "version": 108.0 + }, + { + "datasetUuid": "64a66f22-07a9-476e-9816-785e2ccb9c30", + "datasetName": "evidence", + "datasetLabel": "Manual Add", + "version": 108.0 + }, + { + "datasetUuid": "64a66f22-07a9-476e-9816-785e2ccb9c30", + "datasetName": "evidence", + "datasetLabel": "Manual Add", + "version": 108.0 + }, + { + "datasetUuid": "64a66f22-07a9-476e-9816-785e2ccb9c30", + "datasetName": "evidence", + "datasetLabel": "Manual Add", + "version": 108.0 + } + ] + }, + "assembly": { + "datasetInfos": [ + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + }, + { + "datasetUuid": "b4ff55e3-d06a-4772-bb13-81c3207669e3", + "datasetName": "assembly", + "datasetLabel": "GCA_900519105.1", + "version": 108.0 + } + ] + }, + "homologies": { + "datasetInfos": [ + { + "datasetUuid": "e67ca09d-2e7b-4135-a990-6a2d1bca7285", + "datasetName": "homologies", + "datasetLabel": "Manual Add", + "version": 108.0 + }, + { + "datasetUuid": "e67ca09d-2e7b-4135-a990-6a2d1bca7285", + "datasetName": "homologies", + "datasetLabel": "Manual Add", + "version": 108.0 + } + ] + }, + "genebuild": { + "datasetInfos": [ + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + }, + { + "datasetUuid": "0dc05c6e-2910-4dbd-879a-719ba97d5824", + "datasetName": "genebuild", + "datasetLabel": "2018-04-IWGSC", + "version": 108.0 + } + ] + }, + "variation": { + "datasetInfos": [ + { + "datasetUuid": "4d411e2d-676e-4fe0-b0d7-65a9e33fd47f", + "datasetName": "variation", + "datasetLabel": "Manual Add", + "version": 108.0 + }, + { + "datasetUuid": "4d411e2d-676e-4fe0-b0d7-65a9e33fd47f", + "datasetName": "variation", + "datasetLabel": "Manual Add", + "version": 108.0 + } + ] + } + } + } + assert json.loads(output) == expected_output + + def test_get_datasets_list_by_uuid_no_results(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_datasets_list_by_uuid(genome_db_conn, "some-random-uuid-f00-b4r", 103.0)) + output = json.loads(output) + expected_output = {} + assert output == expected_output + + def test_get_dataset_by_genome_and_dataset_type(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_dataset_by_genome_and_dataset_type(genome_db_conn, "a7335667-93e7-11ec-a39d-005056b38ce3", + "assembly") + ) + output = json.loads(output) + assert output == { + 'datasetInfos': [{ + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'assembly.date', + 'type': 'string', + 'value': '2013-12', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'assembly.level', + 'type': 'string', + 'value': 'chromosome', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'chromosomes', + 'type': 'integer', + 'value': '25', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'component_sequences', + 'type': 'integer', + 'value': '36734', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'contig_n50', + 'type': 'bp', + 'value': '56413054', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'gc_percentage', + 'type': 'percent', + 'value': '38.87', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'spanned_gaps', + 'type': 'integer', + 'value': '661', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'toplevel_sequences', + 'type': 'integer', + 'value': '640', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'total_coding_sequence_length', + 'type': 'bp', + 'value': '34459298', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'total_gap_length', + 'type': 'bp', + 'value': '161368351', + 'version': 108.0 + }, + { + 'datasetLabel': 'GCA_000001405.28', + 'datasetName': 'assembly', + 'datasetUuid': '559d7660-d92d-47e1-924e-e741151c2cef', + 'name': 'total_genome_length', + 'type': 'bp', + 'value': '3272116950', + 'version': 108.0 + }], + 'datasetType': 'assembly', + 'genomeUuid': 'a7335667-93e7-11ec-a39d-005056b38ce3' + } + + def test_get_dataset_by_genome_id_no_results(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_dataset_by_genome_and_dataset_type(genome_db_conn, "a7335667-93e7-11ec-a39d-005056b38ce3", + "blah blah blah")) + output = json.loads(output) + assert output == {} + + @pytest.mark.parametrize( + "ensembl_name, assembly_name, use_default, expected_output", + [ + ("homo_sapiens", "GRCh38.p13", False, {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), + ("homo_sapiens", "GRCh38.p13", True, {}), + ("homo_sapiens", "GRCh38", True, {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), + ("random_ensembl_name", "GRCh38", False, {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), + ("random_ensembl_name", "random_assembly_name", True, {}), + ("random_ensembl_name", "random_assembly_name", False, {}), + ] + ) + def test_get_genome_uuid(self, genome_db_conn, ensembl_name, assembly_name, use_default, expected_output): + output = json_format.MessageToJson( + utils.get_genome_uuid( + db_conn=genome_db_conn, + ensembl_name=ensembl_name, + assembly_name=assembly_name, + use_default=use_default + )) + assert json.loads(output) == expected_output + + def test_get_genome_by_uuid(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_genome_by_uuid( + db_conn=genome_db_conn, + genome_uuid="a73357ab-93e7-11ec-a39d-005056b38ce3", + release_version=108.0 + )) + expected_output = { + "assembly": { + "accession": "GCA_900519105.1", + "ensemblName": "IWGSC", + "assemblyUuid": "ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1", + "level": "chromosome", + "name": "IWGSC" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:32:36", + "genomeUuid": "a73357ab-93e7-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Triticum aestivum", + "ensemblName": "Triticum_aestivum", + "organismUuid": "d64c34ca-b37a-476b-83b5-f21d07a3ae67", + "scientificName": "Triticum aestivum", + "scientificParlanceName": "triticum_aestivum", + "speciesTaxonomyId": 4565, + "taxonomyId": 4565, + "strain": "reference (Chinese spring)" + }, + "relatedAssembliesCount": 1, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "alternativeNames": [ + "Canadian hard winter wheat", + "Triticum aestivum subsp. aestivum", + "Triticum vulgare", + "bread wheat", + "common wheat", + "wheat" + ], + "scientificName": "Triticum aestivum", + "strain": "reference (Chinese spring)", + "taxonomyId": 4565 + } + } + assert json.loads(output) == expected_output + + def test_genome_by_uuid_release_version_unspecified(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_genome_by_uuid( + db_conn=genome_db_conn, + genome_uuid="a73357ab-93e7-11ec-a39d-005056b38ce3", + release_version=None + )) + expected_output = { + "assembly": { + "accession": "GCA_900519105.1", + "ensemblName": "IWGSC", + "assemblyUuid": "ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1", + "level": "chromosome", + "name": "IWGSC" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:32:36", + "genomeUuid": "a73357ab-93e7-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Triticum aestivum", + "ensemblName": "Triticum_aestivum", + "organismUuid": "d64c34ca-b37a-476b-83b5-f21d07a3ae67", + "scientificName": "Triticum aestivum", + "scientificParlanceName": "triticum_aestivum", + "speciesTaxonomyId": 4565, + "taxonomyId": 4565, + "strain": "reference (Chinese spring)" + }, + "relatedAssembliesCount": 1, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "alternativeNames": [ + "Canadian hard winter wheat", + "Triticum aestivum subsp. aestivum", + "Triticum vulgare", + "bread wheat", + "common wheat", + "wheat" + ], + "scientificName": "Triticum aestivum", + "strain": "reference (Chinese spring)", + "taxonomyId": 4565 + } + } + assert json.loads(output) == expected_output + + def test_get_genomes_by_uuid_null(self, genome_db_conn): + output = utils.get_genome_by_uuid(genome_db_conn, None, 0) + assert output == ensembl_metadata_pb2.Genome() + + def test_get_genomes_by_keyword(self, genome_db_conn): + output = [json.loads(json_format.MessageToJson(response)) for response in + utils.get_genomes_by_keyword_iterator(genome_db_conn, "Human", 108.0)] + expected_output = [ + { + "assembly": { + "accession": "GCA_000001405.28", + "ensemblName": "GRCh38.p13", + "assemblyUuid": "eeaaa2bf-151c-4848-8b85-a05a9993101e", + "level": "chromosome", + "isReference": True, + "name": "GRCh38.p13", + "ucscName": "hg38", + "urlName": "GRCh38" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:30:58", + "genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Human", + "ensemblName": "Homo_sapiens", + "organismUuid": "db2a5f09-2db8-429b-a407-c15a4ca2876d", + "scientificName": "Homo sapiens", + "speciesTaxonomyId": 9606, + "taxonomyId": 9606, + "scientificParlanceName": "homo_sapiens" + }, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "scientificName": "Homo sapiens", + "taxonomyId": 9606 + }, + }, + { + "assembly": { + "accession": "GCA_000001405.14", + "ensemblName": "GRCh37.p13", + "assemblyUuid": "633034c3-2268-40a2-866a-9f492cac84bf", + "level": "chromosome", + "name": "GRCh37.p13", + "ucscName": "hg19", + "urlName": "GRCh37" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:32:06", + "genomeUuid": "3704ceb1-948d-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Human", + "ensemblName": "Homo_sapiens", + "organismUuid": "db2a5f09-2db8-429b-a407-c15a4ca2876d", + "scientificName": "Homo sapiens", + "speciesTaxonomyId": 9606, + "taxonomyId": 9606, + "scientificParlanceName": "homo_sapiens" + }, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "scientificName": "Homo sapiens", + "taxonomyId": 9606 + } + } + ] + assert output == expected_output + + def test_get_genomes_by_keyword_release_unspecified(self, genome_db_conn): + output = [json.loads(json_format.MessageToJson(response)) for response in + utils.get_genomes_by_keyword_iterator(genome_db_conn, "Homo Sapiens", 0.0)] + # TODO: DRY the expected_output + expected_output = [ + { + "assembly": { + "accession": "GCA_000001405.28", + "ensemblName": "GRCh38.p13", + "assemblyUuid": "eeaaa2bf-151c-4848-8b85-a05a9993101e", + "level": "chromosome", + "isReference": True, + "name": "GRCh38.p13", + "ucscName": "hg38", + "urlName": "GRCh38" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:30:58", + "genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Human", + "ensemblName": "Homo_sapiens", + "organismUuid": "db2a5f09-2db8-429b-a407-c15a4ca2876d", + "scientificName": "Homo sapiens", + "speciesTaxonomyId": 9606, + "taxonomyId": 9606, + "scientificParlanceName": "homo_sapiens" + }, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "scientificName": "Homo sapiens", + "taxonomyId": 9606 + }, + }, + { + "assembly": { + "accession": "GCA_000001405.14", + "ensemblName": "GRCh37.p13", + "assemblyUuid": "633034c3-2268-40a2-866a-9f492cac84bf", + "level": "chromosome", + "name": "GRCh37.p13", + "ucscName": "hg19", + "urlName": "GRCh37" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:32:06", + "genomeUuid": "3704ceb1-948d-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Human", + "ensemblName": "Homo_sapiens", + "organismUuid": "db2a5f09-2db8-429b-a407-c15a4ca2876d", + "scientificName": "Homo sapiens", + "speciesTaxonomyId": 9606, + "taxonomyId": 9606, + "scientificParlanceName": "homo_sapiens" + }, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "scientificName": "Homo sapiens", + "taxonomyId": 9606 + } + } + ] + assert output == expected_output + + def test_get_genomes_by_keyword_null(self, genome_db_conn): + output = list( + utils.get_genomes_by_keyword_iterator(genome_db_conn, None, 0)) + assert output == [] + + def test_get_genomes_by_keyword_no_matches(self, genome_db_conn): + output = list( + utils.get_genomes_by_keyword_iterator(genome_db_conn, "bigfoot", + 1)) + assert output == [] + + def test_get_genomes_by_name(self, genome_db_conn): + output = json_format.MessageToJson(utils.get_genome_by_name( + db_conn=genome_db_conn, + site_name="Ensembl", + ensembl_name="Triticum_aestivum", + release_version=108.0 + )) + expected_output = { + "assembly": { + "accession": "GCA_900519105.1", + "ensemblName": "IWGSC", + "assemblyUuid": "ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1", + "level": "chromosome", + "name": "IWGSC" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:32:36", + "genomeUuid": "a73357ab-93e7-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Triticum aestivum", + "ensemblName": "Triticum_aestivum", + "organismUuid": "d64c34ca-b37a-476b-83b5-f21d07a3ae67", + "scientificName": "Triticum aestivum", + "scientificParlanceName": "triticum_aestivum", + "speciesTaxonomyId": 4565, + "taxonomyId": 4565, + "strain": "reference (Chinese spring)" + }, + "relatedAssembliesCount": 1, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "alternativeNames": [ + "Canadian hard winter wheat", + "Triticum aestivum subsp. aestivum", + "Triticum vulgare", + "bread wheat", + "common wheat", + "wheat" + ], + "scientificName": "Triticum aestivum", + "strain": "reference (Chinese spring)", + "taxonomyId": 4565 + } + } + assert json.loads(output) == expected_output + + def test_get_genomes_by_name_release_unspecified(self, genome_db_conn): + # We are expecting the same result as test_get_genomes_by_name() above + # because no release is specified get_genome_by_name() -> fetch_genomes + # checks if the fetched genome is released and picks it up + output = json_format.MessageToJson(utils.get_genome_by_name( + db_conn=genome_db_conn, + site_name="Ensembl", + ensembl_name="Triticum_aestivum", + release_version=None + )) + expected_output = { + "assembly": { + "accession": "GCA_900519105.1", + "ensemblName": "IWGSC", + "assemblyUuid": "ec1c4b53-c2ef-431c-ad0e-b2aef19b44f1", + "level": "chromosome", + "name": "IWGSC" + }, + "attributesInfo": {}, + "created": "2023-05-12 13:32:36", + "genomeUuid": "a73357ab-93e7-11ec-a39d-005056b38ce3", + "organism": { + "commonName": "Triticum aestivum", + "ensemblName": "Triticum_aestivum", + "organismUuid": "d64c34ca-b37a-476b-83b5-f21d07a3ae67", + "scientificName": "Triticum aestivum", + "scientificParlanceName": "triticum_aestivum", + "speciesTaxonomyId": 4565, + "taxonomyId": 4565, + "strain": "reference (Chinese spring)" + }, + "relatedAssembliesCount": 1, + "release": { + "isCurrent": True, + "releaseDate": "2023-05-15", + "releaseLabel": "Beta Release 1", + "releaseVersion": 108.0, + "siteLabel": "Ensembl Genome Browser", + "siteName": "Ensembl", + "siteUri": "https://beta.ensembl.org" + }, + "taxon": { + "alternativeNames": [ + "Canadian hard winter wheat", + "Triticum aestivum subsp. aestivum", + "Triticum vulgare", + "bread wheat", + "common wheat", + "wheat" + ], + "scientificName": "Triticum aestivum", + "strain": "reference (Chinese spring)", + "taxonomyId": 4565 + } + } + assert json.loads(output) == expected_output + + def test_get_organisms_group_count(self, genome_db_conn): + output = json_format.MessageToJson( + utils.get_organisms_group_count( + db_conn=genome_db_conn, + release_version=None + ) + ) + expected_output = { + "organismsGroupCount": [ + { + "speciesTaxonomyId": 9606, + "ensemblName": "Homo_sapiens", + "commonName": "Human", + "scientificName": "Homo sapiens", + "order": 1, + "count": 3 + } + ] + } + # make sure it returns 6 organisms + json_output = json.loads(output) + assert len(json_output['organismsGroupCount']) == 6 + # and pick up the first element to check if it matches the expected output + # I picked up only the first element for the sake of shortening the code + assert json_output['organismsGroupCount'][0] == expected_output['organismsGroupCount'][0] + + @pytest.mark.parametrize( + "genome_tag, expected_output", + [ + # url_name = GRCh38 => homo_sapien 38 + ("GRCh38", {"genomeUuid": "a7335667-93e7-11ec-a39d-005056b38ce3"}), + # Null + ("iDontExist", {}), + ] + ) + def test_get_genome_uuid_by_tag(self, genome_db_conn, genome_tag, expected_output): + output = json_format.MessageToJson( + utils.get_genome_uuid_by_tag( + db_conn=genome_db_conn, + genome_tag=genome_tag, + )) + assert json.loads(output) == expected_output