diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..db0033b7 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +ensembl_metadata_api diff --git a/.travis.yml b/.travis.yml index 66cb7350..b1177099 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,13 +3,15 @@ os: linux python: - "3.8" + - "3.9" env: - TESTENV=test -install: +before_script: - pip install -r requirements-test.txt - pip install . + - export PYTHONPATH=$PYTHONPATH:$PWD/src script: - if [[ "$TESTENV" == "test" ]]; then coverage run -m pytest; fi diff --git a/requirements.in b/requirements.in index af1aca55..ebd9816c 100644 --- a/requirements.in +++ b/requirements.in @@ -2,3 +2,5 @@ mysqlclient pymysql sqlalchemy types-pymysql +git+https://github.com/Ensembl/ensembl-py.git#egg=ensembl-py +mysql \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f233661a..5d140a62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,66 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile --output-file=requirements.txt requirements.in # +attrs==22.1.0 + # via pytest +certifi==2022.9.24 + # via requests +charset-normalizer==2.1.1 + # via requests +ensembl-hive @ git+https://github.com/Ensembl/ensembl-hive.git@main + # via ensembl-py +ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git + # via -r requirements.in greenlet==1.1.0 # via sqlalchemy -mysqlclient==2.0.3 +idna==3.4 + # via requests +iniconfig==1.1.1 + # via pytest +mysql==0.0.3 # via -r requirements.in +mysqlclient==2.0.3 + # via + # -r requirements.in + # ensembl-py + # mysql +packaging==21.3 + # via pytest +pluggy==1.0.0 + # via pytest +py==1.11.0 + # via pytest pymysql==1.0.2 # via -r requirements.in +pyparsing==3.0.9 + # via packaging +pytest==7.1.3 + # via + # ensembl-py + # pytest-dependency +pytest-dependency==0.5.1 + # via ensembl-py +python-dotenv==0.19.2 + # via ensembl-py +pyyaml==6.0 + # via ensembl-py +requests==2.28.1 + # via ensembl-py +six==1.16.0 + # via sqlalchemy-utils sqlalchemy==1.4.21 - # via -r requirements.in + # via + # -r requirements.in + # ensembl-py + # sqlalchemy-utils +sqlalchemy-utils==0.37.9 + # via ensembl-py +tomli==2.0.1 + # via pytest types-pymysql==1.0.0 # via -r requirements.in +urllib3==1.26.12 + # via requests diff --git a/setup.py b/setup.py index e39fc5d5..957b67cc 100644 --- a/setup.py +++ b/setup.py @@ -41,5 +41,8 @@ 'Programming Language :: Python', 'Programming Language :: Python :: 3', "Programming Language :: Python :: 3.8", + ], + install_requires=[ + 'ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git', ] ) diff --git a/src/ensembl/production/metadata/api.py b/src/ensembl/production/metadata/api.py index 55c4e5fe..1598d8a3 100644 --- a/src/ensembl/production/metadata/api.py +++ b/src/ensembl/production/metadata/api.py @@ -10,28 +10,35 @@ # See the License for the specific language governing permissions and # limitations under the License. import sqlalchemy as db -from sqlalchemy.orm import Session +from sqlalchemy import select +from sqlalchemy.orm import Session, sessionmaker import pymysql +from ensembl.production.metadata.config import get_metadata_uri, get_taxonomy_uri +from ensembl.database.dbconnection import DBConnection +from ensembl.production.metadata.models import * -from ensembl.production.metadata.config import MetadataConfig -pymysql.install_as_MySQLdb() -config = MetadataConfig() +#Database ORM connection. +class load_database(DBConnection): + """ + Load a database and directly create a session for ORM interaction with the database + """ + def create_session(self, engine): + self._session = Session(engine, future=True) + def __init__(self, url): + super().__init__(url) + self.create_session(self._engine) -def load_database(uri): - try: - engine = db.create_engine(uri) - except AttributeError as err: - raise ValueError(f'Could not connect to database {uri}: {err}.') from err + #Commit any changes to the database and create a new session instance. + def commit(self): + self._session.commit() + self._session.close() + self.create_session(self._engine) - try: - connection = engine.connect() - except db.exc.OperationalError as err: - raise ValueError(f'Could not connect to database {uri}: {err}.') from err - - connection.close() - return engine + #rollback any changes made before commiting the session instance. + def rollback(self): + self._session.rollback() def check_parameter(param): @@ -42,87 +49,100 @@ def check_parameter(param): class BaseAdaptor: def __init__(self, metadata_uri=None): - # This is sqlalchemy's metadata, not Ensembl's! - self.md = db.MetaData() - if metadata_uri is None: - metadata_uri = config.METADATA_URI + metadata_uri = get_metadata_uri() self.metadata_db = load_database(metadata_uri) - self.metadata_db_session = Session(self.metadata_db, future=True) class ReleaseAdaptor(BaseAdaptor): - def fetch_releases(self, - release_id=None, - release_version=None, - current_only=True, - release_type=None, - site_name=None - ): + def fetch_releases( + self, + release_id=None, + release_version=None, + current_only=True, + release_type=None, + site_name=None, + ): release_id = check_parameter(release_id) release_version = check_parameter(release_version) release_type = check_parameter(release_type) site_name = check_parameter(site_name) - # Reflect existing tables, letting sqlalchemy load linked tables where possible. - release = db.Table('ensembl_release', self.md, autoload_with=self.metadata_db) - site = self.md.tables['ensembl_site'] release_select = db.select( - release.c.release_id, - release.c.version.label('release_version'), - db.cast(release.c.release_date, db.String), - release.c.label.label('release_label'), - release.c.is_current, - release.c.release_type, - site.c.name.label('site_name'), - site.c.label.label('site_label'), - site.c.uri.label('site_uri') - ).select_from(release) + EnsemblRelease,EnsemblSite + ).join(EnsemblRelease.ensembl_site) - # These options are in order of decreasing specificity, - # and thus the ones later in the list can be redundant. + #WHERE ensembl_release.release_id = :release_id_1 if release_id is not None: - release_select = release_select.filter(release.c.release_id.in_(release_id)) + release_select = release_select.filter( + EnsemblRelease.release_id.in_(release_id) + ) + #WHERE ensembl_release.version = :version_1 elif release_version is not None: - release_select = release_select.filter(release.c.version.in_(release_version)) + release_select = release_select.filter( + EnsemblRelease.version.in_(release_version) + ) + #WHERE ensembl_release.is_current =:is_current_1 elif current_only: - release_select = release_select.filter_by(is_current=1) + release_select = release_select.filter( + EnsemblRelease.is_current == 1 + ) + #WHERE ensembl_release.release_type = :release_type_1 if release_type is not None: - release_select = release_select.filter(release.c.release_type.in_(release_type)) + release_select = release_select.filter( + EnsemblRelease.release_type.in_(release_type) + ) - release_select = release_select.join(site) + #WHERE ensembl_site.name = :name_1 if site_name is not None: - release_select = release_select.filter(site.c.name.in_(site_name)) + release_select = release_select.filter( + EnsemblSite.name.in_(site_name) + ) + return self.metadata_db._session.execute(release_select) - return self.metadata_db_session.execute(release_select).all() def fetch_releases_for_genome(self, genome_uuid, site_name=None): - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - genome_release = db.Table('genome_release', self.md, autoload_with=self.metadata_db) + # SELECT genome_release.release_id + # FROM genome_release + # JOIN genome ON genome.genome_id = genome_release.genome_id + # WHERE genome.genome_uuid =:genome_uuid_1 release_id_select = db.select( - genome_release.c.release_id - ).select_from(genome).filter_by( - genome_uuid=genome_uuid - ).join(genome_release) - - release_ids = [rid for (rid,) in self.metadata_db_session.execute(release_id_select)] + GenomeRelease.release_id + ).filter( + Genome.genome_uuid == genome_uuid + ).join( + GenomeRelease.genome + ) + release_ids = [] + release_objects = self.metadata_db._session.execute(release_id_select) + for rid in release_objects: + release_ids.append(rid[0]) + release_ids = list(dict.fromkeys(release_ids)) return self.fetch_releases(release_id=release_ids, site_name=site_name) def fetch_releases_for_dataset(self, dataset_uuid, site_name=None): - dataset = db.Table('dataset', self.md, autoload_with=self.metadata_db) - genome_dataset = db.Table('genome_dataset', self.md, autoload_with=self.metadata_db) + # SELECT genome_release.release_id + # FROM genome_dataset + # JOIN dataset ON dataset.dataset_id = genome_dataset.dataset_id + # WHERE dataset.dataset_uuid = :dataset_uuid_1 release_id_select = db.select( - genome_dataset.c.release_id - ).select_from(dataset).filter_by( - dataset_uuid=dataset_uuid - ).join(genome_dataset) + GenomeDataset.release_id + ).filter( + Dataset.dataset_uuid == dataset_uuid + ).join( + GenomeDataset.dataset + ) - release_ids = [rid for (rid,) in self.metadata_db_session.execute(release_id_select)] + release_ids = [] + release_objects = self.metadata_db._session.execute(release_id_select) + for rid in release_objects: + release_ids.append(rid[0]) + release_ids = list(dict.fromkeys(release_ids)) return self.fetch_releases(release_id=release_ids, site_name=site_name) @@ -144,69 +164,73 @@ def __init__(self, metadata_uri=None, taxonomy_uri=None): self.taxon_names = self.fetch_taxonomy_names(taxonomy_ids) def fetch_taxonomy_ids(self): - organism = db.Table('organism', self.md, autoload_with=self.metadata_db) + organism = db.Table("organism", self.md, autoload_with=self.metadata_db) taxonomy_id_select = db.select(organism.c.taxonomy_id.distinct()) taxonomy_ids = [tid for (tid,) in self.metadata_db.execute(taxonomy_id_select)] return taxonomy_ids def fetch_taxonomy_names(self, taxonomy_id): - ncbi_taxa_name = db.Table('ncbi_taxa_name', self.md, autoload_with=self.taxonomy_db) + ncbi_taxa_name = db.Table( + "ncbi_taxa_name", self.md, autoload_with=self.taxonomy_db + ) taxons = {} for tid in taxonomy_id: - names = { - 'scientific_name': None, - 'synonym': [] - } + names = {"scientific_name": None, "synonym": []} taxons[tid] = names sci_name_select = db.select( - ncbi_taxa_name.c.taxon_id, - ncbi_taxa_name.c.name + ncbi_taxa_name.c.taxon_id, ncbi_taxa_name.c.name ).filter( ncbi_taxa_name.c.taxon_id.in_(taxonomy_id), - ncbi_taxa_name.c.name_class == 'scientific name' + ncbi_taxa_name.c.name_class == "scientific name", ) for x in self.taxonomy_db.execute(sci_name_select): - taxons[x.taxon_id]['scientific_name'] = x.name + taxons[x.taxon_id]["scientific_name"] = x.name synonym_class = [ - 'common name', - 'equivalent name', - 'genbank common name', - 'genbank synonym', - 'synonym' + "common name", + "equivalent name", + "genbank common name", + "genbank synonym", + "synonym", ] synonyms_select = db.select( - ncbi_taxa_name.c.taxon_id, - ncbi_taxa_name.c.name + ncbi_taxa_name.c.taxon_id, ncbi_taxa_name.c.name ).filter( ncbi_taxa_name.c.taxon_id.in_(taxonomy_id), - ncbi_taxa_name.c.name_class.in_(synonym_class) + ncbi_taxa_name.c.name_class.in_(synonym_class), ) for x in self.taxonomy_db.execute(synonyms_select): - taxons[x.taxon_id]['synonym'].append(x.name) + taxons[x.taxon_id]["synonym"].append(x.name) return taxons - def fetch_genomes(self, - genome_id=None, genome_uuid=None, - assembly_accession=None, - ensembl_name=None, taxonomy_id=None, - unreleased_only=False, - site_name=None, release_type=None, release_version=None, current_only=True - ): + def fetch_genomes( + self, + genome_id=None, + genome_uuid=None, + assembly_accession=None, + ensembl_name=None, + taxonomy_id=None, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): genome_id = check_parameter(genome_id) genome_uuid = check_parameter(genome_uuid) assembly_accession = check_parameter(assembly_accession) ensembl_name = check_parameter(ensembl_name) taxonomy_id = check_parameter(taxonomy_id) - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - assembly = self.md.tables['assembly'] - organism = self.md.tables['organism'] + genome = db.Table("genome", self.md, autoload_with=self.metadata_db) + assembly = self.md.tables["assembly"] + organism = self.md.tables["organism"] - genome_select = db.select( + genome_select = ( + db.select( genome.c.genome_id, genome.c.genome_uuid, organism.c.ensembl_name, @@ -214,35 +238,51 @@ def fetch_genomes(self, organism.c.display_name, organism.c.strain, organism.c.taxonomy_id, - assembly.c.accession.label('assembly_accession'), - assembly.c.name.label('assembly_name'), - assembly.c.ucsc_name.label('assembly_ucsc_name'), - assembly.c.level.label('assembly_level') - ).select_from(genome).join(assembly).join(organism) + assembly.c.accession.label("assembly_accession"), + assembly.c.name.label("assembly_name"), + assembly.c.ucsc_name.label("assembly_ucsc_name"), + assembly.c.level.label("assembly_level"), + ) + .select_from(genome) + .join(assembly) + .join(organism) + ) if unreleased_only: - genome_release = db.Table('genome_release', self.md, autoload_with=self.metadata_db) + genome_release = db.Table( + "genome_release", self.md, autoload_with=self.metadata_db + ) - genome_select = genome_select.outerjoin(genome_release).filter_by(genome_id=None) + genome_select = genome_select.outerjoin(genome_release).filter_by( + genome_id=None + ) elif site_name is not None: - genome_release = db.Table('genome_release', self.md, autoload_with=self.metadata_db) - release = self.md.tables['ensembl_release'] - site = self.md.tables['ensembl_site'] - - genome_select = genome_select.join( - genome_release).join( - release).join( - site).filter_by(name=site_name) + genome_release = db.Table( + "genome_release", self.md, autoload_with=self.metadata_db + ) + release = self.md.tables["ensembl_release"] + site = self.md.tables["ensembl_site"] + + genome_select = ( + genome_select.join(genome_release) + .join(release) + .join(site) + .filter_by(name=site_name) + ) if release_type is not None: - genome_select = genome_select.filter(release.c.release_type == release_type) + genome_select = genome_select.filter( + release.c.release_type == release_type + ) if current_only: genome_select = genome_select.filter(genome_release.c.is_current == 1) if release_version is not None: - genome_select = genome_select.filter(release.c.version <= release_version) + genome_select = genome_select.filter( + release.c.version <= release_version + ) # These options are in order of decreasing specificity, # and thus the ones later in the list can be redundant. @@ -251,11 +291,17 @@ def fetch_genomes(self, elif genome_uuid is not None: genome_select = genome_select.filter(genome.c.genome_uuid.in_(genome_uuid)) elif assembly_accession is not None: - genome_select = genome_select.filter(assembly.c.accession.in_(assembly_accession)) + genome_select = genome_select.filter( + assembly.c.accession.in_(assembly_accession) + ) elif ensembl_name is not None: - genome_select = genome_select.filter(organism.c.ensembl_name.in_(ensembl_name)) + genome_select = genome_select.filter( + organism.c.ensembl_name.in_(ensembl_name) + ) elif taxonomy_id is not None: - genome_select = genome_select.filter(organism.c.taxonomy_id.in_(taxonomy_id)) + genome_select = genome_select.filter( + organism.c.taxonomy_id.in_(taxonomy_id) + ) for result in self.metadata_db_session.execute(genome_select): taxon_names = self.taxon_names[result.taxonomy_id] @@ -263,128 +309,174 @@ def fetch_genomes(self, result_dict.update(taxon_names) yield result_dict - def fetch_genomes_by_genome_uuid(self, - genome_uuid, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(genome_uuid=genome_uuid, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_assembly_accession(self, - assembly_accession, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(assembly_accession=assembly_accession, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_ensembl_name(self, - ensembl_name, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(ensembl_name=ensembl_name, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_taxonomy_id(self, - taxonomy_id, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(taxonomy_id=taxonomy_id, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_scientific_name(self, - scientific_name, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - taxonomy_ids = [t_id for t_id in self.taxon_names - if self.taxon_names[t_id]['scientific_name'] == scientific_name] - - return self.fetch_genomes_by_taxonomy_id(taxonomy_ids, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_synonym(self, - synonym, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): + def fetch_genomes_by_genome_uuid( + self, + genome_uuid, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + genome_uuid=genome_uuid, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_assembly_accession( + self, + assembly_accession, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + assembly_accession=assembly_accession, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_ensembl_name( + self, + ensembl_name, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + ensembl_name=ensembl_name, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_taxonomy_id( + self, + taxonomy_id, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + taxonomy_id=taxonomy_id, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_scientific_name( + self, + scientific_name, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + taxonomy_ids = [ + t_id + for t_id in self.taxon_names + if self.taxon_names[t_id]["scientific_name"] == scientific_name + ] + + return self.fetch_genomes_by_taxonomy_id( + taxonomy_ids, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_synonym( + self, + synonym, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): taxonomy_ids = [] for taxon_id in self.taxon_names: - if synonym.casefold() in [x.casefold() for x in self.taxon_names[taxon_id]['synonym']]: + if synonym.casefold() in [ + x.casefold() for x in self.taxon_names[taxon_id]["synonym"] + ]: taxonomy_ids.append(taxon_id) - return self.fetch_genomes_by_taxonomy_id(taxonomy_ids, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_sequences(self, - genome_id=None, genome_uuid=None, - assembly_accession=None, - chromosomal_only=False - ): + return self.fetch_genomes_by_taxonomy_id( + taxonomy_ids, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_sequences( + self, + genome_id=None, + genome_uuid=None, + assembly_accession=None, + chromosomal_only=False, + ): genome_id = check_parameter(genome_id) genome_uuid = check_parameter(genome_uuid) assembly_accession = check_parameter(assembly_accession) - assembly = db.Table('assembly', self.md, autoload_with=self.metadata_db) - assembly_sequence = db.Table('assembly_sequence', self.md, autoload_with=self.metadata_db) + assembly = db.Table("assembly", self.md, autoload_with=self.metadata_db) + assembly_sequence = db.Table( + "assembly_sequence", self.md, autoload_with=self.metadata_db + ) - seq_select = db.select( + seq_select = ( + db.select( assembly_sequence.c.accession, assembly_sequence.c.name, assembly_sequence.c.sequence_location, assembly_sequence.c.length, assembly_sequence.c.chromosomal, assembly_sequence.c.sequence_checksum, - assembly_sequence.c.ga4gh_identifier - ).select_from( - assembly).join( - assembly_sequence, assembly.c.assembly_id == assembly_sequence.c.assembly_id) + assembly_sequence.c.ga4gh_identifier, + ) + .select_from(assembly) + .join( + assembly_sequence, + assembly.c.assembly_id == assembly_sequence.c.assembly_id, + ) + ) if chromosomal_only: seq_select = seq_select.filter_by(chromosomal=1) # These options are in order of decreasing specificity, # and thus the ones later in the list can be redundant. if genome_id is not None: - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - seq_select = seq_select.join( - genome).filter(genome.c.genome_id.in_(genome_id)) + genome = db.Table("genome", self.md, autoload_with=self.metadata_db) + seq_select = seq_select.join(genome).filter( + genome.c.genome_id.in_(genome_id) + ) elif genome_uuid is not None: - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - seq_select = seq_select.join( - genome).filter(genome.c.genome_uuid.in_(genome_uuid)) + genome = db.Table("genome", self.md, autoload_with=self.metadata_db) + seq_select = seq_select.join(genome).filter( + genome.c.genome_uuid.in_(genome_uuid) + ) elif assembly_accession is not None: seq_select = seq_select.filter(assembly.c.accession.in_(assembly_accession)) @@ -392,9 +484,13 @@ def fetch_sequences(self, yield dict(result) def fetch_sequences_by_genome_uuid(self, genome_uuid, chromosomal_only=False): - return self.fetch_sequences(genome_uuid=genome_uuid, - chromosomal_only=chromosomal_only) + return self.fetch_sequences( + genome_uuid=genome_uuid, chromosomal_only=chromosomal_only + ) - def fetch_sequences_by_assembly_accession(self, assembly_accession, chromosomal_only=False): - return self.fetch_sequences(assembly_accession=assembly_accession, - chromosomal_only=chromosomal_only) + def fetch_sequences_by_assembly_accession( + self, assembly_accession, chromosomal_only=False + ): + return self.fetch_sequences( + assembly_accession=assembly_accession, chromosomal_only=chromosomal_only + ) diff --git a/src/ensembl/production/metadata/config.py b/src/ensembl/production/metadata/config.py index 3d980ddf..ad6cb833 100755 --- a/src/ensembl/production/metadata/config.py +++ b/src/ensembl/production/metadata/config.py @@ -12,6 +12,8 @@ import os -class MetadataConfig: - METADATA_URI = os.environ.get("METADATA_URI", None) - TAXONOMY_URI = os.environ.get("TAXONOMY_URI", None) +def get_metadata_uri(): + return os.environ.get("METADATA_URI", None) + +def get_taxonomy_uri(): + return os.environ.get("TAXONOMY_URI", None) diff --git a/src/ensembl/production/metadata/models.py b/src/ensembl/production/metadata/models.py new file mode 100644 index 00000000..336d5e24 --- /dev/null +++ b/src/ensembl/production/metadata/models.py @@ -0,0 +1,293 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from sqlalchemy import Column, DECIMAL, Date, DateTime, ForeignKey, Index, Integer, String +from sqlalchemy.dialects.mysql import DATETIME, TINYINT +from sqlalchemy.orm import relationship, sessionmaker, backref +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import create_engine, MetaData, inspect + + +Base = declarative_base() +metadata = Base.metadata + + +class Assembly(Base): + __tablename__ = 'assembly' + + assembly_id = Column(Integer, primary_key=True) + ucsc_name = Column(String(16)) + accession = Column(String(16), nullable=False, unique=True) + level = Column(String(32), nullable=False) + name = Column(String(128), nullable=False) + accession_body = Column(String(32)) + assembly_default = Column(String(32)) + tolid = Column(String(32), unique=True) + created = Column(DateTime) + ensembl_name = Column(String(255), unique=True) +#One to many relationships +#assembly_id within assembly_sequence + assembly_sequences = relationship("AssemblySequence", back_populates="assembly") +#assembly_id within genome + genomes = relationship("Genome", back_populates="assembly") +#many to one relationships +#none + +class AssemblySequence(Base): + __tablename__ = 'assembly_sequence' + __table_args__ = ( + Index('assembly_sequence_assembly_id_accession_5f3e5119_uniq', 'assembly_id', 'accession', unique=True), + ) + + assembly_sequence_id = Column(Integer, primary_key=True) + name = Column(String(128)) + assembly_id = Column(ForeignKey('assembly.assembly_id'), nullable=False, index=True) + accession = Column(String(32), nullable=False) + chromosomal = Column(TINYINT(1), nullable=False) + length = Column(Integer, nullable=False) + sequence_location = Column(String(10)) + sequence_checksum = Column(String(32)) + ga4gh_identifier = Column(String(32)) + #One to many relationships + #none + #many to one relationships + #assembly_id within assembly + assembly = relationship('Assembly', back_populates="assembly_sequences") + + +class Attribute(Base): + __tablename__ = 'attribute' + + attribute_id = Column(Integer, primary_key=True) + name = Column(String(128), nullable=False) + label = Column(String(128), nullable=False) + description = Column(String(255)) + #One to many relationships + #attribute_id within dataset attribute + dataset_attributes = relationship("DatasetAttribute", back_populates='attribute') + #many to one relationships + #none + +class Dataset(Base): + __tablename__ = 'dataset' + + dataset_id = Column(Integer, primary_key=True) + dataset_uuid = Column(String(128), nullable=False, unique=True) + dataset_type_id = Column(ForeignKey('dataset_type.dataset_type_id'), nullable=False, index=True) + name = Column(String(128), nullable=False) + version = Column(String(128)) + created = Column(DATETIME(fsp=6), nullable=False) + dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) + label = Column(String(128), nullable=False) + + #One to many relationships + #dataset_id to dataset attribute and genome dataset + dataset_attributes = relationship("DatasetAttribute", back_populates='dataset') + genome_datasets = relationship("GenomeDataset", back_populates='dataset') + #many to one relationships + #dataset_type_id to dataset_type + dataset_type = relationship('DatasetType', back_populates="datasets") + #dataset_source_id to dataset source + dataset_source = relationship('DatasetSource', back_populates="datasets") + + +class DatasetAttribute(Base): + __tablename__ = 'dataset_attribute' + __table_args__ = ( + Index('dataset_attribute_dataset_id_attribute_id__d3b34d8c_uniq', 'dataset_id', 'attribute_id', 'type', 'value', unique=True), + ) + + dataset_attribute_id = Column(Integer, primary_key=True) + type = Column(String(32), nullable=False) + value = Column(String(128), nullable=False) + attribute_id = Column(ForeignKey('attribute.attribute_id'), nullable=False, index=True) + dataset_id = Column(ForeignKey('dataset.dataset_id'), nullable=False, index=True) + #One to many relationships + #none + #many to one relationships + #dataset_attribute_id to dataset + attribute = relationship('Attribute', back_populates="dataset_attributes") + #attribute_id to attribute + dataset = relationship('Dataset', back_populates="dataset_attributes") + + +class DatasetSource(Base): + __tablename__ = 'dataset_source' + + dataset_source_id = Column(Integer, primary_key=True) + type = Column(String(32), nullable=False) + name = Column(String(255), nullable=False, unique=True) + #One to many relationships + #dataset_source_id to dataset + datasets = relationship('Dataset', back_populates='dataset_source') + #many to one relationships + #none + +class DatasetType(Base): + __tablename__ = 'dataset_type' + + dataset_type_id = Column(Integer, primary_key=True) + name = Column(String(32), nullable=False) + label = Column(String(128), nullable=False) + topic = Column(String(32), nullable=False) + description = Column(String(255)) + details_uri = Column(String(255)) + #One to many relationships + #dataset_type_id to dataset + datasets = relationship('Dataset', back_populates='dataset_type') + #many to one relationships + #none + +class EnsemblSite(Base): + __tablename__ = 'ensembl_site' + + site_id = Column(Integer, primary_key=True) + name = Column(String(64), nullable=False) + label = Column(String(64), nullable=False) + uri = Column(String(64), nullable=False) + #One to many relationships + #site_id to ensembl_release + ensembl_releases = relationship('EnsemblRelease', back_populates='ensembl_site') + #many to one relationships + #none + +class EnsemblRelease(Base): + __tablename__ = 'ensembl_release' + __table_args__ = ( + Index('ensembl_release_version_site_id_b743399a_uniq', 'version', 'site_id', unique=True), + ) + + release_id = Column(Integer, primary_key=True) + version = Column(DECIMAL(10, 1), nullable=False) + release_date = Column(Date, nullable=False) + label = Column(String(64)) + is_current = Column(TINYINT(1), nullable=False) + site_id = Column(ForeignKey('ensembl_site.site_id'), index=True) + release_type = Column(String(16), nullable=False) + #One to many relationships + #release_id to genome dataset and genome release + genome_datasets = relationship('GenomeDataset', back_populates='ensembl_release') + genome_releases = relationship('GenomeRelease', back_populates='ensembl_release') + #many to one relationships + #site_id to ensembl_site + ensembl_site = relationship('EnsemblSite', back_populates='ensembl_releases') + + +class Genome(Base): + __tablename__ = 'genome' + + genome_id = Column(Integer, primary_key=True) + genome_uuid = Column(String(128), nullable=False, unique=True) + assembly_id = Column(ForeignKey('assembly.assembly_id'), nullable=False, index=True) + organism_id = Column(ForeignKey('organism.organism_id'), nullable=False, index=True) + created = Column(DATETIME(fsp=6), nullable=False) + # One to many relationships + # genome_id to genome_dataset and genome release + genome_datasets = relationship('GenomeDataset', back_populates='genome') + genome_releases = relationship('GenomeRelease', back_populates='genome') + # many to one relationships + # assembly_id to assembly + assembly = relationship('Assembly', back_populates="genomes") + # organism_id to organism + organism = relationship('Organism', back_populates="genomes") + + +class GenomeDataset(Base): + __tablename__ = 'genome_dataset' + + genome_dataset_id = Column(Integer, primary_key=True) + dataset_id = Column(ForeignKey('dataset.dataset_id'), nullable=False, index=True) + genome_id = Column(ForeignKey('genome.genome_id'), nullable=False, index=True) + release_id = Column(ForeignKey('ensembl_release.release_id'), nullable=False, index=True) + is_current = Column(TINYINT(1), nullable=False) + #One to many relationships + #none + #many to one relationships + #genome_dataset_id to genome + dataset = relationship('Dataset', back_populates="genome_datasets") + #genome_id to genome + genome = relationship('Genome', back_populates="genome_datasets") + #release_id to release + ensembl_release = relationship('EnsemblRelease', back_populates="genome_datasets") + + +class GenomeRelease(Base): + __tablename__ = 'genome_release' + + genome_release_id = Column(Integer, primary_key=True) + genome_id = Column(ForeignKey('genome.genome_id'), nullable=False, index=True) + release_id = Column(ForeignKey('ensembl_release.release_id'), nullable=False, index=True) + is_current = Column(TINYINT(1), nullable=False) + #One to many relationships + #none + #many to one relationships + #genome_release_id to genome_release + genome = relationship('Genome', back_populates='genome_releases') + #release_id to ensembl release + ensembl_release = relationship('EnsemblRelease', back_populates='genome_releases') + + +class Organism(Base): + __tablename__ = 'organism' + + organism_id = Column(Integer, primary_key=True) + taxonomy_id = Column(Integer, nullable=False) + species_taxonomy_id = Column(Integer) + display_name = Column(String(128), nullable=False) + strain = Column(String(128)) + scientific_name = Column(String(128)) + url_name = Column(String(128), nullable=False) + ensembl_name = Column(String(128), nullable=False, unique=True) + scientific_parlance_name = Column(String(255)) + #One to many relationships + #Organism_id to organism_group_member and genome + genomes = relationship('Genome', back_populates='organism') + organism_group_members = relationship('OrganismGroupMember', back_populates='organism') + #many to one relationships + #organim_id and taxonomy_id to taxonomy_node #DIFFERENT DATABASE + +class OrganismGroup(Base): + __tablename__ = 'organism_group' + __table_args__ = ( + Index('group_type_name_63c2f6ac_uniq', 'type', 'name', unique=True), + ) + + organism_group_id = Column(Integer, primary_key=True) + type = Column(String(32), nullable=False) + name = Column(String(255), nullable=False) + code = Column(String(48), unique=True) + #One to many relationships + #Organism_group_id to organism_group_member + organism_group_members = relationship('OrganismGroupMember', back_populates='organism_group') + #many to one relationships + #none + +class OrganismGroupMember(Base): + __tablename__ = 'organism_group_member' + __table_args__ = ( + Index('organism_group_member_organism_id_organism_gro_fe8f49ac_uniq', 'organism_id', 'organism_group_id', unique=True), + ) + + organism_group_member_id = Column(Integer, primary_key=True) + is_reference = Column(TINYINT(1), nullable=False) + organism_id = Column(ForeignKey('organism.organism_id'), nullable=False) + organism_group_id = Column(ForeignKey('organism_group.organism_group_id'), nullable=False, index=True) + #One to many relationships + #none + #many to one relationships + #Organism_group_id to organism_group_member + #organism_id to organism + organism_group = relationship('OrganismGroup', back_populates='organism_group_members') + organism = relationship('Organism', back_populates='organism_group_members') diff --git a/tests/TEST.db b/tests/TEST.db new file mode 100644 index 00000000..dfd297ad Binary files /dev/null and b/tests/TEST.db differ diff --git a/tests/test_api.py b/tests/test_api.py index ad2db065..c708bba7 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -12,10 +12,30 @@ """ Unit tests for api module """ +from os.path import dirname +from ensembl.production.metadata.api import * -from ensembl.production.metadata.api import load_database - +DB_NAME = 'sqlite:///' + dirname(__file__) + '/TEST.db' def test_load_database(): - """Test api.load_database function""" - pass + DB_TEST = ReleaseAdaptor(DB_NAME) + assert DB_TEST, "DB should not be empty" + +def test_fetch_releases(): + conn = ReleaseAdaptor(DB_NAME) + TEST = conn.fetch_releases(release_id=1).one() + #Test the one to many connection + assert TEST.EnsemblSite.name == '2020-map' + #Test the direct access. + assert TEST.EnsemblRelease.label == '2020 MAP 7 species' + +#currently only have one release, so the testing is not comprehensive +def test_fetch_releases_for_genome(): + conn = ReleaseAdaptor(DB_NAME) + TEST = conn.fetch_releases_for_genome('a733574a-93e7-11ec-a39d-005056b38ce3').one() + assert TEST.EnsemblSite.name == '2020-map' + +def test_fetch_releases_for_dataset(): + conn = ReleaseAdaptor(DB_NAME) + TEST = conn.fetch_releases_for_dataset('76ffa505-948d-11ec-a39d-005056b38ce3').one() + assert TEST.EnsemblSite.name == '2020-map' \ No newline at end of file