diff --git a/requirements.in b/requirements.in index af1aca55..25419910 100644 --- a/requirements.in +++ b/requirements.in @@ -2,3 +2,4 @@ mysqlclient pymysql sqlalchemy types-pymysql +ensembl-py \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f233661a..2c2dbcfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ sqlalchemy==1.4.21 # via -r requirements.in types-pymysql==1.0.0 # via -r requirements.in +git+https://github.com/Ensembl/ensembl-py.git@main \ No newline at end of file diff --git a/setup.py b/setup.py index e39fc5d5..fa46d1fe 100644 --- a/setup.py +++ b/setup.py @@ -41,5 +41,8 @@ 'Programming Language :: Python', 'Programming Language :: Python :: 3', "Programming Language :: Python :: 3.8", + ], + install_requires=[ + 'ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git@main', ] ) diff --git a/src/ensembl/production/metadata/api.py b/src/ensembl/production/metadata/api.py index 55c4e5fe..68bfbee4 100644 --- a/src/ensembl/production/metadata/api.py +++ b/src/ensembl/production/metadata/api.py @@ -12,26 +12,34 @@ import sqlalchemy as db from sqlalchemy.orm import Session import pymysql - from ensembl.production.metadata.config import MetadataConfig - +import src.ensembl.production.metadata.models pymysql.install_as_MySQLdb() config = MetadataConfig() -def load_database(uri): - try: - engine = db.create_engine(uri) - except AttributeError as err: - raise ValueError(f'Could not connect to database {uri}: {err}.') from err - try: - connection = engine.connect() - except db.exc.OperationalError as err: - raise ValueError(f'Could not connect to database {uri}: {err}.') from err - connection.close() - return engine +#Replace with the DBconnection interface from ensembl-py.database.dbconnection.py +#Remove after tests are acceptable. +#def load_database(uri): +# try: +# engine = db.create_engine(uri) +# except AttributeError as err: +# raise ValueError(f"Could not connect to database {uri}: {err}.") from err +# +# try: +# connection = engine.connect() +# except db.exc.OperationalError as err: +# raise ValueError(f"Could not connect to database {uri}: {err}.") from err +# +# connection.close() +# return engine + + + + +#Not sure why all of this is here, but it is not being removed until we are certain. def check_parameter(param): @@ -52,45 +60,50 @@ def __init__(self, metadata_uri=None): class ReleaseAdaptor(BaseAdaptor): - def fetch_releases(self, - release_id=None, - release_version=None, - current_only=True, - release_type=None, - site_name=None - ): + def fetch_releases( + self, + release_id=None, + release_version=None, + current_only=True, + release_type=None, + site_name=None, + ): release_id = check_parameter(release_id) release_version = check_parameter(release_version) release_type = check_parameter(release_type) site_name = check_parameter(site_name) # Reflect existing tables, letting sqlalchemy load linked tables where possible. - release = db.Table('ensembl_release', self.md, autoload_with=self.metadata_db) - site = self.md.tables['ensembl_site'] + release = db.Table("ensembl_release", self.md, autoload_with=self.metadata_db) + site = self.md.tables["ensembl_site"] release_select = db.select( - release.c.release_id, - release.c.version.label('release_version'), - db.cast(release.c.release_date, db.String), - release.c.label.label('release_label'), - release.c.is_current, - release.c.release_type, - site.c.name.label('site_name'), - site.c.label.label('site_label'), - site.c.uri.label('site_uri') - ).select_from(release) + release.c.release_id, + release.c.version.label("release_version"), + db.cast(release.c.release_date, db.String), + release.c.label.label("release_label"), + release.c.is_current, + release.c.release_type, + site.c.name.label("site_name"), + site.c.label.label("site_label"), + site.c.uri.label("site_uri"), + ).select_from(release) # These options are in order of decreasing specificity, # and thus the ones later in the list can be redundant. if release_id is not None: release_select = release_select.filter(release.c.release_id.in_(release_id)) elif release_version is not None: - release_select = release_select.filter(release.c.version.in_(release_version)) + release_select = release_select.filter( + release.c.version.in_(release_version) + ) elif current_only: release_select = release_select.filter_by(is_current=1) if release_type is not None: - release_select = release_select.filter(release.c.release_type.in_(release_type)) + release_select = release_select.filter( + release.c.release_type.in_(release_type) + ) release_select = release_select.join(site) if site_name is not None: @@ -99,30 +112,40 @@ def fetch_releases(self, return self.metadata_db_session.execute(release_select).all() def fetch_releases_for_genome(self, genome_uuid, site_name=None): - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - genome_release = db.Table('genome_release', self.md, autoload_with=self.metadata_db) + genome = db.Table("genome", self.md, autoload_with=self.metadata_db) + genome_release = db.Table( + "genome_release", self.md, autoload_with=self.metadata_db + ) - release_id_select = db.select( - genome_release.c.release_id - ).select_from(genome).filter_by( - genome_uuid=genome_uuid - ).join(genome_release) + release_id_select = ( + db.select(genome_release.c.release_id) + .select_from(genome) + .filter_by(genome_uuid=genome_uuid) + .join(genome_release) + ) - release_ids = [rid for (rid,) in self.metadata_db_session.execute(release_id_select)] + release_ids = [ + rid for (rid,) in self.metadata_db_session.execute(release_id_select) + ] return self.fetch_releases(release_id=release_ids, site_name=site_name) def fetch_releases_for_dataset(self, dataset_uuid, site_name=None): - dataset = db.Table('dataset', self.md, autoload_with=self.metadata_db) - genome_dataset = db.Table('genome_dataset', self.md, autoload_with=self.metadata_db) + dataset = db.Table("dataset", self.md, autoload_with=self.metadata_db) + genome_dataset = db.Table( + "genome_dataset", self.md, autoload_with=self.metadata_db + ) - release_id_select = db.select( - genome_dataset.c.release_id - ).select_from(dataset).filter_by( - dataset_uuid=dataset_uuid - ).join(genome_dataset) + release_id_select = ( + db.select(genome_dataset.c.release_id) + .select_from(dataset) + .filter_by(dataset_uuid=dataset_uuid) + .join(genome_dataset) + ) - release_ids = [rid for (rid,) in self.metadata_db_session.execute(release_id_select)] + release_ids = [ + rid for (rid,) in self.metadata_db_session.execute(release_id_select) + ] return self.fetch_releases(release_id=release_ids, site_name=site_name) @@ -144,69 +167,73 @@ def __init__(self, metadata_uri=None, taxonomy_uri=None): self.taxon_names = self.fetch_taxonomy_names(taxonomy_ids) def fetch_taxonomy_ids(self): - organism = db.Table('organism', self.md, autoload_with=self.metadata_db) + organism = db.Table("organism", self.md, autoload_with=self.metadata_db) taxonomy_id_select = db.select(organism.c.taxonomy_id.distinct()) taxonomy_ids = [tid for (tid,) in self.metadata_db.execute(taxonomy_id_select)] return taxonomy_ids def fetch_taxonomy_names(self, taxonomy_id): - ncbi_taxa_name = db.Table('ncbi_taxa_name', self.md, autoload_with=self.taxonomy_db) + ncbi_taxa_name = db.Table( + "ncbi_taxa_name", self.md, autoload_with=self.taxonomy_db + ) taxons = {} for tid in taxonomy_id: - names = { - 'scientific_name': None, - 'synonym': [] - } + names = {"scientific_name": None, "synonym": []} taxons[tid] = names sci_name_select = db.select( - ncbi_taxa_name.c.taxon_id, - ncbi_taxa_name.c.name + ncbi_taxa_name.c.taxon_id, ncbi_taxa_name.c.name ).filter( ncbi_taxa_name.c.taxon_id.in_(taxonomy_id), - ncbi_taxa_name.c.name_class == 'scientific name' + ncbi_taxa_name.c.name_class == "scientific name", ) for x in self.taxonomy_db.execute(sci_name_select): - taxons[x.taxon_id]['scientific_name'] = x.name + taxons[x.taxon_id]["scientific_name"] = x.name synonym_class = [ - 'common name', - 'equivalent name', - 'genbank common name', - 'genbank synonym', - 'synonym' + "common name", + "equivalent name", + "genbank common name", + "genbank synonym", + "synonym", ] synonyms_select = db.select( - ncbi_taxa_name.c.taxon_id, - ncbi_taxa_name.c.name + ncbi_taxa_name.c.taxon_id, ncbi_taxa_name.c.name ).filter( ncbi_taxa_name.c.taxon_id.in_(taxonomy_id), - ncbi_taxa_name.c.name_class.in_(synonym_class) + ncbi_taxa_name.c.name_class.in_(synonym_class), ) for x in self.taxonomy_db.execute(synonyms_select): - taxons[x.taxon_id]['synonym'].append(x.name) + taxons[x.taxon_id]["synonym"].append(x.name) return taxons - def fetch_genomes(self, - genome_id=None, genome_uuid=None, - assembly_accession=None, - ensembl_name=None, taxonomy_id=None, - unreleased_only=False, - site_name=None, release_type=None, release_version=None, current_only=True - ): + def fetch_genomes( + self, + genome_id=None, + genome_uuid=None, + assembly_accession=None, + ensembl_name=None, + taxonomy_id=None, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): genome_id = check_parameter(genome_id) genome_uuid = check_parameter(genome_uuid) assembly_accession = check_parameter(assembly_accession) ensembl_name = check_parameter(ensembl_name) taxonomy_id = check_parameter(taxonomy_id) - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - assembly = self.md.tables['assembly'] - organism = self.md.tables['organism'] + genome = db.Table("genome", self.md, autoload_with=self.metadata_db) + assembly = self.md.tables["assembly"] + organism = self.md.tables["organism"] - genome_select = db.select( + genome_select = ( + db.select( genome.c.genome_id, genome.c.genome_uuid, organism.c.ensembl_name, @@ -214,35 +241,51 @@ def fetch_genomes(self, organism.c.display_name, organism.c.strain, organism.c.taxonomy_id, - assembly.c.accession.label('assembly_accession'), - assembly.c.name.label('assembly_name'), - assembly.c.ucsc_name.label('assembly_ucsc_name'), - assembly.c.level.label('assembly_level') - ).select_from(genome).join(assembly).join(organism) + assembly.c.accession.label("assembly_accession"), + assembly.c.name.label("assembly_name"), + assembly.c.ucsc_name.label("assembly_ucsc_name"), + assembly.c.level.label("assembly_level"), + ) + .select_from(genome) + .join(assembly) + .join(organism) + ) if unreleased_only: - genome_release = db.Table('genome_release', self.md, autoload_with=self.metadata_db) + genome_release = db.Table( + "genome_release", self.md, autoload_with=self.metadata_db + ) - genome_select = genome_select.outerjoin(genome_release).filter_by(genome_id=None) + genome_select = genome_select.outerjoin(genome_release).filter_by( + genome_id=None + ) elif site_name is not None: - genome_release = db.Table('genome_release', self.md, autoload_with=self.metadata_db) - release = self.md.tables['ensembl_release'] - site = self.md.tables['ensembl_site'] - - genome_select = genome_select.join( - genome_release).join( - release).join( - site).filter_by(name=site_name) + genome_release = db.Table( + "genome_release", self.md, autoload_with=self.metadata_db + ) + release = self.md.tables["ensembl_release"] + site = self.md.tables["ensembl_site"] + + genome_select = ( + genome_select.join(genome_release) + .join(release) + .join(site) + .filter_by(name=site_name) + ) if release_type is not None: - genome_select = genome_select.filter(release.c.release_type == release_type) + genome_select = genome_select.filter( + release.c.release_type == release_type + ) if current_only: genome_select = genome_select.filter(genome_release.c.is_current == 1) if release_version is not None: - genome_select = genome_select.filter(release.c.version <= release_version) + genome_select = genome_select.filter( + release.c.version <= release_version + ) # These options are in order of decreasing specificity, # and thus the ones later in the list can be redundant. @@ -251,11 +294,17 @@ def fetch_genomes(self, elif genome_uuid is not None: genome_select = genome_select.filter(genome.c.genome_uuid.in_(genome_uuid)) elif assembly_accession is not None: - genome_select = genome_select.filter(assembly.c.accession.in_(assembly_accession)) + genome_select = genome_select.filter( + assembly.c.accession.in_(assembly_accession) + ) elif ensembl_name is not None: - genome_select = genome_select.filter(organism.c.ensembl_name.in_(ensembl_name)) + genome_select = genome_select.filter( + organism.c.ensembl_name.in_(ensembl_name) + ) elif taxonomy_id is not None: - genome_select = genome_select.filter(organism.c.taxonomy_id.in_(taxonomy_id)) + genome_select = genome_select.filter( + organism.c.taxonomy_id.in_(taxonomy_id) + ) for result in self.metadata_db_session.execute(genome_select): taxon_names = self.taxon_names[result.taxonomy_id] @@ -263,128 +312,174 @@ def fetch_genomes(self, result_dict.update(taxon_names) yield result_dict - def fetch_genomes_by_genome_uuid(self, - genome_uuid, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(genome_uuid=genome_uuid, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_assembly_accession(self, - assembly_accession, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(assembly_accession=assembly_accession, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_ensembl_name(self, - ensembl_name, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(ensembl_name=ensembl_name, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_taxonomy_id(self, - taxonomy_id, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - return self.fetch_genomes(taxonomy_id=taxonomy_id, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_scientific_name(self, - scientific_name, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): - taxonomy_ids = [t_id for t_id in self.taxon_names - if self.taxon_names[t_id]['scientific_name'] == scientific_name] - - return self.fetch_genomes_by_taxonomy_id(taxonomy_ids, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_genomes_by_synonym(self, - synonym, - unreleased_only=False, - site_name=None, release_type=None, - release_version=None, current_only=True - ): + def fetch_genomes_by_genome_uuid( + self, + genome_uuid, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + genome_uuid=genome_uuid, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_assembly_accession( + self, + assembly_accession, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + assembly_accession=assembly_accession, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_ensembl_name( + self, + ensembl_name, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + ensembl_name=ensembl_name, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_taxonomy_id( + self, + taxonomy_id, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + return self.fetch_genomes( + taxonomy_id=taxonomy_id, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_scientific_name( + self, + scientific_name, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): + taxonomy_ids = [ + t_id + for t_id in self.taxon_names + if self.taxon_names[t_id]["scientific_name"] == scientific_name + ] + + return self.fetch_genomes_by_taxonomy_id( + taxonomy_ids, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_genomes_by_synonym( + self, + synonym, + unreleased_only=False, + site_name=None, + release_type=None, + release_version=None, + current_only=True, + ): taxonomy_ids = [] for taxon_id in self.taxon_names: - if synonym.casefold() in [x.casefold() for x in self.taxon_names[taxon_id]['synonym']]: + if synonym.casefold() in [ + x.casefold() for x in self.taxon_names[taxon_id]["synonym"] + ]: taxonomy_ids.append(taxon_id) - return self.fetch_genomes_by_taxonomy_id(taxonomy_ids, - unreleased_only=unreleased_only, - site_name=site_name, - release_type=release_type, - release_version=release_version, - current_only=current_only) - - def fetch_sequences(self, - genome_id=None, genome_uuid=None, - assembly_accession=None, - chromosomal_only=False - ): + return self.fetch_genomes_by_taxonomy_id( + taxonomy_ids, + unreleased_only=unreleased_only, + site_name=site_name, + release_type=release_type, + release_version=release_version, + current_only=current_only, + ) + + def fetch_sequences( + self, + genome_id=None, + genome_uuid=None, + assembly_accession=None, + chromosomal_only=False, + ): genome_id = check_parameter(genome_id) genome_uuid = check_parameter(genome_uuid) assembly_accession = check_parameter(assembly_accession) - assembly = db.Table('assembly', self.md, autoload_with=self.metadata_db) - assembly_sequence = db.Table('assembly_sequence', self.md, autoload_with=self.metadata_db) + assembly = db.Table("assembly", self.md, autoload_with=self.metadata_db) + assembly_sequence = db.Table( + "assembly_sequence", self.md, autoload_with=self.metadata_db + ) - seq_select = db.select( + seq_select = ( + db.select( assembly_sequence.c.accession, assembly_sequence.c.name, assembly_sequence.c.sequence_location, assembly_sequence.c.length, assembly_sequence.c.chromosomal, assembly_sequence.c.sequence_checksum, - assembly_sequence.c.ga4gh_identifier - ).select_from( - assembly).join( - assembly_sequence, assembly.c.assembly_id == assembly_sequence.c.assembly_id) + assembly_sequence.c.ga4gh_identifier, + ) + .select_from(assembly) + .join( + assembly_sequence, + assembly.c.assembly_id == assembly_sequence.c.assembly_id, + ) + ) if chromosomal_only: seq_select = seq_select.filter_by(chromosomal=1) # These options are in order of decreasing specificity, # and thus the ones later in the list can be redundant. if genome_id is not None: - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - seq_select = seq_select.join( - genome).filter(genome.c.genome_id.in_(genome_id)) + genome = db.Table("genome", self.md, autoload_with=self.metadata_db) + seq_select = seq_select.join(genome).filter( + genome.c.genome_id.in_(genome_id) + ) elif genome_uuid is not None: - genome = db.Table('genome', self.md, autoload_with=self.metadata_db) - seq_select = seq_select.join( - genome).filter(genome.c.genome_uuid.in_(genome_uuid)) + genome = db.Table("genome", self.md, autoload_with=self.metadata_db) + seq_select = seq_select.join(genome).filter( + genome.c.genome_uuid.in_(genome_uuid) + ) elif assembly_accession is not None: seq_select = seq_select.filter(assembly.c.accession.in_(assembly_accession)) @@ -392,9 +487,13 @@ def fetch_sequences(self, yield dict(result) def fetch_sequences_by_genome_uuid(self, genome_uuid, chromosomal_only=False): - return self.fetch_sequences(genome_uuid=genome_uuid, - chromosomal_only=chromosomal_only) + return self.fetch_sequences( + genome_uuid=genome_uuid, chromosomal_only=chromosomal_only + ) - def fetch_sequences_by_assembly_accession(self, assembly_accession, chromosomal_only=False): - return self.fetch_sequences(assembly_accession=assembly_accession, - chromosomal_only=chromosomal_only) + def fetch_sequences_by_assembly_accession( + self, assembly_accession, chromosomal_only=False + ): + return self.fetch_sequences( + assembly_accession=assembly_accession, chromosomal_only=chromosomal_only + ) diff --git a/src/ensembl/production/metadata/models.py b/src/ensembl/production/metadata/models.py new file mode 100644 index 00000000..7b7f83f2 --- /dev/null +++ b/src/ensembl/production/metadata/models.py @@ -0,0 +1,223 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from sqlalchemy import Column, DECIMAL, Date, DateTime, ForeignKey, Index, Integer, String +from sqlalchemy.dialects.mysql import DATETIME, TINYINT +from sqlalchemy.orm import relationship, sessionmaker, backref +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import create_engine, MetaData, inspect + + +Base = declarative_base() +metadata = Base.metadata + + +class Assembly(Base): + __tablename__ = 'assembly' + + assembly_id = Column(Integer, primary_key=True) + ucsc_name = Column(String(16)) + accession = Column(String(16), nullable=False, unique=True) + level = Column(String(32), nullable=False) + name = Column(String(128), nullable=False) + accession_body = Column(String(32)) + assembly_default = Column(String(32)) + tolid = Column(String(32), unique=True) + created = Column(DateTime) + ensembl_name = Column(String(255), unique=True) + + +class AssemblySequence(Base): + __tablename__ = 'assembly_sequence' + __table_args__ = ( + Index('assembly_sequence_assembly_id_accession_5f3e5119_uniq', 'assembly_id', 'accession', unique=True), + ) + + assembly_sequence_id = Column(Integer, primary_key=True) + name = Column(String(128)) + assembly_id = Column(ForeignKey('assembly.assembly_id'), nullable=False, index=True) + accession = Column(String(32), nullable=False) + chromosomal = Column(TINYINT(1), nullable=False) + length = Column(Integer, nullable=False) + sequence_location = Column(String(10)) + sequence_checksum = Column(String(32)) + ga4gh_identifier = Column(String(32)) + assembly = relationship('Assembly', backref="assembly") + + +class Attribute(Base): + __tablename__ = 'attribute' + + attribute_id = Column(Integer, primary_key=True) + name = Column(String(128), nullable=False) + label = Column(String(128), nullable=False) + description = Column(String(255)) + + +class Dataset(Base): + __tablename__ = 'dataset' + + dataset_id = Column(Integer, primary_key=True) + dataset_uuid = Column(String(128), nullable=False, unique=True) + dataset_type_id = Column(ForeignKey('dataset_type.dataset_type_id'), nullable=False, index=True) + name = Column(String(128), nullable=False) + version = Column(String(128)) + created = Column(DATETIME(fsp=6), nullable=False) + dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True) + label = Column(String(128), nullable=False) + + dataset_source = relationship('DatasetSource', backref="dataset") + dataset_type = relationship('DatasetType', backref="dataset") + + +class DatasetAttribute(Base): + __tablename__ = 'dataset_attribute' + __table_args__ = ( + Index('dataset_attribute_dataset_id_attribute_id__d3b34d8c_uniq', 'dataset_id', 'attribute_id', 'type', 'value', unique=True), + ) + + dataset_attribute_id = Column(Integer, primary_key=True) + type = Column(String(32), nullable=False) + value = Column(String(128), nullable=False) + attribute_id = Column(ForeignKey('attribute.attribute_id'), nullable=False, index=True) + dataset_id = Column(ForeignKey('dataset.dataset_id'), nullable=False, index=True) + + attribute = relationship('Attribute', backref="dataset_attribute") + dataset = relationship('Dataset', backref="dataset_attribute") + + +class DatasetSource(Base): + __tablename__ = 'dataset_source' + + dataset_source_id = Column(Integer, primary_key=True) + type = Column(String(32), nullable=False) + name = Column(String(255), nullable=False, unique=True) + + +class DatasetType(Base): + __tablename__ = 'dataset_type' + + dataset_type_id = Column(Integer, primary_key=True) + name = Column(String(32), nullable=False) + label = Column(String(128), nullable=False) + topic = Column(String(32), nullable=False) + description = Column(String(255)) + details_uri = Column(String(255)) + + +class EnsemblSite(Base): + __tablename__ = 'ensembl_site' + + site_id = Column(Integer, primary_key=True) + name = Column(String(64), nullable=False) + label = Column(String(64), nullable=False) + uri = Column(String(64), nullable=False) + + +class EnsemblRelease(Base): + __tablename__ = 'ensembl_release' + __table_args__ = ( + Index('ensembl_release_version_site_id_b743399a_uniq', 'version', 'site_id', unique=True), + ) + + release_id = Column(Integer, primary_key=True) + version = Column(DECIMAL(10, 1), nullable=False) + release_date = Column(Date, nullable=False) + label = Column(String(64)) + is_current = Column(TINYINT(1), nullable=False) + site_id = Column(ForeignKey('ensembl_site.site_id'), index=True) + release_type = Column(String(16), nullable=False) + + site = relationship('EnsemblSite', backref='ensembl_release') + + +class Genome(Base): + __tablename__ = 'genome' + + genome_id = Column(Integer, primary_key=True) + genome_uuid = Column(String(128), nullable=False, unique=True) + assembly_id = Column(ForeignKey('assembly.assembly_id'), nullable=False, index=True) + organism_id = Column(ForeignKey('organism.organism_id'), nullable=False, index=True) + created = Column(DATETIME(fsp=6), nullable=False) + + assembly = relationship('Assembly', backref="genome") + organism = relationship('Organism', backref="genome") + + +class GenomeDataset(Base): + __tablename__ = 'genome_dataset' + + genome_dataset_id = Column(Integer, primary_key=True) + dataset_id = Column(ForeignKey('dataset.dataset_id'), nullable=False, index=True) + genome_id = Column(ForeignKey('genome.genome_id'), nullable=False, index=True) + release_id = Column(ForeignKey('ensembl_release.release_id'), nullable=False, index=True) + is_current = Column(TINYINT(1), nullable=False) + + dataset = relationship('Dataset', backref="genome_dataset") + genome = relationship('Genome', backref="genome_dataset") + release = relationship('EnsemblRelease', backref="genome_dataset") + + +class GenomeRelease(Base): + __tablename__ = 'genome_release' + + genome_release_id = Column(Integer, primary_key=True) + genome_id = Column(ForeignKey('genome.genome_id'), nullable=False, index=True) + release_id = Column(ForeignKey('ensembl_release.release_id'), nullable=False, index=True) + is_current = Column(TINYINT(1), nullable=False) + + genome = relationship('Genome', backref='genome_release') + release = relationship('EnsemblRelease', backref='genome_release') + + +class Organism(Base): + __tablename__ = 'organism' + + organism_id = Column(Integer, primary_key=True) + taxonomy_id = Column(Integer, nullable=False) + species_taxonomy_id = Column(Integer) + display_name = Column(String(128), nullable=False) + strain = Column(String(128)) + scientific_name = Column(String(128)) + url_name = Column(String(128), nullable=False) + ensembl_name = Column(String(128), nullable=False, unique=True) + scientific_parlance_name = Column(String(255)) + + +class OrganismGroup(Base): + __tablename__ = 'organism_group' + __table_args__ = ( + Index('group_type_name_63c2f6ac_uniq', 'type', 'name', unique=True), + ) + + organism_group_id = Column(Integer, primary_key=True) + type = Column(String(32), nullable=False) + name = Column(String(255), nullable=False) + code = Column(String(48), unique=True) + + +class OrganismGroupMember(Base): + __tablename__ = 'organism_group_member' + __table_args__ = ( + Index('organism_group_member_organism_id_organism_gro_fe8f49ac_uniq', 'organism_id', 'organism_group_id', unique=True), + ) + + organism_group_member_id = Column(Integer, primary_key=True) + is_reference = Column(TINYINT(1), nullable=False) + organism_id = Column(ForeignKey('organism.organism_id'), nullable=False) + organism_group_id = Column(ForeignKey('organism_group.organism_group_id'), nullable=False, index=True) + + organism_group = relationship('OrganismGroup', backref='organism_group_member') + organism = relationship('Organism', backref='organism_group_member') diff --git a/tests/test_api.py b/tests/test_api.py index ad2db065..bc37303e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -13,9 +13,11 @@ Unit tests for api module """ -from ensembl.production.metadata.api import load_database - def test_load_database(): """Test api.load_database function""" + #load_database('XXX') pass + + +