diff --git a/sql/assembly_sequence.sql b/sql/assembly_sequence.sql index ca25d31f..df2189ee 100644 --- a/sql/assembly_sequence.sql +++ b/sql/assembly_sequence.sql @@ -1,11 +1,11 @@ --- Because the attrib_type and external_db tables are identical +-- Because the attrib_type.txt and external_db tables are identical -- across all dbs, and in sync with the production master copy, -- we can use IDs directly, and avoid some complicated outer -- join statements... -- external_db.external_db_id 50710 = INSDC --- attrib_type.attrib_type_id 6 = toplevel --- attrib_type.attrib_type_id 367 = karyotype_rank --- attrib_type.attrib_type_id 547 = sequence_location +-- attrib_type.txt.attrib_type_id 6 = toplevel +-- attrib_type.txt.attrib_type_id 367 = karyotype_rank +-- attrib_type.txt.attrib_type_id 547 = sequence_location -- Unfortunately, the sequence_location attribute in the core dbs -- isn't set with the values you might expect; it has '*_chromosome' diff --git a/src/ensembl/production/metadata/api/base.py b/src/ensembl/production/metadata/api/base.py index 2d32323a..d5289b5e 100644 --- a/src/ensembl/production/metadata/api/base.py +++ b/src/ensembl/production/metadata/api/base.py @@ -12,12 +12,19 @@ from ensembl.database import DBConnection +##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies. +#Add in best genome (see doc) +#More functions for related genomes + + class BaseAdaptor: def __init__(self, metadata_uri): self.metadata_db = DBConnection(metadata_uri) def check_parameter(param): + if isinstance(param, tuple): + param = param[0] if param is not None and not isinstance(param, list): param = [param] return param diff --git a/src/ensembl/production/metadata/api/dataset.py b/src/ensembl/production/metadata/api/dataset.py new file mode 100644 index 00000000..bac2f553 --- /dev/null +++ b/src/ensembl/production/metadata/api/dataset.py @@ -0,0 +1,38 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sqlalchemy as db +from sqlalchemy.engine import make_url + +from ensembl.production.metadata.api.base import BaseAdaptor +from ensembl.production.metadata.api.models import GenomeDataset, Dataset +import logging + +logger = logging.getLogger(__name__) + + +class DatasetAdaptor(BaseAdaptor): + def __init__(self, metadata_uri): + super().__init__(metadata_uri) + + def check_release_status(self, dataset_uuid): + with self.metadata_db.session_scope() as session: + # Query to check if a release_id exists for the given genome_uuid + dataset_id = session.query(Dataset.dataset_id).filter(Dataset.dataset_uuid == dataset_uuid).scalar() + if dataset_id is None: + return "UUID not found" + + # Now we check if there exists a genome dataset with the corresponding dataset_id and a non-null release_id + result = session.query( + session.query(GenomeDataset).filter(GenomeDataset.dataset_id == dataset_id, + GenomeDataset.release_id.isnot(None)).exists() + ).scalar() + return result diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index f249b08f..39de969b 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -30,11 +30,11 @@ def __init__(self, metadata_uri, taxonomy_uri=None): def fetch_taxonomy_names(self, taxonomy_ids): + taxonomy_ids = check_parameter(taxonomy_ids) taxons = {} for tid in taxonomy_ids: names = {"scientific_name": None, "synonym": []} taxons[tid] = names - for taxon in taxons: sci_name_select = db.select( NCBITaxaName.name diff --git a/src/ensembl/production/metadata/api/models/assembly.py b/src/ensembl/production/metadata/api/models/assembly.py index c840d47a..503912b1 100644 --- a/src/ensembl/production/metadata/api/models/assembly.py +++ b/src/ensembl/production/metadata/api/models/assembly.py @@ -35,9 +35,9 @@ class Assembly(Base): alt_accession = Column(String(16), nullable=True) # One to many relationships # assembly_id within assembly_sequence - assembly_sequences = relationship("AssemblySequence", back_populates="assembly") + assembly_sequences = relationship("AssemblySequence", back_populates="assembly", cascade="all, delete, delete-orphan") # assembly_id within genome - genomes = relationship("Genome", back_populates="assembly") + genomes = relationship("Genome", back_populates="assembly", cascade="all, delete, delete-orphan") class AssemblySequence(Base): diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 4a458b12..cd90dea2 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -47,8 +47,8 @@ class Dataset(Base): # One to many relationships # dataset_id to dataset attribute and genome dataset - dataset_attributes = relationship("DatasetAttribute", back_populates='dataset') - genome_datasets = relationship("GenomeDataset", back_populates='dataset') + dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', cascade="all, delete, delete-orphan") + genome_datasets = relationship("GenomeDataset", back_populates='dataset', cascade="all, delete, delete-orphan") # many to one relationships # dataset_type_id to dataset_type dataset_type = relationship('DatasetType', back_populates="datasets") diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index cff25617..21060e08 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -28,8 +28,8 @@ class Genome(Base): created = Column(DATETIME(fsp=6), nullable=False) # One to many relationships # genome_id to genome_dataset and genome release - genome_datasets = relationship("GenomeDataset", back_populates="genome") - genome_releases = relationship("GenomeRelease", back_populates="genome") + genome_datasets = relationship("GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan") + genome_releases = relationship("GenomeRelease", back_populates="genome", cascade="all, delete, delete-orphan") # many to one relationships # assembly_id to assembly assembly = relationship("Assembly", back_populates="genomes") diff --git a/src/ensembl/production/metadata/api/models/organism.py b/src/ensembl/production/metadata/api/models/organism.py index 9dc4f747..ebcb5bb4 100644 --- a/src/ensembl/production/metadata/api/models/organism.py +++ b/src/ensembl/production/metadata/api/models/organism.py @@ -33,7 +33,7 @@ class Organism(Base): scientific_parlance_name = Column(String(255)) # One to many relationships # Organism_id to organism_group_member and genome - genomes = relationship("Genome", back_populates="organism") + genomes = relationship("Genome", back_populates="organism", cascade="all, delete, delete-orphan") organism_group_members = relationship("OrganismGroupMember", back_populates="organism") # many to one relationships diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt index 35754fda..b50b8bed 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt @@ -56,13 +56,15 @@ 56 homology_coverage Coverage Percent of genome which is homologous to another species percent 57 short_variants Short variants Small-scale genetic variations integer 58 structural_variants Structural variants Large-scale genetic variations integer -59 short_variants_with_phenotype_assertions "Short variants -With phenotype assertions" Short variants with phenotypic evidence integer -60 short_variants_with_publications "Short variants -With publications" Short variants published in literature integer -61 short_variants_frequency_studies "Short variants -Frequency studies" Short variants studied for frequency integer -62 structural_variants_with_phenotype_assertions "Structural variants -With phenotype assertions" Structural variants with phenotypic evidence integer +59 short_variants_with_phenotype_assertions "Short variants With phenotype assertions" Short variants with phenotypic evidence integer +60 short_variants_with_publications "Short variants With publications" Short variants published in literature integer +61 short_variants_frequency_studies "Short variants Frequency studies" Short variants studied for frequency integer +62 structural_variants_with_phenotype_assertions "Structural variants With phenotype assertions" Structural variants with phenotypic evidence integer 63 enhancers Enhancers DNA sequences that increase gene expression integer 64 promoters Promoters DNA sequences initiating transcription integer +65 assembly.accession accession accession string +66 assembly.default default default string +67 assembly.name name name string +68 assembly.ucsc_alias ucsc_alias ucsc_alias string +69 genebuild.last_geneset_update last_geneset_update last_geneset_update string +70 genebuild.version version version string \ No newline at end of file diff --git a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt index e33a4660..175bbc0e 100644 --- a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt +++ b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt @@ -321,6 +321,10 @@ 2301119 Rhabditomorpha scientific name 2698737 Sar Burki et al. 2008 authority 2698737 Sar scientific name +666668 jabberwocky synonym +666668 carol_jabberwocky3 equivalent name +666668 carol_jabberwocky2 equivalent name +666668 carol_jabberwocky scientific name 2698737 SAR supergroup synonym 38820 4478 merged_taxon_id 38820 4727 merged_taxon_id diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 8e63c7c9..642e1f64 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -13,6 +13,7 @@ from sqlalchemy.engine import make_url from ensembl.core.models import Meta +from ensembl.production.metadata.api.models import DatasetSource from ensembl.database import DBConnection from ensembl.production.metadata.api.models import EnsemblRelease @@ -21,8 +22,7 @@ class BaseMetaUpdater: def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): self.db_uri = db_uri self.db = DBConnection(self.db_uri) - self.species = None - self.db_type = None + self.metadata_db = DBConnection(metadata_uri) # We will add a release later. For now, the release must be specified for it to be used. if release is None: self.listed_release = None @@ -30,8 +30,7 @@ def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): else: self.listed_release = release self.listed_release_is_current = EnsemblRelease.is_current - self.metadata_db = DBConnection(metadata_uri) - self.taxonomy_uri = taxonomy_uri + # Basic API for the meta table in the submission database. def get_meta_single_meta_key(self, species_id, parameter): @@ -43,4 +42,29 @@ def get_meta_single_meta_key(self, species_id, parameter): else: return result[0] + def get_meta_list_from_prefix_meta_key(self, species_id, prefix): + with self.db.session_scope() as session: + query = session.query(Meta.meta_key, Meta.meta_value).filter( + Meta.meta_key.like(f'{prefix}%'), + Meta.species_id == species_id + ) + result = query.all() + if not result: + return {} + else: + # Build a dictionary out of the results. + result_dict = {key: value for key, value in result} + return result_dict + def get_or_new_source(self, meta_session, db_uri, db_type): + name = make_url(db_uri).database + dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none() + if dataset_source is None: + dataset_source = DatasetSource( + type=db_type, # core/fungen etc + name=name # dbname + ) + meta_session.add(dataset_source) # Only add a new DatasetSource to the session if it doesn't exist + return dataset_source, "new" + else: + return dataset_source, "existing" diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 18d23df9..919a1f79 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -9,47 +9,51 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging import re -import uuid - +from collections import defaultdict import sqlalchemy as db -from ensembl.core.models import Meta, Assembly, CoordSystem, SeqRegionAttrib, SeqRegion, SeqRegionSynonym -from sqlalchemy import select, update, func, and_ -from sqlalchemy.engine import make_url +from ensembl.core.models import Meta, CoordSystem, SeqRegionAttrib, SeqRegion, \ + SeqRegionSynonym, AttribType, ExternalDb +from sqlalchemy import select, func from sqlalchemy.orm import aliased - +from ensembl.database import DBConnection +from sqlalchemy.exc import NoResultFound from ensembl.production.metadata.api.genome import GenomeAdaptor +from ensembl.production.metadata.api.dataset import DatasetAdaptor from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater class CoreMetaUpdater(BaseMetaUpdater): - def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): - # Each of these objects represents a table in the database to store data in as either an array or a single object. + def __init__(self, db_uri, metadata_uri, taxonomy_uri): + # Each of these objects represents a table in the database to store data in as an array or a single object. self.organism = None + self.division = None self.organism_group_member = None - self.organism_group = None - + self.metadata_uri = metadata_uri + self.taxonomy_uri = taxonomy_uri self.assembly = None self.assembly_sequences = None # array self.assembly_dataset = None + self.assembly_dataset_attributes = None # array self.genome = None self.genome_release = None - self.genome_dataset = None - self.datasets = None # array + self.genebuild_dataset_attributes = None # array + self.genebuild_dataset = None self.dataset_type = None self.dataset_source = None - self.dataset_attribute = None self.attribute = None - super().__init__(db_uri, metadata_uri, taxonomy_uri, release) + super().__init__(db_uri, metadata_uri=self.metadata_uri, taxonomy_uri=self.taxonomy_uri, release=None) self.db_type = 'core' def process_core(self, **kwargs): # Special case for loading a single species from a collection database. Can be removed in a future release sel_species = kwargs.get('species', None) + metadata_uri = kwargs.get('metadata_uri', self.metadata_uri) + taxonomy_uri = kwargs.get('metadata_uri', self.taxonomy_uri) + db_uri = kwargs.get('db_uri', self.db_uri) if sel_species: with self.db.session_scope() as session: multi_species = session.execute( @@ -58,7 +62,7 @@ def process_core(self, **kwargs): ) else: # Normal handling of collections from here - # Handle multispecies databases and run an update for each species + # Handle multi-species databases and run an update for each species with self.db.session_scope() as session: multi_species = session.execute( select(Meta.species_id).filter(Meta.meta_key == "species.production_name").distinct() @@ -66,488 +70,403 @@ def process_core(self, **kwargs): multi_species = [multi_species for multi_species, in multi_species] for species in multi_species: - self.species = species - self.process_species() - - def process_species(self): - - # Each class that is called here extracts unlinked data from the submission database to use in comparisons and to - # populate the new values if possible. - self.new_organism() - self.new_genome() - self.new_genome_release() - self.new_assembly() - self.new_assembly_sequence() - self.new_assembly_dataset() - self.new_dataset_source() - self.new_genome_dataset() - self.new_datasets() - - ################# - # Transactions are committed once per program run. - # Failures prevent any commit - ################# - - # Species Check - # Check for new species by checking if ensembl name is already present in the database - if not GenomeAdaptor(metadata_uri=self.metadata_db.url, - taxonomy_uri=self.taxonomy_uri).fetch_genomes_by_ensembl_name( - self.organism.ensembl_name): - # Check if the assembly accesion is already present in the database - new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") - with self.metadata_db.session_scope() as session: - if session.query(session.query(Assembly).filter_by(accession=new_assembly_acc).exists()).scalar(): - Exception("Assembly Accession already exists for a different organism. Please do a manual update.") - self.create_organism() - logging.info("Fresh Organism. Adding data to organism, genome, genome_release," - " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") - - # Check to see if it is an updated organism. - else: - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - test_organism = session.execute(db.select(Organism).filter( - Organism.ensembl_name == self.organism.ensembl_name)).one_or_none() - self.organism.organism_id = Organism.organism_id - self.organism.scientific_parlance_name = Organism.scientific_parlance_name - - if int(test_organism.Organism.species_taxonomy_id) == int( - self.organism.species_taxonomy_id) and \ - int(test_organism.Organism.taxonomy_id) == int( - self.organism.taxonomy_id) and \ - str(test_organism.Organism.display_name) == str( - self.organism.display_name) and \ - str(test_organism.Organism.scientific_name) == str( - self.organism.scientific_name) and \ - str(test_organism.Organism.url_name) == str( - self.organism.url_name) and \ - str(test_organism.Organism.strain) == str(self.organism.strain): - logging.info("Old Organism with no change. No update to organism table") - ################################################################ - ##### Assembly Check and Update - ################################################################ - with self.metadata_db.session_scope() as session: - assembly_acc = session.execute(db.select(Assembly - ).join(Genome.assembly).join(Genome.organism).filter( - Organism.ensembl_name == self.organism.ensembl_name)).all() - new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") - assembly_test = False - for assembly_obj in assembly_acc: - if assembly_obj[0].accession == new_assembly_acc: - assembly_test = True - if assembly_test: - logging.info( - "Old Assembly with no change. No update to Genome, genome_release, assembly, and assembly_sequence tables.") - for dataset in self.datasets: - with self.metadata_db.session_scope() as session: - # Check to see if any already exist: - # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - if dataset.name == "genebuild": - dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - Dataset.version == dataset.version, - Dataset.label == dataset.label).first() - if dataset_test is None: - gb_dataset_type = session.query(DatasetType).filter( - DatasetType.name == "genebuild").first() - dataset.dataset_type = gb_dataset_type - dataset.dataset_source = self.dataset_source - session.add(dataset) - + self.process_species(species, metadata_uri, taxonomy_uri, db_uri) + + def process_species(self, species, metadata_uri, taxonomy_uri, db_uri): + """ + Process an individual species from a core database to update the metadata db. + This method contains the logic for updating the metadata + """ + meta_conn = DBConnection(metadata_uri) + with meta_conn.session_scope() as meta_session: + self.organism, self.division, self.organism_group_member, organism_status = \ + self.get_or_new_organism(species, meta_session, metadata_uri, taxonomy_uri) + self.assembly, self.assembly_dataset, self.assembly_dataset_attributes, self.assembly_sequences, \ + self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, db_uri) + self.genebuild_dataset, self.genebuild_dataset_attributes, \ + genebuild_status = self.new_genebuild(species, meta_session, db_uri, self.dataset_source) + + conn = DatasetAdaptor(metadata_uri=metadata_uri) + genebuild_release_status = conn.check_release_status(self.genebuild_dataset.dataset_uuid) + + if organism_status == "New": + print ("New organism") + # ###############################Checks that dataset and assembly are new ################## + if assembly_status != "New" or genebuild_status != "New": + raise Exception("New organism, but existing assembly accession and/or genebuild version") + ############################################### + # Create genome and populate the database with organism, assembly and dataset + new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, + self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + + elif assembly_status == "New": + print ("New assembly") + + # ###############################Checks that dataset and update are new ################## + if genebuild_status != "New": + raise Exception("New assembly, but existing genebuild version") + ############################################### + + new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, + self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + + # Create genome and populate the database with assembly and dataset + elif genebuild_status == "New": + print ("New genebuild") + + # Create genome and populate the database with genebuild dataset + new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, + self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + else: + # Check if the data has been released: + if genebuild_release_status is True: + raise Exception("Existing Organism, Assembly, and Datasets within a release") else: - logging.info("New Assembly. Updating genome, genome_release," - " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") - self.update_assembly() - ################################################################ - ##### dataset Check and Update - ################################################################ - # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. - + print("Rewrite") + #Need to do a rewrite, so that it only redoes the geneset data. + + # Delete the data from the database and repopulate assembly and genebuild. + genome_dataset = meta_session.query(GenomeDataset).join(Dataset).filter( + Dataset.dataset_uuid == self.assembly_dataset.dataset_uuid).first() + current_genome = meta_session.query(Genome).get(genome_dataset.genome_id) + for d in meta_session.query(Dataset).join(GenomeDataset).filter( + GenomeDataset.genome_id == current_genome.genome_id).filter(Dataset.name == "genebuild"): + meta_session.delete(d) + meta_session.commit() + meta_session.flush() + genebuild_genome_dataset = GenomeDataset( + genome=current_genome, + dataset=self.genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) + + + def new_genome(self, meta_session, organism, assembly, assembly_dataset, genebuild_dataset): + new_genome = Genome( + genome_uuid=str(uuid.uuid4()), + assembly=assembly, + organism=organism, + created=func.now(), + ) + meta_session.add(new_genome) + assembly_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=assembly_dataset, + is_current=True, + ) + meta_session.add(assembly_genome_dataset) + genebuild_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) + return new_genome, assembly_genome_dataset, genebuild_genome_dataset + + def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri): + """ + Get an existing Organism instance or create a new one, depending on the information from the metadata database. + """ + + # Fetch the Ensembl name of the organism from metadata using either 'species.ensembl_name' + # or 'species.production_name' as the key. + ensembl_name = self.get_meta_single_meta_key(species, "species.ensembl_name") + if ensembl_name is None: + ensembl_name = self.get_meta_single_meta_key(species, "species.production_name") + + # Instantiate a new Organism object using data fetched from metadata. + new_organism = Organism( + species_taxonomy_id=self.get_meta_single_meta_key(species, "species.species_taxonomy_id"), + taxonomy_id=self.get_meta_single_meta_key(species, "species.taxonomy_id"), + display_name=self.get_meta_single_meta_key(species, "species.display_name"), + scientific_name=self.get_meta_single_meta_key(species, "species.scientific_name"), + url_name=self.get_meta_single_meta_key(species, "species.url"), + ensembl_name=ensembl_name, + strain=self.get_meta_single_meta_key(species, "species.strain"), + # + ) + # Query the metadata database to find if an Organism with the same Ensembl name already exists. + old_organism = meta_session.query(Organism).filter( + Organism.ensembl_name == new_organism.ensembl_name).one_or_none() + division_name = self.get_meta_single_meta_key(species, "species.division") + division = meta_session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none() + + # If an existing Organism is found, return it and indicate that it already existed. + if old_organism: + organism_group_member = meta_session.query(OrganismGroupMember).filter( + OrganismGroupMember.organism_id == old_organism.organism_id, + OrganismGroupMember.organism_group_id == division.organism_group_id).one_or_none() + return old_organism, division, organism_group_member, "Existing" + else: + # If no existing Organism is found, conduct additional checks before creating a new one. + + # Check if the new organism's taxonomy ID exists in the taxonomy database. + conn = GenomeAdaptor(metadata_uri=metadata_uri, taxonomy_uri=taxonomy_uri) + try: + conn.fetch_taxonomy_names(taxonomy_ids=new_organism.taxonomy_id) + except NoResultFound: + raise Exception("taxid not found in taxonomy database for scientific name") + + # Check if an Assembly with the same accession already exists in the metadata database. + accession = self.get_meta_single_meta_key(species, "assembly.accession") + assembly_test = meta_session.query(Assembly).filter(Assembly.accession == accession).one_or_none() + if assembly_test is not None: + raise Exception( + "Assembly Accession already exists for a different organism. Please do a manual update.") + # Fetch the division name of the new organism from metadata. + if division_name is None: + Exception("No species.division found in meta table") - else: - self.update_organism() - logging.info("Old Organism with changes. Updating organism table") - - def create_organism(self): - # In this, we are assuming that with a new genome, there will be a new assemblbly. - - with self.metadata_db.session_scope() as session: - # Organism section - # Updating Organism, organism_group_member, and organism_group - self.new_organism_group_and_members(session) - # Add in the new assembly here - # assembly sequence, assembly, genome, genome release. - assembly_test = session.execute(db.select(Assembly).filter( - Assembly.accession == self.assembly.accession)).one_or_none() - if assembly_test is not None: - Exception( - "Error, existing name but, assembly accession already found. Please update the Ensembl Name in the Meta field manually") - if self.listed_release is not None: - release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() - self.genome_release.ensembl_release = release - self.genome_release.genome = self.genome - - for assembly_seq in self.assembly_sequences: - assembly_seq.assembly = self.assembly - self.assembly.genomes.append(self.genome) - - self.genome.organism = self.organism - - # Update assembly dataset - # Updates genome_dataset,dataset,dataset_source - dataset_source_test = session.execute( - db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() - if dataset_source_test is not None: - Exception("Error, data already present in source") - - dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() - if self.listed_release is not None: - self.genome_dataset.ensembl_release = release - self.genome_dataset.genome = self.genome - self.genome_dataset.dataset = self.assembly_dataset - - self.assembly_dataset.dataset_type = dataset_type - self.assembly_dataset.dataset_source = self.dataset_source - - assembly_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, - ) - assembly_genome_dataset.dataset = self.assembly_dataset - self.genome.genome_datasets.append(assembly_genome_dataset) - - # session.add(assembly_genome_dataset) - - # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. - for dataset in self.datasets: - # Check to see if any already exist: - # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - if dataset.name == "genebuild": - dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - Dataset.version == dataset.version, - Dataset.label == dataset.label).first() - if dataset_test is None: - dataset.dataset_type = session.query(DatasetType).filter( - DatasetType.name == "genebuild").first() - dataset.dataset_source = self.dataset_source - temp_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, + # Query the metadata database to find if an OrganismGroup with the same division name already exists. + if division is None: + # If no such OrganismGroup exists, create a new one. + division = OrganismGroup( + type="Division", + name=division_name, ) - temp_genome_dataset.dataset = dataset - self.genome.genome_datasets.append(temp_genome_dataset) - # Add everything to the database. Closing the session commits it. - session.add(self.organism) - - def update_organism(self): - with self.metadata_db.session_scope() as session: - session.execute( - update(Organism).where(Organism.ensembl_name == self.organism.ensembl_name).values( - species_taxonomy_id=self.organism.species_taxonomy_id, - taxonomy_id=self.organism.taxonomy_id, - display_name=self.organism.display_name, - scientific_name=self.organism.scientific_name, - url_name=self.organism.url_name, - ensembl_name=self.organism.ensembl_name, - strain=self.organism.strain, - )) - - # TODO: Add an update to the groups here. - - def update_assembly(self): - # Change to new assembly/fresh - with self.metadata_db.session_scope() as session: - # Get the genome - self.organism = session.query(Organism).filter( - Organism.ensembl_name == self.organism.ensembl_name).first() - self.genome.organism = self.organism - - if self.listed_release is not None: - release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() - self.genome_release.ensembl_release = release - self.genome_release.genome = self.genome - - self.assembly.genomes.append(self.genome) - - # Update assembly dataset - # Updates genome_dataset,dataset,dataset_source - dataset_source_test = session.execute( - db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() - if dataset_source_test is not None: - self.dataset_source = session.query(DatasetSource).filter( - DatasetSource.name == self.dataset_source.name).first() - - dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() - if self.listed_release is not None: - self.genome_dataset.ensembl_release = release - self.genome_dataset.genome = self.genome - self.genome_dataset.dataset = self.assembly_dataset - - self.assembly_dataset.dataset_type = dataset_type - self.assembly_dataset.dataset_source = self.dataset_source - - assembly_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, + meta_session.add(division) + + # Create a new OrganismGroupMember linking the new Organism to the division group. + organism_group_member = OrganismGroupMember( + is_reference=0, + organism=new_organism, + organism_group=division, ) - assembly_genome_dataset.dataset = self.assembly_dataset - self.genome.genome_datasets.append(assembly_genome_dataset) - - for dataset in self.datasets: - # Check to see if any already exist: - # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - if dataset.name == "genebuild": - dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - Dataset.version == dataset.version, - Dataset.label == dataset.label).first() - dataset.dataset_type = session.query(DatasetType).filter( - DatasetType.name == "genebuild").first() - dataset.dataset_source = self.dataset_source - temp_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, + meta_session.add(new_organism) + meta_session.add(organism_group_member) + # Return the newly created Organism and indicate that it is new. + return new_organism, division, organism_group_member, "New" + + def get_assembly_sequences(self, species, assembly): + """ + Get the assembly sequences and the values that correspond to the metadata table + """ + assembly_sequences = [] + with self.db.session_scope() as session: + # Create an alias for SeqRegionAttrib and AttribType to be used for sequence_location + SeqRegionAttribAlias = aliased(SeqRegionAttrib) + AttribTypeAlias = aliased(AttribType) + + # One complicated query to get all the data. Otherwise, this takes far too long to do. + results = (session.query(SeqRegion.name, SeqRegion.length, CoordSystem.name, + SeqRegionAttribAlias.value, SeqRegionSynonym.synonym) + .join(SeqRegion.coord_system) + .join(SeqRegion.seq_region_attrib) + .join(SeqRegionAttrib.attrib_type) + .outerjoin(SeqRegion.seq_region_synonym) + .join(SeqRegionAttribAlias, SeqRegion.seq_region_attrib) # join with SeqRegionAttribAlias + .outerjoin(AttribTypeAlias, SeqRegionAttribAlias.attrib_type) # join with AttribTypeAlias + .filter(Meta.species_id == species) + .filter(AttribType.code == "toplevel") # ensure toplevel + .filter(AttribTypeAlias.code == "sequence_location").all()) # ensure sequence_location + + # Create a dictionary so that the results can have multiple synonyms per line and only one SeqRegion + accession_info = defaultdict( + lambda: {"names": set(), "length": None, "location": None, "chromosomal": None}) + + for seq_region_name, seq_region_length, coord_system_name, location, synonym in results: + # Skip all sequence lrg sequences. + if coord_system_name == "lrg": + continue + # Test to see if the seq_name follows accession standards (99% of sequences) + if re.match(r'^[a-zA-Z]+\d+\.\d+', seq_region_name): + # If so assign it to accession + accession = seq_region_name + if not synonym: + # If it doesn't have any synonyms the accession is the name. + accession_info[accession]["names"].add(accession) + else: + accession_info[accession]["names"].add(synonym) + else: + # For named sequences like chr1 + name = seq_region_name + if re.match(r'^[a-zA-Z]+\d+\.\d+', synonym): + accession = synonym + accession_info[accession]["names"].add(name) + else: + accession = name # In case synonym doesn't match the pattern, use the name as the accession + accession_info[accession]["names"].add(synonym if synonym else name) + + # Save the sequence location, length, and chromosomal flag. + location_mapping = { + 'nuclear_chromosome': 'SO:0000738', + 'mitochondrial_chromosome': 'SO:0000737', + 'chloroplast_chromosome': 'SO:0000745', + None: 'SO:0000738', + } + + try: + sequence_location = location_mapping[location] + except KeyError: + raise Exception('Error with sequence location: {} is not a valid type'.format(location)) + + # Test if chromosomal: + if coord_system_name == "chromosome": + chromosomal = 1 + else: + chromosomal = 0 + + # Assign the values to the dictionary + accession_info[accession]["location"] = sequence_location + accession_info[accession]["chromosomal"] = chromosomal + accession_info[accession]["length"] = seq_region_length + + # Now, create AssemblySequence objects for each unique accession. + for accession, info in accession_info.items(): + # Combine all unique names with ";". If a name appears in multiple sequences with the same accession, + name = ";".join(info["names"]) + + # Create an AssemblySequence object. + assembly_sequence = AssemblySequence( + name=name, + assembly=assembly, + accession=accession, + chromosomal=info["chromosomal"], + length=info["length"], + sequence_location=info["location"], + # sequence_checksum="", Not implemented + # ga4gh_identifier="", Not implemented ) - temp_genome_dataset.dataset = dataset - self.genome.genome_datasets.append(temp_genome_dataset) - # Add everything to the database. Closing the session commits it. - session.add(self.genome) - - # The following methods populate the data from the core into the objects. K - # It may be beneficial to move them to the base class with later implementations - def new_organism(self): - # All taken from the meta table except parlance name. - self.organism = Organism( - organism_id=None, # Should be autogenerated upon insertion - species_taxonomy_id=self.get_meta_single_meta_key(self.species, "species.species_taxonomy_id"), - taxonomy_id=self.get_meta_single_meta_key(self.species, "species.taxonomy_id"), - display_name=self.get_meta_single_meta_key(self.species, "species.display_name"), - scientific_name=self.get_meta_single_meta_key(self.species, "species.scientific_name"), - url_name=self.get_meta_single_meta_key(self.species, "species.url"), - ensembl_name=self.get_meta_single_meta_key(self.species, "species.production_name"), - strain=self.get_meta_single_meta_key(self.species, "species.strain"), - scientific_parlance_name=None, - ) - if self.organism.species_taxonomy_id is None: - self.organism.species_taxonomy_id = self.organism.taxonomy_id - - def new_organism_group_and_members(self, session): - # This method auto grabs the division name and checks for the strain groups - division_name = self.get_meta_single_meta_key(self.species, "species.division") - if division_name is None: - Exception("No species.dvision found in meta table") - division = session.execute(db.select(OrganismGroup).filter(OrganismGroup.name == division_name)).one_or_none() - if division is None: - group = OrganismGroup( - organism_group_id=None, - type="Division", - name=division_name, - code=None, - ) - else: - group = session.query(OrganismGroup).filter(OrganismGroup.name == division_name).first() - self.organism_group_member = OrganismGroupMember( - organism_group_member_id=None, - is_reference=0, - organism_id=None, - organism_group_id=None, - ) - self.organism_group_member.organism_group = group - self.organism_group_member.organism = self.organism - # Work on the strain level group members - strain = self.get_meta_single_meta_key(self.species, "species.strain") - strain_group = self.get_meta_single_meta_key(self.species, "species.strain_group") - strain_type = self.get_meta_single_meta_key(self.species, "species.type") + assembly_sequences.append(assembly_sequence) + return assembly_sequences + + def get_or_new_assembly(self, species, meta_session, db_uri, source=None): + # Get the new assembly accession from the core handed over + assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") + assembly = meta_session.query(Assembly).filter(Assembly.accession == assembly_accession).one_or_none() + + if assembly is not None: + # Get the existing assembly dataset + assembly_dataset = meta_session.query(Dataset).filter(Dataset.label == assembly_accession).one_or_none() + # I should not need this, but double check on database updating. + assembly_dataset_attributes = assembly_dataset.dataset_attributes + assembly_sequences = assembly.assembly_sequences + if source is not None: + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + else: + dataset_source = source + + return assembly, assembly_dataset, assembly_dataset_attributes, \ + assembly_sequences, dataset_source, "Existing" - if strain is not None: - if strain == 'reference': - reference = 1 + else: + with self.db.session_scope() as session: + # May be problematic. Might be provided by genebuild. + level = (session.execute(db.select(CoordSystem.name).filter( + CoordSystem.species_id == species).order_by(CoordSystem.rank)).all())[0][0] + assembly = Assembly( + ucsc_name=self.get_meta_single_meta_key(species, "assembly.ucsc_alias"), + accession=self.get_meta_single_meta_key(species, "assembly.accession"), + level=level, + # level=self.get_meta_single_meta_key(self.species, "assembly.level"), #Not yet implemented. + name=self.get_meta_single_meta_key(species, "assembly.name"), + accession_body=self.get_meta_single_meta_key(species, "assembly.provider"), + assembly_default=self.get_meta_single_meta_key(species, "assembly.default"), + tol_id=self.get_meta_single_meta_key(species, "assembly.tol_id"), # Not implemented yet + created=func.now(), + ensembl_name=self.get_meta_single_meta_key(species, "assembly.name"), + assembly_uuid=str(uuid.uuid4()), + ) + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() + if source is None: + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") else: - reference = 0 - group_member = OrganismGroupMember( - organism_group_member_id=None, - is_reference=reference, - organism_id=None, - organism_group_id=None, + dataset_source = source + + assembly_dataset = Dataset( + dataset_uuid=str(uuid.uuid4()), + dataset_type=dataset_type, # extract from dataset_type + name="assembly", + # version=None, Could be changed. + label=assembly.accession, # Required. Makes for a quick lookup + created=func.now(), + dataset_source=dataset_source, # extract from dataset_source + status='Submitted', ) - # Check for group, if not present make it - division = session.execute( - db.select(OrganismGroup).filter(OrganismGroup.name == strain_group)).one_or_none() - if division is None: - group = OrganismGroup( - organism_group_id=None, - type=strain_type, - name=strain_group, - code=None, + attributes = self.get_meta_list_from_prefix_meta_key(species, "assembly") + assembly_dataset_attributes = [] + for attribute, value in attributes.items(): + meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually") + dataset_attribute = DatasetAttribute( + value=value, + dataset=assembly_dataset, + attribute=meta_attribute, ) + assembly_dataset_attributes.append(dataset_attribute) + assembly_sequences = self.get_assembly_sequences(species, assembly) + meta_session.add(assembly) + meta_session.add_all(assembly_sequences) + meta_session.add(assembly_dataset) + meta_session.add_all(assembly_dataset_attributes) + return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "New" + + def new_genebuild(self, species, meta_session, db_uri, source=None): + """ + Process an individual species from a core database to update the metadata db. + This method contains the logic for updating the metadata + This is not a get, as we don't update the metadata for genebuild, only replace it if it is not released. + """ + # The assembly accession and genebuild version are extracted from the metadata of the species + assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") + genebuild_version = self.get_meta_single_meta_key(species, "genebuild.version") + + # The genebuild accession is formed by combining the assembly accession and the genebuild version + genebuild_accession = assembly_accession + "_" + genebuild_version + + # Depending on whether a source is provided, it uses the provided source or creates a new source + if source is None: + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + else: + dataset_source = source - else: - group = session.query(OrganismGroup).filter(OrganismGroup.name == strain_group).first() - group_member.organism_group = group - group_member.organism = self.organism - - def new_genome(self): - # Data for the update function. - self.genome = Genome( - genome_id=None, # Should be autogenerated upon insertion - genome_uuid=str(uuid.uuid4()), - assembly_id=None, # Update the assembly before inserting and grab the assembly key - organism_id=None, # Update the organism before inserting and grab the organism_id - created=func.now(), # Replace all of them with sqlalchemy func.now() - ) - - def new_genome_release(self): - # Genome Release - self.genome_release = GenomeRelease( - genome_release_id=None, # Should be autogenerated upon insertion - genome_id=None, # Update the genome before inserting and grab the genome_id - release_id=None, - is_current=self.listed_release_is_current, - ) - - def new_assembly(self): - level = None - with self.db.session_scope() as session: - level = (session.execute(db.select(CoordSystem.name).filter( - CoordSystem.species_id == self.species).order_by(CoordSystem.rank)).all())[0][0] - - self.assembly = Assembly( - assembly_id=None, # Should be autogenerated upon insertion - ucsc_name=self.get_meta_single_meta_key(self.species, "assembly.ucsc_alias"), - accession=self.get_meta_single_meta_key(self.species, "assembly.accession"), - level=level, - name=self.get_meta_single_meta_key(self.species, "assembly.name"), - accession_body=None, # Not implemented yet - assembly_default=self.get_meta_single_meta_key(self.species, "assembly.default"), - created=func.now(), - ensembl_name=self.get_meta_single_meta_key(self.species, "assembly.name"), - ) + # The type of the dataset is set to be "genebuild" + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() - def new_assembly_dataset(self): - self.assembly_dataset = Dataset( - dataset_id=None, # Should be autogenerated upon insertion + # A new Dataset instance is created with all necessary properties + genebuild_dataset = Dataset( dataset_uuid=str(uuid.uuid4()), - dataset_type_id=None, # extract from dataset_type - name="assembly", - version=None, + dataset_type=dataset_type, + name="genebuild", + version=genebuild_version, + label=genebuild_accession, created=func.now(), - dataset_source_id=None, # extract from dataset_source - label=self.assembly.accession, + dataset_source=dataset_source, status='Submitted', ) - def new_assembly_sequence(self): - self.assembly_sequences = [] - with self.db.session_scope() as session: - # Alias the seq_region_attrib and seq_region_synonym tables - sra1 = aliased(SeqRegionAttrib) - sra3 = aliased(SeqRegionAttrib) - - results = ( - session.query(SeqRegion.name, SeqRegionSynonym.synonym, SeqRegion.length, - CoordSystem.name, - sra3.value, - ) - .join(CoordSystem, SeqRegion.coord_system_id == CoordSystem.coord_system_id) - .join(Meta, CoordSystem.species_id == Meta.species_id) - .join(sra1, SeqRegion.seq_region_id == sra1.seq_region_id) - .outerjoin(SeqRegionSynonym, and_(SeqRegion.seq_region_id == SeqRegionSynonym.seq_region_id, - SeqRegionSynonym.external_db_id == 50710)) - .outerjoin(sra3, and_(SeqRegion.seq_region_id == sra3.seq_region_id, - sra3.attrib_type_id == 547)) - .filter(Meta.meta_key == 'assembly.accession', sra1.attrib_type_id == 6, - Meta.species_id == self.species) - ).all() - for data in results: - - # If the name does not match normal accession formating, then use that name. - name = None - if re.match(r'^[a-zA-Z]+\d+\.\d+', data[0]): - name = None - else: - name = data[0] - # Nab accession from the seq region synonym or else the name. - accession = None - if data[1] is not None and re.match(r'^[a-zA-Z]+\d+\.\d+', data[1]): - accession = data[1] - elif name is not None: - accession = name - else: - accession = data[0] - - chromosomal = 0 - if data[3] == 'chromosome': - chromosomal = 1 - - sequence_location = None - if data[4] == 'nuclear_chromosome': - sequence_location = 'SO:0000738' - elif data[4] == 'mitochondrial_chromosome': - sequence_location = 'SO:0000737' - elif data[4] == 'chloroplast_chromosome': - sequence_location = 'SO:0000745' - elif data[4] is None: - sequence_location = 'SO:0000738' - else: - raise Exception('Error with sequence location: ' + data[4] + ' is not a valid type') - - self.assembly_sequences.append(AssemblySequence( - assembly_sequence_id=None, # Should be autogenerated upon insertion - name=name, - assembly_id=None, # Update the assembly before inserting and grab the assembly_id - accession=accession, - chromosomal=chromosomal, - length=data[2], - sequence_location=sequence_location, - # These two get populated in the core stats pipeline. - sequence_checksum=None, - ga4gh_identifier=None, - )) - - def new_genome_dataset(self): - self.genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=self.listed_release_is_current, - ) + # Fetching all attributes associated with "genebuild" from the metadata of the species + attributes = self.get_meta_list_from_prefix_meta_key(species, "genebuild.") + + # An empty list to hold DatasetAttribute instances + genebuild_dataset_attributes = [] + # For each attribute-value pair, a new DatasetAttribute instance is created and added to the list + for attribute, value in attributes.items(): + meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually") + dataset_attribute = DatasetAttribute( + value=value, + dataset=genebuild_dataset, + attribute=meta_attribute, + ) + genebuild_dataset_attributes.append(dataset_attribute) - def new_dataset_source(self): - self.dataset_source = DatasetSource( - dataset_source_id=None, # Should be autogenerated upon insertion - type=self.db_type, # core/fungen etc - name=make_url(self.db_uri).database # dbname - ) + # Check if the genebuild dataset with the given label already exists + test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none() - def new_datasets(self): - self.datasets = [] - # Genebuild. - label = self.get_meta_single_meta_key(self.species, "genebuild.last_geneset_update") - if label is None: - label = self.get_meta_single_meta_key(self.species, "genebuild.start_date") - self.datasets.append(Dataset( - dataset_id=None, # Should be autogenerated upon insertion - dataset_uuid=str(uuid.uuid4()), - dataset_type_id=None, # extract from dataset_type - name="genebuild", - version=self.get_meta_single_meta_key(self.species, "gencode.version"), - created=func.now(), - dataset_source_id=None, # extract from dataset_source - label=label, - status='Submitted', - )) - # Protein Features + # If it does not exist, it is added to the session, otherwise the status is set to "Existing" + if test_status is None: + status = "New" + meta_session.add(genebuild_dataset) + meta_session.add_all(genebuild_dataset_attributes) + else: + status = "Existing" + + # The method returns the Dataset instance, the list of DatasetAttribute instances, and the status + return genebuild_dataset, genebuild_dataset_attributes, status diff --git a/src/tests/databases/core_1/attrib_type.txt b/src/tests/databases/core_1/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_1/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index 9c38fecc..b744f3c4 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -11,6 +11,7 @@ 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 diff --git a/src/tests/databases/core_1/table.sql b/src/tests/databases/core_1/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_1/table.sql +++ b/src/tests/databases/core_1/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file diff --git a/src/tests/databases/core_2/attrib_type.txt b/src/tests/databases/core_2/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_2/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index d8efbd88..b744f3c4 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -7,10 +7,11 @@ 3 1 species.display_name jabberwocky 7 1 species.division Ensembl_TEST 6 1 species.production_name Jabberwocky -4 1 species.scientific_name lewis_carol +4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 diff --git a/src/tests/databases/core_2/table.sql b/src/tests/databases/core_2/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_2/table.sql +++ b/src/tests/databases/core_2/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file diff --git a/src/tests/databases/core_3/attrib_type.txt b/src/tests/databases/core_3/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_3/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index 63700f27..5bb63698 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -6,10 +6,11 @@ 3 1 species.display_name jabberwocky 7 1 species.division Ensembl_TEST 6 1 species.production_name Jabberwocky -4 1 species.scientific_name lewis_carol +4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 diff --git a/src/tests/databases/core_3/table.sql b/src/tests/databases/core_3/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_3/table.sql +++ b/src/tests/databases/core_3/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file diff --git a/src/tests/databases/core_4/attrib_type.txt b/src/tests/databases/core_4/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_4/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 4ebee73c..3202ce69 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -7,10 +7,11 @@ 3 1 species.display_name jabberwocky 7 1 species.division Ensembl_TEST 6 1 species.production_name Jabberwocky -4 1 species.scientific_name lewis_carol +4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 diff --git a/src/tests/databases/core_4/table.sql b/src/tests/databases/core_4/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_4/table.sql +++ b/src/tests/databases/core_4/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file diff --git a/src/tests/test_api.py b/src/tests/test_api.py index eefb521e..62b5af70 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -52,7 +52,7 @@ def test_fetch_releases_for_dataset(self, multi_dbs): def test_fetch_taxonomy_names(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_taxonomy_names(taxonomy_ids=(6239, 511145)) + test = conn.fetch_taxonomy_names(taxonomy_ids=511145) assert test[511145]['scientific_name'] == 'Escherichia coli str. K-12 substr. MG1655' def test_fetch_taxonomy_ids(self, multi_dbs): diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 44c449f0..ad33a7f8 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -46,10 +46,12 @@ def test_new_organism(self, multi_dbs): metadata = MetaData() dataset = Table('dataset', metadata, autoload=True, autoload_with=engine) query = select([dataset]).where( - (dataset.c.version == 999) & (dataset.c.name == 'genebuild') & (dataset.c.label == '01') + (dataset.c.version == 1) & (dataset.c.name == 'genebuild') ) row = engine.execute(query).fetchone() - assert row[-2] == '01' + assert row is not None + if row is not None: + assert row[4] is not None # def test_update_organism(self, multi_dbs): @@ -59,7 +61,7 @@ def test_update_organism(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') - assert test_collect[0].Organism.scientific_name == 'lewis_carol' + assert test_collect[0].Organism.scientific_name == 'carol_jabberwocky' def test_update_assembly(self, multi_dbs): test = meta_factory(multi_dbs['core_3'].dbc.url, multi_dbs['ensembl_metadata'].dbc.url, @@ -68,7 +70,7 @@ def test_update_assembly(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') - assert test_collect[1].Organism.scientific_name == 'lewis_carol' + assert test_collect[1].Organism.scientific_name == 'carol_jabberwocky' assert test_collect[1].Assembly.accession == 'weird02' # @@ -80,7 +82,9 @@ def test_update_geneset(self, multi_dbs): metadata = MetaData() dataset = Table('dataset', metadata, autoload=True, autoload_with=engine) query = select([dataset]).where( - (dataset.c.version == 999) & (dataset.c.name == 'genebuild') & (dataset.c.label == '02') + (dataset.c.version == 1) & (dataset.c.name == 'genebuild') ) row = engine.execute(query).fetchone() - assert row[-2] == '02' + assert row is not None + if row is not None: + assert row[4] is not None