From e93b273392ae3d516fccf203e87f734734f7d910 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 11 Jul 2023 07:45:05 +0100 Subject: [PATCH 01/20] Initial refactor with changes made to the organism update --- src/ensembl/production/metadata/api/base.py | 5 + src/ensembl/production/metadata/api/genome.py | 2 +- .../production/metadata/updater/Temporary.py | 6 + .../production/metadata/updater/base.py | 19 +- .../production/metadata/updater/core.py | 1110 ++++++++++------- 5 files changed, 659 insertions(+), 483 deletions(-) create mode 100644 src/ensembl/production/metadata/updater/Temporary.py diff --git a/src/ensembl/production/metadata/api/base.py b/src/ensembl/production/metadata/api/base.py index 2d32323a..f88c9206 100644 --- a/src/ensembl/production/metadata/api/base.py +++ b/src/ensembl/production/metadata/api/base.py @@ -12,6 +12,11 @@ from ensembl.database import DBConnection +##Todo: Add in OrganismAdapator. Subfunction fetches all organism in popular group. and # of genomes from distinct assemblies. +#Add in best genome (see doc) +#More functions for related genomes + + class BaseAdaptor: def __init__(self, metadata_uri): self.metadata_db = DBConnection(metadata_uri) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index f249b08f..6425bae9 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -29,7 +29,7 @@ def __init__(self, metadata_uri, taxonomy_uri=None): self.taxonomy_db = DBConnection(taxonomy_uri) def fetch_taxonomy_names(self, taxonomy_ids): - + taxonomy_ids = [taxonomy_ids] if not isinstance(taxonomy_ids, list) else taxonomy_ids taxons = {} for tid in taxonomy_ids: names = {"scientific_name": None, "synonym": []} diff --git a/src/ensembl/production/metadata/updater/Temporary.py b/src/ensembl/production/metadata/updater/Temporary.py new file mode 100644 index 00000000..14572dfc --- /dev/null +++ b/src/ensembl/production/metadata/updater/Temporary.py @@ -0,0 +1,6 @@ + +from ensembl.production.metadata.api.factory import meta_factory + + +test = meta_factory( 'mysql://danielp:Killadam69@localhost:3306/acanthochromis_polyacanthus_core_109_1',"mysql://danielp:Killadam69@localhost:3306/ensembl_genome_metadata",'mysql://danielp:Killadam69@localhost:3306/ncbi_taxonomy') +test.process_core() diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 8e63c7c9..96dafc62 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -13,6 +13,7 @@ from sqlalchemy.engine import make_url from ensembl.core.models import Meta +from ensembl.production.metadata.api.models import DatasetSource from ensembl.database import DBConnection from ensembl.production.metadata.api.models import EnsemblRelease @@ -21,8 +22,7 @@ class BaseMetaUpdater: def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): self.db_uri = db_uri self.db = DBConnection(self.db_uri) - self.species = None - self.db_type = None + self.metadata_db = DBConnection(metadata_uri) # We will add a release later. For now, the release must be specified for it to be used. if release is None: self.listed_release = None @@ -30,8 +30,7 @@ def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): else: self.listed_release = release self.listed_release_is_current = EnsemblRelease.is_current - self.metadata_db = DBConnection(metadata_uri) - self.taxonomy_uri = taxonomy_uri + # Basic API for the meta table in the submission database. def get_meta_single_meta_key(self, species_id, parameter): @@ -43,4 +42,14 @@ def get_meta_single_meta_key(self, species_id, parameter): else: return result[0] - + def get_or_new_source(self, meta_session, db_uri, db_type): + name = make_url(db_uri).database + dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none() + if dataset_source is None: + dataset_source = DatasetSource( + type=db_type, # core/fungen etc + name=name # dbname + ) + return dataset_source, "new" + else: + return dataset_source, "existing" diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 18d23df9..23c4e557 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -14,26 +14,36 @@ import uuid import sqlalchemy as db -from ensembl.core.models import Meta, Assembly, CoordSystem, SeqRegionAttrib, SeqRegion, SeqRegionSynonym +from ensembl.core.models import Meta, Assembly as AssemblyCore, CoordSystem, SeqRegionAttrib, SeqRegion, \ + SeqRegionSynonym from sqlalchemy import select, update, func, and_ from sqlalchemy.engine import make_url from sqlalchemy.orm import aliased - +from ensembl.database import DBConnection +from sqlalchemy.exc import NoResultFound +import sys from ensembl.production.metadata.api.genome import GenomeAdaptor from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater +##TODO: +# Prevent deletion of release data. +# Logic: +##Create new organism on new production name if no ensembl name. If ensembl name is given create new if none, if already exists create new genome based on production name. +##Check that taxid is present in db. + + class CoreMetaUpdater(BaseMetaUpdater): def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): - # Each of these objects represents a table in the database to store data in as either an array or a single object. + # Each of these objects represents a table in the database to store data in as eith0er an array or a single object. self.organism = None - self.organism_group_member = None - self.organism_group = None - + self.metadata_uri = metadata_uri + self.taxonomy_uri = taxonomy_uri self.assembly = None self.assembly_sequences = None # array self.assembly_dataset = None + self.assembly_dataset_attributes = None # array self.genome = None self.genome_release = None @@ -41,15 +51,17 @@ def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): self.datasets = None # array self.dataset_type = None self.dataset_source = None - self.dataset_attribute = None self.attribute = None - super().__init__(db_uri, metadata_uri, taxonomy_uri, release) + super().__init__(db_uri, metadata_uri=self.metadata_uri, taxonomy_uri=self.taxonomy_uri, release=None) self.db_type = 'core' def process_core(self, **kwargs): # Special case for loading a single species from a collection database. Can be removed in a future release sel_species = kwargs.get('species', None) + metadata_uri = kwargs.get('metadata_uri', self.metadata_uri) + taxonomy_uri = kwargs.get('metadata_uri', self.taxonomy_uri) + if sel_species: with self.db.session_scope() as session: multi_species = session.execute( @@ -66,488 +78,632 @@ def process_core(self, **kwargs): multi_species = [multi_species for multi_species, in multi_species] for species in multi_species: - self.species = species - self.process_species() - - def process_species(self): - - # Each class that is called here extracts unlinked data from the submission database to use in comparisons and to - # populate the new values if possible. - self.new_organism() - self.new_genome() - self.new_genome_release() - self.new_assembly() - self.new_assembly_sequence() - self.new_assembly_dataset() - self.new_dataset_source() - self.new_genome_dataset() - self.new_datasets() - - ################# - # Transactions are committed once per program run. - # Failures prevent any commit - ################# - - # Species Check - # Check for new species by checking if ensembl name is already present in the database - if not GenomeAdaptor(metadata_uri=self.metadata_db.url, - taxonomy_uri=self.taxonomy_uri).fetch_genomes_by_ensembl_name( - self.organism.ensembl_name): - # Check if the assembly accesion is already present in the database - new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") - with self.metadata_db.session_scope() as session: - if session.query(session.query(Assembly).filter_by(accession=new_assembly_acc).exists()).scalar(): - Exception("Assembly Accession already exists for a different organism. Please do a manual update.") - self.create_organism() - logging.info("Fresh Organism. Adding data to organism, genome, genome_release," - " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") - - # Check to see if it is an updated organism. + self.process_species(species, metadata_uri, taxonomy_uri) + + def process_species(self, species, metadata_uri, taxonomy_uri, db_uri): + # new functions + meta_conn = DBConnection(metadata_uri) + with meta_conn.session_scope() as meta_session: + self.organism, organism_status = self.get_or_new_organism(species, meta_session, metadata_uri, taxonomy_uri) + self.assembly, self.assembly_dataset, self.assembly_dataset_attributes, self.assembly_sequences, \ + self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, metadata_uri, + db_uri) + print(self.organism, organism_status) + + def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri): + """ + Get an existing Organism instance or create a new one, depending on the information from the metadata database. + """ + + # Fetch the Ensembl name of the organism from metadata using either 'species.ensembl_name' + # or 'species.production_name' as the key. + ensembl_name = self.get_meta_single_meta_key(species, "species.ensembl_name") + if ensembl_name is None: + ensembl_name = self.get_meta_single_meta_key(species, "species.production_name") + + # Instantiate a new Organism object using data fetched from metadata. + new_organism = Organism( + species_taxonomy_id=self.get_meta_single_meta_key(species, "species.species_taxonomy_id"), #REQURIED + taxonomy_id=self.get_meta_single_meta_key(species, "species.taxonomy_id"),#REQURIED + display_name=self.get_meta_single_meta_key(species, "species.display_name"),#REQURIED , MAY BE DELETED + scientific_name=self.get_meta_single_meta_key(species, "species.scientific_name"),#REQURIED + url_name=self.get_meta_single_meta_key(species, "species.url"), + ensembl_name=ensembl_name, + strain=self.get_meta_single_meta_key(species, "species.strain"), + # + ) + + # Query the metadata database to find if an Organism with the same Ensembl name already exists. + old_organism = meta_session.query(Organism).filter( + Organism.ensembl_name == new_organism.ensembl_name).one_or_none() + + # If an existing Organism is found, return it and indicate that it already existed. + if old_organism: + return old_organism, "Existing" else: - with self.metadata_db.session_scope() as session: - session.expire_on_commit = False - test_organism = session.execute(db.select(Organism).filter( - Organism.ensembl_name == self.organism.ensembl_name)).one_or_none() - self.organism.organism_id = Organism.organism_id - self.organism.scientific_parlance_name = Organism.scientific_parlance_name - - if int(test_organism.Organism.species_taxonomy_id) == int( - self.organism.species_taxonomy_id) and \ - int(test_organism.Organism.taxonomy_id) == int( - self.organism.taxonomy_id) and \ - str(test_organism.Organism.display_name) == str( - self.organism.display_name) and \ - str(test_organism.Organism.scientific_name) == str( - self.organism.scientific_name) and \ - str(test_organism.Organism.url_name) == str( - self.organism.url_name) and \ - str(test_organism.Organism.strain) == str(self.organism.strain): - logging.info("Old Organism with no change. No update to organism table") - ################################################################ - ##### Assembly Check and Update - ################################################################ - with self.metadata_db.session_scope() as session: - assembly_acc = session.execute(db.select(Assembly - ).join(Genome.assembly).join(Genome.organism).filter( - Organism.ensembl_name == self.organism.ensembl_name)).all() - new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") - assembly_test = False - for assembly_obj in assembly_acc: - if assembly_obj[0].accession == new_assembly_acc: - assembly_test = True - if assembly_test: - logging.info( - "Old Assembly with no change. No update to Genome, genome_release, assembly, and assembly_sequence tables.") - for dataset in self.datasets: - with self.metadata_db.session_scope() as session: - # Check to see if any already exist: - # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - if dataset.name == "genebuild": - dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - Dataset.version == dataset.version, - Dataset.label == dataset.label).first() - if dataset_test is None: - gb_dataset_type = session.query(DatasetType).filter( - DatasetType.name == "genebuild").first() - dataset.dataset_type = gb_dataset_type - dataset.dataset_source = self.dataset_source - session.add(dataset) - - else: - logging.info("New Assembly. Updating genome, genome_release," - " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") - self.update_assembly() - ################################################################ - ##### dataset Check and Update - ################################################################ - # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. - - - - - else: - self.update_organism() - logging.info("Old Organism with changes. Updating organism table") - - def create_organism(self): - # In this, we are assuming that with a new genome, there will be a new assemblbly. - - with self.metadata_db.session_scope() as session: - # Organism section - # Updating Organism, organism_group_member, and organism_group - self.new_organism_group_and_members(session) - # Add in the new assembly here - # assembly sequence, assembly, genome, genome release. - assembly_test = session.execute(db.select(Assembly).filter( - Assembly.accession == self.assembly.accession)).one_or_none() + # If no existing Organism is found, conduct additional checks before creating a new one. + + # Check if the new organism's taxonomy ID exists in the taxonomy database. + conn = GenomeAdaptor(metadata_uri=metadata_uri, taxonomy_uri=taxonomy_uri) + try: + conn.fetch_taxonomy_names(taxonomy_ids=new_organism.taxonomy_id) + except NoResultFound: + raise Exception("taxid not found in taxonomy database for scientific name") + + # Check if an Assembly with the same accession already exists in the metadata database. + accession = self.get_meta_single_meta_key(species, "assembly.accession") + assembly_test = meta_session.query(Assembly).filter(Assembly.accession == accession).one_or_none() if assembly_test is not None: - Exception( - "Error, existing name but, assembly accession already found. Please update the Ensembl Name in the Meta field manually") - if self.listed_release is not None: - release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() - self.genome_release.ensembl_release = release - self.genome_release.genome = self.genome - - for assembly_seq in self.assembly_sequences: - assembly_seq.assembly = self.assembly - self.assembly.genomes.append(self.genome) - - self.genome.organism = self.organism - - # Update assembly dataset - # Updates genome_dataset,dataset,dataset_source - dataset_source_test = session.execute( - db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() - if dataset_source_test is not None: - Exception("Error, data already present in source") + raise Exception( + "Assembly Accession already exists for a different organism. Please do a manual update.") - dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() - if self.listed_release is not None: - self.genome_dataset.ensembl_release = release - self.genome_dataset.genome = self.genome - self.genome_dataset.dataset = self.assembly_dataset - - self.assembly_dataset.dataset_type = dataset_type - self.assembly_dataset.dataset_source = self.dataset_source - - assembly_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, - ) - assembly_genome_dataset.dataset = self.assembly_dataset - self.genome.genome_datasets.append(assembly_genome_dataset) - - # session.add(assembly_genome_dataset) - - # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. - for dataset in self.datasets: - # Check to see if any already exist: - # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - if dataset.name == "genebuild": - dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - Dataset.version == dataset.version, - Dataset.label == dataset.label).first() - if dataset_test is None: - dataset.dataset_type = session.query(DatasetType).filter( - DatasetType.name == "genebuild").first() - dataset.dataset_source = self.dataset_source - temp_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, - ) - temp_genome_dataset.dataset = dataset - self.genome.genome_datasets.append(temp_genome_dataset) - # Add everything to the database. Closing the session commits it. - session.add(self.organism) - - def update_organism(self): - with self.metadata_db.session_scope() as session: - session.execute( - update(Organism).where(Organism.ensembl_name == self.organism.ensembl_name).values( - species_taxonomy_id=self.organism.species_taxonomy_id, - taxonomy_id=self.organism.taxonomy_id, - display_name=self.organism.display_name, - scientific_name=self.organism.scientific_name, - url_name=self.organism.url_name, - ensembl_name=self.organism.ensembl_name, - strain=self.organism.strain, - )) - - # TODO: Add an update to the groups here. - - def update_assembly(self): - # Change to new assembly/fresh - with self.metadata_db.session_scope() as session: - # Get the genome - self.organism = session.query(Organism).filter( - Organism.ensembl_name == self.organism.ensembl_name).first() - self.genome.organism = self.organism - - if self.listed_release is not None: - release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() - self.genome_release.ensembl_release = release - self.genome_release.genome = self.genome - - self.assembly.genomes.append(self.genome) - - # Update assembly dataset - # Updates genome_dataset,dataset,dataset_source - dataset_source_test = session.execute( - db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() - if dataset_source_test is not None: - self.dataset_source = session.query(DatasetSource).filter( - DatasetSource.name == self.dataset_source.name).first() + # Fetch the division name of the new organism from metadata. + division_name = self.get_meta_single_meta_key(species, "species.division") + if division_name is None: + Exception("No species.division found in meta table") - dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() - if self.listed_release is not None: - self.genome_dataset.ensembl_release = release - self.genome_dataset.genome = self.genome - self.genome_dataset.dataset = self.assembly_dataset - - self.assembly_dataset.dataset_type = dataset_type - self.assembly_dataset.dataset_source = self.dataset_source - - assembly_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, - ) - assembly_genome_dataset.dataset = self.assembly_dataset - self.genome.genome_datasets.append(assembly_genome_dataset) - - for dataset in self.datasets: - # Check to see if any already exist: - # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - if dataset.name == "genebuild": - dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - Dataset.version == dataset.version, - Dataset.label == dataset.label).first() - dataset.dataset_type = session.query(DatasetType).filter( - DatasetType.name == "genebuild").first() - dataset.dataset_source = self.dataset_source - temp_genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=0, + # Query the metadata database to find if an OrganismGroup with the same division name already exists. + division = meta_session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none() + if division is None: + # If no such OrganismGroup exists, create a new one. + division = OrganismGroup( + type="Division", + name=division_name, ) - temp_genome_dataset.dataset = dataset - self.genome.genome_datasets.append(temp_genome_dataset) - # Add everything to the database. Closing the session commits it. - session.add(self.genome) - - # The following methods populate the data from the core into the objects. K - # It may be beneficial to move them to the base class with later implementations - def new_organism(self): - # All taken from the meta table except parlance name. - self.organism = Organism( - organism_id=None, # Should be autogenerated upon insertion - species_taxonomy_id=self.get_meta_single_meta_key(self.species, "species.species_taxonomy_id"), - taxonomy_id=self.get_meta_single_meta_key(self.species, "species.taxonomy_id"), - display_name=self.get_meta_single_meta_key(self.species, "species.display_name"), - scientific_name=self.get_meta_single_meta_key(self.species, "species.scientific_name"), - url_name=self.get_meta_single_meta_key(self.species, "species.url"), - ensembl_name=self.get_meta_single_meta_key(self.species, "species.production_name"), - strain=self.get_meta_single_meta_key(self.species, "species.strain"), - scientific_parlance_name=None, - ) - if self.organism.species_taxonomy_id is None: - self.organism.species_taxonomy_id = self.organism.taxonomy_id - - def new_organism_group_and_members(self, session): - # This method auto grabs the division name and checks for the strain groups - division_name = self.get_meta_single_meta_key(self.species, "species.division") - if division_name is None: - Exception("No species.dvision found in meta table") - division = session.execute(db.select(OrganismGroup).filter(OrganismGroup.name == division_name)).one_or_none() - if division is None: - group = OrganismGroup( - organism_group_id=None, - type="Division", - name=division_name, - code=None, + + # Create a new OrganismGroupMember linking the new Organism to the division group. + organism_group_member = OrganismGroupMember( + is_reference=0, + organism_id=new_organism, + organism_group_id=division, ) + + # Return the newly created Organism and indicate that it is new. + return new_organism, "New" + + def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): + # Get the new assembly assession from the core handed over + assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") + assembly = meta_session.query(Assembly).filter(Assembly.accession == assembly_accession).one_or_none() + + if assembly is not None: + # Get the existing assembly dataset + assembly_dataset = meta_session.query(Dataset).filter(Dataset.label == assembly_accession).one_or_none() + # I should not need this, but double check on database updating. + assembly_dataset_attributes = assembly_dataset.dataset_attributes + + ################################ Tests ################################# + + # assembly sequences. Count and compare to make sure that they match. + if assembly_count != new_assembly_count: + raise Exception("Number of sequences does not match number in database. " + "A new assembly requires a new accession.") + + return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "Existing" + + else: - group = session.query(OrganismGroup).filter(OrganismGroup.name == division_name).first() - self.organism_group_member = OrganismGroupMember( - organism_group_member_id=None, - is_reference=0, - organism_id=None, - organism_group_id=None, - ) - self.organism_group_member.organism_group = group - self.organism_group_member.organism = self.organism - - # Work on the strain level group members - strain = self.get_meta_single_meta_key(self.species, "species.strain") - strain_group = self.get_meta_single_meta_key(self.species, "species.strain_group") - strain_type = self.get_meta_single_meta_key(self.species, "species.type") - - if strain is not None: - if strain == 'reference': - reference = 1 - else: - reference = 0 - group_member = OrganismGroupMember( - organism_group_member_id=None, - is_reference=reference, - organism_id=None, - organism_group_id=None, + with self.db.session_scope() as session: + #May be problematic. Might be provided by genebuild. + level = (session.execute(db.select(CoordSystem.name).filter( + CoordSystem.species_id == species).order_by(CoordSystem.rank)).all())[0][0] + assembly = Assembly( + ucsc_name=self.get_meta_single_meta_key(species, "assembly.ucsc_alias"), + accession=self.get_meta_single_meta_key(self.species, "assembly.accession"), + level=level, + name=self.get_meta_single_meta_key(self.species, "assembly.name"), + accession_body=self.get_meta_single_meta_key(self.species, "assembly.provider"), + assembly_default=self.get_meta_single_meta_key(self.species, "assembly.default"), + tol_id=self.get_meta_single_meta_key(self.species, "assembly.tol_id"), # Not implemented yet + created=func.now(), + ensembl_name=self.get_meta_single_meta_key(self.species, "assembly.name"), + assembly_uuid=str(uuid.uuid4()), ) - # Check for group, if not present make it - division = session.execute( - db.select(OrganismGroup).filter(OrganismGroup.name == strain_group)).one_or_none() - if division is None: - group = OrganismGroup( - organism_group_id=None, - type=strain_type, - name=strain_group, - code=None, + dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + + assembly_dataset =Dataset( + dataset_uuid=str(uuid.uuid4()), + dataset_type=dataset_type, # extract from dataset_type + name="assembly", + ###version=None, Could be changed. + label=assembly.accession, #Required. Suurfulus in this case. + created=func.now(), + dataset_source=dataset_source, # extract from dataset_source + status='Submitted', ) + assembly_ds_attributes = session.query(Meta).filter( + Meta.species_id == species, + Meta.meta_key.like('assembly.%') + ).all() + for attribute in assembly_ds_attributes: - else: - group = session.query(OrganismGroup).filter(OrganismGroup.name == strain_group).first() - group_member.organism_group = group - group_member.organism = self.organism - - def new_genome(self): - # Data for the update function. - self.genome = Genome( - genome_id=None, # Should be autogenerated upon insertion - genome_uuid=str(uuid.uuid4()), - assembly_id=None, # Update the assembly before inserting and grab the assembly key - organism_id=None, # Update the organism before inserting and grab the organism_id - created=func.now(), # Replace all of them with sqlalchemy func.now() - ) + dataset_attribute = DatasetAttribute( + value="", + dataset=assembly_dataset, + dataset_attribute= - def new_genome_release(self): - # Genome Release - self.genome_release = GenomeRelease( - genome_release_id=None, # Should be autogenerated upon insertion - genome_id=None, # Update the genome before inserting and grab the genome_id - release_id=None, - is_current=self.listed_release_is_current, - ) - def new_assembly(self): - level = None - with self.db.session_scope() as session: - level = (session.execute(db.select(CoordSystem.name).filter( - CoordSystem.species_id == self.species).order_by(CoordSystem.rank)).all())[0][0] - - self.assembly = Assembly( - assembly_id=None, # Should be autogenerated upon insertion - ucsc_name=self.get_meta_single_meta_key(self.species, "assembly.ucsc_alias"), - accession=self.get_meta_single_meta_key(self.species, "assembly.accession"), - level=level, - name=self.get_meta_single_meta_key(self.species, "assembly.name"), - accession_body=None, # Not implemented yet - assembly_default=self.get_meta_single_meta_key(self.species, "assembly.default"), - created=func.now(), - ensembl_name=self.get_meta_single_meta_key(self.species, "assembly.name"), - ) - def new_assembly_dataset(self): - self.assembly_dataset = Dataset( - dataset_id=None, # Should be autogenerated upon insertion - dataset_uuid=str(uuid.uuid4()), - dataset_type_id=None, # extract from dataset_type - name="assembly", - version=None, - created=func.now(), - dataset_source_id=None, # extract from dataset_source - label=self.assembly.accession, - status='Submitted', - ) + ) + + - def new_assembly_sequence(self): - self.assembly_sequences = [] - with self.db.session_scope() as session: - # Alias the seq_region_attrib and seq_region_synonym tables - sra1 = aliased(SeqRegionAttrib) - sra3 = aliased(SeqRegionAttrib) - - results = ( - session.query(SeqRegion.name, SeqRegionSynonym.synonym, SeqRegion.length, - CoordSystem.name, - sra3.value, - ) - .join(CoordSystem, SeqRegion.coord_system_id == CoordSystem.coord_system_id) - .join(Meta, CoordSystem.species_id == Meta.species_id) - .join(sra1, SeqRegion.seq_region_id == sra1.seq_region_id) - .outerjoin(SeqRegionSynonym, and_(SeqRegion.seq_region_id == SeqRegionSynonym.seq_region_id, - SeqRegionSynonym.external_db_id == 50710)) - .outerjoin(sra3, and_(SeqRegion.seq_region_id == sra3.seq_region_id, - sra3.attrib_type_id == 547)) - .filter(Meta.meta_key == 'assembly.accession', sra1.attrib_type_id == 6, - Meta.species_id == self.species) - ).all() - for data in results: - - # If the name does not match normal accession formating, then use that name. - name = None - if re.match(r'^[a-zA-Z]+\d+\.\d+', data[0]): - name = None - else: - name = data[0] - # Nab accession from the seq region synonym or else the name. - accession = None - if data[1] is not None and re.match(r'^[a-zA-Z]+\d+\.\d+', data[1]): - accession = data[1] - elif name is not None: - accession = name - else: - accession = data[0] - - chromosomal = 0 - if data[3] == 'chromosome': - chromosomal = 1 - - sequence_location = None - if data[4] == 'nuclear_chromosome': - sequence_location = 'SO:0000738' - elif data[4] == 'mitochondrial_chromosome': - sequence_location = 'SO:0000737' - elif data[4] == 'chloroplast_chromosome': - sequence_location = 'SO:0000745' - elif data[4] is None: - sequence_location = 'SO:0000738' - else: - raise Exception('Error with sequence location: ' + data[4] + ' is not a valid type') - - self.assembly_sequences.append(AssemblySequence( - assembly_sequence_id=None, # Should be autogenerated upon insertion - name=name, - assembly_id=None, # Update the assembly before inserting and grab the assembly_id - accession=accession, - chromosomal=chromosomal, - length=data[2], - sequence_location=sequence_location, - # These two get populated in the core stats pipeline. - sequence_checksum=None, - ga4gh_identifier=None, - )) - - def new_genome_dataset(self): - self.genome_dataset = GenomeDataset( - genome_dataset_id=None, # Should be autogenerated upon insertion - dataset_id=None, # extract from dataset once genertated - genome_id=None, # extract from genome once genertated - release_id=None, # extract from release once genertated - is_current=self.listed_release_is_current, - ) - def new_dataset_source(self): - self.dataset_source = DatasetSource( - dataset_source_id=None, # Should be autogenerated upon insertion - type=self.db_type, # core/fungen etc - name=make_url(self.db_uri).database # dbname - ) - def new_datasets(self): - self.datasets = [] - # Genebuild. - label = self.get_meta_single_meta_key(self.species, "genebuild.last_geneset_update") - if label is None: - label = self.get_meta_single_meta_key(self.species, "genebuild.start_date") - self.datasets.append(Dataset( - dataset_id=None, # Should be autogenerated upon insertion - dataset_uuid=str(uuid.uuid4()), - dataset_type_id=None, # extract from dataset_type - name="genebuild", - version=self.get_meta_single_meta_key(self.species, "gencode.version"), - created=func.now(), - dataset_source_id=None, # extract from dataset_source - label=label, - status='Submitted', - )) - # Protein Features + # Old functions + # self.new_genome() + # self.new_genome_release() + # self.new_assembly() + # self.new_assembly_sequence() + # self.new_assembly_dataset() + # self.new_dataset_source() + # self.new_genome_dataset() + # self.new_datasets() + + ######################################################################## + ##### Logic Section ######################## + ######################################################################## + + # # Species Check + # # Check for new species by checking if ensembl name is already present in the database + # if not GenomeAdaptor(metadata_uri=self.metadata_db.url, + # taxonomy_uri=self.taxonomy_uri).fetch_genomes_by_ensembl_name( + # self.organism.ensembl_name): + # # Check if the assembly accesion is already present in the database + # new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") + # with self.metadata_db.session_scope() as session: + # if session.query(session.query(Assembly).filter_by(accession=new_assembly_acc).exists()).scalar(): + # Exception("Assembly Accession already exists for a different organism. Please do a manual update.") + # self.create_organism() + # logging.info("Fresh Organism. Adding data to organism, genome, genome_release," + # " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") + # + # # Check to see if it is an updated organism. + # else: + # with self.metadata_db.session_scope() as session: + # session.expire_on_commit = False + # test_organism = session.execute(db.select(Organism).filter( + # Organism.ensembl_name == self.organism.ensembl_name)).one_or_none() + # self.organism.organism_id = Organism.organism_id + # self.organism.scientific_parlance_name = Organism.scientific_parlance_name + # + # if int(test_organism.Organism.species_taxonomy_id) == int( + # self.organism.species_taxonomy_id) and \ + # int(test_organism.Organism.taxonomy_id) == int( + # self.organism.taxonomy_id) and \ + # str(test_organism.Organism.display_name) == str( + # self.organism.display_name) and \ + # str(test_organism.Organism.scientific_name) == str( + # self.organism.scientific_name) and \ + # str(test_organism.Organism.url_name) == str( + # self.organism.url_name) and \ + # str(test_organism.Organism.strain) == str(self.organism.strain): + # logging.info("Old Organism with no change. No update to organism table") + # ################################################################ + # ##### Assembly Check and Update + # ################################################################ + # with self.metadata_db.session_scope() as session: + # assembly_acc = session.execute(db.select(Assembly + # ).join(Genome.assembly).join(Genome.organism).filter( + # Organism.ensembl_name == self.organism.ensembl_name)).all() + # new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") + # assembly_test = False + # for assembly_obj in assembly_acc: + # if assembly_obj[0].accession == new_assembly_acc: + # assembly_test = True + # if assembly_test: + # logging.info( + # "Old Assembly with no change. No update to Genome, genome_release, assembly, and assembly_sequence tables.") + # for dataset in self.datasets: + # with self.metadata_db.session_scope() as session: + # # Check to see if any already exist: + # # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! + # if dataset.name == "genebuild": + # dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", + # Dataset.version == dataset.version, + # Dataset.label == dataset.label).first() + # if dataset_test is None: + # gb_dataset_type = session.query(DatasetType).filter( + # DatasetType.name == "genebuild").first() + # dataset.dataset_type = gb_dataset_type + # dataset.dataset_source = self.dataset_source + # session.add(dataset) + # + # else: + # logging.info("New Assembly. Updating genome, genome_release," + # " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") + # self.update_assembly() + # ################################################################ + # ##### dataset Check and Update + # ################################################################ + # # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. + # + # + # + # + # else: + # self.update_organism() + # logging.info("Old Organism with changes. Updating organism table") + # + # def create_organism(self): + # # In this, we are assuming that with a new genome, there will be a new assemblbly. + # + # with self.metadata_db.session_scope() as session: + # # Organism section + # # Updating Organism, organism_group_member, and organism_group + # self.new_organism_group_and_members(session) + # # Add in the new assembly here + # # assembly sequence, assembly, genome, genome release. + # assembly_test = session.execute(db.select(Assembly).filter( + # Assembly.accession == self.assembly.accession)).one_or_none() + # if assembly_test is not None: + # Exception( + # "Error, existing name but, assembly accession already found. Please update the Ensembl Name in the Meta field manually") + # if self.listed_release is not None: + # release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() + # self.genome_release.ensembl_release = release + # self.genome_release.genome = self.genome + # + # for assembly_seq in self.assembly_sequences: + # assembly_seq.assembly = self.assembly + # self.assembly.genomes.append(self.genome) + # + # self.genome.organism = self.organism + # + # # Update assembly dataset + # # Updates genome_dataset,dataset,dataset_source + # dataset_source_test = session.execute( + # db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() + # if dataset_source_test is not None: + # Exception("Error, data already present in source") + # + # dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() + # if self.listed_release is not None: + # self.genome_dataset.ensembl_release = release + # self.genome_dataset.genome = self.genome + # self.genome_dataset.dataset = self.assembly_dataset + # + # self.assembly_dataset.dataset_type = dataset_type + # self.assembly_dataset.dataset_source = self.dataset_source + # + # assembly_genome_dataset = GenomeDataset( + # genome_dataset_id=None, # Should be autogenerated upon insertion + # dataset_id=None, # extract from dataset once genertated + # genome_id=None, # extract from genome once genertated + # release_id=None, # extract from release once genertated + # is_current=0, + # ) + # assembly_genome_dataset.dataset = self.assembly_dataset + # self.genome.genome_datasets.append(assembly_genome_dataset) + # + # # session.add(assembly_genome_dataset) + # + # # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. + # for dataset in self.datasets: + # # Check to see if any already exist: + # # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! + # if dataset.name == "genebuild": + # dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", + # Dataset.version == dataset.version, + # Dataset.label == dataset.label).first() + # if dataset_test is None: + # dataset.dataset_type = session.query(DatasetType).filter( + # DatasetType.name == "genebuild").first() + # dataset.dataset_source = self.dataset_source + # temp_genome_dataset = GenomeDataset( + # genome_dataset_id=None, # Should be autogenerated upon insertion + # dataset_id=None, # extract from dataset once genertated + # genome_id=None, # extract from genome once genertated + # release_id=None, # extract from release once genertated + # is_current=0, + # ) + # temp_genome_dataset.dataset = dataset + # self.genome.genome_datasets.append(temp_genome_dataset) + # # Add everything to the database. Closing the session commits it. + # session.add(self.organism) + # + # def update_organism(self): + # with self.metadata_db.session_scope() as session: + # session.execute( + # update(Organism).where(Organism.ensembl_name == self.organism.ensembl_name).values( + # species_taxonomy_id=self.organism.species_taxonomy_id, + # taxonomy_id=self.organism.taxonomy_id, + # display_name=self.organism.display_name, + # scientific_name=self.organism.scientific_name, + # url_name=self.organism.url_name, + # ensembl_name=self.organism.ensembl_name, + # strain=self.organism.strain, + # )) + # + # # TODO: Add an update to the groups here. + # + # def update_assembly(self): + # # Change to new assembly/fresh + # with self.metadata_db.session_scope() as session: + # # Get the genome + # self.organism = session.query(Organism).filter( + # Organism.ensembl_name == self.organism.ensembl_name).first() + # self.genome.organism = self.organism + # + # if self.listed_release is not None: + # release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() + # self.genome_release.ensembl_release = release + # self.genome_release.genome = self.genome + # + # self.assembly.genomes.append(self.genome) + # + # # Update assembly dataset + # # Updates genome_dataset,dataset,dataset_source + # dataset_source_test = session.execute( + # db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() + # if dataset_source_test is not None: + # self.dataset_source = session.query(DatasetSource).filter( + # DatasetSource.name == self.dataset_source.name).first() + # + # dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() + # if self.listed_release is not None: + # self.genome_dataset.ensembl_release = release + # self.genome_dataset.genome = self.genome + # self.genome_dataset.dataset = self.assembly_dataset + # + # self.assembly_dataset.dataset_type = dataset_type + # self.assembly_dataset.dataset_source = self.dataset_source + # + # assembly_genome_dataset = GenomeDataset( + # genome_dataset_id=None, # Should be autogenerated upon insertion + # dataset_id=None, # extract from dataset once genertated + # genome_id=None, # extract from genome once genertated + # release_id=None, # extract from release once genertated + # is_current=0, + # ) + # assembly_genome_dataset.dataset = self.assembly_dataset + # self.genome.genome_datasets.append(assembly_genome_dataset) + # + # for dataset in self.datasets: + # # Check to see if any already exist: + # # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! + # if dataset.name == "genebuild": + # dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", + # Dataset.version == dataset.version, + # Dataset.label == dataset.label).first() + # dataset.dataset_type = session.query(DatasetType).filter( + # DatasetType.name == "genebuild").first() + # dataset.dataset_source = self.dataset_source + # temp_genome_dataset = GenomeDataset( + # genome_dataset_id=None, # Should be autogenerated upon insertion + # dataset_id=None, # extract from dataset once genertated + # genome_id=None, # extract from genome once genertated + # release_id=None, # extract from release once genertated + # is_current=0, + # ) + # temp_genome_dataset.dataset = dataset + # self.genome.genome_datasets.append(temp_genome_dataset) + # # Add everything to the database. Closing the session commits it. + # session.add(self.genome) + # + # # The following methods populate the data from the core into the objects. K + # # It may be beneficial to move them to the base class with later implementations + # def new_organism(self): + # # All taken from the meta table except parlance name. + # self.organism = Organism( + # organism_id=None, # Should be autogenerated upon insertion + # species_taxonomy_id=self.get_meta_single_meta_key(self.species, "species.species_taxonomy_id"), + # taxonomy_id=self.get_meta_single_meta_key(self.species, "species.taxonomy_id"), + # display_name=self.get_meta_single_meta_key(self.species, "species.display_name"), + # scientific_name=self.get_meta_single_meta_key(self.species, "species.scientific_name"), + # url_name=self.get_meta_single_meta_key(self.species, "species.url"), + # ensembl_name=self.get_meta_single_meta_key(self.species, "species.production_name"), + # strain=self.get_meta_single_meta_key(self.species, "species.strain"), + # scientific_parlance_name=None, + # ) + # if self.organism.species_taxonomy_id is None: + # self.organism.species_taxonomy_id = self.organism.taxonomy_id + # + # def new_organism_group_and_members(self, session): + # # This method auto grabs the division name and checks for the strain groups + # division_name = self.get_meta_single_meta_key(self.species, "species.division") + # if division_name is None: + # Exception("No species.dvision found in meta table") + # division = session.execute(db.select(OrganismGroup).filter(OrganismGroup.name == division_name)).one_or_none() + # if division is None: + # group = OrganismGroup( + # organism_group_id=None, + # type="Division", + # name=division_name, + # code=None, + # ) + # else: + # group = session.query(OrganismGroup).filter(OrganismGroup.name == division_name).first() + # self.organism_group_member = OrganismGroupMember( + # organism_group_member_id=None, + # is_reference=0, + # organism_id=None, + # organism_group_id=None, + # ) + # self.organism_group_member.organism_group = group + # self.organism_group_member.organism = self.organism + # + # # Work on the strain level group members + # strain = self.get_meta_single_meta_key(self.species, "species.strain") + # strain_group = self.get_meta_single_meta_key(self.species, "species.strain_group") + # strain_type = self.get_meta_single_meta_key(self.species, "species.type") + # + # if strain is not None: + # if strain == 'reference': + # reference = 1 + # else: + # reference = 0 + # group_member = OrganismGroupMember( + # organism_group_member_id=None, + # is_reference=reference, + # organism_id=None, + # organism_group_id=None, + # ) + # # Check for group, if not present make it + # division = session.execute( + # db.select(OrganismGroup).filter(OrganismGroup.name == strain_group)).one_or_none() + # if division is None: + # group = OrganismGroup( + # organism_group_id=None, + # type=strain_type, + # name=strain_group, + # code=None, + # ) + # + # else: + # group = session.query(OrganismGroup).filter(OrganismGroup.name == strain_group).first() + # group_member.organism_group = group + # group_member.organism = self.organism + # + # def new_genome(self): + # # Data for the update function. + # self.genome = Genome( + # genome_id=None, # Should be autogenerated upon insertion + # genome_uuid=str(uuid.uuid4()), + # assembly_id=None, # Update the assembly before inserting and grab the assembly key + # organism_id=None, # Update the organism before inserting and grab the organism_id + # created=func.now(), # Replace all of them with sqlalchemy func.now() + # ) + # + # def new_genome_release(self): + # # Genome Release + # self.genome_release = GenomeRelease( + # genome_release_id=None, # Should be autogenerated upon insertion + # genome_id=None, # Update the genome before inserting and grab the genome_id + # release_id=None, + # is_current=self.listed_release_is_current, + # ) + # + # def new_assembly(self): + # level = None + # with self.db.session_scope() as session: + # level = (session.execute(db.select(CoordSystem.name).filter( + # CoordSystem.species_id == self.species).order_by(CoordSystem.rank)).all())[0][0] + # + # self.assembly = Assembly( + # assembly_id=None, # Should be autogenerated upon insertion + # ucsc_name=self.get_meta_single_meta_key(self.species, "assembly.ucsc_alias"), + # accession=self.get_meta_single_meta_key(self.species, "assembly.accession"), + # level=level, + # name=self.get_meta_single_meta_key(self.species, "assembly.name"), + # accession_body=None, # Not implemented yet + # assembly_default=self.get_meta_single_meta_key(self.species, "assembly.default"), + # created=func.now(), + # ensembl_name=self.get_meta_single_meta_key(self.species, "assembly.name"), + # ) + + # def new_assembly_dataset(self): + # self.assembly_dataset = Dataset( + # dataset_id=None, # Should be autogenerated upon insertion + # dataset_uuid=str(uuid.uuid4()), + # dataset_type_id=None, # extract from dataset_type + # name="assembly", + # version=None, + # created=func.now(), + # dataset_source_id=None, # extract from dataset_source + # label=self.assembly.accession, + # status='Submitted', + # ) + # + # def new_assembly_sequence(self): + # self.assembly_sequences = [] + # with self.db.session_scope() as session: + # # Alias the seq_region_attrib and seq_region_synonym tables + # sra1 = aliased(SeqRegionAttrib) + # sra3 = aliased(SeqRegionAttrib) + # + # results = ( + # session.query(SeqRegion.name, SeqRegionSynonym.synonym, SeqRegion.length, + # CoordSystem.name, + # sra3.value, + # ) + # .join(CoordSystem, SeqRegion.coord_system_id == CoordSystem.coord_system_id) + # .join(Meta, CoordSystem.species_id == Meta.species_id) + # .join(sra1, SeqRegion.seq_region_id == sra1.seq_region_id) + # .outerjoin(SeqRegionSynonym, and_(SeqRegion.seq_region_id == SeqRegionSynonym.seq_region_id, + # SeqRegionSynonym.external_db_id == 50710)) + # .outerjoin(sra3, and_(SeqRegion.seq_region_id == sra3.seq_region_id, + # sra3.attrib_type_id == 547)) + # .filter(Meta.meta_key == 'assembly.accession', sra1.attrib_type_id == 6, + # Meta.species_id == self.species) + # ).all() + # for data in results: + # # If the name does not match normal accession formating, then use that name. + # name = None + # if re.match(r'^[a-zA-Z]+\d+\.\d+', data[0]): + # name = None + # else: + # name = data[0] + # # Nab accession from the seq region synonym or else the name. + # accession = None + # if data[1] is not None and re.match(r'^[a-zA-Z]+\d+\.\d+', data[1]): + # accession = data[1] + # elif name is not None: + # accession = name + # else: + # accession = data[0] + # + # chromosomal = 0 + # if data[3] == 'chromosome': + # chromosomal = 1 + # + # sequence_location = None + # if data[4] == 'nuclear_chromosome': + # sequence_location = 'SO:0000738' + # elif data[4] == 'mitochondrial_chromosome': + # sequence_location = 'SO:0000737' + # elif data[4] == 'chloroplast_chromosome': + # sequence_location = 'SO:0000745' + # elif data[4] is None: + # sequence_location = 'SO:0000738' + # else: + # raise Exception('Error with sequence location: ' + data[4] + ' is not a valid type') + # + # self.assembly_sequences.append(AssemblySequence( + # assembly_sequence_id=None, # Should be autogenerated upon insertion + # name=name, + # assembly_id=None, # Update the assembly before inserting and grab the assembly_id + # accession=accession, + # chromosomal=chromosomal, + # length=data[2], + # sequence_location=sequence_location, + # # These two get populated in the core stats pipeline. + # sequence_checksum=None, + # ga4gh_identifier=None, + # )) + # + # def new_genome_dataset(self): + # self.genome_dataset = GenomeDataset( + # genome_dataset_id=None, # Should be autogenerated upon insertion + # dataset_id=None, # extract from dataset once genertated + # genome_id=None, # extract from genome once genertated + # release_id=None, # extract from release once genertated + # is_current=self.listed_release_is_current, + # ) + # + # def new_dataset_source(self): + # self.dataset_source = DatasetSource( + # dataset_source_id=None, # Should be autogenerated upon insertion + # type=self.db_type, # core/fungen etc + # name=make_url(self.db_uri).database # dbname + # ) + # + # def new_datasets(self): + # self.datasets = [] + # # Genebuild. + # label = self.get_meta_single_meta_key(self.species, "genebuild.last_geneset_update") + # if label is None: + # label = self.get_meta_single_meta_key(self.species, "genebuild.start_date") + # self.datasets.append(Dataset( + # dataset_id=None, # Should be autogenerated upon insertion + # dataset_uuid=str(uuid.uuid4()), + # dataset_type_id=None, # extract from dataset_type + # name="genebuild", + # version=self.get_meta_single_meta_key(self.species, "gencode.version"), + # created=func.now(), + # dataset_source_id=None, # extract from dataset_source + # label=label, + # status='Submitted', + # )) + # Protein Features From bc6601b892258c7961e63c9aa6f3b8b5fb09afe2 Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 12 Jul 2023 17:05:24 +0100 Subject: [PATCH 02/20] Fiadded new genset and assembly updates --- .../production/metadata/updater/base.py | 15 ++ .../production/metadata/updater/core.py | 215 +++++++++++++++--- 2 files changed, 200 insertions(+), 30 deletions(-) diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 96dafc62..49d72e2c 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -42,6 +42,21 @@ def get_meta_single_meta_key(self, species_id, parameter): else: return result[0] + def get_meta_list_from_prefix_meta_key(self, species_id, prefix): + with self.db.session_scope() as session: + query = session.query(Meta.meta_key, Meta.meta_value).filter( + Meta.meta_key.like(f'{prefix}%'), + Meta.species_id == species_id + ) + result = query.all() + if not result: + return None + else: + # Build a dictionary out of the results. + result_dict = {key: value for key, value in result} + return result_dict + + def get_or_new_source(self, meta_session, db_uri, db_type): name = make_url(db_uri).database dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none() diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 23c4e557..9eccf41a 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -12,10 +12,11 @@ import logging import re import uuid +from collections import defaultdict import sqlalchemy as db from ensembl.core.models import Meta, Assembly as AssemblyCore, CoordSystem, SeqRegionAttrib, SeqRegion, \ - SeqRegionSynonym + SeqRegionSynonym, AttribType, ExternalDb from sqlalchemy import select, update, func, and_ from sqlalchemy.engine import make_url from sqlalchemy.orm import aliased @@ -47,8 +48,8 @@ def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): self.genome = None self.genome_release = None - self.genome_dataset = None - self.datasets = None # array + self.genebuild_dataset_attributes = None # array + self.genebuild_dataset = None self.dataset_type = None self.dataset_source = None self.attribute = None @@ -81,13 +82,28 @@ def process_core(self, **kwargs): self.process_species(species, metadata_uri, taxonomy_uri) def process_species(self, species, metadata_uri, taxonomy_uri, db_uri): - # new functions + """ + Process an individual species from a core database to update the metadata db. + This method contains the logic for updating the metadata + """ meta_conn = DBConnection(metadata_uri) with meta_conn.session_scope() as meta_session: self.organism, organism_status = self.get_or_new_organism(species, meta_session, metadata_uri, taxonomy_uri) self.assembly, self.assembly_dataset, self.assembly_dataset_attributes, self.assembly_sequences, \ self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, metadata_uri, db_uri) + #Add release check here! + self.genebuild_dataset, self.genebuild_dataset_attributes = self.new_genebuild(species, + meta_session, db_uri) + + ###############Check if all sources are the same and it has been released ################ + + # If all the sources are the same and it hasn't been released: Delete corresponding genome and create new + + + # if new assembly, create new genome. + + print(self.organism, organism_status) def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri): @@ -103,10 +119,10 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) # Instantiate a new Organism object using data fetched from metadata. new_organism = Organism( - species_taxonomy_id=self.get_meta_single_meta_key(species, "species.species_taxonomy_id"), #REQURIED - taxonomy_id=self.get_meta_single_meta_key(species, "species.taxonomy_id"),#REQURIED - display_name=self.get_meta_single_meta_key(species, "species.display_name"),#REQURIED , MAY BE DELETED - scientific_name=self.get_meta_single_meta_key(species, "species.scientific_name"),#REQURIED + species_taxonomy_id=self.get_meta_single_meta_key(species, "species.species_taxonomy_id"), # REQURIED + taxonomy_id=self.get_meta_single_meta_key(species, "species.taxonomy_id"), # REQURIED + display_name=self.get_meta_single_meta_key(species, "species.display_name"), # REQURIED , MAY BE DELETED + scientific_name=self.get_meta_single_meta_key(species, "species.scientific_name"), # REQURIED url_name=self.get_meta_single_meta_key(species, "species.url"), ensembl_name=ensembl_name, strain=self.get_meta_single_meta_key(species, "species.strain"), @@ -161,6 +177,80 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) # Return the newly created Organism and indicate that it is new. return new_organism, "New" + def get_assembly_sequences(self, species, assembly): + """ + Get the assembly sequences and the values that correspond to the metadata table + """ + assembly_sequences = [] + with self.db.session_scope() as session: + # Create an alias for SeqRegionAttrib and AttribType to be used for sequence_location + SeqRegionAttribAlias = aliased(SeqRegionAttrib) + AttribTypeAlias = aliased(AttribType) + + # One complicated query to get all the data. Otherwise this takes far too long to do. + results = (session.query(SeqRegion.name, SeqRegion.length, CoordSystem.name, + SeqRegionAttribAlias.value, SeqRegionSynonym.synonym, ExternalDb.db_name) + .join(SeqRegion.coord_system) + .join(SeqRegion.seq_region_attrib) + .join(SeqRegionAttrib.attrib_type) + .join(CoordSystem.meta) + .outerjoin(SeqRegion.seq_region_synonym) + .outerjoin(SeqRegionSynonym.external_db) + .join(SeqRegionAttribAlias, SeqRegion.seq_region_attrib) # join with SeqRegionAttribAlias + .outerjoin(AttribTypeAlias, SeqRegionAttribAlias.attrib_type) # join with AttribTypeAlias + .filter(Meta.species_id == species) + .filter(AttribType.code == "toplevel") # ensure toplevel + .filter(AttribTypeAlias.code == "sequence_location").all()) # ensure sequence_location + + # Create a dictionary so that the results can have multiple synonyms per line and only one SeqRegion + results_dict = defaultdict(dict) + for seq_region_name, seq_region_length, coord_system_name, sequence_location, synonym, db_name in results: + key = (seq_region_name, seq_region_length, coord_system_name, sequence_location) + results_dict[key][synonym] = db_name + + for ( + seq_region_name, seq_region_length, coord_system_name, + sequence_location), synonyms in results_dict.items(): + # Test if chromosomal: + if coord_system_name == "chromosome": + chromosomal = 1 + else: + chromosomal = 0 + # Test to see if the seq_name follows accession standards (99% of sequences) + if re.match(r'^[a-zA-Z]+\d+\.\d+', seq_region_name): + # If so assign it to accession + accession = seq_region_name + if not synonyms: + # If it doesn't have any synonyms the accession is the name. + name = accession + # otherwise join all the accessions and store them in name + ################Likely problematic in the future###################### + name = ";".join(synonyms.keys()) + else: + # For named sequences like chr1 + name = seq_region_name + for synonym, db in synonyms: + # We used to match to KnownXref, however that should not be necessary. Testing this way for now. + if re.match(r'^[a-zA-Z]+\d+\.\d+', synonym): + accession = synonym + else: + name = name + ";" + synonym + if accession is none: + raise Exception(f"seq_region_name {seq_region_name} accession could not be found. Please check") + assembly_sequence = AssemblySequence( + name=name, + assembly_id=assembly, + accession=accession, + chromosomal=chromosomal, + length=seq_region_length, + sequence_location=sequence_location, + # sequence_checksum="", Not implemented + # ga4gh_identifier="", Not implemented + ) + assembly_sequences.append(assembly_sequence) + return assembly_sequences + + #TODO: add in assembly override for unreleased. Call this method agiain during logic after removing old assembly. def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): # Get the new assembly assession from the core handed over assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") @@ -173,24 +263,28 @@ def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): assembly_dataset_attributes = assembly_dataset.dataset_attributes ################################ Tests ################################# - + new_assembly_sequences = self.get_assembly_sequences(species, assembly) + assembly_sequences = assembly.assembly_sequences # assembly sequences. Count and compare to make sure that they match. - if assembly_count != new_assembly_count: + if len(assembly_sequences) != len(new_assembly_sequences): raise Exception("Number of sequences does not match number in database. " "A new assembly requires a new accession.") + ########################################################################## + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "Existing" else: with self.db.session_scope() as session: - #May be problematic. Might be provided by genebuild. + # May be problematic. Might be provided by genebuild. level = (session.execute(db.select(CoordSystem.name).filter( CoordSystem.species_id == species).order_by(CoordSystem.rank)).all())[0][0] assembly = Assembly( ucsc_name=self.get_meta_single_meta_key(species, "assembly.ucsc_alias"), accession=self.get_meta_single_meta_key(self.species, "assembly.accession"), level=level, +# level=self.get_meta_single_meta_key(self.species, "assembly.level"), #Not yet implemented. name=self.get_meta_single_meta_key(self.species, "assembly.name"), accession_body=self.get_meta_single_meta_key(self.species, "assembly.provider"), assembly_default=self.get_meta_single_meta_key(self.species, "assembly.default"), @@ -199,37 +293,98 @@ def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): ensembl_name=self.get_meta_single_meta_key(self.species, "assembly.name"), assembly_uuid=str(uuid.uuid4()), ) - dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") - assembly_dataset =Dataset( - dataset_uuid=str(uuid.uuid4()), - dataset_type=dataset_type, # extract from dataset_type - name="assembly", - ###version=None, Could be changed. - label=assembly.accession, #Required. Suurfulus in this case. - created=func.now(), - dataset_source=dataset_source, # extract from dataset_source - status='Submitted', - ) - assembly_ds_attributes = session.query(Meta).filter( - Meta.species_id == species, - Meta.meta_key.like('assembly.%') - ).all() - for attribute in assembly_ds_attributes: + assembly_dataset = Dataset( + dataset_uuid=str(uuid.uuid4()), + dataset_type=dataset_type, # extract from dataset_type + name="assembly", + ###version=None, Could be changed. + label=assembly.accession, # Required. Makes for a quick lookup + created=func.now(), + dataset_source=dataset_source, # extract from dataset_source + status='Submitted', + ) + attributes = self.get_meta_list_from_prefix_meta_key(species, "assembly") + assembly_dataset_attributes = [] + for attribute, value in attributes: + attribute.replace("assembly.", "", 1) + meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise Exception(f"Atribute {attribute} not found. Please enter it into the db manually") dataset_attribute = DatasetAttribute( - value="", + value=value, dataset=assembly_dataset, - dataset_attribute= + attribute=meta_attribute, + ) + assembly_dataset_attributes.append(dataset_attribute) + assembly_sequences = self.get_assembly_sequences(species, assembly) + return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "New" - ) + def new_genebuild(self, species, meta_session, db_uri): + """ + Process an individual species from a core database to update the metadata db. + This method contains the logic for updating the metadata + This is not a get, as we don't update the metadata for genebuild, only replace it if it is not released. + """ + assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") + genebuild_version = self.get_meta_single_meta_key(species, "genebuild.version") + genebuild_accesion = assembly_accession + "_" + genebuild_version + # genebuild_dataset = meta_session.query(Dataset).filter( + # Dataset.label == genebuild_accesion).one_or_none() + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() + genebuild_dataset = Dataset( + dataset_uuid=str(uuid.uuid4()), + dataset_type=dataset_type, # extract from dataset_type + name="assembly", + version=genebuild_version, + label=genebuild_accesion, # Required. Used for lookup in this script + created=func.now(), + dataset_source=dataset_source, # extract from dataset_source + status='Submitted', + ) + attributes = self.get_meta_list_from_prefix_meta_key(species, "genebuild.") + genebuild_dataset_attributes = [] + for attribute, value in attributes: + attribute.replace("genebuild.", "", 1) + meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise Exception(f"Atribute {attribute} not found. Please enter it into the db manually") + dataset_attribute = DatasetAttribute( + value=value, + dataset=genebuild_dataset, + attribute=meta_attribute, + ) + genebuild_dataset_attributes.append(dataset_attribute) + return genebuild_dataset, genebuild_dataset_attributes + + + + + + attributes = self.get_meta_list_from_prefix_meta_key(self, species, "assembly") + assembly_dataset_attributes = [] + for attribute, value in attributes: + meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() + if meta_attribute is None: + raise Exception(f"Atribute {attribute} not found. Please enter it into the db manually") + dataset_attribute = DatasetAttribute( + value=value, + dataset=assembly_dataset, + attribute=meta_attribute, + ) + assembly_dataset_attributes.append(dataset_attribute) + assembly_sequences = self.get_assembly_sequences(species, assembly) + return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "New" # Old functions # self.new_genome() From f60e7084e4dcf9ab5067dd3af27f1dadfddc1852 Mon Sep 17 00:00:00 2001 From: danielp Date: Thu, 13 Jul 2023 15:29:20 +0100 Subject: [PATCH 03/20] Full refactor of updater before testing. Created release check for API datasets --- .../production/metadata/api/dataset.py | 38 + .../production/metadata/updater/core.py | 759 +++++------------- 2 files changed, 229 insertions(+), 568 deletions(-) create mode 100644 src/ensembl/production/metadata/api/dataset.py diff --git a/src/ensembl/production/metadata/api/dataset.py b/src/ensembl/production/metadata/api/dataset.py new file mode 100644 index 00000000..bac2f553 --- /dev/null +++ b/src/ensembl/production/metadata/api/dataset.py @@ -0,0 +1,38 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sqlalchemy as db +from sqlalchemy.engine import make_url + +from ensembl.production.metadata.api.base import BaseAdaptor +from ensembl.production.metadata.api.models import GenomeDataset, Dataset +import logging + +logger = logging.getLogger(__name__) + + +class DatasetAdaptor(BaseAdaptor): + def __init__(self, metadata_uri): + super().__init__(metadata_uri) + + def check_release_status(self, dataset_uuid): + with self.metadata_db.session_scope() as session: + # Query to check if a release_id exists for the given genome_uuid + dataset_id = session.query(Dataset.dataset_id).filter(Dataset.dataset_uuid == dataset_uuid).scalar() + if dataset_id is None: + return "UUID not found" + + # Now we check if there exists a genome dataset with the corresponding dataset_id and a non-null release_id + result = session.query( + session.query(GenomeDataset).filter(GenomeDataset.dataset_id == dataset_id, + GenomeDataset.release_id.isnot(None)).exists() + ).scalar() + return result diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 9eccf41a..8cbc8b3c 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -9,36 +9,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging import re -import uuid from collections import defaultdict - import sqlalchemy as db -from ensembl.core.models import Meta, Assembly as AssemblyCore, CoordSystem, SeqRegionAttrib, SeqRegion, \ +from ensembl.core.models import Meta, CoordSystem, SeqRegionAttrib, SeqRegion, \ SeqRegionSynonym, AttribType, ExternalDb -from sqlalchemy import select, update, func, and_ -from sqlalchemy.engine import make_url +from sqlalchemy import select, func from sqlalchemy.orm import aliased from ensembl.database import DBConnection from sqlalchemy.exc import NoResultFound -import sys from ensembl.production.metadata.api.genome import GenomeAdaptor +from ensembl.production.metadata.api.dataset import DatasetAdaptor from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater -##TODO: -# Prevent deletion of release data. -# Logic: -##Create new organism on new production name if no ensembl name. If ensembl name is given create new if none, if already exists create new genome based on production name. -##Check that taxid is present in db. - - class CoreMetaUpdater(BaseMetaUpdater): - def __init__(self, db_uri, metadata_uri, taxonomy_uri, release=None): - # Each of these objects represents a table in the database to store data in as eith0er an array or a single object. + def __init__(self, db_uri, metadata_uri, taxonomy_uri): + # Each of these objects represents a table in the database to store data in as an array or a single object. self.organism = None + self.division = None + self.organism_group_member = None self.metadata_uri = metadata_uri self.taxonomy_uri = taxonomy_uri self.assembly = None @@ -62,7 +53,7 @@ def process_core(self, **kwargs): sel_species = kwargs.get('species', None) metadata_uri = kwargs.get('metadata_uri', self.metadata_uri) taxonomy_uri = kwargs.get('metadata_uri', self.taxonomy_uri) - + db_uri = kwargs.get('db_uri') if sel_species: with self.db.session_scope() as session: multi_species = session.execute( @@ -71,7 +62,7 @@ def process_core(self, **kwargs): ) else: # Normal handling of collections from here - # Handle multispecies databases and run an update for each species + # Handle multi-species databases and run an update for each species with self.db.session_scope() as session: multi_species = session.execute( select(Meta.species_id).filter(Meta.meta_key == "species.production_name").distinct() @@ -79,7 +70,7 @@ def process_core(self, **kwargs): multi_species = [multi_species for multi_species, in multi_species] for species in multi_species: - self.process_species(species, metadata_uri, taxonomy_uri) + self.process_species(species, metadata_uri, taxonomy_uri, db_uri) def process_species(self, species, metadata_uri, taxonomy_uri, db_uri): """ @@ -88,23 +79,150 @@ def process_species(self, species, metadata_uri, taxonomy_uri, db_uri): """ meta_conn = DBConnection(metadata_uri) with meta_conn.session_scope() as meta_session: - self.organism, organism_status = self.get_or_new_organism(species, meta_session, metadata_uri, taxonomy_uri) + self.organism, self.division, self.organism_group_member, organism_status = \ + self.get_or_new_organism(species, meta_session, metadata_uri, taxonomy_uri) self.assembly, self.assembly_dataset, self.assembly_dataset_attributes, self.assembly_sequences, \ - self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, metadata_uri, - db_uri) - #Add release check here! - self.genebuild_dataset, self.genebuild_dataset_attributes = self.new_genebuild(species, - meta_session, db_uri) - - ###############Check if all sources are the same and it has been released ################ - - # If all the sources are the same and it hasn't been released: Delete corresponding genome and create new - - - # if new assembly, create new genome. - - - print(self.organism, organism_status) + self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, db_uri) + # Add release check here! + self.genebuild_dataset, self.genebuild_dataset_attributes, \ + genebuild_status = self.new_genebuild(species, meta_session, db_uri) + + conn = DatasetAdaptor(metadata_uri=metadata_uri) + genebuild_release_status = conn.check_release_status(self.genebuild_dataset.dataset_uuid) + + if organism_status == "New": + # ###############################Checks that dataset and assembly are new ################## + if assembly_status != "New" or genebuild_status != "New": + raise Exception("New organism, but existing assembly accession and/or genebuild version") + ############################################### + # Create genome and populate the database with organism, assembly and dataset + meta_session.add(self.assembly) + meta_session.add_all(self.assembly_sequences) + meta_session.add(self.organism) + meta_session.add(self.division) + meta_session.add(self.organism_group_member) + meta_session.add(self.dataset_source) + meta_session.add(self.assembly_dataset) + meta_session.add_all(self.assembly_dataset_attributes) + meta_session.add(self.genebuild_dataset) + meta_session.add_all(self.genebuild_dataset_attributes) + new_genome = Genome( + genome_uuid=str(uuid.uuid4()), + assembly=self.assembly, + organism=self.organism, + created=func.now(), + ) + meta_session.add(new_genome) + assembly_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=self.assembly_dataset, + is_current=True, + ) + meta_session.add(assembly_genome_dataset) + genebuild_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=self.genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) + + elif assembly_status == "New": + # ###############################Checks that dataset and update are new ################## + if genebuild_status != "New": + raise Exception("New assembly, but existing genebuild version") + ############################################### + + meta_session.add(self.assembly) + meta_session.add_all(self.assembly_sequences) + meta_session.add(self.dataset_source) + meta_session.add(self.assembly_dataset) + meta_session.add_all(self.assembly_dataset_attributes) + meta_session.add(self.genebuild_dataset) + meta_session.add_all(self.genebuild_dataset_attributes) + new_genome = Genome( + genome_uuid=str(uuid.uuid4()), + assembly=self.assembly, + organism=self.organism, + created=func.now(), + ) + meta_session.add(new_genome) + assembly_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=self.assembly_dataset, + is_current=True, + ) + meta_session.add(assembly_genome_dataset) + genebuild_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=self.genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) + + # Create genome and populate the database with assembly and dataset + elif genebuild_status == "New": + # Create genome and populate the database with genebuild dataset + meta_session.add(self.dataset_source) + meta_session.add(self.genebuild_dataset) + meta_session.add_all(self.genebuild_dataset_attributes) + new_genome = Genome( + genome_uuid=str(uuid.uuid4()), + assembly=self.assembly, + organism=self.organism, + created=func.now(), + ) + meta_session.add(new_genome) + genebuild_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=self.genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) + else: + # Check if the data has been released: + if genebuild_release_status is True: + raise Exception("Existing Organism, Assembly, and Datasets within a release") + else: + # Delete the data from the database and repopulate assembly and genebuild. + genome_dataset = meta_session.query(GenomeDataset).join(Dataset).filter( + Dataset.dataset_uuid == self.assembly_dataset.assembly_uuid).first() + bad_genome = meta_session.query(Genome).get(genome_dataset.genome_id) + meta_session.delete(bad_genome) + meta_session.commit() + + # Create genome and populate the database with assembly and dataset + self.assembly, self.assembly_dataset, self.assembly_dataset_attributes, self.assembly_sequences, \ + self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, db_uri) + self.genebuild_dataset, self.genebuild_dataset_attributes, genebuild_status = self.new_genebuild( + species, + meta_session, + db_uri) + meta_session.add(self.assembly) + meta_session.add_all(self.assembly_sequences) + meta_session.add(self.dataset_source) + meta_session.add(self.assembly_dataset) + meta_session.add_all(self.assembly_dataset_attributes) + meta_session.add(self.genebuild_dataset) + meta_session.add_all(self.genebuild_dataset_attributes) + new_genome = Genome( + genome_uuid=str(uuid.uuid4()), + assembly=self.assembly, + organism=self.organism, + created=func.now(), + ) + meta_session.add(new_genome) + assembly_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=self.assembly_dataset, + is_current=True, + ) + meta_session.add(assembly_genome_dataset) + genebuild_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=self.genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri): """ @@ -119,10 +237,10 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) # Instantiate a new Organism object using data fetched from metadata. new_organism = Organism( - species_taxonomy_id=self.get_meta_single_meta_key(species, "species.species_taxonomy_id"), # REQURIED - taxonomy_id=self.get_meta_single_meta_key(species, "species.taxonomy_id"), # REQURIED - display_name=self.get_meta_single_meta_key(species, "species.display_name"), # REQURIED , MAY BE DELETED - scientific_name=self.get_meta_single_meta_key(species, "species.scientific_name"), # REQURIED + species_taxonomy_id=self.get_meta_single_meta_key(species, "species.species_taxonomy_id"), + taxonomy_id=self.get_meta_single_meta_key(species, "species.taxonomy_id"), + display_name=self.get_meta_single_meta_key(species, "species.display_name"), + scientific_name=self.get_meta_single_meta_key(species, "species.scientific_name"), url_name=self.get_meta_single_meta_key(species, "species.url"), ensembl_name=ensembl_name, strain=self.get_meta_single_meta_key(species, "species.strain"), @@ -132,10 +250,14 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) # Query the metadata database to find if an Organism with the same Ensembl name already exists. old_organism = meta_session.query(Organism).filter( Organism.ensembl_name == new_organism.ensembl_name).one_or_none() - + division_name = self.get_meta_single_meta_key(species, "species.division") + division = meta_session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none() + organism_group_member = meta_session.query(OrganismGroupMember).filter( + OrganismGroupMember.organism_id == old_organism.organism_id, + OrganismGroupMember.organism_group_id == division.organism_group_id).one_or_none() # If an existing Organism is found, return it and indicate that it already existed. if old_organism: - return old_organism, "Existing" + return old_organism, division, organism_group_member, "Existing" else: # If no existing Organism is found, conduct additional checks before creating a new one. @@ -154,12 +276,10 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) "Assembly Accession already exists for a different organism. Please do a manual update.") # Fetch the division name of the new organism from metadata. - division_name = self.get_meta_single_meta_key(species, "species.division") if division_name is None: Exception("No species.division found in meta table") # Query the metadata database to find if an OrganismGroup with the same division name already exists. - division = meta_session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none() if division is None: # If no such OrganismGroup exists, create a new one. division = OrganismGroup( @@ -175,7 +295,7 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) ) # Return the newly created Organism and indicate that it is new. - return new_organism, "New" + return new_organism, division, organism_group_member, "New" def get_assembly_sequences(self, species, assembly): """ @@ -187,7 +307,7 @@ def get_assembly_sequences(self, species, assembly): SeqRegionAttribAlias = aliased(SeqRegionAttrib) AttribTypeAlias = aliased(AttribType) - # One complicated query to get all the data. Otherwise this takes far too long to do. + # One complicated query to get all the data. Otherwise, this takes far too long to do. results = (session.query(SeqRegion.name, SeqRegion.length, CoordSystem.name, SeqRegionAttribAlias.value, SeqRegionSynonym.synonym, ExternalDb.db_name) .join(SeqRegion.coord_system) @@ -223,9 +343,10 @@ def get_assembly_sequences(self, species, assembly): if not synonyms: # If it doesn't have any synonyms the accession is the name. name = accession + else: + name = ";".join(synonyms.keys()) # otherwise join all the accessions and store them in name - ################Likely problematic in the future###################### - name = ";".join(synonyms.keys()) + # ###############Likely problematic in the future###################### else: # For named sequences like chr1 name = seq_region_name @@ -235,7 +356,7 @@ def get_assembly_sequences(self, species, assembly): accession = synonym else: name = name + ";" + synonym - if accession is none: + if accession is None: raise Exception(f"seq_region_name {seq_region_name} accession could not be found. Please check") assembly_sequence = AssemblySequence( name=name, @@ -250,9 +371,8 @@ def get_assembly_sequences(self, species, assembly): assembly_sequences.append(assembly_sequence) return assembly_sequences - #TODO: add in assembly override for unreleased. Call this method agiain during logic after removing old assembly. - def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): - # Get the new assembly assession from the core handed over + def get_or_new_assembly(self, species, meta_session, db_uri): + # Get the new assembly accession from the core handed over assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") assembly = meta_session.query(Assembly).filter(Assembly.accession == assembly_accession).one_or_none() @@ -262,18 +382,18 @@ def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): # I should not need this, but double check on database updating. assembly_dataset_attributes = assembly_dataset.dataset_attributes - ################################ Tests ################################# + # ############################### Tests ################################# new_assembly_sequences = self.get_assembly_sequences(species, assembly) assembly_sequences = assembly.assembly_sequences # assembly sequences. Count and compare to make sure that they match. if len(assembly_sequences) != len(new_assembly_sequences): raise Exception("Number of sequences does not match number in database. " "A new assembly requires a new accession.") - ########################################################################## + # ######################################################################### dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") - return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "Existing" - + return assembly, assembly_dataset, assembly_dataset_attributes, \ + assembly_sequences, dataset_source, "Existing" else: with self.db.session_scope() as session: @@ -282,15 +402,15 @@ def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): CoordSystem.species_id == species).order_by(CoordSystem.rank)).all())[0][0] assembly = Assembly( ucsc_name=self.get_meta_single_meta_key(species, "assembly.ucsc_alias"), - accession=self.get_meta_single_meta_key(self.species, "assembly.accession"), + accession=self.get_meta_single_meta_key(species, "assembly.accession"), level=level, -# level=self.get_meta_single_meta_key(self.species, "assembly.level"), #Not yet implemented. - name=self.get_meta_single_meta_key(self.species, "assembly.name"), - accession_body=self.get_meta_single_meta_key(self.species, "assembly.provider"), - assembly_default=self.get_meta_single_meta_key(self.species, "assembly.default"), - tol_id=self.get_meta_single_meta_key(self.species, "assembly.tol_id"), # Not implemented yet + # level=self.get_meta_single_meta_key(self.species, "assembly.level"), #Not yet implemented. + name=self.get_meta_single_meta_key(species, "assembly.name"), + accession_body=self.get_meta_single_meta_key(species, "assembly.provider"), + assembly_default=self.get_meta_single_meta_key(species, "assembly.default"), + tol_id=self.get_meta_single_meta_key(species, "assembly.tol_id"), # Not implemented yet created=func.now(), - ensembl_name=self.get_meta_single_meta_key(self.species, "assembly.name"), + ensembl_name=self.get_meta_single_meta_key(species, "assembly.name"), assembly_uuid=str(uuid.uuid4()), ) dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() @@ -300,7 +420,7 @@ def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): dataset_uuid=str(uuid.uuid4()), dataset_type=dataset_type, # extract from dataset_type name="assembly", - ###version=None, Could be changed. + # version=None, Could be changed. label=assembly.accession, # Required. Makes for a quick lookup created=func.now(), dataset_source=dataset_source, # extract from dataset_source @@ -313,7 +433,7 @@ def get_or_new_assembly(self, species, meta_session, metadata_uri, db_uri): attribute.replace("assembly.", "", 1) meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() if meta_attribute is None: - raise Exception(f"Atribute {attribute} not found. Please enter it into the db manually") + raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually") dataset_attribute = DatasetAttribute( value=value, dataset=assembly_dataset, @@ -333,9 +453,9 @@ def new_genebuild(self, species, meta_session, db_uri): """ assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") genebuild_version = self.get_meta_single_meta_key(species, "genebuild.version") - genebuild_accesion = assembly_accession + "_" + genebuild_version + genebuild_accession = assembly_accession + "_" + genebuild_version # genebuild_dataset = meta_session.query(Dataset).filter( - # Dataset.label == genebuild_accesion).one_or_none() + # Dataset.label == genebuild_accession).one_or_none() dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() genebuild_dataset = Dataset( @@ -343,7 +463,7 @@ def new_genebuild(self, species, meta_session, db_uri): dataset_type=dataset_type, # extract from dataset_type name="assembly", version=genebuild_version, - label=genebuild_accesion, # Required. Used for lookup in this script + label=genebuild_accession, # Required. Used for lookup in this script created=func.now(), dataset_source=dataset_source, # extract from dataset_source status='Submitted', @@ -354,7 +474,7 @@ def new_genebuild(self, species, meta_session, db_uri): attribute.replace("genebuild.", "", 1) meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() if meta_attribute is None: - raise Exception(f"Atribute {attribute} not found. Please enter it into the db manually") + raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually") dataset_attribute = DatasetAttribute( value=value, dataset=genebuild_dataset, @@ -362,503 +482,6 @@ def new_genebuild(self, species, meta_session, db_uri): ) genebuild_dataset_attributes.append(dataset_attribute) - - return genebuild_dataset, genebuild_dataset_attributes - - - - - - attributes = self.get_meta_list_from_prefix_meta_key(self, species, "assembly") - assembly_dataset_attributes = [] - for attribute, value in attributes: - meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() - if meta_attribute is None: - raise Exception(f"Atribute {attribute} not found. Please enter it into the db manually") - dataset_attribute = DatasetAttribute( - value=value, - dataset=assembly_dataset, - attribute=meta_attribute, - ) - assembly_dataset_attributes.append(dataset_attribute) - - assembly_sequences = self.get_assembly_sequences(species, assembly) - - return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "New" - - # Old functions - # self.new_genome() - # self.new_genome_release() - # self.new_assembly() - # self.new_assembly_sequence() - # self.new_assembly_dataset() - # self.new_dataset_source() - # self.new_genome_dataset() - # self.new_datasets() - - ######################################################################## - ##### Logic Section ######################## - ######################################################################## - - # # Species Check - # # Check for new species by checking if ensembl name is already present in the database - # if not GenomeAdaptor(metadata_uri=self.metadata_db.url, - # taxonomy_uri=self.taxonomy_uri).fetch_genomes_by_ensembl_name( - # self.organism.ensembl_name): - # # Check if the assembly accesion is already present in the database - # new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") - # with self.metadata_db.session_scope() as session: - # if session.query(session.query(Assembly).filter_by(accession=new_assembly_acc).exists()).scalar(): - # Exception("Assembly Accession already exists for a different organism. Please do a manual update.") - # self.create_organism() - # logging.info("Fresh Organism. Adding data to organism, genome, genome_release," - # " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") - # - # # Check to see if it is an updated organism. - # else: - # with self.metadata_db.session_scope() as session: - # session.expire_on_commit = False - # test_organism = session.execute(db.select(Organism).filter( - # Organism.ensembl_name == self.organism.ensembl_name)).one_or_none() - # self.organism.organism_id = Organism.organism_id - # self.organism.scientific_parlance_name = Organism.scientific_parlance_name - # - # if int(test_organism.Organism.species_taxonomy_id) == int( - # self.organism.species_taxonomy_id) and \ - # int(test_organism.Organism.taxonomy_id) == int( - # self.organism.taxonomy_id) and \ - # str(test_organism.Organism.display_name) == str( - # self.organism.display_name) and \ - # str(test_organism.Organism.scientific_name) == str( - # self.organism.scientific_name) and \ - # str(test_organism.Organism.url_name) == str( - # self.organism.url_name) and \ - # str(test_organism.Organism.strain) == str(self.organism.strain): - # logging.info("Old Organism with no change. No update to organism table") - # ################################################################ - # ##### Assembly Check and Update - # ################################################################ - # with self.metadata_db.session_scope() as session: - # assembly_acc = session.execute(db.select(Assembly - # ).join(Genome.assembly).join(Genome.organism).filter( - # Organism.ensembl_name == self.organism.ensembl_name)).all() - # new_assembly_acc = self.get_meta_single_meta_key(self.species, "assembly.accession") - # assembly_test = False - # for assembly_obj in assembly_acc: - # if assembly_obj[0].accession == new_assembly_acc: - # assembly_test = True - # if assembly_test: - # logging.info( - # "Old Assembly with no change. No update to Genome, genome_release, assembly, and assembly_sequence tables.") - # for dataset in self.datasets: - # with self.metadata_db.session_scope() as session: - # # Check to see if any already exist: - # # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - # if dataset.name == "genebuild": - # dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - # Dataset.version == dataset.version, - # Dataset.label == dataset.label).first() - # if dataset_test is None: - # gb_dataset_type = session.query(DatasetType).filter( - # DatasetType.name == "genebuild").first() - # dataset.dataset_type = gb_dataset_type - # dataset.dataset_source = self.dataset_source - # session.add(dataset) - # - # else: - # logging.info("New Assembly. Updating genome, genome_release," - # " assembly, assembly_sequence, dataset, dataset source, and genome_dataset tables.") - # self.update_assembly() - # ################################################################ - # ##### dataset Check and Update - # ################################################################ - # # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. - # - # - # - # - # else: - # self.update_organism() - # logging.info("Old Organism with changes. Updating organism table") - # - # def create_organism(self): - # # In this, we are assuming that with a new genome, there will be a new assemblbly. - # - # with self.metadata_db.session_scope() as session: - # # Organism section - # # Updating Organism, organism_group_member, and organism_group - # self.new_organism_group_and_members(session) - # # Add in the new assembly here - # # assembly sequence, assembly, genome, genome release. - # assembly_test = session.execute(db.select(Assembly).filter( - # Assembly.accession == self.assembly.accession)).one_or_none() - # if assembly_test is not None: - # Exception( - # "Error, existing name but, assembly accession already found. Please update the Ensembl Name in the Meta field manually") - # if self.listed_release is not None: - # release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() - # self.genome_release.ensembl_release = release - # self.genome_release.genome = self.genome - # - # for assembly_seq in self.assembly_sequences: - # assembly_seq.assembly = self.assembly - # self.assembly.genomes.append(self.genome) - # - # self.genome.organism = self.organism - # - # # Update assembly dataset - # # Updates genome_dataset,dataset,dataset_source - # dataset_source_test = session.execute( - # db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() - # if dataset_source_test is not None: - # Exception("Error, data already present in source") - # - # dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() - # if self.listed_release is not None: - # self.genome_dataset.ensembl_release = release - # self.genome_dataset.genome = self.genome - # self.genome_dataset.dataset = self.assembly_dataset - # - # self.assembly_dataset.dataset_type = dataset_type - # self.assembly_dataset.dataset_source = self.dataset_source - # - # assembly_genome_dataset = GenomeDataset( - # genome_dataset_id=None, # Should be autogenerated upon insertion - # dataset_id=None, # extract from dataset once genertated - # genome_id=None, # extract from genome once genertated - # release_id=None, # extract from release once genertated - # is_current=0, - # ) - # assembly_genome_dataset.dataset = self.assembly_dataset - # self.genome.genome_datasets.append(assembly_genome_dataset) - # - # # session.add(assembly_genome_dataset) - # - # # Dataset section. More logic will be necessary for additional datasets. Currently only the genebuild is listed here. - # for dataset in self.datasets: - # # Check to see if any already exist: - # # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - # if dataset.name == "genebuild": - # dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - # Dataset.version == dataset.version, - # Dataset.label == dataset.label).first() - # if dataset_test is None: - # dataset.dataset_type = session.query(DatasetType).filter( - # DatasetType.name == "genebuild").first() - # dataset.dataset_source = self.dataset_source - # temp_genome_dataset = GenomeDataset( - # genome_dataset_id=None, # Should be autogenerated upon insertion - # dataset_id=None, # extract from dataset once genertated - # genome_id=None, # extract from genome once genertated - # release_id=None, # extract from release once genertated - # is_current=0, - # ) - # temp_genome_dataset.dataset = dataset - # self.genome.genome_datasets.append(temp_genome_dataset) - # # Add everything to the database. Closing the session commits it. - # session.add(self.organism) - # - # def update_organism(self): - # with self.metadata_db.session_scope() as session: - # session.execute( - # update(Organism).where(Organism.ensembl_name == self.organism.ensembl_name).values( - # species_taxonomy_id=self.organism.species_taxonomy_id, - # taxonomy_id=self.organism.taxonomy_id, - # display_name=self.organism.display_name, - # scientific_name=self.organism.scientific_name, - # url_name=self.organism.url_name, - # ensembl_name=self.organism.ensembl_name, - # strain=self.organism.strain, - # )) - # - # # TODO: Add an update to the groups here. - # - # def update_assembly(self): - # # Change to new assembly/fresh - # with self.metadata_db.session_scope() as session: - # # Get the genome - # self.organism = session.query(Organism).filter( - # Organism.ensembl_name == self.organism.ensembl_name).first() - # self.genome.organism = self.organism - # - # if self.listed_release is not None: - # release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == self.listed_release).first() - # self.genome_release.ensembl_release = release - # self.genome_release.genome = self.genome - # - # self.assembly.genomes.append(self.genome) - # - # # Update assembly dataset - # # Updates genome_dataset,dataset,dataset_source - # dataset_source_test = session.execute( - # db.select(DatasetSource).filter(DatasetSource.name == self.dataset_source.name)).one_or_none() - # if dataset_source_test is not None: - # self.dataset_source = session.query(DatasetSource).filter( - # DatasetSource.name == self.dataset_source.name).first() - # - # dataset_type = session.query(DatasetType).filter(DatasetType.name == "assembly").first() - # if self.listed_release is not None: - # self.genome_dataset.ensembl_release = release - # self.genome_dataset.genome = self.genome - # self.genome_dataset.dataset = self.assembly_dataset - # - # self.assembly_dataset.dataset_type = dataset_type - # self.assembly_dataset.dataset_source = self.dataset_source - # - # assembly_genome_dataset = GenomeDataset( - # genome_dataset_id=None, # Should be autogenerated upon insertion - # dataset_id=None, # extract from dataset once genertated - # genome_id=None, # extract from genome once genertated - # release_id=None, # extract from release once genertated - # is_current=0, - # ) - # assembly_genome_dataset.dataset = self.assembly_dataset - # self.genome.genome_datasets.append(assembly_genome_dataset) - # - # for dataset in self.datasets: - # # Check to see if any already exist: - # # for all of genebuild in dataset, see if any have the same label (genebuild.id) and version. If so, don't update and error out here! - # if dataset.name == "genebuild": - # dataset_test = session.query(Dataset).filter(Dataset.name == "genebuild", - # Dataset.version == dataset.version, - # Dataset.label == dataset.label).first() - # dataset.dataset_type = session.query(DatasetType).filter( - # DatasetType.name == "genebuild").first() - # dataset.dataset_source = self.dataset_source - # temp_genome_dataset = GenomeDataset( - # genome_dataset_id=None, # Should be autogenerated upon insertion - # dataset_id=None, # extract from dataset once genertated - # genome_id=None, # extract from genome once genertated - # release_id=None, # extract from release once genertated - # is_current=0, - # ) - # temp_genome_dataset.dataset = dataset - # self.genome.genome_datasets.append(temp_genome_dataset) - # # Add everything to the database. Closing the session commits it. - # session.add(self.genome) - # - # # The following methods populate the data from the core into the objects. K - # # It may be beneficial to move them to the base class with later implementations - # def new_organism(self): - # # All taken from the meta table except parlance name. - # self.organism = Organism( - # organism_id=None, # Should be autogenerated upon insertion - # species_taxonomy_id=self.get_meta_single_meta_key(self.species, "species.species_taxonomy_id"), - # taxonomy_id=self.get_meta_single_meta_key(self.species, "species.taxonomy_id"), - # display_name=self.get_meta_single_meta_key(self.species, "species.display_name"), - # scientific_name=self.get_meta_single_meta_key(self.species, "species.scientific_name"), - # url_name=self.get_meta_single_meta_key(self.species, "species.url"), - # ensembl_name=self.get_meta_single_meta_key(self.species, "species.production_name"), - # strain=self.get_meta_single_meta_key(self.species, "species.strain"), - # scientific_parlance_name=None, - # ) - # if self.organism.species_taxonomy_id is None: - # self.organism.species_taxonomy_id = self.organism.taxonomy_id - # - # def new_organism_group_and_members(self, session): - # # This method auto grabs the division name and checks for the strain groups - # division_name = self.get_meta_single_meta_key(self.species, "species.division") - # if division_name is None: - # Exception("No species.dvision found in meta table") - # division = session.execute(db.select(OrganismGroup).filter(OrganismGroup.name == division_name)).one_or_none() - # if division is None: - # group = OrganismGroup( - # organism_group_id=None, - # type="Division", - # name=division_name, - # code=None, - # ) - # else: - # group = session.query(OrganismGroup).filter(OrganismGroup.name == division_name).first() - # self.organism_group_member = OrganismGroupMember( - # organism_group_member_id=None, - # is_reference=0, - # organism_id=None, - # organism_group_id=None, - # ) - # self.organism_group_member.organism_group = group - # self.organism_group_member.organism = self.organism - # - # # Work on the strain level group members - # strain = self.get_meta_single_meta_key(self.species, "species.strain") - # strain_group = self.get_meta_single_meta_key(self.species, "species.strain_group") - # strain_type = self.get_meta_single_meta_key(self.species, "species.type") - # - # if strain is not None: - # if strain == 'reference': - # reference = 1 - # else: - # reference = 0 - # group_member = OrganismGroupMember( - # organism_group_member_id=None, - # is_reference=reference, - # organism_id=None, - # organism_group_id=None, - # ) - # # Check for group, if not present make it - # division = session.execute( - # db.select(OrganismGroup).filter(OrganismGroup.name == strain_group)).one_or_none() - # if division is None: - # group = OrganismGroup( - # organism_group_id=None, - # type=strain_type, - # name=strain_group, - # code=None, - # ) - # - # else: - # group = session.query(OrganismGroup).filter(OrganismGroup.name == strain_group).first() - # group_member.organism_group = group - # group_member.organism = self.organism - # - # def new_genome(self): - # # Data for the update function. - # self.genome = Genome( - # genome_id=None, # Should be autogenerated upon insertion - # genome_uuid=str(uuid.uuid4()), - # assembly_id=None, # Update the assembly before inserting and grab the assembly key - # organism_id=None, # Update the organism before inserting and grab the organism_id - # created=func.now(), # Replace all of them with sqlalchemy func.now() - # ) - # - # def new_genome_release(self): - # # Genome Release - # self.genome_release = GenomeRelease( - # genome_release_id=None, # Should be autogenerated upon insertion - # genome_id=None, # Update the genome before inserting and grab the genome_id - # release_id=None, - # is_current=self.listed_release_is_current, - # ) - # - # def new_assembly(self): - # level = None - # with self.db.session_scope() as session: - # level = (session.execute(db.select(CoordSystem.name).filter( - # CoordSystem.species_id == self.species).order_by(CoordSystem.rank)).all())[0][0] - # - # self.assembly = Assembly( - # assembly_id=None, # Should be autogenerated upon insertion - # ucsc_name=self.get_meta_single_meta_key(self.species, "assembly.ucsc_alias"), - # accession=self.get_meta_single_meta_key(self.species, "assembly.accession"), - # level=level, - # name=self.get_meta_single_meta_key(self.species, "assembly.name"), - # accession_body=None, # Not implemented yet - # assembly_default=self.get_meta_single_meta_key(self.species, "assembly.default"), - # created=func.now(), - # ensembl_name=self.get_meta_single_meta_key(self.species, "assembly.name"), - # ) - - # def new_assembly_dataset(self): - # self.assembly_dataset = Dataset( - # dataset_id=None, # Should be autogenerated upon insertion - # dataset_uuid=str(uuid.uuid4()), - # dataset_type_id=None, # extract from dataset_type - # name="assembly", - # version=None, - # created=func.now(), - # dataset_source_id=None, # extract from dataset_source - # label=self.assembly.accession, - # status='Submitted', - # ) - # - # def new_assembly_sequence(self): - # self.assembly_sequences = [] - # with self.db.session_scope() as session: - # # Alias the seq_region_attrib and seq_region_synonym tables - # sra1 = aliased(SeqRegionAttrib) - # sra3 = aliased(SeqRegionAttrib) - # - # results = ( - # session.query(SeqRegion.name, SeqRegionSynonym.synonym, SeqRegion.length, - # CoordSystem.name, - # sra3.value, - # ) - # .join(CoordSystem, SeqRegion.coord_system_id == CoordSystem.coord_system_id) - # .join(Meta, CoordSystem.species_id == Meta.species_id) - # .join(sra1, SeqRegion.seq_region_id == sra1.seq_region_id) - # .outerjoin(SeqRegionSynonym, and_(SeqRegion.seq_region_id == SeqRegionSynonym.seq_region_id, - # SeqRegionSynonym.external_db_id == 50710)) - # .outerjoin(sra3, and_(SeqRegion.seq_region_id == sra3.seq_region_id, - # sra3.attrib_type_id == 547)) - # .filter(Meta.meta_key == 'assembly.accession', sra1.attrib_type_id == 6, - # Meta.species_id == self.species) - # ).all() - # for data in results: - # # If the name does not match normal accession formating, then use that name. - # name = None - # if re.match(r'^[a-zA-Z]+\d+\.\d+', data[0]): - # name = None - # else: - # name = data[0] - # # Nab accession from the seq region synonym or else the name. - # accession = None - # if data[1] is not None and re.match(r'^[a-zA-Z]+\d+\.\d+', data[1]): - # accession = data[1] - # elif name is not None: - # accession = name - # else: - # accession = data[0] - # - # chromosomal = 0 - # if data[3] == 'chromosome': - # chromosomal = 1 - # - # sequence_location = None - # if data[4] == 'nuclear_chromosome': - # sequence_location = 'SO:0000738' - # elif data[4] == 'mitochondrial_chromosome': - # sequence_location = 'SO:0000737' - # elif data[4] == 'chloroplast_chromosome': - # sequence_location = 'SO:0000745' - # elif data[4] is None: - # sequence_location = 'SO:0000738' - # else: - # raise Exception('Error with sequence location: ' + data[4] + ' is not a valid type') - # - # self.assembly_sequences.append(AssemblySequence( - # assembly_sequence_id=None, # Should be autogenerated upon insertion - # name=name, - # assembly_id=None, # Update the assembly before inserting and grab the assembly_id - # accession=accession, - # chromosomal=chromosomal, - # length=data[2], - # sequence_location=sequence_location, - # # These two get populated in the core stats pipeline. - # sequence_checksum=None, - # ga4gh_identifier=None, - # )) - # - # def new_genome_dataset(self): - # self.genome_dataset = GenomeDataset( - # genome_dataset_id=None, # Should be autogenerated upon insertion - # dataset_id=None, # extract from dataset once genertated - # genome_id=None, # extract from genome once genertated - # release_id=None, # extract from release once genertated - # is_current=self.listed_release_is_current, - # ) - # - # def new_dataset_source(self): - # self.dataset_source = DatasetSource( - # dataset_source_id=None, # Should be autogenerated upon insertion - # type=self.db_type, # core/fungen etc - # name=make_url(self.db_uri).database # dbname - # ) - # - # def new_datasets(self): - # self.datasets = [] - # # Genebuild. - # label = self.get_meta_single_meta_key(self.species, "genebuild.last_geneset_update") - # if label is None: - # label = self.get_meta_single_meta_key(self.species, "genebuild.start_date") - # self.datasets.append(Dataset( - # dataset_id=None, # Should be autogenerated upon insertion - # dataset_uuid=str(uuid.uuid4()), - # dataset_type_id=None, # extract from dataset_type - # name="genebuild", - # version=self.get_meta_single_meta_key(self.species, "gencode.version"), - # created=func.now(), - # dataset_source_id=None, # extract from dataset_source - # label=label, - # status='Submitted', - # )) - # Protein Features + test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none() + status = "New" if test_status is None else "Existing" + return genebuild_dataset, genebuild_dataset_attributes, status From 4f8c7b643794f6eac2e7fefb0606ff93ec38ed75 Mon Sep 17 00:00:00 2001 From: danielp Date: Sun, 16 Jul 2023 20:33:38 +0100 Subject: [PATCH 04/20] Full refactor of updater before testing. Created release check for API datasets --- .../metadata/api/models/assembly.py | 4 +- .../production/metadata/api/models/dataset.py | 4 +- .../production/metadata/api/models/genome.py | 4 +- .../metadata/api/models/organism.py | 2 +- .../production/metadata/updater/Temporary.py | 6 - .../production/metadata/updater/base.py | 4 +- .../production/metadata/updater/core.py | 379 +++++++++--------- 7 files changed, 191 insertions(+), 212 deletions(-) delete mode 100644 src/ensembl/production/metadata/updater/Temporary.py diff --git a/src/ensembl/production/metadata/api/models/assembly.py b/src/ensembl/production/metadata/api/models/assembly.py index c840d47a..503912b1 100644 --- a/src/ensembl/production/metadata/api/models/assembly.py +++ b/src/ensembl/production/metadata/api/models/assembly.py @@ -35,9 +35,9 @@ class Assembly(Base): alt_accession = Column(String(16), nullable=True) # One to many relationships # assembly_id within assembly_sequence - assembly_sequences = relationship("AssemblySequence", back_populates="assembly") + assembly_sequences = relationship("AssemblySequence", back_populates="assembly", cascade="all, delete, delete-orphan") # assembly_id within genome - genomes = relationship("Genome", back_populates="assembly") + genomes = relationship("Genome", back_populates="assembly", cascade="all, delete, delete-orphan") class AssemblySequence(Base): diff --git a/src/ensembl/production/metadata/api/models/dataset.py b/src/ensembl/production/metadata/api/models/dataset.py index 4a458b12..cd90dea2 100644 --- a/src/ensembl/production/metadata/api/models/dataset.py +++ b/src/ensembl/production/metadata/api/models/dataset.py @@ -47,8 +47,8 @@ class Dataset(Base): # One to many relationships # dataset_id to dataset attribute and genome dataset - dataset_attributes = relationship("DatasetAttribute", back_populates='dataset') - genome_datasets = relationship("GenomeDataset", back_populates='dataset') + dataset_attributes = relationship("DatasetAttribute", back_populates='dataset', cascade="all, delete, delete-orphan") + genome_datasets = relationship("GenomeDataset", back_populates='dataset', cascade="all, delete, delete-orphan") # many to one relationships # dataset_type_id to dataset_type dataset_type = relationship('DatasetType', back_populates="datasets") diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index cff25617..21060e08 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -28,8 +28,8 @@ class Genome(Base): created = Column(DATETIME(fsp=6), nullable=False) # One to many relationships # genome_id to genome_dataset and genome release - genome_datasets = relationship("GenomeDataset", back_populates="genome") - genome_releases = relationship("GenomeRelease", back_populates="genome") + genome_datasets = relationship("GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan") + genome_releases = relationship("GenomeRelease", back_populates="genome", cascade="all, delete, delete-orphan") # many to one relationships # assembly_id to assembly assembly = relationship("Assembly", back_populates="genomes") diff --git a/src/ensembl/production/metadata/api/models/organism.py b/src/ensembl/production/metadata/api/models/organism.py index 9dc4f747..ebcb5bb4 100644 --- a/src/ensembl/production/metadata/api/models/organism.py +++ b/src/ensembl/production/metadata/api/models/organism.py @@ -33,7 +33,7 @@ class Organism(Base): scientific_parlance_name = Column(String(255)) # One to many relationships # Organism_id to organism_group_member and genome - genomes = relationship("Genome", back_populates="organism") + genomes = relationship("Genome", back_populates="organism", cascade="all, delete, delete-orphan") organism_group_members = relationship("OrganismGroupMember", back_populates="organism") # many to one relationships diff --git a/src/ensembl/production/metadata/updater/Temporary.py b/src/ensembl/production/metadata/updater/Temporary.py deleted file mode 100644 index 14572dfc..00000000 --- a/src/ensembl/production/metadata/updater/Temporary.py +++ /dev/null @@ -1,6 +0,0 @@ - -from ensembl.production.metadata.api.factory import meta_factory - - -test = meta_factory( 'mysql://danielp:Killadam69@localhost:3306/acanthochromis_polyacanthus_core_109_1',"mysql://danielp:Killadam69@localhost:3306/ensembl_genome_metadata",'mysql://danielp:Killadam69@localhost:3306/ncbi_taxonomy') -test.process_core() diff --git a/src/ensembl/production/metadata/updater/base.py b/src/ensembl/production/metadata/updater/base.py index 49d72e2c..642e1f64 100644 --- a/src/ensembl/production/metadata/updater/base.py +++ b/src/ensembl/production/metadata/updater/base.py @@ -50,13 +50,12 @@ def get_meta_list_from_prefix_meta_key(self, species_id, prefix): ) result = query.all() if not result: - return None + return {} else: # Build a dictionary out of the results. result_dict = {key: value for key, value in result} return result_dict - def get_or_new_source(self, meta_session, db_uri, db_type): name = make_url(db_uri).database dataset_source = meta_session.query(DatasetSource).filter(DatasetSource.name == name).one_or_none() @@ -65,6 +64,7 @@ def get_or_new_source(self, meta_session, db_uri, db_type): type=db_type, # core/fungen etc name=name # dbname ) + meta_session.add(dataset_source) # Only add a new DatasetSource to the session if it doesn't exist return dataset_source, "new" else: return dataset_source, "existing" diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 8cbc8b3c..935d5128 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -53,7 +53,7 @@ def process_core(self, **kwargs): sel_species = kwargs.get('species', None) metadata_uri = kwargs.get('metadata_uri', self.metadata_uri) taxonomy_uri = kwargs.get('metadata_uri', self.taxonomy_uri) - db_uri = kwargs.get('db_uri') + db_uri = kwargs.get('db_uri', self.db_uri) if sel_species: with self.db.session_scope() as session: multi_species = session.execute( @@ -83,146 +83,86 @@ def process_species(self, species, metadata_uri, taxonomy_uri, db_uri): self.get_or_new_organism(species, meta_session, metadata_uri, taxonomy_uri) self.assembly, self.assembly_dataset, self.assembly_dataset_attributes, self.assembly_sequences, \ self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, db_uri) - # Add release check here! self.genebuild_dataset, self.genebuild_dataset_attributes, \ - genebuild_status = self.new_genebuild(species, meta_session, db_uri) + genebuild_status = self.new_genebuild(species, meta_session, db_uri, self.dataset_source) conn = DatasetAdaptor(metadata_uri=metadata_uri) genebuild_release_status = conn.check_release_status(self.genebuild_dataset.dataset_uuid) - if organism_status == "New": - # ###############################Checks that dataset and assembly are new ################## - if assembly_status != "New" or genebuild_status != "New": - raise Exception("New organism, but existing assembly accession and/or genebuild version") - ############################################### - # Create genome and populate the database with organism, assembly and dataset - meta_session.add(self.assembly) - meta_session.add_all(self.assembly_sequences) - meta_session.add(self.organism) - meta_session.add(self.division) - meta_session.add(self.organism_group_member) - meta_session.add(self.dataset_source) - meta_session.add(self.assembly_dataset) - meta_session.add_all(self.assembly_dataset_attributes) - meta_session.add(self.genebuild_dataset) - meta_session.add_all(self.genebuild_dataset_attributes) - new_genome = Genome( - genome_uuid=str(uuid.uuid4()), - assembly=self.assembly, - organism=self.organism, - created=func.now(), - ) - meta_session.add(new_genome) - assembly_genome_dataset = GenomeDataset( - genome=new_genome, - dataset=self.assembly_dataset, - is_current=True, - ) - meta_session.add(assembly_genome_dataset) - genebuild_genome_dataset = GenomeDataset( - genome=new_genome, - dataset=self.genebuild_dataset, - is_current=True, - ) - meta_session.add(genebuild_genome_dataset) - - elif assembly_status == "New": - # ###############################Checks that dataset and update are new ################## - if genebuild_status != "New": - raise Exception("New assembly, but existing genebuild version") - ############################################### - - meta_session.add(self.assembly) - meta_session.add_all(self.assembly_sequences) - meta_session.add(self.dataset_source) - meta_session.add(self.assembly_dataset) - meta_session.add_all(self.assembly_dataset_attributes) - meta_session.add(self.genebuild_dataset) - meta_session.add_all(self.genebuild_dataset_attributes) - new_genome = Genome( - genome_uuid=str(uuid.uuid4()), - assembly=self.assembly, - organism=self.organism, - created=func.now(), - ) - meta_session.add(new_genome) - assembly_genome_dataset = GenomeDataset( - genome=new_genome, - dataset=self.assembly_dataset, - is_current=True, - ) - meta_session.add(assembly_genome_dataset) - genebuild_genome_dataset = GenomeDataset( - genome=new_genome, - dataset=self.genebuild_dataset, - is_current=True, - ) - meta_session.add(genebuild_genome_dataset) - - # Create genome and populate the database with assembly and dataset - elif genebuild_status == "New": - # Create genome and populate the database with genebuild dataset - meta_session.add(self.dataset_source) - meta_session.add(self.genebuild_dataset) - meta_session.add_all(self.genebuild_dataset_attributes) - new_genome = Genome( - genome_uuid=str(uuid.uuid4()), - assembly=self.assembly, - organism=self.organism, - created=func.now(), - ) - meta_session.add(new_genome) - genebuild_genome_dataset = GenomeDataset( - genome=new_genome, - dataset=self.genebuild_dataset, - is_current=True, - ) - meta_session.add(genebuild_genome_dataset) - else: - # Check if the data has been released: - if genebuild_release_status is True: - raise Exception("Existing Organism, Assembly, and Datasets within a release") - else: - # Delete the data from the database and repopulate assembly and genebuild. - genome_dataset = meta_session.query(GenomeDataset).join(Dataset).filter( - Dataset.dataset_uuid == self.assembly_dataset.assembly_uuid).first() - bad_genome = meta_session.query(Genome).get(genome_dataset.genome_id) - meta_session.delete(bad_genome) - meta_session.commit() + if organism_status == "New": + print ("New organism") + # ###############################Checks that dataset and assembly are new ################## + if assembly_status != "New" or genebuild_status != "New": + raise Exception("New organism, but existing assembly accession and/or genebuild version") + ############################################### + # Create genome and populate the database with organism, assembly and dataset + new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, + self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + + elif assembly_status == "New": + print ("New assembly") + + # ###############################Checks that dataset and update are new ################## + if genebuild_status != "New": + raise Exception("New assembly, but existing genebuild version") + ############################################### + + new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, + self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) # Create genome and populate the database with assembly and dataset - self.assembly, self.assembly_dataset, self.assembly_dataset_attributes, self.assembly_sequences, \ - self.dataset_source, assembly_status = self.get_or_new_assembly(species, meta_session, db_uri) - self.genebuild_dataset, self.genebuild_dataset_attributes, genebuild_status = self.new_genebuild( - species, - meta_session, - db_uri) - meta_session.add(self.assembly) - meta_session.add_all(self.assembly_sequences) - meta_session.add(self.dataset_source) - meta_session.add(self.assembly_dataset) - meta_session.add_all(self.assembly_dataset_attributes) - meta_session.add(self.genebuild_dataset) - meta_session.add_all(self.genebuild_dataset_attributes) - new_genome = Genome( - genome_uuid=str(uuid.uuid4()), - assembly=self.assembly, - organism=self.organism, - created=func.now(), - ) - meta_session.add(new_genome) - assembly_genome_dataset = GenomeDataset( - genome=new_genome, - dataset=self.assembly_dataset, - is_current=True, - ) - meta_session.add(assembly_genome_dataset) - genebuild_genome_dataset = GenomeDataset( - genome=new_genome, - dataset=self.genebuild_dataset, - is_current=True, - ) - meta_session.add(genebuild_genome_dataset) + elif genebuild_status == "New": + print ("New genebuild") + + # Create genome and populate the database with genebuild dataset + new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, + self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + else: + # Check if the data has been released: + if genebuild_release_status is True: + raise Exception("Existing Organism, Assembly, and Datasets within a release") + else: + print("Rewrite") + #Need to do a rewrite, so that it only redoes the geneset data. + + # Delete the data from the database and repopulate assembly and genebuild. + genome_dataset = meta_session.query(GenomeDataset).join(Dataset).filter( + Dataset.dataset_uuid == self.assembly_dataset.dataset_uuid).first() + current_genome = meta_session.query(Genome).get(genome_dataset.genome_id) + for d in meta_session.query(Dataset).join(GenomeDataset).filter( + GenomeDataset.genome_id == current_genome.genome_id).filter(Dataset.name == "genebuild"): + meta_session.delete(d) + meta_session.commit() + meta_session.flush() + genebuild_genome_dataset = GenomeDataset( + genome=current_genome, + dataset=self.genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) + + + def new_genome(self, meta_session, organism, assembly, assembly_dataset, genebuild_dataset): + new_genome = Genome( + genome_uuid=str(uuid.uuid4()), + assembly=assembly, + organism=organism, + created=func.now(), + ) + meta_session.add(new_genome) + assembly_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=assembly_dataset, + is_current=True, + ) + meta_session.add(assembly_genome_dataset) + genebuild_genome_dataset = GenomeDataset( + genome=new_genome, + dataset=genebuild_dataset, + is_current=True, + ) + meta_session.add(genebuild_genome_dataset) + return new_genome, assembly_genome_dataset, genebuild_genome_dataset def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri): """ @@ -252,11 +192,12 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) Organism.ensembl_name == new_organism.ensembl_name).one_or_none() division_name = self.get_meta_single_meta_key(species, "species.division") division = meta_session.query(OrganismGroup).filter(OrganismGroup.name == division_name).one_or_none() - organism_group_member = meta_session.query(OrganismGroupMember).filter( - OrganismGroupMember.organism_id == old_organism.organism_id, - OrganismGroupMember.organism_group_id == division.organism_group_id).one_or_none() + # If an existing Organism is found, return it and indicate that it already existed. if old_organism: + organism_group_member = meta_session.query(OrganismGroupMember).filter( + OrganismGroupMember.organism_id == old_organism.organism_id, + OrganismGroupMember.organism_group_id == division.organism_group_id).one_or_none() return old_organism, division, organism_group_member, "Existing" else: # If no existing Organism is found, conduct additional checks before creating a new one. @@ -286,14 +227,16 @@ def get_or_new_organism(self, species, meta_session, metadata_uri, taxonomy_uri) type="Division", name=division_name, ) + meta_session.add(division) # Create a new OrganismGroupMember linking the new Organism to the division group. organism_group_member = OrganismGroupMember( is_reference=0, - organism_id=new_organism, - organism_group_id=division, + organism=new_organism, + organism_group=division, ) - + meta_session.add(new_organism) + meta_session.add(organism_group_member) # Return the newly created Organism and indicate that it is new. return new_organism, division, organism_group_member, "New" @@ -323,55 +266,75 @@ def get_assembly_sequences(self, species, assembly): .filter(AttribTypeAlias.code == "sequence_location").all()) # ensure sequence_location # Create a dictionary so that the results can have multiple synonyms per line and only one SeqRegion - results_dict = defaultdict(dict) - for seq_region_name, seq_region_length, coord_system_name, sequence_location, synonym, db_name in results: - key = (seq_region_name, seq_region_length, coord_system_name, sequence_location) - results_dict[key][synonym] = db_name - - for ( - seq_region_name, seq_region_length, coord_system_name, - sequence_location), synonyms in results_dict.items(): - # Test if chromosomal: - if coord_system_name == "chromosome": - chromosomal = 1 - else: - chromosomal = 0 + accession_info = defaultdict( + lambda: {"names": set(), "length": None, "location": None, "chromosomal": None}) + + for seq_region_name, seq_region_length, coord_system_name, location, synonym, db_name in results: + # Test to see if the seq_name follows accession standards (99% of sequences) if re.match(r'^[a-zA-Z]+\d+\.\d+', seq_region_name): # If so assign it to accession accession = seq_region_name - if not synonyms: + if not synonym: # If it doesn't have any synonyms the accession is the name. - name = accession + accession_info[accession]["names"].add(accession) else: - name = ";".join(synonyms.keys()) - # otherwise join all the accessions and store them in name - # ###############Likely problematic in the future###################### + accession_info[accession]["names"].add(synonym) else: # For named sequences like chr1 name = seq_region_name - for synonym, db in synonyms: - # We used to match to KnownXref, however that should not be necessary. Testing this way for now. - if re.match(r'^[a-zA-Z]+\d+\.\d+', synonym): - accession = synonym - else: - name = name + ";" + synonym - if accession is None: - raise Exception(f"seq_region_name {seq_region_name} accession could not be found. Please check") + if re.match(r'^[a-zA-Z]+\d+\.\d+', synonym): + accession = synonym + accession_info[accession]["names"].add(name) + else: + accession = name # In case synonym doesn't match the pattern, use the name as the accession + accession_info[accession]["names"].add(synonym if synonym else name) + + # Save the sequence location, length, and chromosomal flag. + location_mapping = { + 'nuclear_chromosome': 'SO:0000738', + 'mitochondrial_chromosome': 'SO:0000737', + 'chloroplast_chromosome': 'SO:0000745', + None: 'SO:0000738', + } + + try: + sequence_location = location_mapping[location] + except KeyError: + raise Exception('Error with sequence location: {} is not a valid type'.format(location)) + + # Test if chromosomal: + if coord_system_name == "chromosome": + chromosomal = 1 + else: + chromosomal = 0 + + # Assign the values to the dictionary + accession_info[accession]["location"] = sequence_location + accession_info[accession]["chromosomal"] = chromosomal + accession_info[accession]["length"] = seq_region_length + + # Now, create AssemblySequence objects for each unique accession. + for accession, info in accession_info.items(): + # Combine all unique names with ";". If a name appears in multiple sequences with the same accession, + name = ";".join(info["names"]) + + # Create an AssemblySequence object. assembly_sequence = AssemblySequence( name=name, - assembly_id=assembly, + assembly=assembly, accession=accession, - chromosomal=chromosomal, - length=seq_region_length, - sequence_location=sequence_location, + chromosomal=info["chromosomal"], + length=info["length"], + sequence_location=info["location"], # sequence_checksum="", Not implemented # ga4gh_identifier="", Not implemented ) + assembly_sequences.append(assembly_sequence) return assembly_sequences - def get_or_new_assembly(self, species, meta_session, db_uri): + def get_or_new_assembly(self, species, meta_session, db_uri, source=None): # Get the new assembly accession from the core handed over assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") assembly = meta_session.query(Assembly).filter(Assembly.accession == assembly_accession).one_or_none() @@ -381,16 +344,11 @@ def get_or_new_assembly(self, species, meta_session, db_uri): assembly_dataset = meta_session.query(Dataset).filter(Dataset.label == assembly_accession).one_or_none() # I should not need this, but double check on database updating. assembly_dataset_attributes = assembly_dataset.dataset_attributes - - # ############################### Tests ################################# - new_assembly_sequences = self.get_assembly_sequences(species, assembly) assembly_sequences = assembly.assembly_sequences - # assembly sequences. Count and compare to make sure that they match. - if len(assembly_sequences) != len(new_assembly_sequences): - raise Exception("Number of sequences does not match number in database. " - "A new assembly requires a new accession.") - # ######################################################################### - dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + if source is not None: + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + else: + dataset_source = source return assembly, assembly_dataset, assembly_dataset_attributes, \ assembly_sequences, dataset_source, "Existing" @@ -414,7 +372,10 @@ def get_or_new_assembly(self, species, meta_session, db_uri): assembly_uuid=str(uuid.uuid4()), ) dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "assembly").first() - dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + if source is None: + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + else: + dataset_source = source assembly_dataset = Dataset( dataset_uuid=str(uuid.uuid4()), @@ -426,11 +387,9 @@ def get_or_new_assembly(self, species, meta_session, db_uri): dataset_source=dataset_source, # extract from dataset_source status='Submitted', ) - attributes = self.get_meta_list_from_prefix_meta_key(species, "assembly") assembly_dataset_attributes = [] - for attribute, value in attributes: - attribute.replace("assembly.", "", 1) + for attribute, value in attributes.items(): meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() if meta_attribute is None: raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually") @@ -440,38 +399,54 @@ def get_or_new_assembly(self, species, meta_session, db_uri): attribute=meta_attribute, ) assembly_dataset_attributes.append(dataset_attribute) - assembly_sequences = self.get_assembly_sequences(species, assembly) - + meta_session.add(assembly) + meta_session.add_all(assembly_sequences) + meta_session.add(assembly_dataset) + meta_session.add_all(assembly_dataset_attributes) return assembly, assembly_dataset, assembly_dataset_attributes, assembly_sequences, dataset_source, "New" - def new_genebuild(self, species, meta_session, db_uri): + def new_genebuild(self, species, meta_session, db_uri, source=None): """ Process an individual species from a core database to update the metadata db. This method contains the logic for updating the metadata This is not a get, as we don't update the metadata for genebuild, only replace it if it is not released. """ + # The assembly accession and genebuild version are extracted from the metadata of the species assembly_accession = self.get_meta_single_meta_key(species, "assembly.accession") genebuild_version = self.get_meta_single_meta_key(species, "genebuild.version") + + # The genebuild accession is formed by combining the assembly accession and the genebuild version genebuild_accession = assembly_accession + "_" + genebuild_version - # genebuild_dataset = meta_session.query(Dataset).filter( - # Dataset.label == genebuild_accession).one_or_none() - dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + + # Depending on whether a source is provided, it uses the provided source or creates a new source + if source is None: + dataset_source, source_status = self.get_or_new_source(meta_session, db_uri, "core") + else: + dataset_source = source + + # The type of the dataset is set to be "genebuild" dataset_type = meta_session.query(DatasetType).filter(DatasetType.name == "genebuild").first() + + # A new Dataset instance is created with all necessary properties genebuild_dataset = Dataset( dataset_uuid=str(uuid.uuid4()), - dataset_type=dataset_type, # extract from dataset_type - name="assembly", + dataset_type=dataset_type, + name="genebuild", version=genebuild_version, - label=genebuild_accession, # Required. Used for lookup in this script + label=genebuild_accession, created=func.now(), - dataset_source=dataset_source, # extract from dataset_source + dataset_source=dataset_source, status='Submitted', ) + + # Fetching all attributes associated with "genebuild" from the metadata of the species attributes = self.get_meta_list_from_prefix_meta_key(species, "genebuild.") + + # An empty list to hold DatasetAttribute instances genebuild_dataset_attributes = [] - for attribute, value in attributes: - attribute.replace("genebuild.", "", 1) + # For each attribute-value pair, a new DatasetAttribute instance is created and added to the list + for attribute, value in attributes.items(): meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() if meta_attribute is None: raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually") @@ -482,6 +457,16 @@ def new_genebuild(self, species, meta_session, db_uri): ) genebuild_dataset_attributes.append(dataset_attribute) + # Check if the genebuild dataset with the given label already exists test_status = meta_session.query(Dataset).filter(Dataset.label == genebuild_accession).one_or_none() - status = "New" if test_status is None else "Existing" + + # If it does not exist, it is added to the session, otherwise the status is set to "Existing" + if test_status is None: + status = "New" + meta_session.add(genebuild_dataset) + meta_session.add_all(genebuild_dataset_attributes) + else: + status = "Existing" + + # The method returns the Dataset instance, the list of DatasetAttribute instances, and the status return genebuild_dataset, genebuild_dataset_attributes, status From 93fc4a172e091e1d3b595354eca460418292ce4f Mon Sep 17 00:00:00 2001 From: Daniel Poppleton <111403332+dpopleton@users.noreply.github.com> Date: Sun, 16 Jul 2023 20:53:01 +0100 Subject: [PATCH 05/20] Update genome.py Removed forced list --- src/ensembl/production/metadata/api/genome.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 6425bae9..f3ac5721 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -29,7 +29,6 @@ def __init__(self, metadata_uri, taxonomy_uri=None): self.taxonomy_db = DBConnection(taxonomy_uri) def fetch_taxonomy_names(self, taxonomy_ids): - taxonomy_ids = [taxonomy_ids] if not isinstance(taxonomy_ids, list) else taxonomy_ids taxons = {} for tid in taxonomy_ids: names = {"scientific_name": None, "synonym": []} From 7c4f4fd4bc48c64cac6090c3fff863345cfde4a4 Mon Sep 17 00:00:00 2001 From: danielp Date: Mon, 7 Aug 2023 15:54:52 +0100 Subject: [PATCH 06/20] Fixed Tests --- .../metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt index e33a4660..9f364493 100644 --- a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt +++ b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt @@ -321,6 +321,7 @@ 2301119 Rhabditomorpha scientific name 2698737 Sar Burki et al. 2008 authority 2698737 Sar scientific name +66666668 monster scientific name 2698737 SAR supergroup synonym 38820 4478 merged_taxon_id 38820 4727 merged_taxon_id From 18ea79d6c9312b959b436b9f9a4165d38b727946 Mon Sep 17 00:00:00 2001 From: danielp Date: Mon, 7 Aug 2023 16:03:04 +0100 Subject: [PATCH 07/20] Modified test dbs to use E. coli taxid --- src/tests/databases/core_1/meta.txt | 2 +- src/tests/databases/core_2/meta.txt | 2 +- src/tests/databases/core_3/meta.txt | 2 +- src/tests/databases/core_4/meta.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index 9c38fecc..c95b1ce2 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -11,6 +11,6 @@ 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 511145 10 1 species.type monsters 5 1 species.url Jabbe diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index d8efbd88..1934679d 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -11,6 +11,6 @@ 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 511145 10 1 species.type monsters 5 1 species.url Jabbe diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index 63700f27..40d23e65 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -10,6 +10,6 @@ 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 511145 10 1 species.type monsters 5 1 species.url Jabbe diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 4ebee73c..5b650aa4 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -11,6 +11,6 @@ 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 66666668 +2 1 species.taxonomy_id 511145 10 1 species.type monsters 5 1 species.url Jabbe From 6b974d1aca97dd48a77c3b9347e930c351ddc7a0 Mon Sep 17 00:00:00 2001 From: danielp Date: Mon, 7 Aug 2023 16:28:54 +0100 Subject: [PATCH 08/20] Modified test dbs to use specific unique name. --- .../metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt | 5 ++++- src/tests/databases/core_1/meta.txt | 2 +- src/tests/databases/core_2/meta.txt | 4 ++-- src/tests/databases/core_3/meta.txt | 4 ++-- src/tests/databases/core_4/meta.txt | 4 ++-- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt index 9f364493..175bbc0e 100644 --- a/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt +++ b/src/ensembl/production/metadata/api/sample/ncbi_taxonomy/ncbi_taxa_name.txt @@ -321,7 +321,10 @@ 2301119 Rhabditomorpha scientific name 2698737 Sar Burki et al. 2008 authority 2698737 Sar scientific name -66666668 monster scientific name +666668 jabberwocky synonym +666668 carol_jabberwocky3 equivalent name +666668 carol_jabberwocky2 equivalent name +666668 carol_jabberwocky scientific name 2698737 SAR supergroup synonym 38820 4478 merged_taxon_id 38820 4727 merged_taxon_id diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index c95b1ce2..13fd1319 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -11,6 +11,6 @@ 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 511145 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index 1934679d..13fd1319 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -7,10 +7,10 @@ 3 1 species.display_name jabberwocky 7 1 species.division Ensembl_TEST 6 1 species.production_name Jabberwocky -4 1 species.scientific_name lewis_carol +4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 511145 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index 40d23e65..2382e20d 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -6,10 +6,10 @@ 3 1 species.display_name jabberwocky 7 1 species.division Ensembl_TEST 6 1 species.production_name Jabberwocky -4 1 species.scientific_name lewis_carol +4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 511145 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 5b650aa4..4d82453e 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -7,10 +7,10 @@ 3 1 species.display_name jabberwocky 7 1 species.division Ensembl_TEST 6 1 species.production_name Jabberwocky -4 1 species.scientific_name lewis_carol +4 1 species.scientific_name carol_jabberwocky 1 1 species.species_taxonomy_id 6666666 8 1 species.strain reference 9 1 species.strain_group testing -2 1 species.taxonomy_id 511145 +2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe From 659aca5ca7e1d16d2cb672fdac5b73b2c74235af Mon Sep 17 00:00:00 2001 From: danielp Date: Mon, 7 Aug 2023 16:45:59 +0100 Subject: [PATCH 09/20] Improved fetch taxonomy names within api --- src/ensembl/production/metadata/api/genome.py | 76 ++++++++++++------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index f3ac5721..1e3ae07b 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -34,34 +34,54 @@ def fetch_taxonomy_names(self, taxonomy_ids): names = {"scientific_name": None, "synonym": []} taxons[tid] = names - for taxon in taxons: - sci_name_select = db.select( - NCBITaxaName.name - ).filter( - NCBITaxaName.taxon_id == taxon, - NCBITaxaName.name_class == "scientific name", - ) - synonym_class = [ - "common name", - "equivalent name", - "genbank common name", - "genbank synonym", - "synonym", - ] - - synonyms_select = db.select( - NCBITaxaName.name - ).filter( - NCBITaxaName.taxon_id == taxon, - NCBITaxaName.name_class.in_(synonym_class), - ) - - with self.taxonomy_db.session_scope() as session: - sci_name = session.execute(sci_name_select).one() - taxons[taxon]["scientific_name"] = sci_name[0] - synonyms = session.execute(synonyms_select).all() - for synonym in synonyms: - taxons[taxon]["synonym"].append(synonym[0]) + taxonomy_ids = check_parameter(taxonomy_ids) + + taxons = {} + for tid in taxonomy_ids: + names = {"scientific_name": None, "synonym": []} + taxons[tid] = names + for taxon in taxons: + sci_name_select = db.select( + NCBITaxaName.name + ).filter( + NCBITaxaName.taxon_id == taxon, + NCBITaxaName.name_class == "scientific name", + ) + synonym_class = [ + "common name", + "equivalent name", + "genbank common name", + "genbank synonym", + "synonym", + ] + + synonyms_select = db.select( + NCBITaxaName.name, NCBITaxaName.name_class + ).filter( + NCBITaxaName.taxon_id == taxon, + NCBITaxaName.name_class.in_(synonym_class), + ) + with self.taxonomy_db.session_scope() as session: + sci_name = session.execute(sci_name_select).one() + taxons[taxon]["scientific_name"] = sci_name[0] + synonyms = session.execute(synonyms_select).all() + common_names = [] + taxons[taxon]['ncbi_common_name'] = None + for synonym in synonyms: + # create a list of synonyms + taxons[taxon]["synonym"].append(synonym[0]) + # and fill the rest of the required key-values fields + # these are required by get_species_information() in the metadata service + if synonym[1] is not None and synonym[0] is not None: + if synonym[1] == 'genbank common name': + taxons[taxon]['ncbi_common_name'] = synonym[0] + if synonym[1] == 'common name': + common_names.append(synonym[1]) + taxons[taxon]['alternative_names'] = common_names + if len(common_names) > 0: + taxons[taxon]['common_name'] = common_names[0] + else: + taxons[taxon]['common_name'] = None return taxons def fetch_taxonomy_ids(self, taxonomy_names): From 1c397de5aa8d3dca1885f150ad5d99529d41be7a Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 09:38:49 +0100 Subject: [PATCH 10/20] Added taxid genome api check --- src/ensembl/production/metadata/api/genome.py | 76 +++++++------------ 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 1e3ae07b..1e506161 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -29,59 +29,39 @@ def __init__(self, metadata_uri, taxonomy_uri=None): self.taxonomy_db = DBConnection(taxonomy_uri) def fetch_taxonomy_names(self, taxonomy_ids): + taxonomy_ids = check_parameter(taxonomy_ids) taxons = {} for tid in taxonomy_ids: names = {"scientific_name": None, "synonym": []} taxons[tid] = names + for taxon in taxons: + sci_name_select = db.select( + NCBITaxaName.name + ).filter( + NCBITaxaName.taxon_id == taxon, + NCBITaxaName.name_class == "scientific name", + ) + synonym_class = [ + "common name", + "equivalent name", + "genbank common name", + "genbank synonym", + "synonym", + ] + + synonyms_select = db.select( + NCBITaxaName.name + ).filter( + NCBITaxaName.taxon_id == taxon, + NCBITaxaName.name_class.in_(synonym_class), + ) - taxonomy_ids = check_parameter(taxonomy_ids) - - taxons = {} - for tid in taxonomy_ids: - names = {"scientific_name": None, "synonym": []} - taxons[tid] = names - for taxon in taxons: - sci_name_select = db.select( - NCBITaxaName.name - ).filter( - NCBITaxaName.taxon_id == taxon, - NCBITaxaName.name_class == "scientific name", - ) - synonym_class = [ - "common name", - "equivalent name", - "genbank common name", - "genbank synonym", - "synonym", - ] - - synonyms_select = db.select( - NCBITaxaName.name, NCBITaxaName.name_class - ).filter( - NCBITaxaName.taxon_id == taxon, - NCBITaxaName.name_class.in_(synonym_class), - ) - with self.taxonomy_db.session_scope() as session: - sci_name = session.execute(sci_name_select).one() - taxons[taxon]["scientific_name"] = sci_name[0] - synonyms = session.execute(synonyms_select).all() - common_names = [] - taxons[taxon]['ncbi_common_name'] = None - for synonym in synonyms: - # create a list of synonyms - taxons[taxon]["synonym"].append(synonym[0]) - # and fill the rest of the required key-values fields - # these are required by get_species_information() in the metadata service - if synonym[1] is not None and synonym[0] is not None: - if synonym[1] == 'genbank common name': - taxons[taxon]['ncbi_common_name'] = synonym[0] - if synonym[1] == 'common name': - common_names.append(synonym[1]) - taxons[taxon]['alternative_names'] = common_names - if len(common_names) > 0: - taxons[taxon]['common_name'] = common_names[0] - else: - taxons[taxon]['common_name'] = None + with self.taxonomy_db.session_scope() as session: + sci_name = session.execute(sci_name_select).one() + taxons[taxon]["scientific_name"] = sci_name[0] + synonyms = session.execute(synonyms_select).all() + for synonym in synonyms: + taxons[taxon]["synonym"].append(synonym[0]) return taxons def fetch_taxonomy_ids(self, taxonomy_names): From c3cf0a7ecfd0150fa419a232e7ac631e0ad12779 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 09:48:39 +0100 Subject: [PATCH 11/20] added a change for taxid check. Major rework already in PR so it isn't quite proper --- src/ensembl/production/metadata/api/genome.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 1e506161..efecee30 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -29,7 +29,8 @@ def __init__(self, metadata_uri, taxonomy_uri=None): self.taxonomy_db = DBConnection(taxonomy_uri) def fetch_taxonomy_names(self, taxonomy_ids): - taxonomy_ids = check_parameter(taxonomy_ids) + if not isinstance(taxonomy_ids, str): + taxonomy_ids = check_parameter(taxonomy_ids) taxons = {} for tid in taxonomy_ids: names = {"scientific_name": None, "synonym": []} From fad763c013f599b1a4c1365b80b7d2306e1a7ac7 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 09:53:30 +0100 Subject: [PATCH 12/20] added a change for taxid check. Major rework already in PR so it isn't quite proper --- src/ensembl/production/metadata/api/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index efecee30..06bde947 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -29,7 +29,7 @@ def __init__(self, metadata_uri, taxonomy_uri=None): self.taxonomy_db = DBConnection(taxonomy_uri) def fetch_taxonomy_names(self, taxonomy_ids): - if not isinstance(taxonomy_ids, str): + if taxonomy_ids.isdigit(): taxonomy_ids = check_parameter(taxonomy_ids) taxons = {} for tid in taxonomy_ids: From 03eef45b66e9931cb8e662456df16e1ea77c7202 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 10:54:09 +0100 Subject: [PATCH 13/20] added a change for taxid check. Major rework already in PR so it isn't quite proper --- src/ensembl/production/metadata/api/base.py | 2 ++ src/ensembl/production/metadata/api/genome.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ensembl/production/metadata/api/base.py b/src/ensembl/production/metadata/api/base.py index f88c9206..d5289b5e 100644 --- a/src/ensembl/production/metadata/api/base.py +++ b/src/ensembl/production/metadata/api/base.py @@ -23,6 +23,8 @@ def __init__(self, metadata_uri): def check_parameter(param): + if isinstance(param, tuple): + param = param[0] if param is not None and not isinstance(param, list): param = [param] return param diff --git a/src/ensembl/production/metadata/api/genome.py b/src/ensembl/production/metadata/api/genome.py index 06bde947..39de969b 100644 --- a/src/ensembl/production/metadata/api/genome.py +++ b/src/ensembl/production/metadata/api/genome.py @@ -29,8 +29,8 @@ def __init__(self, metadata_uri, taxonomy_uri=None): self.taxonomy_db = DBConnection(taxonomy_uri) def fetch_taxonomy_names(self, taxonomy_ids): - if taxonomy_ids.isdigit(): - taxonomy_ids = check_parameter(taxonomy_ids) + + taxonomy_ids = check_parameter(taxonomy_ids) taxons = {} for tid in taxonomy_ids: names = {"scientific_name": None, "synonym": []} From 43534ae5a65406f051c0c6411abbbd75f4c666d6 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 11:33:12 +0100 Subject: [PATCH 14/20] updated test tables --- .../api/sample/ensembl_metadata/attribute.txt | 18 ++++++++++-------- src/tests/databases/core_1/meta.txt | 1 + src/tests/databases/core_2/meta.txt | 1 + src/tests/databases/core_3/meta.txt | 1 + src/tests/databases/core_4/meta.txt | 1 + 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt index 35754fda..b50b8bed 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/attribute.txt @@ -56,13 +56,15 @@ 56 homology_coverage Coverage Percent of genome which is homologous to another species percent 57 short_variants Short variants Small-scale genetic variations integer 58 structural_variants Structural variants Large-scale genetic variations integer -59 short_variants_with_phenotype_assertions "Short variants -With phenotype assertions" Short variants with phenotypic evidence integer -60 short_variants_with_publications "Short variants -With publications" Short variants published in literature integer -61 short_variants_frequency_studies "Short variants -Frequency studies" Short variants studied for frequency integer -62 structural_variants_with_phenotype_assertions "Structural variants -With phenotype assertions" Structural variants with phenotypic evidence integer +59 short_variants_with_phenotype_assertions "Short variants With phenotype assertions" Short variants with phenotypic evidence integer +60 short_variants_with_publications "Short variants With publications" Short variants published in literature integer +61 short_variants_frequency_studies "Short variants Frequency studies" Short variants studied for frequency integer +62 structural_variants_with_phenotype_assertions "Structural variants With phenotype assertions" Structural variants with phenotypic evidence integer 63 enhancers Enhancers DNA sequences that increase gene expression integer 64 promoters Promoters DNA sequences initiating transcription integer +65 assembly.accession accession accession string +66 assembly.default default default string +67 assembly.name name name string +68 assembly.ucsc_alias ucsc_alias ucsc_alias string +69 genebuild.last_geneset_update last_geneset_update last_geneset_update string +70 genebuild.version version version string \ No newline at end of file diff --git a/src/tests/databases/core_1/meta.txt b/src/tests/databases/core_1/meta.txt index 13fd1319..b744f3c4 100644 --- a/src/tests/databases/core_1/meta.txt +++ b/src/tests/databases/core_1/meta.txt @@ -14,3 +14,4 @@ 2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 diff --git a/src/tests/databases/core_2/meta.txt b/src/tests/databases/core_2/meta.txt index 13fd1319..b744f3c4 100644 --- a/src/tests/databases/core_2/meta.txt +++ b/src/tests/databases/core_2/meta.txt @@ -14,3 +14,4 @@ 2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 diff --git a/src/tests/databases/core_3/meta.txt b/src/tests/databases/core_3/meta.txt index 2382e20d..5bb63698 100644 --- a/src/tests/databases/core_3/meta.txt +++ b/src/tests/databases/core_3/meta.txt @@ -13,3 +13,4 @@ 2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 diff --git a/src/tests/databases/core_4/meta.txt b/src/tests/databases/core_4/meta.txt index 4d82453e..3202ce69 100644 --- a/src/tests/databases/core_4/meta.txt +++ b/src/tests/databases/core_4/meta.txt @@ -14,3 +14,4 @@ 2 1 species.taxonomy_id 666668 10 1 species.type monsters 5 1 species.url Jabbe +17 1 genebuild.version 1 From 06c7c3c15dec3ce8b8da13b83cbfd6db9060d1cf Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 11:57:38 +0100 Subject: [PATCH 15/20] Fixed api test and altered get sequences --- src/ensembl/production/metadata/updater/core.py | 3 +-- src/tests/test_api.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index 935d5128..db2dff32 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -256,12 +256,11 @@ def get_assembly_sequences(self, species, assembly): .join(SeqRegion.coord_system) .join(SeqRegion.seq_region_attrib) .join(SeqRegionAttrib.attrib_type) - .join(CoordSystem.meta) .outerjoin(SeqRegion.seq_region_synonym) .outerjoin(SeqRegionSynonym.external_db) .join(SeqRegionAttribAlias, SeqRegion.seq_region_attrib) # join with SeqRegionAttribAlias .outerjoin(AttribTypeAlias, SeqRegionAttribAlias.attrib_type) # join with AttribTypeAlias - .filter(Meta.species_id == species) + .filter(CoordSystem.species_id == species) .filter(AttribType.code == "toplevel") # ensure toplevel .filter(AttribTypeAlias.code == "sequence_location").all()) # ensure sequence_location diff --git a/src/tests/test_api.py b/src/tests/test_api.py index eefb521e..62b5af70 100644 --- a/src/tests/test_api.py +++ b/src/tests/test_api.py @@ -52,7 +52,7 @@ def test_fetch_releases_for_dataset(self, multi_dbs): def test_fetch_taxonomy_names(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) - test = conn.fetch_taxonomy_names(taxonomy_ids=(6239, 511145)) + test = conn.fetch_taxonomy_names(taxonomy_ids=511145) assert test[511145]['scientific_name'] == 'Escherichia coli str. K-12 substr. MG1655' def test_fetch_taxonomy_ids(self, multi_dbs): From ffcd9f223b1674dc551c42ae218926818d36a27f Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 12:35:21 +0100 Subject: [PATCH 16/20] added attrib_type to tests --- sql/assembly_sequence.sql | 8 ++++---- src/ensembl/production/metadata/updater/core.py | 7 +++---- src/tests/databases/core_1/attrib_type.txt | 2 ++ src/tests/databases/core_1/table.sql | 8 ++++++++ src/tests/databases/core_2/attrib_type.txt | 2 ++ src/tests/databases/core_2/table.sql | 8 ++++++++ src/tests/databases/core_3/attrib_type.txt | 2 ++ src/tests/databases/core_3/table.sql | 8 ++++++++ src/tests/databases/core_4/attrib_type.txt | 2 ++ src/tests/databases/core_4/table.sql | 8 ++++++++ 10 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 src/tests/databases/core_1/attrib_type.txt create mode 100644 src/tests/databases/core_2/attrib_type.txt create mode 100644 src/tests/databases/core_3/attrib_type.txt create mode 100644 src/tests/databases/core_4/attrib_type.txt diff --git a/sql/assembly_sequence.sql b/sql/assembly_sequence.sql index ca25d31f..df2189ee 100644 --- a/sql/assembly_sequence.sql +++ b/sql/assembly_sequence.sql @@ -1,11 +1,11 @@ --- Because the attrib_type and external_db tables are identical +-- Because the attrib_type.txt and external_db tables are identical -- across all dbs, and in sync with the production master copy, -- we can use IDs directly, and avoid some complicated outer -- join statements... -- external_db.external_db_id 50710 = INSDC --- attrib_type.attrib_type_id 6 = toplevel --- attrib_type.attrib_type_id 367 = karyotype_rank --- attrib_type.attrib_type_id 547 = sequence_location +-- attrib_type.txt.attrib_type_id 6 = toplevel +-- attrib_type.txt.attrib_type_id 367 = karyotype_rank +-- attrib_type.txt.attrib_type_id 547 = sequence_location -- Unfortunately, the sequence_location attribute in the core dbs -- isn't set with the values you might expect; it has '*_chromosome' diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index db2dff32..f3ecae2c 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -252,15 +252,14 @@ def get_assembly_sequences(self, species, assembly): # One complicated query to get all the data. Otherwise, this takes far too long to do. results = (session.query(SeqRegion.name, SeqRegion.length, CoordSystem.name, - SeqRegionAttribAlias.value, SeqRegionSynonym.synonym, ExternalDb.db_name) + SeqRegionAttribAlias.value, SeqRegionSynonym.synonym) .join(SeqRegion.coord_system) .join(SeqRegion.seq_region_attrib) .join(SeqRegionAttrib.attrib_type) .outerjoin(SeqRegion.seq_region_synonym) - .outerjoin(SeqRegionSynonym.external_db) .join(SeqRegionAttribAlias, SeqRegion.seq_region_attrib) # join with SeqRegionAttribAlias .outerjoin(AttribTypeAlias, SeqRegionAttribAlias.attrib_type) # join with AttribTypeAlias - .filter(CoordSystem.species_id == species) + .filter(Meta.species_id == species) .filter(AttribType.code == "toplevel") # ensure toplevel .filter(AttribTypeAlias.code == "sequence_location").all()) # ensure sequence_location @@ -268,7 +267,7 @@ def get_assembly_sequences(self, species, assembly): accession_info = defaultdict( lambda: {"names": set(), "length": None, "location": None, "chromosomal": None}) - for seq_region_name, seq_region_length, coord_system_name, location, synonym, db_name in results: + for seq_region_name, seq_region_length, coord_system_name, location, synonym in results: # Test to see if the seq_name follows accession standards (99% of sequences) if re.match(r'^[a-zA-Z]+\d+\.\d+', seq_region_name): diff --git a/src/tests/databases/core_1/attrib_type.txt b/src/tests/databases/core_1/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_1/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_1/table.sql b/src/tests/databases/core_1/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_1/table.sql +++ b/src/tests/databases/core_1/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file diff --git a/src/tests/databases/core_2/attrib_type.txt b/src/tests/databases/core_2/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_2/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_2/table.sql b/src/tests/databases/core_2/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_2/table.sql +++ b/src/tests/databases/core_2/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file diff --git a/src/tests/databases/core_3/attrib_type.txt b/src/tests/databases/core_3/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_3/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_3/table.sql b/src/tests/databases/core_3/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_3/table.sql +++ b/src/tests/databases/core_3/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file diff --git a/src/tests/databases/core_4/attrib_type.txt b/src/tests/databases/core_4/attrib_type.txt new file mode 100644 index 00000000..de5f1880 --- /dev/null +++ b/src/tests/databases/core_4/attrib_type.txt @@ -0,0 +1,2 @@ +6 toplevel Top Level Top Level Non-Redundant Sequence Region +547 sequence_location sequence_location To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): "apicoplast_chromosome", "chloroplast_chromosome", "chromoplast_chromosome", "cyanelle_chromosome", "leucoplast_chromosome", "macronuclear_chromosome", "micronuclear_chromosome", "mitochondrial_chromosome", "nuclear_chromosome". diff --git a/src/tests/databases/core_4/table.sql b/src/tests/databases/core_4/table.sql index 03467355..953da984 100644 --- a/src/tests/databases/core_4/table.sql +++ b/src/tests/databases/core_4/table.sql @@ -76,3 +76,11 @@ CREATE TABLE seq_region_synonym CREATE INDEX seq_region_idx on seq_region_synonym (seq_region_id); +CREATE TABLE `attrib_type` ( + `attrib_type_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, + `code` varchar(20) NOT NULL DEFAULT '', + `name` varchar(255) NOT NULL DEFAULT '', + `description` text, + PRIMARY KEY (`attrib_type_id`), + UNIQUE KEY `code_idx` (`code`) +); \ No newline at end of file From 8ce5ea4b84d4b4c8090bb09d5777d34bdf9a7ee4 Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 12:47:45 +0100 Subject: [PATCH 17/20] Reworked Tests --- src/tests/test_updater.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 44c449f0..7d1f2f00 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -46,10 +46,10 @@ def test_new_organism(self, multi_dbs): metadata = MetaData() dataset = Table('dataset', metadata, autoload=True, autoload_with=engine) query = select([dataset]).where( - (dataset.c.version == 999) & (dataset.c.name == 'genebuild') & (dataset.c.label == '01') + (dataset.c.version == 999) & (dataset.c.name == 'genebuild') ) row = engine.execute(query).fetchone() - assert row[-2] == '01' + assert row[4] is not None # def test_update_organism(self, multi_dbs): @@ -59,7 +59,7 @@ def test_update_organism(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') - assert test_collect[0].Organism.scientific_name == 'lewis_carol' + assert test_collect[0].Organism.scientific_name == 'carol_jabberwocky' def test_update_assembly(self, multi_dbs): test = meta_factory(multi_dbs['core_3'].dbc.url, multi_dbs['ensembl_metadata'].dbc.url, @@ -68,7 +68,7 @@ def test_update_assembly(self, multi_dbs): conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url, taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url) test_collect = conn.fetch_genomes_by_ensembl_name('Jabberwocky') - assert test_collect[1].Organism.scientific_name == 'lewis_carol' + assert test_collect[1].Organism.scientific_name == 'carol_jabberwocky' assert test_collect[1].Assembly.accession == 'weird02' # @@ -83,4 +83,5 @@ def test_update_geneset(self, multi_dbs): (dataset.c.version == 999) & (dataset.c.name == 'genebuild') & (dataset.c.label == '02') ) row = engine.execute(query).fetchone() - assert row[-2] == '02' + assert row[4] is not None + From 3e06aefab1c34eb92b4a07070948ec2211fd6ccc Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 12:53:02 +0100 Subject: [PATCH 18/20] Reworked Tests --- src/tests/test_updater.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 7d1f2f00..4a7cc3f4 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -49,7 +49,9 @@ def test_new_organism(self, multi_dbs): (dataset.c.version == 999) & (dataset.c.name == 'genebuild') ) row = engine.execute(query).fetchone() - assert row[4] is not None + assert row is not None + if row is not None: + assert row[4] is not None # def test_update_organism(self, multi_dbs): @@ -83,5 +85,6 @@ def test_update_geneset(self, multi_dbs): (dataset.c.version == 999) & (dataset.c.name == 'genebuild') & (dataset.c.label == '02') ) row = engine.execute(query).fetchone() - assert row[4] is not None - + assert row is not None + if row is not None: + assert row[4] is not None From 7baee2afd2c8553e2f1f82ec1751e5fb3fc0bc8b Mon Sep 17 00:00:00 2001 From: danielp Date: Tue, 8 Aug 2023 12:54:19 +0100 Subject: [PATCH 19/20] Reworked Tests --- src/tests/test_updater.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/test_updater.py b/src/tests/test_updater.py index 4a7cc3f4..ad33a7f8 100644 --- a/src/tests/test_updater.py +++ b/src/tests/test_updater.py @@ -46,7 +46,7 @@ def test_new_organism(self, multi_dbs): metadata = MetaData() dataset = Table('dataset', metadata, autoload=True, autoload_with=engine) query = select([dataset]).where( - (dataset.c.version == 999) & (dataset.c.name == 'genebuild') + (dataset.c.version == 1) & (dataset.c.name == 'genebuild') ) row = engine.execute(query).fetchone() assert row is not None @@ -82,7 +82,7 @@ def test_update_geneset(self, multi_dbs): metadata = MetaData() dataset = Table('dataset', metadata, autoload=True, autoload_with=engine) query = select([dataset]).where( - (dataset.c.version == 999) & (dataset.c.name == 'genebuild') & (dataset.c.label == '02') + (dataset.c.version == 1) & (dataset.c.name == 'genebuild') ) row = engine.execute(query).fetchone() assert row is not None From 216ef2c7c1cb4fea151d5f6ce28770ec75ada7ed Mon Sep 17 00:00:00 2001 From: danielp Date: Wed, 9 Aug 2023 09:58:26 +0100 Subject: [PATCH 20/20] Added lrg skip for assembly sequences --- src/ensembl/production/metadata/updater/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index f3ecae2c..919a1f79 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -268,7 +268,9 @@ def get_assembly_sequences(self, species, assembly): lambda: {"names": set(), "length": None, "location": None, "chromosomal": None}) for seq_region_name, seq_region_length, coord_system_name, location, synonym in results: - + # Skip all sequence lrg sequences. + if coord_system_name == "lrg": + continue # Test to see if the seq_name follows accession standards (99% of sequences) if re.match(r'^[a-zA-Z]+\d+\.\d+', seq_region_name): # If so assign it to accession