diff --git a/src/ensembl/production/metadata/api/models/assembly.py b/src/ensembl/production/metadata/api/models/assembly.py index 1d89d7d4..bd919bce 100644 --- a/src/ensembl/production/metadata/api/models/assembly.py +++ b/src/ensembl/production/metadata/api/models/assembly.py @@ -33,7 +33,7 @@ class Assembly(Base): created = Column(DateTime) ensembl_name = Column(String(255), unique=True) alt_accession = Column(String(16), nullable=True) - is_reference = Column(TINYINT(1), nullable=False) + is_reference = Column(TINYINT(1), nullable=False, default=0) url_name = Column(String(128), nullable=False) # One to many relationships # assembly_id within assembly_sequence @@ -52,7 +52,7 @@ class AssemblySequence(Base): name = Column(String(128), unique=True) assembly_id = Column(ForeignKey('assembly.assembly_id'), nullable=False, index=True) accession = Column(String(128), nullable=False) - chromosomal = Column(TINYINT(1), nullable=False) + chromosomal = Column(TINYINT(1), nullable=False, default=0) chromosome_rank = Column(Integer) length = Column(Integer, nullable=False) sequence_location = Column(String(10)) diff --git a/src/ensembl/production/metadata/api/models/genome.py b/src/ensembl/production/metadata/api/models/genome.py index 27c60d23..48208100 100644 --- a/src/ensembl/production/metadata/api/models/genome.py +++ b/src/ensembl/production/metadata/api/models/genome.py @@ -26,6 +26,7 @@ class Genome(Base): assembly_id = Column(ForeignKey("assembly.assembly_id"), nullable=False, index=True) organism_id = Column(ForeignKey("organism.organism_id"), nullable=False, index=True) created = Column(DATETIME(fsp=6), nullable=False) + is_best = Column(TINYINT(1), nullable=False, default=0) # One to many relationships # genome_id to genome_dataset and genome release genome_datasets = relationship("GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan") @@ -44,7 +45,8 @@ class GenomeDataset(Base): dataset_id = Column(ForeignKey("dataset.dataset_id"), nullable=False, index=True) genome_id = Column(ForeignKey("genome.genome_id"), nullable=False, index=True) release_id = Column(ForeignKey("ensembl_release.release_id"), index=True) - is_current = Column(TINYINT(1), nullable=False) + is_current = Column(TINYINT(1), nullable=False, default=0) + # One to many relationships # none # many to one relationships @@ -62,8 +64,7 @@ class GenomeRelease(Base): genome_release_id = Column(Integer, primary_key=True) genome_id = Column(ForeignKey("genome.genome_id"), nullable=False, index=True) release_id = Column(ForeignKey("ensembl_release.release_id"), nullable=False, index=True) - is_current = Column(TINYINT(1), nullable=False) - is_best = Column(TINYINT(1), nullable=False) + is_current = Column(TINYINT(1), nullable=False, default=0) # One to many relationships # none # many to one relationships diff --git a/src/ensembl/production/metadata/api/models/organism.py b/src/ensembl/production/metadata/api/models/organism.py index 187016b0..4ae85986 100644 --- a/src/ensembl/production/metadata/api/models/organism.py +++ b/src/ensembl/production/metadata/api/models/organism.py @@ -72,7 +72,7 @@ class OrganismGroupMember(Base): ) organism_group_member_id = Column(Integer, primary_key=True) - is_reference = Column(TINYINT(1), nullable=False) + is_reference = Column(TINYINT(1), nullable=False, default=0) order = Column(Integer, nullable=True) organism_id = Column(ForeignKey("organism.organism_id"), nullable=False) organism_group_id = Column(ForeignKey("organism_group.organism_group_id"), nullable=False, index=True) diff --git a/src/ensembl/production/metadata/api/models/release.py b/src/ensembl/production/metadata/api/models/release.py index 77f9a22b..1bbfbf1a 100644 --- a/src/ensembl/production/metadata/api/models/release.py +++ b/src/ensembl/production/metadata/api/models/release.py @@ -40,7 +40,7 @@ class EnsemblRelease(Base): version = Column(DECIMAL(10, 1), nullable=False) release_date = Column(Date, nullable=False) label = Column(String(64)) - is_current = Column(TINYINT(1), nullable=False) + is_current = Column(TINYINT(1), nullable=False, default=0) site_id = Column(ForeignKey('ensembl_site.site_id'), index=True) release_type = Column(String(16), nullable=False) # One to many relationships diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt index df30f6e6..75743857 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome.txt @@ -1,9 +1,9 @@ -1 a7335667-93e7-11ec-a39d-005056b38ce3 1 1 2023-05-12 13:30:58 -2 3704ceb1-948d-11ec-a39d-005056b38ce3 2 1 2023-05-12 13:32:06 -3 a73351f7-93e7-11ec-a39d-005056b38ce3 3 2 2023-05-12 13:32:14 -4 a73356e1-93e7-11ec-a39d-005056b38ce3 4 3 2023-05-12 13:32:25 -5 a73357ab-93e7-11ec-a39d-005056b38ce3 5 4 2023-05-12 13:32:36 -6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46 -7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52 -8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58 +1 a7335667-93e7-11ec-a39d-005056b38ce3 1 1 2023-05-12 13:30:58 0 +2 3704ceb1-948d-11ec-a39d-005056b38ce3 2 1 2023-05-12 13:32:06 0 +3 a73351f7-93e7-11ec-a39d-005056b38ce3 3 2 2023-05-12 13:32:14 0 +4 a73356e1-93e7-11ec-a39d-005056b38ce3 4 3 2023-05-12 13:32:25 0 +5 a73357ab-93e7-11ec-a39d-005056b38ce3 5 4 2023-05-12 13:32:36 0 +6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46 0 +7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52 0 +8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58 0 diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_release.txt b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_release.txt index fd772560..dcb5008c 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_release.txt +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/genome_release.txt @@ -1,7 +1,7 @@ -1 1 1 1 0 -2 2 1 0 0 -3 3 1 1 0 -4 4 1 1 0 -5 5 1 1 0 -6 6 1 1 0 -7 7 1 1 0 +1 1 1 1 +2 2 1 0 +3 3 1 1 +4 4 1 1 +5 5 1 1 +6 6 1 1 +7 7 1 1 diff --git a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql index 3c926d84..979f65f3 100644 --- a/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql +++ b/src/ensembl/production/metadata/api/sample/ensembl_metadata/table.sql @@ -12,7 +12,7 @@ CREATE TABLE assembly created datetime null, ensembl_name varchar(255) null, alt_accession varchar(16) null, - is_reference tinyint(1) not null, + is_reference tinyint(1) not null default 0, url_name varchar(128) null, constraint assembly_uuid @@ -32,7 +32,7 @@ CREATE TABLE assembly_sequence name varchar(128) null, assembly_id int not null, accession varchar(128) null, - chromosomal tinyint(1) not null, + chromosomal tinyint(1) not null default 0, length int not null, chromosome_rank int null, sequence_location varchar(10) null, @@ -141,7 +141,7 @@ CREATE TABLE ensembl_release version decimal(10, 1) not null, release_date date not null, label varchar(64) null, - is_current tinyint(1) not null, + is_current tinyint(1) not null default 0, site_id int null, release_type varchar(16) not null, constraint ensembl_release_version_site_id_b743399a_uniq @@ -176,7 +176,9 @@ CREATE TABLE genome genome_uuid varchar(128) not null, assembly_id int not null, organism_id int not null, - created datetime(6) not null, + created datetime(6) not null, + is_best tinyint(1) not null default 0, + constraint genome_uuid unique (genome_uuid), constraint genome_assembly_id_0a748388_fk_assembly_assembly_id @@ -192,7 +194,7 @@ CREATE TABLE genome_dataset dataset_id int not null, genome_id int not null, release_id int null, - is_current tinyint(1) not null, + is_current tinyint(1) not null default 0, constraint ensembl_metadata_gen_dataset_id_26d7bac7_fk_dataset_d foreign key (dataset_id) references dataset (dataset_id), constraint ensembl_metadata_gen_genome_id_7670a2c5_fk_genome_ge @@ -207,8 +209,7 @@ CREATE TABLE genome_release primary key, genome_id int not null, release_id int not null, - is_current tinyint(1) not null, - is_best tinyint(1) not null, + is_current tinyint(1) not null default 0, constraint genome_release_genome_id_3e45dc04_fk foreign key (genome_id) references genome (genome_id), constraint genome_release_release_id_bca7e1e5_fk_ensembl_release_release_id @@ -231,7 +232,7 @@ CREATE TABLE organism_group CREATE TABLE `organism_group_member` ( `organism_group_member_id` int NOT NULL AUTO_INCREMENT, - `is_reference` tinyint(1) DEFAULT NULL, + `is_reference` tinyint(1) NOT NULL DEFAULT 0, `organism_id` int NOT NULL, `organism_group_id` int NOT NULL, `order` int DEFAULT NULL, diff --git a/src/ensembl/production/metadata/updater/core.py b/src/ensembl/production/metadata/updater/core.py index c9624bac..f2b36577 100644 --- a/src/ensembl/production/metadata/updater/core.py +++ b/src/ensembl/production/metadata/updater/core.py @@ -23,7 +23,7 @@ from ensembl.production.metadata.api.dataset import DatasetAdaptor from ensembl.production.metadata.api.models import * from ensembl.production.metadata.updater.base import BaseMetaUpdater - +import logging class CoreMetaUpdater(BaseMetaUpdater): def __init__(self, db_uri, metadata_uri, taxonomy_uri): @@ -73,7 +73,6 @@ def process_core(self, **kwargs): self.process_species(species, metadata_uri, taxonomy_uri, db_uri) def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri): - print (db_uri) """ Process an individual species from a core database to update the metadata db. This method contains the logic for updating the metadata @@ -91,17 +90,20 @@ def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri): genebuild_release_status = conn.check_release_status(self.genebuild_dataset.dataset_uuid) if organism_status == "New": - print ("New organism") + logging.info('New organism') # ###############################Checks that dataset and assembly are new ################## if assembly_status != "New" or genebuild_status != "New": raise Exception("New organism, but existing assembly accession and/or genebuild version") ############################################### # Create genome and populate the database with organism, assembly and dataset new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, - self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + self.organism, + self.assembly, + self.assembly_dataset, + self.genebuild_dataset) elif assembly_status == "New": - print ("New assembly") + logging.info('New assembly') # ###############################Checks that dataset and update are new ################## if genebuild_status != "New": @@ -109,22 +111,28 @@ def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri): ############################################### new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, - self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + self.organism, + self.assembly, + self.assembly_dataset, + self.genebuild_dataset) # Create genome and populate the database with assembly and dataset elif genebuild_status == "New": - print ("New genebuild") + logging.info('New genebuild') # Create genome and populate the database with genebuild dataset new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session, - self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset) + self.organism, + self.assembly, + self.assembly_dataset, + self.genebuild_dataset) else: # Check if the data has been released: if genebuild_release_status is True: raise Exception("Existing Organism, Assembly, and Datasets within a release") else: - print("Rewrite") - #Need to do a rewrite, so that it only redoes the geneset data. + logging.info('Rewrite of existing data') + # Need to do a rewrite, so that it only redoes the geneset data. # Delete the data from the database and repopulate assembly and genebuild. genome_dataset = meta_session.query(GenomeDataset).join(Dataset).filter( @@ -142,13 +150,13 @@ def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri): ) meta_session.add(genebuild_genome_dataset) - def new_genome(self, meta_session, organism, assembly, assembly_dataset, genebuild_dataset): new_genome = Genome( genome_uuid=str(uuid.uuid4()), assembly=assembly, organism=organism, created=func.now(), + is_best=0, ) meta_session.add(new_genome) assembly_genome_dataset = GenomeDataset( @@ -191,8 +199,8 @@ def get_or_new_organism(self, species_id, meta_session, metadata_uri, taxonomy_u scientific_name=self.get_meta_single_meta_key(species_id, "species.scientific_name"), ensembl_name=ensembl_name, strain=self.get_meta_single_meta_key(species_id, "species.strain"), - strain_type = self.get_meta_single_meta_key(species_id, "strain.type"), - # + strain_type=self.get_meta_single_meta_key(species_id, "strain.type"), + scientific_parlance_name=self.get_meta_single_meta_key(species_id, "species.parlance_name") ) # Query the metadata database to find if an Organism with the same Ensembl name already exists. @@ -206,6 +214,7 @@ def get_or_new_organism(self, species_id, meta_session, metadata_uri, taxonomy_u organism_group_member = meta_session.query(OrganismGroupMember).filter( OrganismGroupMember.organism_id == old_organism.organism_id, OrganismGroupMember.organism_group_id == division.organism_group_id).one_or_none() + return old_organism, division, organism_group_member, "Existing" else: # If no existing Organism is found, conduct additional checks before creating a new one. @@ -276,10 +285,10 @@ def get_assembly_sequences(self, species_id, assembly): attribute_dict[name] = {} attribute_dict[name][code] = value - # Create a dictionary so that the results can have multiple synonyms per line and only one SeqRegion accession_info = defaultdict( - lambda: {"names": set(), "accession": None, "length": None, "location": None, "chromosomal": None, "karyotype_rank":None}) + lambda: {"names": set(), "accession": None, "length": None, "location": None, "chromosomal": None, + "karyotype_rank": None}) for seq_region_name, seq_region_length, coord_system_name, synonym in results: accession_info[seq_region_name]["names"].add(seq_region_name) @@ -341,9 +350,6 @@ def get_assembly_sequences(self, species_id, assembly): accession = matching_accessions[0] if matching_accessions else accession name = preferred_name - # Combine all unique names with ";". If a name appears in multiple sequences with the same accession, - # name = ";".join(info["names"]) - # Create an AssemblySequence object. assembly_sequence = AssemblySequence( name=name, assembly=assembly, @@ -387,7 +393,7 @@ def get_or_new_assembly(self, species_id, meta_session, db_uri, source=None): # Leaving it until told otherwise. level = (session.execute(db.select(CoordSystem.name).filter( CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0] - tol_id=self.get_meta_single_meta_key(species_id, "assembly.tol_id") + tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id") if tol_id is None: tol_id = self.get_meta_single_meta_key(species_id, "assembly.tolid") @@ -450,8 +456,9 @@ def new_genebuild(self, species_id, meta_session, db_uri, source=None): # The assembly accession and genebuild version are extracted from the metadata of the species assembly_accession = self.get_meta_single_meta_key(species_id, "assembly.accession") genebuild_version = self.get_meta_single_meta_key(species_id, "genebuild.version") - - #Test if sample_gene is present. + if genebuild_version is None: + raise Exception(f"genebuild.version is required in the core database") + # Test if sample_gene is present. # The genebuild accession is formed by combining the assembly accession and the genebuild version genebuild_accession = assembly_accession + "_" + genebuild_version @@ -486,7 +493,12 @@ def new_genebuild(self, species_id, meta_session, db_uri, source=None): for attribute, value in attributes.items(): meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none() if meta_attribute is None: - raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually") + meta_attribute = Attribute( + name=attribute, + label=attribute, + description=attribute, + type="string", + ) dataset_attribute = DatasetAttribute( value=value, dataset=genebuild_dataset, @@ -494,18 +506,18 @@ def new_genebuild(self, species_id, meta_session, db_uri, source=None): ) genebuild_dataset_attributes.append(dataset_attribute) - #Grab the necessary sample data and add it as an datasetattribute + # Grab the necessary sample data and add it as an datasetattribute sample_gene_param = DatasetAttribute( - value=self.get_meta_single_meta_key(species_id, "sample.gene_param"), - dataset=genebuild_dataset, - attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.gene_param").one_or_none(), - ) + value=self.get_meta_single_meta_key(species_id, "sample.gene_param"), + dataset=genebuild_dataset, + attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.gene_param").one_or_none(), + ) genebuild_dataset_attributes.append(sample_gene_param) sample_location_param = DatasetAttribute( - value=self.get_meta_single_meta_key(species_id, "sample.location_param"), - dataset=genebuild_dataset, - attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.location_param").one_or_none(), - ) + value=self.get_meta_single_meta_key(species_id, "sample.location_param"), + dataset=genebuild_dataset, + attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.location_param").one_or_none(), + ) genebuild_dataset_attributes.append(sample_location_param) # Check if the genebuild dataset with the given label already exists