Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/ensembl/production/metadata/api/models/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class Assembly(Base):
created = Column(DateTime)
ensembl_name = Column(String(255), unique=True)
alt_accession = Column(String(16), nullable=True)
is_reference = Column(TINYINT(1), nullable=False)
is_reference = Column(TINYINT(1), nullable=False, default=0)
url_name = Column(String(128), nullable=False)
# One to many relationships
# assembly_id within assembly_sequence
Expand All @@ -52,7 +52,7 @@ class AssemblySequence(Base):
name = Column(String(128), unique=True)
assembly_id = Column(ForeignKey('assembly.assembly_id'), nullable=False, index=True)
accession = Column(String(128), nullable=False)
chromosomal = Column(TINYINT(1), nullable=False)
chromosomal = Column(TINYINT(1), nullable=False, default=0)
chromosome_rank = Column(Integer)
length = Column(Integer, nullable=False)
sequence_location = Column(String(10))
Expand Down
7 changes: 4 additions & 3 deletions src/ensembl/production/metadata/api/models/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class Genome(Base):
assembly_id = Column(ForeignKey("assembly.assembly_id"), nullable=False, index=True)
organism_id = Column(ForeignKey("organism.organism_id"), nullable=False, index=True)
created = Column(DATETIME(fsp=6), nullable=False)
is_best = Column(TINYINT(1), nullable=False, default=0)
# One to many relationships
# genome_id to genome_dataset and genome release
genome_datasets = relationship("GenomeDataset", back_populates="genome", cascade="all, delete, delete-orphan")
Expand All @@ -44,7 +45,8 @@ class GenomeDataset(Base):
dataset_id = Column(ForeignKey("dataset.dataset_id"), nullable=False, index=True)
genome_id = Column(ForeignKey("genome.genome_id"), nullable=False, index=True)
release_id = Column(ForeignKey("ensembl_release.release_id"), index=True)
is_current = Column(TINYINT(1), nullable=False)
is_current = Column(TINYINT(1), nullable=False, default=0)

# One to many relationships
# none
# many to one relationships
Expand All @@ -62,8 +64,7 @@ class GenomeRelease(Base):
genome_release_id = Column(Integer, primary_key=True)
genome_id = Column(ForeignKey("genome.genome_id"), nullable=False, index=True)
release_id = Column(ForeignKey("ensembl_release.release_id"), nullable=False, index=True)
is_current = Column(TINYINT(1), nullable=False)
is_best = Column(TINYINT(1), nullable=False)
is_current = Column(TINYINT(1), nullable=False, default=0)
# One to many relationships
# none
# many to one relationships
Expand Down
2 changes: 1 addition & 1 deletion src/ensembl/production/metadata/api/models/organism.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class OrganismGroupMember(Base):
)

organism_group_member_id = Column(Integer, primary_key=True)
is_reference = Column(TINYINT(1), nullable=False)
is_reference = Column(TINYINT(1), nullable=False, default=0)
order = Column(Integer, nullable=True)
organism_id = Column(ForeignKey("organism.organism_id"), nullable=False)
organism_group_id = Column(ForeignKey("organism_group.organism_group_id"), nullable=False, index=True)
Expand Down
2 changes: 1 addition & 1 deletion src/ensembl/production/metadata/api/models/release.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class EnsemblRelease(Base):
version = Column(DECIMAL(10, 1), nullable=False)
release_date = Column(Date, nullable=False)
label = Column(String(64))
is_current = Column(TINYINT(1), nullable=False)
is_current = Column(TINYINT(1), nullable=False, default=0)
site_id = Column(ForeignKey('ensembl_site.site_id'), index=True)
release_type = Column(String(16), nullable=False)
# One to many relationships
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
1 a7335667-93e7-11ec-a39d-005056b38ce3 1 1 2023-05-12 13:30:58
2 3704ceb1-948d-11ec-a39d-005056b38ce3 2 1 2023-05-12 13:32:06
3 a73351f7-93e7-11ec-a39d-005056b38ce3 3 2 2023-05-12 13:32:14
4 a73356e1-93e7-11ec-a39d-005056b38ce3 4 3 2023-05-12 13:32:25
5 a73357ab-93e7-11ec-a39d-005056b38ce3 5 4 2023-05-12 13:32:36
6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46
7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52
8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58
1 a7335667-93e7-11ec-a39d-005056b38ce3 1 1 2023-05-12 13:30:58 0
2 3704ceb1-948d-11ec-a39d-005056b38ce3 2 1 2023-05-12 13:32:06 0
3 a73351f7-93e7-11ec-a39d-005056b38ce3 3 2 2023-05-12 13:32:14 0
4 a73356e1-93e7-11ec-a39d-005056b38ce3 4 3 2023-05-12 13:32:25 0
5 a73357ab-93e7-11ec-a39d-005056b38ce3 5 4 2023-05-12 13:32:36 0
6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46 0
7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52 0
8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58 0

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
1 1 1 1 0
2 2 1 0 0
3 3 1 1 0
4 4 1 1 0
5 5 1 1 0
6 6 1 1 0
7 7 1 1 0
1 1 1 1
2 2 1 0
3 3 1 1
4 4 1 1
5 5 1 1
6 6 1 1
7 7 1 1
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ CREATE TABLE assembly
created datetime null,
ensembl_name varchar(255) null,
alt_accession varchar(16) null,
is_reference tinyint(1) not null,
is_reference tinyint(1) not null default 0,
url_name varchar(128) null,

constraint assembly_uuid
Expand All @@ -32,7 +32,7 @@ CREATE TABLE assembly_sequence
name varchar(128) null,
assembly_id int not null,
accession varchar(128) null,
chromosomal tinyint(1) not null,
chromosomal tinyint(1) not null default 0,
length int not null,
chromosome_rank int null,
sequence_location varchar(10) null,
Expand Down Expand Up @@ -141,7 +141,7 @@ CREATE TABLE ensembl_release
version decimal(10, 1) not null,
release_date date not null,
label varchar(64) null,
is_current tinyint(1) not null,
is_current tinyint(1) not null default 0,
site_id int null,
release_type varchar(16) not null,
constraint ensembl_release_version_site_id_b743399a_uniq
Expand Down Expand Up @@ -176,7 +176,9 @@ CREATE TABLE genome
genome_uuid varchar(128) not null,
assembly_id int not null,
organism_id int not null,
created datetime(6) not null,
created datetime(6) not null,
is_best tinyint(1) not null default 0,

constraint genome_uuid
unique (genome_uuid),
constraint genome_assembly_id_0a748388_fk_assembly_assembly_id
Expand All @@ -192,7 +194,7 @@ CREATE TABLE genome_dataset
dataset_id int not null,
genome_id int not null,
release_id int null,
is_current tinyint(1) not null,
is_current tinyint(1) not null default 0,
constraint ensembl_metadata_gen_dataset_id_26d7bac7_fk_dataset_d
foreign key (dataset_id) references dataset (dataset_id),
constraint ensembl_metadata_gen_genome_id_7670a2c5_fk_genome_ge
Expand All @@ -207,8 +209,7 @@ CREATE TABLE genome_release
primary key,
genome_id int not null,
release_id int not null,
is_current tinyint(1) not null,
is_best tinyint(1) not null,
is_current tinyint(1) not null default 0,
constraint genome_release_genome_id_3e45dc04_fk
foreign key (genome_id) references genome (genome_id),
constraint genome_release_release_id_bca7e1e5_fk_ensembl_release_release_id
Expand All @@ -231,7 +232,7 @@ CREATE TABLE organism_group
CREATE TABLE `organism_group_member`
(
`organism_group_member_id` int NOT NULL AUTO_INCREMENT,
`is_reference` tinyint(1) DEFAULT NULL,
`is_reference` tinyint(1) NOT NULL DEFAULT 0,
`organism_id` int NOT NULL,
`organism_group_id` int NOT NULL,
`order` int DEFAULT NULL,
Expand Down
74 changes: 43 additions & 31 deletions src/ensembl/production/metadata/updater/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from ensembl.production.metadata.api.dataset import DatasetAdaptor
from ensembl.production.metadata.api.models import *
from ensembl.production.metadata.updater.base import BaseMetaUpdater

import logging

class CoreMetaUpdater(BaseMetaUpdater):
def __init__(self, db_uri, metadata_uri, taxonomy_uri):
Expand Down Expand Up @@ -73,7 +73,6 @@ def process_core(self, **kwargs):
self.process_species(species, metadata_uri, taxonomy_uri, db_uri)

def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri):
print (db_uri)
"""
Process an individual species from a core database to update the metadata db.
This method contains the logic for updating the metadata
Expand All @@ -91,40 +90,49 @@ def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri):
genebuild_release_status = conn.check_release_status(self.genebuild_dataset.dataset_uuid)

if organism_status == "New":
print ("New organism")
logging.info('New organism')
# ###############################Checks that dataset and assembly are new ##################
if assembly_status != "New" or genebuild_status != "New":
raise Exception("New organism, but existing assembly accession and/or genebuild version")
###############################################
# Create genome and populate the database with organism, assembly and dataset
new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session,
self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset)
self.organism,
self.assembly,
self.assembly_dataset,
self.genebuild_dataset)

elif assembly_status == "New":
print ("New assembly")
logging.info('New assembly')

# ###############################Checks that dataset and update are new ##################
if genebuild_status != "New":
raise Exception("New assembly, but existing genebuild version")
###############################################

new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session,
self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset)
self.organism,
self.assembly,
self.assembly_dataset,
self.genebuild_dataset)

# Create genome and populate the database with assembly and dataset
elif genebuild_status == "New":
print ("New genebuild")
logging.info('New genebuild')

# Create genome and populate the database with genebuild dataset
new_genome, assembly_genome_dataset, genebuild_genome_dataset = self.new_genome(meta_session,
self.organism, self.assembly, self.assembly_dataset, self.genebuild_dataset)
self.organism,
self.assembly,
self.assembly_dataset,
self.genebuild_dataset)
else:
# Check if the data has been released:
if genebuild_release_status is True:
raise Exception("Existing Organism, Assembly, and Datasets within a release")
else:
print("Rewrite")
#Need to do a rewrite, so that it only redoes the geneset data.
logging.info('Rewrite of existing data')
# Need to do a rewrite, so that it only redoes the geneset data.

# Delete the data from the database and repopulate assembly and genebuild.
genome_dataset = meta_session.query(GenomeDataset).join(Dataset).filter(
Expand All @@ -142,13 +150,13 @@ def process_species(self, species_id, metadata_uri, taxonomy_uri, db_uri):
)
meta_session.add(genebuild_genome_dataset)


def new_genome(self, meta_session, organism, assembly, assembly_dataset, genebuild_dataset):
new_genome = Genome(
genome_uuid=str(uuid.uuid4()),
assembly=assembly,
organism=organism,
created=func.now(),
is_best=0,
)
meta_session.add(new_genome)
assembly_genome_dataset = GenomeDataset(
Expand Down Expand Up @@ -191,8 +199,8 @@ def get_or_new_organism(self, species_id, meta_session, metadata_uri, taxonomy_u
scientific_name=self.get_meta_single_meta_key(species_id, "species.scientific_name"),
ensembl_name=ensembl_name,
strain=self.get_meta_single_meta_key(species_id, "species.strain"),
strain_type = self.get_meta_single_meta_key(species_id, "strain.type"),
#
strain_type=self.get_meta_single_meta_key(species_id, "strain.type"),
scientific_parlance_name=self.get_meta_single_meta_key(species_id, "species.parlance_name")
)

# Query the metadata database to find if an Organism with the same Ensembl name already exists.
Expand All @@ -206,6 +214,7 @@ def get_or_new_organism(self, species_id, meta_session, metadata_uri, taxonomy_u
organism_group_member = meta_session.query(OrganismGroupMember).filter(
OrganismGroupMember.organism_id == old_organism.organism_id,
OrganismGroupMember.organism_group_id == division.organism_group_id).one_or_none()

return old_organism, division, organism_group_member, "Existing"
else:
# If no existing Organism is found, conduct additional checks before creating a new one.
Expand Down Expand Up @@ -276,10 +285,10 @@ def get_assembly_sequences(self, species_id, assembly):
attribute_dict[name] = {}
attribute_dict[name][code] = value


# Create a dictionary so that the results can have multiple synonyms per line and only one SeqRegion
accession_info = defaultdict(
lambda: {"names": set(), "accession": None, "length": None, "location": None, "chromosomal": None, "karyotype_rank":None})
lambda: {"names": set(), "accession": None, "length": None, "location": None, "chromosomal": None,
"karyotype_rank": None})

for seq_region_name, seq_region_length, coord_system_name, synonym in results:
accession_info[seq_region_name]["names"].add(seq_region_name)
Expand Down Expand Up @@ -341,9 +350,6 @@ def get_assembly_sequences(self, species_id, assembly):
accession = matching_accessions[0] if matching_accessions else accession
name = preferred_name

# Combine all unique names with ";". If a name appears in multiple sequences with the same accession,
# name = ";".join(info["names"])
# Create an AssemblySequence object.
assembly_sequence = AssemblySequence(
name=name,
assembly=assembly,
Expand Down Expand Up @@ -387,7 +393,7 @@ def get_or_new_assembly(self, species_id, meta_session, db_uri, source=None):
# Leaving it until told otherwise.
level = (session.execute(db.select(CoordSystem.name).filter(
CoordSystem.species_id == species_id).order_by(CoordSystem.rank)).all())[0][0]
tol_id=self.get_meta_single_meta_key(species_id, "assembly.tol_id")
tol_id = self.get_meta_single_meta_key(species_id, "assembly.tol_id")
if tol_id is None:
tol_id = self.get_meta_single_meta_key(species_id, "assembly.tolid")

Expand Down Expand Up @@ -450,8 +456,9 @@ def new_genebuild(self, species_id, meta_session, db_uri, source=None):
# The assembly accession and genebuild version are extracted from the metadata of the species
assembly_accession = self.get_meta_single_meta_key(species_id, "assembly.accession")
genebuild_version = self.get_meta_single_meta_key(species_id, "genebuild.version")

#Test if sample_gene is present.
if genebuild_version is None:
raise Exception(f"genebuild.version is required in the core database")
# Test if sample_gene is present.

# The genebuild accession is formed by combining the assembly accession and the genebuild version
genebuild_accession = assembly_accession + "_" + genebuild_version
Expand Down Expand Up @@ -486,26 +493,31 @@ def new_genebuild(self, species_id, meta_session, db_uri, source=None):
for attribute, value in attributes.items():
meta_attribute = meta_session.query(Attribute).filter(Attribute.name == attribute).one_or_none()
if meta_attribute is None:
raise Exception(f"Attribute {attribute} not found. Please enter it into the db manually")
meta_attribute = Attribute(
name=attribute,
label=attribute,
description=attribute,
type="string",
)
dataset_attribute = DatasetAttribute(
value=value,
dataset=genebuild_dataset,
attribute=meta_attribute,
)
genebuild_dataset_attributes.append(dataset_attribute)

#Grab the necessary sample data and add it as an datasetattribute
# Grab the necessary sample data and add it as an datasetattribute
sample_gene_param = DatasetAttribute(
value=self.get_meta_single_meta_key(species_id, "sample.gene_param"),
dataset=genebuild_dataset,
attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.gene_param").one_or_none(),
)
value=self.get_meta_single_meta_key(species_id, "sample.gene_param"),
dataset=genebuild_dataset,
attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.gene_param").one_or_none(),
)
genebuild_dataset_attributes.append(sample_gene_param)
sample_location_param = DatasetAttribute(
value=self.get_meta_single_meta_key(species_id, "sample.location_param"),
dataset=genebuild_dataset,
attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.location_param").one_or_none(),
)
value=self.get_meta_single_meta_key(species_id, "sample.location_param"),
dataset=genebuild_dataset,
attribute=meta_session.query(Attribute).filter(Attribute.name == "sample.location_param").one_or_none(),
)
genebuild_dataset_attributes.append(sample_location_param)

# Check if the genebuild dataset with the given label already exists
Expand Down