diff --git a/src/ensembl/production/metadata/api/factories/datasets.py b/src/ensembl/production/metadata/api/factories/datasets.py index bd1fc5aa..93e8c5bf 100644 --- a/src/ensembl/production/metadata/api/factories/datasets.py +++ b/src/ensembl/production/metadata/api/factories/datasets.py @@ -109,7 +109,7 @@ def create_dataset(self, session, genome_input, dataset_source, dataset_type, da new_genome_dataset = GenomeDataset( genome=genome, dataset=new_dataset, - is_current=False, + is_current=is_current, ) if release is not None: if isinstance(release, str): @@ -117,6 +117,7 @@ def create_dataset(self, session, genome_input, dataset_source, dataset_type, da logger.debug(f"Attaching {new_dataset.dataset_uuid} to release {release.release_id}") new_genome_dataset.release_id = release.release_id session.add(new_genome_dataset) + session.commit() return dataset_uuid, new_dataset, new_dataset_attributes, new_genome_dataset else: return dataset_uuid, new_dataset, new_dataset_attributes, None @@ -430,3 +431,4 @@ def __query_genomes_by_status_and_type(self, session, status, dataset_type): # Execute query and fetch results results = query return results + diff --git a/src/ensembl/production/metadata/scripts/copy_handover_files.py b/src/ensembl/production/metadata/scripts/copy_handover_files.py new file mode 100644 index 00000000..07c18dc1 --- /dev/null +++ b/src/ensembl/production/metadata/scripts/copy_handover_files.py @@ -0,0 +1,128 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import json +import logging +import os +import shutil +import sys +from pathlib import Path + + +# Configure root logger +logger = logging.getLogger() +logger.setLevel(logging.INFO) # Set minimum level for all handlers +logger.handlers.clear() +formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s" +) + +# ERROR log handler +error_file_handler = logging.FileHandler("error.log") +error_file_handler.setLevel(logging.ERROR) +error_file_handler.setFormatter(formatter) +logger.addHandler(error_file_handler) + +# Console handler (shows INFO and above) +console_handler = logging.StreamHandler(sys.stdout) +console_handler.setLevel(logging.INFO) +console_handler.setFormatter(formatter) +logger.addHandler(console_handler) + +logger = logging.getLogger(__name__) + + +def check_directory(paths: str) -> list: + paths = paths.split(',') + paths = [p.strip() for p in paths] + print(paths) + for path in paths: + if not os.path.isdir(path): + raise argparse.ArgumentTypeError(f"The directory '{path}' does not exist.") + return paths + + +def main(json_input, release_id, destinations, rename_files=None, status="Submitted"): + try: + with open(json_input, 'r') as f: + data = json.load(f) + except Exception as e: + logger.error(e) + raise e + try: + for item in data: + genome_uuid = item["genome_uuid"] + dataset_source = item["dataset_source"]["name"] + source_type = item["dataset_source"]["type"] + dataset_type = item["dataset_type"] + dataset_attributes = {attr["name"]: attr["value"] for attr in item["dataset_attribute"]} + name = item["name"] + label = item["label"] + version = item.get("version", None) + source = Path(item["dataset_source"]["name"]) + for destination in destinations: + dest_dir = f"{destination}{genome_uuid}/" + dest_dir = Path(dest_dir) + dest_dir.mkdir(parents=True, exist_ok=True) + if name == "regulatory_features": + dest_dir = f"{dest_dir}/regulatory-features{source.suffix}" + shutil.copy2(item["dataset_source"]["name"], dest_dir) + + print(f"Copied files from {source} to {dest_dir}.") + except Exception as e: + logger.error("An Error occurred:") + logger.error(e) + # logger.error() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="File handover script." + ) + parser.add_argument( + "--release_id", + type=str, + required=True, + help="Release id.", + ) + + parser.add_argument( + "--rename_files", + type=str, + required=False, + help="Rename files into the provided string.", + ) + parser.add_argument( + "--dataset_type", + type=str, + required=True, + help="Fetch Dataset Based on dataset type Ex: genebuild", + ) + parser.add_argument( + "--json_file_path", + type=str, + required=True, + help="Path to JSON file handed over by teams ", + ) + + parser.add_argument( + "--dest_dirs", + type=check_directory, + required=True, + help="Datafiles destination directory(s). You can seprate directories with "," EX:dir1,dir2", + ) + + ARGS = parser.parse_args() + logger.info(f"Provided Arguments {ARGS} ") + + main(json_input=ARGS.json_file_path, release_id=ARGS.release_id, destinations=ARGS.destinations, rename_files=ARGS.rename_files) + diff --git a/src/ensembl/production/metadata/scripts/create_datasets_json.py b/src/ensembl/production/metadata/scripts/create_datasets_json.py index d2cfedda..51bc1f3d 100644 --- a/src/ensembl/production/metadata/scripts/create_datasets_json.py +++ b/src/ensembl/production/metadata/scripts/create_datasets_json.py @@ -9,69 +9,182 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - +import argparse import json +import logging +import os +import shutil +import sys +from pathlib import Path from ensembl.utils.database import DBConnection from ensembl.production.metadata.api.factories.datasets import DatasetFactory -from ensembl.production.metadata.api.models import EnsemblRelease - - -def main(json_input, release_id, conn_uri, status="Submitted"): - with open(json_input, 'r') as f: - data = json.load(f) - metadata_db = DBConnection(conn_uri) - - with metadata_db.session_scope() as session: - for item in data: - genome_uuid = item["genome_uuid"] - dataset_source = item["dataset_source"]["name"] - source_type = item["dataset_source"]["type"] - dataset_type = item["dataset_type"] - dataset_attributes = {attr["name"]: attr["value"] for attr in item["dataset_attribute"]} - name = item["name"] - label = item["label"] - version = item.get("version", None) - dataset_factory = DatasetFactory(conn_uri) - release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == release_id).one_or_none() - # Create the main dataset - dataset_uuid, new_dataset, new_dataset_attributes, new_genome_dataset = dataset_factory.create_dataset( - session=session, - genome_input=genome_uuid, - dataset_source=dataset_source, - dataset_type=dataset_type, - dataset_attributes=dataset_attributes, - name=name, - label=label, - version=version, - status=status, - source_type=source_type, - release=release, - is_current=True - ) - session.commit() - # Populate child datasets - children = dataset_factory.create_all_child_datasets( - dataset_uuid=dataset_uuid, - session=session, - topic="production_process", - status=None, - release=release - ) - session.commit() - - print(f"Created dataset UUID: {dataset_uuid} with children") + +from ensembl.production.metadata.api.models import Dataset, Genome, GenomeDataset, EnsemblRelease, Attribute, \ + DatasetAttribute + +# Configure root logger +logger = logging.getLogger() +logger.setLevel(logging.INFO) # Set minimum level for all handlers +logger.handlers.clear() +formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s" +) + +# ERROR log handler +error_file_handler = logging.FileHandler("error.log") +error_file_handler.setLevel(logging.ERROR) +error_file_handler.setFormatter(formatter) +logger.addHandler(error_file_handler) + +# Console handler (shows INFO and above) +console_handler = logging.StreamHandler(sys.stdout) +console_handler.setLevel(logging.INFO) +console_handler.setFormatter(formatter) +logger.addHandler(console_handler) + +logger = logging.getLogger(__name__) + + +def check_directory(path: str) -> str: + if not os.path.isdir(path): + raise argparse.ArgumentTypeError(f"The directory '{path}' does not exist.") + return path + + +def main(json_input, release_id, conn_uri, destination, status="Submitted"): + try: + with open(json_input, 'r') as f: + data = json.load(f) + metadata_db = DBConnection(conn_uri) + except Exception as e: + logger.error(e) + raise e + try: + with metadata_db.session_scope() as session: + for item in data: + genome_uuid = item["genome_uuid"] + dataset_source = item["dataset_source"]["name"] + source_type = item["dataset_source"]["type"] + dataset_type = item["dataset_type"] + dataset_attributes = {attr["name"]: attr["value"] for attr in item["dataset_attribute"]} + name = item["name"] + label = item["label"] + version = item.get("version", None) + dataset_factory = DatasetFactory(conn_uri) + release = session.query(EnsemblRelease).filter(EnsemblRelease.release_id == release_id).one_or_none() + # Create the main dataset + + old_genome_datasets = session.query(GenomeDataset) \ + .join(Genome, GenomeDataset.genome_id == Genome.genome_id) \ + .join(Dataset, GenomeDataset.dataset_id == Dataset.dataset_id) \ + .join(DatasetAttribute, DatasetAttribute.dataset_id == Dataset.dataset_id) \ + .join(Attribute, DatasetAttribute.attribute_id == Attribute.attribute_id) \ + .filter(Genome.genome_uuid == genome_uuid) \ + .filter(Attribute.name.in_(list(dataset_attributes.keys()))) \ + .filter(GenomeDataset.is_current==1) \ + .all() + + for old_genome_dataset in old_genome_datasets: + children = session.query(GenomeDataset) \ + .join(Dataset,Dataset.dataset_id==GenomeDataset.dataset_id) \ + .filter(Dataset.parent_id == old_genome_dataset.dataset_id) \ + .filter(GenomeDataset.is_current == 1) \ + .all() + for child in children: + # child_child = session.query(GenomeDataset).join(Dataset,Dataset.parent_id==child.dataset_id).filter(GenomeDataset.is_current == 1).all() + child_child = session.query(GenomeDataset) \ + .join(Dataset,Dataset.dataset_id==GenomeDataset.dataset_id) \ + .filter(Dataset.parent_id == child.dataset_id) \ + .filter(GenomeDataset.is_current == 1) \ + .all() + child.is_current = 0 + # if child: + for ch in child_child: + ch.is_current = 0 + old_genome_dataset.is_current = 0 + + dataset_uuid, new_dataset, new_dataset_attributes, new_genome_dataset = dataset_factory.create_dataset( + session=session, + genome_input=genome_uuid, + dataset_source=dataset_source, + dataset_type=dataset_type, + dataset_attributes=dataset_attributes, + name=name, + label=label, + version=version, + status=status, + source_type=source_type, + release=release, + is_current=True + ) + + # Populate child datasets + + children = dataset_factory.create_all_child_datasets( + dataset_uuid=dataset_uuid, + session=session, + topic="production_process", + status=status, + release=release + ) + session.commit() + dest_dir = f"{destination}{genome_uuid}/" + source = Path(item["dataset_source"]["name"]) + dest_dir = Path(dest_dir) + dest_dir.mkdir(parents=True, exist_ok=True) + if name == "regulatory_features": + dest_dir = f"{dest_dir}/regulatory-features{source.suffix}" + shutil.copy2(item["dataset_source"]["name"], dest_dir) + + print(f"Created dataset UUID: {dataset_uuid} with children") + except Exception as e: + session.rollback() + logger.error("An Error occurred:") + logger.error(e) + # logger.error() if __name__ == "__main__": - #Expecting JSON input, release id, and connection URI as command-line arguments - if len(sys.argv) < 4: - print("Usage: python create_datasets.py ") - sys.exit(1) - - json_input = sys.argv[1] - release_id = sys.argv[2] - conn_uri = sys.argv[3] - main(json_input, release_id, conn_uri) + parser = argparse.ArgumentParser( + description="File handover script." + ) + parser.add_argument( + "--release_id", + type=str, + required=True, + help="Release id.", + ) + + parser.add_argument( + "--metadata_db_uri", + type=str, + required=True, + help="metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata", + ) + parser.add_argument( + "--dataset_type", + type=str, + required=True, + help="Fetch Dataset Based on dataset type Ex: genebuild", + ) + parser.add_argument( + "--json_file_path", + type=str, + required=True, + help="Path to JSON file handed over by teams ", + ) + + parser.add_argument( + "--dest_dir", + type=check_directory, + required=True, + help="Datafiles destination directory.", + ) + + ARGS = parser.parse_args() + logger.info(f"Provided Arguments {ARGS} ") + + main(ARGS.json_file_path, ARGS.release_id, ARGS.metadata_db_uri, ARGS.dest_dir) +