Skip to content

Commit

Permalink
Merge pull request #452 from tcezard/EVA3586_add_handlers
Browse files Browse the repository at this point in the history
EVA-3586 - Multiple Fixes from running in production
  • Loading branch information
tcezard committed Jun 7, 2024
2 parents 65809a6 + 849193e commit 6c5f742
Show file tree
Hide file tree
Showing 15 changed files with 144 additions and 115 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import sys

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
from ebi_eva_common_pyutils.logger import logging_config as log_cfg, logging_config
from run_release_in_embassy.release_metadata import vcf_validation_output_file_pattern, asm_report_output_file_pattern

logger = log_cfg.get_logger(__name__)
Expand Down Expand Up @@ -70,21 +70,21 @@ def analyze_asm_report_files(asm_report_files):
return exit_code


def analyze_vcf_validation_results(species_release_folder, assembly_accession):
vcf_validation_report_files = glob.glob("{0}/{1}/{2}".format(species_release_folder, assembly_accession,
vcf_validation_output_file_pattern))
def analyze_vcf_validation_results(assembly_release_folder, assembly_accession):
vcf_validation_report_files = glob.glob("{0}/{1}".format(assembly_release_folder,
vcf_validation_output_file_pattern))
exit_code = analyze_vcf_validation_files(vcf_validation_report_files)
asm_report_files = glob.glob("{0}/{1}/{2}".format(species_release_folder, assembly_accession,
asm_report_output_file_pattern))
asm_report_files = glob.glob("{0}/{1}".format(assembly_release_folder, asm_report_output_file_pattern))
exit_code = exit_code or analyze_asm_report_files(asm_report_files)
sys.exit(exit_code)


@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.option("--assembly-accession", required=True)
@click.command()
def main(species_release_folder, assembly_accession):
analyze_vcf_validation_results(species_release_folder, assembly_accession)
def main(assembly_release_folder, assembly_accession):
logging_config.add_stdout_handler()
analyze_vcf_validation_results(assembly_release_folder, assembly_accession)


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import traceback

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_internal_pyutils.config_utils import get_mongo_uri_for_eva_profile
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.mongo_utils import copy_db
Expand Down Expand Up @@ -153,6 +154,7 @@ def copy_accessioning_collections_to_embassy(private_config_xml_file, profile, t
@click.command()
def main(private_config_xml_file, profile, taxonomy_id, assembly_accession, collections_to_copy, release_species_inventory_table,
release_version, dump_dir):
logging_config.add_stdout_handler()
copy_accessioning_collections_to_embassy(private_config_xml_file, profile, taxonomy_id, assembly_accession,
collections_to_copy, release_species_inventory_table, release_version,
dump_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,25 @@
import os

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.logger import logging_config

from run_release_in_embassy.release_metadata import release_vcf_file_categories, release_text_file_categories
from run_release_in_embassy.release_common_utils import get_release_vcf_file_name_genbank, get_release_text_file_name


def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder):
release_count_filename = os.path.join(species_release_folder, assembly_accession, "README_rs_ids_counts.txt")
def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder):
release_count_filename = os.path.join(assembly_release_folder, "README_rs_ids_counts.txt")
with open(release_count_filename, "w") as release_count_file_handle:
release_count_file_handle.write("# Unique RS ID counts\n")
for vcf_file_category in release_vcf_file_categories:
release_vcf_file_name = get_release_vcf_file_name_genbank(species_release_folder, taxonomy_id, assembly_accession,
release_vcf_file_name = get_release_vcf_file_name_genbank(assembly_release_folder, taxonomy_id, assembly_accession,
vcf_file_category)
num_ids_in_file = run_command_with_output("Counting RS IDs in file: " + release_vcf_file_name,
"{0} {1}.gz".format(count_ids_script_path, release_vcf_file_name),
return_process_output=True)
release_count_file_handle.write(num_ids_in_file)
for text_release_file_category in release_text_file_categories:
text_release_file_name = get_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession,
text_release_file_name = get_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
text_release_file_category)
num_ids_in_file = run_command_with_output("Counting RS IDs in file: " + text_release_file_name,
"zcat {0}.gz | cut -f1 | uniq | wc -l"
Expand All @@ -44,10 +46,11 @@ def count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_a
@click.option("--count-ids-script-path", help="ex: /path/to/count/ids/script", required=True)
@click.option("--taxonomy-id", help="ex: 9913", required=True)
@click.option("--assembly-accession", help="ex: GCA_000003055.6", required=True)
@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.command()
def main(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder):
count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, species_release_folder)
def main(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder):
logging_config.add_stdout_handler()
count_rs_ids_in_release_files(count_ids_script_path, taxonomy_id, assembly_accession, assembly_release_folder)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import os

import click
from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.spring_properties import SpringPropertiesGenerator

Expand All @@ -22,7 +23,7 @@


def get_release_properties_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
release_species_inventory_table, release_version, species_release_folder):
release_species_inventory_table, release_version):
with get_metadata_connection_handle(profile, private_config_xml_file) as metadata_connection_handle:
release_inventory_info_for_assembly = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession,
release_species_inventory_table,
Expand All @@ -35,13 +36,13 @@ def get_release_properties_for_assembly(private_config_xml_file, profile, taxono

def create_release_properties_file_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
release_species_inventory_table, release_version,
species_release_folder):
assembly_species_release_folder = os.path.join(species_release_folder, assembly_accession)
os.makedirs(assembly_species_release_folder, exist_ok=True)
output_file = "{0}/{1}_release.properties".format(assembly_species_release_folder, assembly_accession)
release_properties = get_release_properties_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
release_species_inventory_table, release_version,
species_release_folder)
assembly_release_folder):
os.makedirs(assembly_release_folder, exist_ok=True)
output_file = "{0}/{1}_release.properties".format(assembly_release_folder, assembly_accession)
release_properties = get_release_properties_for_assembly(
private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table,
release_version
)
properties_string = SpringPropertiesGenerator(profile, private_config_xml_file).get_release_properties(
temp_mongo_db=release_properties['mongo_accessioning_db'],
job_name='ACCESSION_RELEASE_JOB',
Expand All @@ -50,7 +51,7 @@ def create_release_properties_file_for_assembly(private_config_xml_file, profile
fasta=release_properties['fasta_path'],
assembly_report=release_properties['report_path'],
contig_naming='SEQUENCE_NAME',
output_folder=assembly_species_release_folder
output_folder=assembly_release_folder
)
open(output_file, "w").write(properties_string)
return output_file
Expand All @@ -63,13 +64,14 @@ def create_release_properties_file_for_assembly(private_config_xml_file, profile
@click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker",
required=False)
@click.option("--release-version", help="ex: 2", type=int, required=True)
@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.command()
def main(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table,
release_version, species_release_folder):
release_version, assembly_release_folder):
logging_config.add_stdout_handler()
create_release_properties_file_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
release_species_inventory_table, release_version,
species_release_folder)
assembly_release_folder)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import click
import logging

from ebi_eva_common_pyutils.logger import logging_config

from run_release_in_embassy.release_metadata import update_release_progress_status
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
Expand Down Expand Up @@ -42,6 +43,7 @@ def initiate_release_status_for_assembly(private_config_xml_file, profile, relea
@click.command()
def main(private_config_xml_file, profile, release_species_inventory_table, taxonomy_id, assembly_accession,
release_version):
logging_config.add_stdout_handler()
initiate_release_status_for_assembly(private_config_xml_file, profile, release_species_inventory_table,
taxonomy_id, assembly_accession, release_version)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,18 @@
import os

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from run_release_in_embassy.release_metadata import release_vcf_file_categories, release_text_file_categories, \
get_release_inventory_info_for_assembly
from run_release_in_embassy.release_common_utils import get_bgzip_bcftools_index_commands_for_file, \
get_release_vcf_file_name, get_unsorted_release_vcf_file_name, get_unsorted_release_text_file_name


def move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, species_release_folder, vcf_file_category,
def move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, assembly_release_folder, vcf_file_category,
unsorted_release_file_path):
unsorted_release_file_name = os.path.basename(unsorted_release_file_path)
release_file_path = get_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession,
release_file_path = get_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
vcf_file_category)
release_file_name = os.path.basename(release_file_path)
for variant_source in ["eva", "dbsnp"]:
Expand Down Expand Up @@ -86,17 +87,17 @@ def merge_dbsnp_eva_vcf_headers(file1, file2, output_file):


def merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession,
species_release_folder, vcf_file_category, data_sources):
assembly_release_folder, vcf_file_category, data_sources):
vcf_merge_commands = []
# This is the desired post-merge output file name in the format <assembly>_<category>.vcf
# ex: 60711_GCA_000409795.2_merged_ids.vcf
unsorted_release_file_path = get_unsorted_release_vcf_file_name(species_release_folder, taxonomy_id, assembly_accession,
unsorted_release_file_path = get_unsorted_release_vcf_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
vcf_file_category)
unsorted_release_file_name = os.path.basename(unsorted_release_file_path)
# After release pipeline is run on a species, the default VCF output files are in the formats like below
# ex: eva_GCA_000409795.2_merged_ids.vcf and dbsnp_GCA_000409795.2_merged_ids.vcf
# Move them to files with _unsorted suffix to avoid confusion
move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, species_release_folder, vcf_file_category,
move_release_files_to_unsorted_category(taxonomy_id, assembly_accession, assembly_release_folder, vcf_file_category,
unsorted_release_file_path)
dbsnp_vcf_file_pattern = unsorted_release_file_path.replace(unsorted_release_file_name,
"dbsnp*_" + unsorted_release_file_name.replace(f'{str(taxonomy_id)}_', ''))
Expand Down Expand Up @@ -133,10 +134,10 @@ def merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, t
return vcf_merge_commands


def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_folder, text_release_file_category,
def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, assembly_release_folder, text_release_file_category,
data_sources):
text_release_file_merge_commands = []
unsorted_release_file_path = get_unsorted_release_text_file_name(species_release_folder, taxonomy_id, assembly_accession,
unsorted_release_file_path = get_unsorted_release_text_file_name(assembly_release_folder, taxonomy_id, assembly_accession,
text_release_file_category)
unsorted_release_file_name = os.path.basename(unsorted_release_file_path)
# After release is run on a species, the default text (i.e., non-vcf) output files have ".unsorted.txt" file suffix
Expand Down Expand Up @@ -173,18 +174,18 @@ def merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_

def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path,
taxonomy_id, assembly_accession, release_species_inventory_table, release_version,
species_release_folder):
assembly_release_folder):
with get_metadata_connection_handle(profile, private_config_xml_file) as metadata_connection_handle:
release_info = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession,
release_species_inventory_table,
release_version, metadata_connection_handle)
merge_commands = []
for vcf_file_category in release_vcf_file_categories:
merge_commands.extend(merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path,
taxonomy_id, assembly_accession, species_release_folder,
taxonomy_id, assembly_accession, assembly_release_folder,
vcf_file_category, release_info["sources"]))
for text_release_file_category in release_text_file_categories:
merge_commands.extend(merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, species_release_folder,
merge_commands.extend(merge_dbsnp_eva_text_files(taxonomy_id, assembly_accession, assembly_release_folder,
text_release_file_category, release_info["sources"]))
final_merge_command = " && ".join(merge_commands)
run_command_with_output(f"Merging dbSNP and EVA release files for taxonomy {taxonomy_id} and assembly {assembly_accession}",
Expand All @@ -201,13 +202,14 @@ def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path,
@click.option("--release-species-inventory-table", default="eva_progress_tracker.clustering_release_tracker",
required=False)
@click.option("--release-version", help="ex: 2", type=int, required=True)
@click.option("--species-release-folder", required=True)
@click.option("--assembly-release-folder", required=True)
@click.command()
def main(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id,
assembly_accession, release_species_inventory_table, release_version, species_release_folder):
assembly_accession, release_species_inventory_table, release_version, assembly_release_folder):
logging_config.add_stdout_handler()
merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path,
taxonomy_id, assembly_accession, release_species_inventory_table, release_version,
species_release_folder)
assembly_release_folder)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 6c5f742

Please sign in to comment.