From 4360f101b6dc30e285b5aefff39687b346dbbbc4 Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 4 Apr 2023 11:26:13 +0200 Subject: [PATCH 01/34] adding retroseq --- CHANGELOG.md | 8 + containers/retroseq/Dockerfile | 31 +++ definitions/rd_dna_initiation_map.yaml | 2 + definitions/rd_dna_parameters.yaml | 20 ++ documentation/Setup.md | 2 + lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 18 ++ lib/MIP/Constants.pm | 2 +- lib/MIP/Program/Retroseq.pm | 34 ++- lib/MIP/Recipes/Analysis/Retroseq.pm | 288 +++++++++++++++++++++ lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 8 +- t/analysis_deepvariant.t | 2 +- t/analysis_retroseq.t | 112 ++++++++ t/data/references/grch37_g1k_alu.bed | 0 t/data/references/grch37_g1k_herv.bed | 0 t/data/references/grch37_g1k_l1.bed | 0 t/data/references/grch37_g1k_sva.bed | 0 t/data/references/grch38_g1k_alu.bed | 0 t/data/references/grch38_g1k_herv.bed | 0 t/data/references/grch38_g1k_l1.bed | 0 t/data/references/grch38_g1k_sva.bed | 0 t/retroseq_call.t | 13 +- t/retroseq_discover.t | 9 +- templates/grch38_mip_rd_dna_config.yaml | 5 + templates/mip_install_config.yaml | 6 +- templates/mip_rd_dna_config.yaml | 5 + 25 files changed, 536 insertions(+), 29 deletions(-) create mode 100644 containers/retroseq/Dockerfile create mode 100644 lib/MIP/Recipes/Analysis/Retroseq.pm create mode 100644 t/analysis_retroseq.t create mode 100644 t/data/references/grch37_g1k_alu.bed create mode 100644 t/data/references/grch37_g1k_herv.bed create mode 100644 t/data/references/grch37_g1k_l1.bed create mode 100644 t/data/references/grch37_g1k_sva.bed create mode 100644 t/data/references/grch38_g1k_alu.bed create mode 100644 t/data/references/grch38_g1k_herv.bed create mode 100644 t/data/references/grch38_g1k_l1.bed create mode 100644 t/data/references/grch38_g1k_sva.bed diff --git a/CHANGELOG.md b/CHANGELOG.md index 51ad79d65..68397452c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,14 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## [11.2.0] + +- Adds retroseq for mobile element detection + +### Tools + +- RetroSeq: 9d4f3b5 + ## [11.1.3] - Adds Gens' bed index file to deliverables diff --git a/containers/retroseq/Dockerfile b/containers/retroseq/Dockerfile new file mode 100644 index 000000000..2d9aaac26 --- /dev/null +++ b/containers/retroseq/Dockerfile @@ -0,0 +1,31 @@ +################## BASE IMAGE ###################### + +FROM clinicalgenomics/mip_base:2.1 + +################## METADATA ###################### + +LABEL base_image="clinicalgenomics/mip_base:2.1" +LABEL version="1" +LABEL software="retroseq" +LABEL software.version="1.5_9d4f3b5" +LABEL extra.binaries="retroseq" +LABEL maintainer="Clinical-Genomics/MIP" + +RUN apt-get update --fix-missing && \ + apt-get install -y --no-install-recommends \ + ca-certificates && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +RUN conda install samtools exonerate bedtools bcftools && \ + /opt/conda/bin/conda clean -ya + +RUN git clone https://github.com/tk2/RetroSeq.git /opt/conda/share/RetroSeq + +WORKDIR /opt/conda/share/RetroSeq + +## Remove samtool check +## Make sure we're on the right commit and remove samtools check +RUN git reset --hard 9d4f3b5 && \ + sed -i '/RetroSeq::Utilities::checkBinary( q\[samtools\].*/d' ./bin/retroseq.pl + diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index 2dc40b532..29522e336 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -49,6 +49,8 @@ CHAIN_ALL: - sv_rankvariant - sv_reformat - vcf2cytosure_ar + - CHAIN_MOBILE_ELEMENTS: + - retroseq - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution - PARALLEL: diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index f6d96dccb..e27f1115b 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -234,6 +234,7 @@ recipe_core_number: picardtools_collectmultiplemetrics: 1 plink: 1 qccollect_ar: 1 + retroseq: 2 rhocall_ar: 13 rhocall_viz: 1 rtg_vcfeval: 36 @@ -301,6 +302,7 @@ recipe_memory: picardtools_collecthsmetrics: 8 picardtools_collectmultiplemetrics: 8 plink: 10 + retroseq: 5 rhocall_ar: 2 rhocall_viz: 5 sambamba_depth: 10 @@ -365,6 +367,7 @@ recipe_time: prepareforvariantannotationblock: 5 qccollect_ar: 1 rankvariant: 10 + retroseq: 5 rhocall_ar: 5 rhocall_viz: 1 rtg_vcfeval: 1 @@ -1250,6 +1253,23 @@ sv_reformat_remove_genes_file: mandatory: no reference: reference_dir type: path +## Mobile elenment chain +retroseq: + analysis_mode: sample + associated_recipe: + - mip + data_type: SCALAR + default: 1 + file_tag: _me + outfile_suffix: ".vcf" + type: recipe +mobile_element_reference: + associated_recipe: + - retroseq + data_type: HASH + is_reference: 1 + reference: reference_dir + type: path ## GATK CollectReadCounts gatk_collectreadcounts: analysis_mode: sample diff --git a/documentation/Setup.md b/documentation/Setup.md index cc7fbc51a..41251a7ac 100644 --- a/documentation/Setup.md +++ b/documentation/Setup.md @@ -64,6 +64,7 @@ You can speed up, for instance, the Readonly module by also installing the compa - [PicardTools] (version: 2.27.2) - [PLINK] (version: 1.90b3x35) - [Preseq] (version: 3.1.2) +- [RetroSeq] (version: 9d4f3b5) - [Rhocall] (version: 0.5.1) - [RSeQC] (version: 4.0.0) - [rtg-tools] (version: 3.12) @@ -195,6 +196,7 @@ Corresponding MIP references: [PicardTools]: http://broadinstitute.github.io/picard/ [PLINK]: https://www.cog-genomics.org/plink2 [Preseq]: http://smithlabresearch.org/software/preseq/ +[RetroSeq]: https://github.com/tk2/RetroSeq [Rhocall]: https://github.com/dnil/rhocall [RSeQC]: http://rseqc.sourceforge.net/ [rtg-tools]: https://github.com/RealTimeGenomics/rtg-tools diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index a692abace..bad85cdc8 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1156,6 +1156,24 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); + option( + q{retroseq} => ( + cmd_tags => [q{Analysis recipe switch}], + documentation => q{Discover mobile elements using RetroSeq}, + is => q{rw}, + isa => enum( [ 0, 1, 2 ] ), + ) + ); + + option( + q{mobile_element_reference} => ( + cmd_tags => [q{file.vcf=TE_type}], + documentation => q{Database file(s) for mobile element iscovery}, + is => q{rw}, + isa => HashRef, + ) + ); + option( q{gatk_haplotypecaller} => ( cmd_tags => [q{Analysis recipe switch}], diff --git a/lib/MIP/Constants.pm b/lib/MIP/Constants.pm index 66ad565f4..a03ea8d72 100644 --- a/lib/MIP/Constants.pm +++ b/lib/MIP/Constants.pm @@ -82,7 +82,7 @@ Readonly our %ANALYSIS => ( ); ## Set MIP version -Readonly our $MIP_VERSION => q{11.1.3}; +Readonly our $MIP_VERSION => q{11.2.0}; ## Cli Readonly our $MOOSEX_APP_SCEEN_WIDTH => 160; diff --git a/lib/MIP/Program/Retroseq.pm b/lib/MIP/Program/Retroseq.pm index e3b90b8ef..9b77e1d6f 100644 --- a/lib/MIP/Program/Retroseq.pm +++ b/lib/MIP/Program/Retroseq.pm @@ -34,9 +34,10 @@ sub retroseq_call { ## Function : Perl wrapper for Retroseq version 1.5 mobile element detector ## Returns : @commands -## Arguments: $filehandle => Filehandle to write to +## Arguments: $call_heterozygous => Call heterogygous insertions +## : $filehandle => Filehandle to write to ## : $infile_path => Path to input bam file -## : $outputfile_path => Path to the output file +## : $outfile_path => Path to the output file ## : $reference_fasta_path => Reference genome path ## : $retroseq_bed_path => Path to the retroseq discover bedfile ## : $stderrfile_path => Stderrfile path @@ -46,9 +47,10 @@ sub retroseq_call { my ($arg_href) = @_; ## Flatten argument(s) + my $call_heterozygous; my $filehandle; my $infile_path; - my $outputfile_path; + my $outfile_path; my $reference_fasta_path; my $retroseq_bed_path; my $stderrfile_path; @@ -56,6 +58,11 @@ sub retroseq_call { my $stdoutfile_path; my $tmpl = { + call_heterozygous => { + allow => [ undef, 0, 1 ], + store => \$call_heterozygous, + strict_type => 1, + }, filehandle => { store => \$filehandle, }, @@ -65,10 +72,10 @@ sub retroseq_call { store => \$infile_path, strict_type => 1, }, - outputfile_path => { + outfile_path => { defined => 1, required => 1, - store => \$outputfile_path, + store => \$outfile_path, strict_type => 1, }, reference_fasta_path => { @@ -103,13 +110,18 @@ sub retroseq_call { push @commands, q{-call -soft}; + if ($call_heterozygous) { + + push @commands, q{-hets}; + } + push @commands, q{-bam} . $SPACE . $infile_path; push @commands, q{-input} . $SPACE . $retroseq_bed_path; push @commands, q{-ref} . $SPACE . $reference_fasta_path; - push @commands, q{-output} . $SPACE . $outputfile_path; + push @commands, q{-output} . $SPACE . $outfile_path; push @commands, unix_standard_streams( @@ -138,7 +150,7 @@ sub retroseq_discover { ## Arguments: $filehandle => Filehandle to write to ## : $infile_path => Path to input bam file ## : $mobile_element_tsv_path => Tab separated file containing the name and path of mobile elements -## : $outputfile_path => path to the output file +## : $outfile_path => path to the output file ## : $stderrfile_path => Stderrfile path ## : $stderrfile_path_append => Append stderr info to file path ## : $stdoutfile_path => Stdoutfile path @@ -149,7 +161,7 @@ sub retroseq_discover { my $filehandle; my $infile_path; my $mobile_element_tsv_path; - my $outputfile_path; + my $outfile_path; my $stderrfile_path; my $stderrfile_path_append; my $stdoutfile_path; @@ -170,10 +182,10 @@ sub retroseq_discover { store => \$mobile_element_tsv_path, strict_type => 1, }, - outputfile_path => { + outfile_path => { defined => 1, required => 1, - store => \$outputfile_path, + store => \$outfile_path, strict_type => 1, }, stderrfile_path => { @@ -200,7 +212,7 @@ sub retroseq_discover { push @commands, q{-refTEs} . $SPACE . $mobile_element_tsv_path; - push @commands, q{-output} . $SPACE . $outputfile_path; + push @commands, q{-output} . $SPACE . $outfile_path; push @commands, unix_standard_streams( diff --git a/lib/MIP/Recipes/Analysis/Retroseq.pm b/lib/MIP/Recipes/Analysis/Retroseq.pm new file mode 100644 index 000000000..6361fe243 --- /dev/null +++ b/lib/MIP/Recipes/Analysis/Retroseq.pm @@ -0,0 +1,288 @@ +package MIP::Recipes::Analysis::Retroseq; + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Spec::Functions qw{ catdir catfile }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use utf8; +use warnings; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw{ :all }; +use Readonly; + +## MIPs lib/ +use MIP::Constants qw{ $CONTAINER_MANAGER $EQUALS $LOG_NAME $NEWLINE $SPACE $UNDERSCORE }; + +BEGIN { + + require Exporter; + use base qw{ Exporter }; + + # Functions and variables which can be optionally exported + our @EXPORT_OK = qw{ analysis_retroseq }; + +} + +sub analysis_retroseq { + +## Function : Discover and call mobile elememnts using RetroSeq. +## Returns : +## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} +## : $case_id => Family id +## : $file_info_href => File_info hash {REF} +## : $job_id_href => Job id hash {REF} +## : $parameter_href => Parameter hash {REF} +## : $profile_base_command => Submission profile base command +## : $recipe_name => Recipe name +## : $sample_id => Sample id +## : $sample_info_href => Info on samples and case hash {REF} + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $active_parameter_href; + my $file_info_href; + my $job_id_href; + my $parameter_href; + my $recipe_name; + my $sample_id; + my $sample_info_href; + + ## Default(s) + my $case_id; + my $profile_base_command; + + my $tmpl = { + active_parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$active_parameter_href, + strict_type => 1, + }, + case_id => { + default => $arg_href->{active_parameter_href}{case_id}, + store => \$case_id, + strict_type => 1, + }, + file_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$file_info_href, + strict_type => 1, + }, + job_id_href => { + default => {}, + defined => 1, + required => 1, + store => \$job_id_href, + strict_type => 1, + }, + parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$parameter_href, + strict_type => 1, + }, + profile_base_command => { + default => q{sbatch}, + store => \$profile_base_command, + strict_type => 1, + }, + recipe_name => { + defined => 1, + required => 1, + store => \$recipe_name, + strict_type => 1, + }, + sample_id => { + defined => 1, + required => 1, + store => \$sample_id, + strict_type => 1, + }, + sample_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$sample_info_href, + strict_type => 1, + }, + }; + + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + use MIP::File_info qw{ get_io_files parse_io_outfiles }; + use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Program::Gnu::Coreutils qw{ gnu_echo }; + use MIP::Program::Retroseq qw{ retroseq_call retroseq_discover }; + use MIP::Recipe qw{ parse_recipe_prerequisites }; + use MIP::Sample_info qw{ set_recipe_metafile_in_sample_info set_recipe_outfile_in_sample_info }; + use MIP::Script::Setup_script qw{ setup_script }; + + ### PREPROCESSING: + + ## Retrieve logger object + my $log = Log::Log4perl->get_logger($LOG_NAME); + + ## Unpack parameters + ## Get the io infiles per chain and id + my %io = get_io_files( + { + id => $sample_id, + file_info_href => $file_info_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + stream => q{in}, + } + ); + + my $infile_name_prefix = $io{in}{file_name_prefix}; + my $infile_path_prefix = $io{in}{file_path_prefix}; + my $infile_suffix = $io{in}{file_suffix}; + my $infile_path = $infile_path_prefix . $infile_suffix; + + my $analysis_type = $active_parameter_href->{analysis_type}{$sample_id}; + + my %recipe = parse_recipe_prerequisites( + { + active_parameter_href => $active_parameter_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + + %io = ( + %io, + parse_io_outfiles( + { + chain_id => $recipe{job_id_chain}, + id => $sample_id, + file_info_href => $file_info_href, + file_name_prefixes_ref => [$infile_name_prefix], + outdata_dir => $active_parameter_href->{outdata_dir}, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ) + ); + + my $outfile_name_prefix = $io{out}{file_name_prefix}; + my $outdir_path = $io{out}{dir_path}; + my $outfile_path = $io{out}{file_path}; + my $outfile_path_prefix = $io{out}{file_path_prefix}; + + ## Filehandles + # Create anonymous filehandle + my $filehandle = IO::Handle->new(); + + ## Creates recipe directories (info & data & script), recipe script filenames and writes sbatch header + my ( $recipe_file_path, $recipe_info_path ) = setup_script( + { + active_parameter_href => $active_parameter_href, + core_number => $recipe{core_number}, + directory_id => $sample_id, + filehandle => $filehandle, + job_id_href => $job_id_href, + memory_allocation => $recipe{memory}, + process_time => $recipe{time}, + recipe_directory => $recipe_name, + recipe_name => $recipe_name, + } + ); + + ### SHELL: + + say {$filehandle} q{## } . $recipe_name; + + say {$filehandle} q{## Prepare mobile element references}; + + my @mobile_element_references; + while ( my ( $me_reference_path, $me_type ) = + each %{ $active_parameter_href->{mobile_element_reference} } ) + { + + push @mobile_element_references, $me_type . q{\t} . $me_reference_path . q{\n}; + } + + my $mobile_element_reference_file = catfile( $outdir_path, q{mobile_element_references.tsv} ); + gnu_echo( + { + enable_interpretation => 1, + filehandle => $filehandle, + no_trailing_newline => 1, + outfile_path => $mobile_element_reference_file, + strings_ref => \@mobile_element_references, + } + ); + say ${filehandle} $NEWLINE; + + say {$filehandle} q{## Discover mobile elements}; + my $discover_outfile_path = $outfile_path_prefix . q{.bed}; + retroseq_discover( + { + filehandle => $filehandle, + infile_path => $infile_path, + mobile_element_tsv_path => $mobile_element_reference_file, + outfile_path => $discover_outfile_path, + } + ); + say {$filehandle} $NEWLINE; + + say {$filehandle} q{## Call mobile elements}; + retroseq_call( + { + call_heterozygous => 1, + filehandle => $filehandle, + infile_path => $infile_path, + retroseq_bed_path => $discover_outfile_path, + outfile_path => $outfile_path, + reference_fasta_path => $active_parameter_href->{human_genome_reference}, + } + ); + say {$filehandle} $NEWLINE; + + ## Close filehandles + close $filehandle or $log->logcroak(q{Could not close filehandle}); + + if ( $recipe{mode} == 1 ) { + + ## Collect QC metadata info for later use + set_recipe_outfile_in_sample_info( + { + infile => $outfile_name_prefix, + path => $outfile_path, + recipe_name => $recipe_name, + sample_id => $sample_id, + sample_info_href => $sample_info_href, + } + ); + + submit_recipe( + { + base_command => $profile_base_command, + case_id => $case_id, + dependency_method => q{sample_to_sample}, + job_id_chain => $recipe{job_id_chain}, + job_id_href => $job_id_href, + job_reservation_name => $active_parameter_href->{job_reservation_name}, + log => $log, + max_parallel_processes_count_href => + $file_info_href->{max_parallel_processes_count}, + recipe_file_path => $recipe_file_path, + sample_id => $sample_id, + submission_profile => $active_parameter_href->{submission_profile}, + } + ); + } + return 1; +} + +1; diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index 680f20aab..9df2fb9d5 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -128,9 +128,9 @@ sub parse_rd_dna { ## Constants Readonly my @MIP_VEP_PLUGINS => qw{ sv_vep_plugin vep_plugin }; Readonly my @ONLY_WGS_VARIANT_CALLER_RECIPES => qw{ cnvnator_ar delly_reformat tiddit }; - Readonly my @ONLY_WGS_RECIPIES => + Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter - gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata mitodel + gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; @@ -491,6 +491,7 @@ sub pipeline_analyse_rd_dna { qw{ analysis_prepareforvariantannotationblock }; use MIP::Recipes::Analysis::Rankvariant qw{ analysis_rankvariant analysis_rankvariant_unaffected analysis_rankvariant_sv analysis_rankvariant_sv_unaffected }; + use MIP::Recipes::Analysis::Retroseq qw{ analysis_retroseq }; use MIP::Recipes::Analysis::Rhocall qw{ analysis_rhocall_annotate analysis_rhocall_viz }; use MIP::Recipes::Analysis::Rtg_vcfeval qw{ analysis_rtg_vcfeval }; use MIP::Recipes::Analysis::Sacct qw{ analysis_sacct }; @@ -581,7 +582,7 @@ sub pipeline_analyse_rd_dna { gatk_variantevalall => \&analysis_gatk_variantevalall, gatk_variantevalexome => \&analysis_gatk_variantevalexome, gatk_variantrecalibration => undef, # Depends on analysis type and/or number of samples - gens_generatedata => \&analysis_gens_generatedata, + gens_generatedata => \&analysis_gens_generatedata, glnexus_merge => \&analysis_glnexus, gzip_fastq => \&analysis_gzip_fastq, manta => \&analysis_manta, @@ -596,6 +597,7 @@ sub pipeline_analyse_rd_dna { prepareforvariantannotationblock => \&analysis_prepareforvariantannotationblock, qccollect_ar => \&analysis_mip_qccollect, rankvariant => undef, # Depends on sample features + retroseq => \&analysis_retroseq, rhocall_ar => \&analysis_rhocall_annotate, rhocall_viz => \&analysis_rhocall_viz, rtg_vcfeval => \&analysis_rtg_vcfeval, diff --git a/t/analysis_deepvariant.t b/t/analysis_deepvariant.t index 0815d3539..e1d4c06db 100644 --- a/t/analysis_deepvariant.t +++ b/t/analysis_deepvariant.t @@ -64,7 +64,7 @@ $active_parameter{$recipe_name} = 1; $active_parameter{recipe_core_number}{$recipe_name} = 1; $active_parameter{recipe_gpu_number}{$recipe_name} = 1; $active_parameter{recipe_time}{$recipe_name} = 1; -$active_parameter{container_manager} = q{sinularity}; +$active_parameter{container_manager} = q{singularity}; my $sample_id = $active_parameter{sample_ids}[0]; set_container_constants( diff --git a/t/analysis_retroseq.t b/t/analysis_retroseq.t new file mode 100644 index 000000000..cd7020710 --- /dev/null +++ b/t/analysis_retroseq.t @@ -0,0 +1,112 @@ +#!/usr/bin/env perl + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile }; +use FindBin qw{ $Bin }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use Test::More; +use utf8; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw { :all }; +use Modern::Perl qw{ 2018 }; +use Test::Trap; + +## MIPs lib/ +use lib catdir( dirname($Bin), q{lib} ); +use MIP::Constants qw{ $COLON $COMMA $SPACE }; +use MIP::Test::Fixtures qw{ test_add_io_for_recipe test_log test_mip_hashes }; + +BEGIN { + + use MIP::Test::Fixtures qw{ test_import }; + +### Check all internal dependency modules and imports +## Modules with import + my %perl_module = ( + q{MIP::Recipes::Analysis::Retroseq} => [qw{ analysis_retroseq }], + q{MIP::Test::Fixtures} => [qw{ test_add_io_for_recipe test_log test_mip_hashes }], + ); + + test_import( { perl_module_href => \%perl_module, } ); +} + +use MIP::Recipes::Analysis::Retroseq qw{ analysis_retroseq }; + +diag( q{Test analysis_retroseq from Retroseq.pm} + . $COMMA + . $SPACE . q{Perl} + . $SPACE + . $PERL_VERSION + . $SPACE + . $EXECUTABLE_NAME ); + +test_log( { log_name => q{MIP}, no_screen => 1, } ); + +## Given analysis parameters +my $recipe_name = q{retroseq}; +my $slurm_mock_cmd = catfile( $Bin, qw{ data modules slurm-mock.pl } ); + +my %active_parameter = test_mip_hashes( + { + mip_hash_name => q{active_parameter}, + recipe_name => $recipe_name, + } +); + +$active_parameter{$recipe_name} = 1; +$active_parameter{recipe_core_number}{$recipe_name} = 1; +$active_parameter{recipe_time}{$recipe_name} = 1; +$active_parameter{mobile_element_reference} = { catfile(qw{ path to herv_file }) => q{HERV} }; +my $sample_id = $active_parameter{sample_ids}[0]; + +my %file_info = test_mip_hashes( + { + mip_hash_name => q{file_info}, + recipe_name => $recipe_name, + } +); + +my %job_id; +my %parameter = test_mip_hashes( + { + mip_hash_name => q{recipe_parameter}, + recipe_name => $recipe_name, + } +); + +test_add_io_for_recipe( + { + file_info_href => \%file_info, + id => $sample_id, + parameter_href => \%parameter, + recipe_name => $recipe_name, + step => q{bam}, + } +); + +my %sample_info; + +my $is_ok = analysis_retroseq( + { + active_parameter_href => \%active_parameter, + file_info_href => \%file_info, + job_id_href => \%job_id, + parameter_href => \%parameter, + profile_base_command => $slurm_mock_cmd, + recipe_name => $recipe_name, + sample_id => $sample_id, + sample_info_href => \%sample_info, + } +); + +## Then return TRUE +ok( $is_ok, q{ Executed analysis recipe } . $recipe_name ); + +done_testing(); diff --git a/t/data/references/grch37_g1k_alu.bed b/t/data/references/grch37_g1k_alu.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_g1k_herv.bed b/t/data/references/grch37_g1k_herv.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_g1k_l1.bed b/t/data/references/grch37_g1k_l1.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_g1k_sva.bed b/t/data/references/grch37_g1k_sva.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_g1k_alu.bed b/t/data/references/grch38_g1k_alu.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_g1k_herv.bed b/t/data/references/grch38_g1k_herv.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_g1k_l1.bed b/t/data/references/grch38_g1k_l1.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_g1k_sva.bed b/t/data/references/grch38_g1k_sva.bed new file mode 100644 index 000000000..e69de29bb diff --git a/t/retroseq_call.t b/t/retroseq_call.t index c3f8c568b..8bd40a444 100644 --- a/t/retroseq_call.t +++ b/t/retroseq_call.t @@ -23,16 +23,13 @@ use lib catdir( dirname($Bin), q{lib} ); use MIP::Constants qw{ $COMMA $SPACE }; use MIP::Test::Commands qw{ test_function }; - BEGIN { use MIP::Test::Fixtures qw{ test_import }; ### Check all internal dependency modules and imports ## Modules with import - my %perl_module = ( - q{MIP::Program::Retroseq} => [qw{ retroseq_call }], -); + my %perl_module = ( q{MIP::Program::Retroseq} => [qw{ retroseq_call }], ); test_import( { perl_module_href => \%perl_module, } ); } @@ -86,7 +83,7 @@ my %required_argument = ( input => q{reference.fa}, expected_output => q{-ref} . $SPACE . q{reference.fa}, }, - outputfile_path => { + outfile_path => { input => q{HappyKangaroo.vcf}, expected_output => q{-output} . $SPACE . q{HappyKangaroo.vcf}, }, @@ -94,6 +91,10 @@ my %required_argument = ( ); my %specific_argument = ( + call_heterozygous => { + input => 1, + expected_output => q{-hets}, + }, infile_path => { input => q{infile_path}, expected_output => q{-bam infile_path}, @@ -106,7 +107,7 @@ my %specific_argument = ( input => q{reference.fa}, expected_output => q{-ref} . $SPACE . q{reference.fa}, }, - outputfile_path => { + outfile_path => { input => q{HappyKangaroo.vcf}, expected_output => q{-output} . $SPACE . q{HappyKangaroo.vcf}, }, diff --git a/t/retroseq_discover.t b/t/retroseq_discover.t index 14bb21c21..6be55d5bb 100644 --- a/t/retroseq_discover.t +++ b/t/retroseq_discover.t @@ -23,16 +23,13 @@ use lib catdir( dirname($Bin), q{lib} ); use MIP::Constants qw{ $COMMA $SPACE }; use MIP::Test::Commands qw{ test_function }; - BEGIN { use MIP::Test::Fixtures qw{ test_import }; ### Check all internal dependency modules and imports ## Modules with import - my %perl_module = ( - q{MIP::Program::Retroseq} => [qw{ retroseq_discover }], -); + my %perl_module = ( q{MIP::Program::Retroseq} => [qw{ retroseq_discover }], ); test_import( { perl_module_href => \%perl_module, } ); } @@ -82,7 +79,7 @@ my %required_argument = ( input => q{input.tab}, expected_output => q{-refTEs} . $SPACE . q{input.tab}, }, - outputfile_path => { + outfile_path => { input => q{HappyKangaroo.bed}, expected_output => q{-output} . $SPACE . q{HappyKangaroo.bed}, }, @@ -98,7 +95,7 @@ my %specific_argument = ( input => q{input.tab}, expected_output => q{-refTEs} . $SPACE . q{input.tab}, }, - outputfile_path => { + outfile_path => { input => q{HappyKangaroo.bed}, expected_output => q{-output} . $SPACE . q{HappyKangaroo.bed}, }, diff --git a/templates/grch38_mip_rd_dna_config.yaml b/templates/grch38_mip_rd_dna_config.yaml index 3e3f26509..ae0141eab 100755 --- a/templates/grch38_mip_rd_dna_config.yaml +++ b/templates/grch38_mip_rd_dna_config.yaml @@ -40,6 +40,11 @@ gatk_varianteval_dbsnp: grch38_variant_-gold_standard_dbsnp-.vcf.gz gatk_varianteval_gold: grch38_mills_and_1000g_-gold_standard_indels-.vcf.gz genmod_models_reduced_penetrance_file: grch38_cust003-cmms-red-pen_-2017_FAKE-.tsv human_genome_reference: grch38_homo_sapiens_-assembly-.fasta +mobile_element_reference: + grch38_g1k_alu.bed: ALU + grch38_g1k_l1.bed: L1 + grch38_g1k_herv.bed: HERV + grch38_g1k_sva.bed: SVA nist_call_set_bed: 3.3.2: NA12878: grch38_nist_hg001_-na12878_v3.3.2-.bed diff --git a/templates/mip_install_config.yaml b/templates/mip_install_config.yaml index 1dcd2e2ef..fc4ffff6c 100644 --- a/templates/mip_install_config.yaml +++ b/templates/mip_install_config.yaml @@ -125,7 +125,7 @@ container: mip: executable: mip: - uri: docker.io/clinicalgenomics/mip:v11.1.3 + uri: docker.io/clinicalgenomics/mip:v11.2.0 multiqc: executable: multiqc: @@ -154,6 +154,10 @@ container: executable: preseq: uri: docker.io/clinicalgenomics/preseq:3.1.2 + retroseq: + executable: + retroseq: "perl /opt/conda/share/RetroSeq/bin/retroseq.pl" + uri: docker.io/clinicalgenomics/retroseq:1.5_9d4f3b5 rhocall: executable: rhocall: diff --git a/templates/mip_rd_dna_config.yaml b/templates/mip_rd_dna_config.yaml index 988a8d8ae..db3d8120e 100755 --- a/templates/mip_rd_dna_config.yaml +++ b/templates/mip_rd_dna_config.yaml @@ -23,6 +23,11 @@ sample_info_file: cluster_constant_path!/case_id!/analysis_constant_path!/case_i gatk_genotypegvcfs_ref_gvcf: grch37_gatk_merged_reference_samples.txt genmod_models_reduced_penetrance_file: grch37_cust003-cmms-red-pen_-2017-.tsv human_genome_reference: grch37_homo_sapiens_-d5-.fasta +mobile_element_reference: + grch37_g1k_alu.bed: ALU + grch37_g1k_l1.bed: L1 + grch37_g1k_herv.bed: HERV + grch37_g1k_sva.bed: SVA rank_model_file: rank_model_-v1.34-.ini sambamba_depth_bed: grch37_scout_exons_-2017-01-.bed sv_vcfanno_config: grch37_sv_vcfanno_config_-v1.4-.toml From 56a1e4227dd087df1869f7276da839d42c0e3e11 Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 4 Apr 2023 13:55:01 +0200 Subject: [PATCH 02/34] removes dead option from retroseq command --- containers/retroseq/Dockerfile | 1 - lib/MIP/Program/Retroseq.pm | 14 +------------- lib/MIP/Recipes/Analysis/Retroseq.pm | 1 - t/retroseq_call.t | 4 ---- 4 files changed, 1 insertion(+), 19 deletions(-) diff --git a/containers/retroseq/Dockerfile b/containers/retroseq/Dockerfile index 2d9aaac26..8f192f47f 100644 --- a/containers/retroseq/Dockerfile +++ b/containers/retroseq/Dockerfile @@ -28,4 +28,3 @@ WORKDIR /opt/conda/share/RetroSeq ## Make sure we're on the right commit and remove samtools check RUN git reset --hard 9d4f3b5 && \ sed -i '/RetroSeq::Utilities::checkBinary( q\[samtools\].*/d' ./bin/retroseq.pl - diff --git a/lib/MIP/Program/Retroseq.pm b/lib/MIP/Program/Retroseq.pm index 9b77e1d6f..41403d48c 100644 --- a/lib/MIP/Program/Retroseq.pm +++ b/lib/MIP/Program/Retroseq.pm @@ -34,8 +34,7 @@ sub retroseq_call { ## Function : Perl wrapper for Retroseq version 1.5 mobile element detector ## Returns : @commands -## Arguments: $call_heterozygous => Call heterogygous insertions -## : $filehandle => Filehandle to write to +## Arguments: $filehandle => Filehandle to write to ## : $infile_path => Path to input bam file ## : $outfile_path => Path to the output file ## : $reference_fasta_path => Reference genome path @@ -47,7 +46,6 @@ sub retroseq_call { my ($arg_href) = @_; ## Flatten argument(s) - my $call_heterozygous; my $filehandle; my $infile_path; my $outfile_path; @@ -58,11 +56,6 @@ sub retroseq_call { my $stdoutfile_path; my $tmpl = { - call_heterozygous => { - allow => [ undef, 0, 1 ], - store => \$call_heterozygous, - strict_type => 1, - }, filehandle => { store => \$filehandle, }, @@ -110,11 +103,6 @@ sub retroseq_call { push @commands, q{-call -soft}; - if ($call_heterozygous) { - - push @commands, q{-hets}; - } - push @commands, q{-bam} . $SPACE . $infile_path; push @commands, q{-input} . $SPACE . $retroseq_bed_path; diff --git a/lib/MIP/Recipes/Analysis/Retroseq.pm b/lib/MIP/Recipes/Analysis/Retroseq.pm index 6361fe243..4f6f540d0 100644 --- a/lib/MIP/Recipes/Analysis/Retroseq.pm +++ b/lib/MIP/Recipes/Analysis/Retroseq.pm @@ -239,7 +239,6 @@ sub analysis_retroseq { say {$filehandle} q{## Call mobile elements}; retroseq_call( { - call_heterozygous => 1, filehandle => $filehandle, infile_path => $infile_path, retroseq_bed_path => $discover_outfile_path, diff --git a/t/retroseq_call.t b/t/retroseq_call.t index 8bd40a444..46edaf52c 100644 --- a/t/retroseq_call.t +++ b/t/retroseq_call.t @@ -91,10 +91,6 @@ my %required_argument = ( ); my %specific_argument = ( - call_heterozygous => { - input => 1, - expected_output => q{-hets}, - }, infile_path => { input => q{infile_path}, expected_output => q{-bam infile_path}, From bc32e3a3dfcdcafe2b7924b7a1abac80d15a30c8 Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 4 Apr 2023 16:09:12 +0200 Subject: [PATCH 03/34] sorting and indexing retroseq output --- definitions/rd_dna_parameters.yaml | 6 +++- lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 2 +- lib/MIP/Recipes/Analysis/Retroseq.pm | 53 +++++++++++++++++++++++++--- templates/mip_install_config.yaml | 2 +- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index e27f1115b..b5f5615be 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -1261,7 +1261,11 @@ retroseq: data_type: SCALAR default: 1 file_tag: _me - outfile_suffix: ".vcf" + outfile_suffix: ".vcf.gz" + program_executables: + - retroseq.pl + - tabix + - bcftools type: recipe mobile_element_reference: associated_recipe: diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index bad85cdc8..7590cdb5b 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1157,7 +1157,7 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ); option( - q{retroseq} => ( + q{me_retroseq} => ( cmd_tags => [q{Analysis recipe switch}], documentation => q{Discover mobile elements using RetroSeq}, is => q{rw}, diff --git a/lib/MIP/Recipes/Analysis/Retroseq.pm b/lib/MIP/Recipes/Analysis/Retroseq.pm index 4f6f540d0..b81b32459 100644 --- a/lib/MIP/Recipes/Analysis/Retroseq.pm +++ b/lib/MIP/Recipes/Analysis/Retroseq.pm @@ -16,7 +16,7 @@ use autodie qw{ :all }; use Readonly; ## MIPs lib/ -use MIP::Constants qw{ $CONTAINER_MANAGER $EQUALS $LOG_NAME $NEWLINE $SPACE $UNDERSCORE }; +use MIP::Constants qw{ $DOT $LOG_NAME $NEWLINE }; BEGIN { @@ -41,6 +41,7 @@ sub analysis_retroseq { ## : $recipe_name => Recipe name ## : $sample_id => Sample id ## : $sample_info_href => Info on samples and case hash {REF} +## : $temp_directory => Temporary directory my ($arg_href) = @_; @@ -56,6 +57,7 @@ sub analysis_retroseq { ## Default(s) my $case_id; my $profile_base_command; + my $temp_directory; my $tmpl = { active_parameter_href => { @@ -115,20 +117,25 @@ sub analysis_retroseq { store => \$sample_info_href, strict_type => 1, }, + temp_directory => { + default => $arg_href->{active_parameter_href}{temp_directory}, + store => \$temp_directory, + strict_type => 1, + }, }; check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Program::Bcftools qw{ bcftools_sort }; use MIP::Program::Gnu::Coreutils qw{ gnu_echo }; + use MIP::Program::Htslib qw{ htslib_tabix }; use MIP::Program::Retroseq qw{ retroseq_call retroseq_discover }; use MIP::Recipe qw{ parse_recipe_prerequisites }; - use MIP::Sample_info qw{ set_recipe_metafile_in_sample_info set_recipe_outfile_in_sample_info }; + use MIP::Sample_info qw{ set_file_path_to_store set_recipe_outfile_in_sample_info }; use MIP::Script::Setup_script qw{ setup_script }; - ### PREPROCESSING: - ## Retrieve logger object my $log = Log::Log4perl->get_logger($LOG_NAME); @@ -141,6 +148,7 @@ sub analysis_retroseq { parameter_href => $parameter_href, recipe_name => $recipe_name, stream => q{in}, + temp_directory => $temp_directory, } ); @@ -170,6 +178,7 @@ sub analysis_retroseq { outdata_dir => $active_parameter_href->{outdata_dir}, parameter_href => $parameter_href, recipe_name => $recipe_name, + temp_directory => $temp_directory, } ) ); @@ -237,17 +246,40 @@ sub analysis_retroseq { say {$filehandle} $NEWLINE; say {$filehandle} q{## Call mobile elements}; + my $call_outfile_path = $outfile_path_prefix . q{_call.vcf}; retroseq_call( { filehandle => $filehandle, infile_path => $infile_path, retroseq_bed_path => $discover_outfile_path, - outfile_path => $outfile_path, + outfile_path => $call_outfile_path, reference_fasta_path => $active_parameter_href->{human_genome_reference}, } ); say {$filehandle} $NEWLINE; + my $sort_memory = $recipe{memory} - 2; + bcftools_sort( + { + filehandle => $filehandle, + infile_path => $call_outfile_path, + max_mem => $sort_memory . q{G}, + outfile_path => $outfile_path, + output_type => q{z}, + temp_directory => catdir( $temp_directory, q{bcftools_sort} ), + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix( + { + filehandle => $filehandle, + infile_path => $outfile_path, + preset => q{vcf}, + } + ); + say {$filehandle} $NEWLINE; + ## Close filehandles close $filehandle or $log->logcroak(q{Could not close filehandle}); @@ -264,6 +296,17 @@ sub analysis_retroseq { } ); + set_file_path_to_store( + { + format => q{vcf}, + id => $sample_id, + path => $outfile_path, + path_index => $outfile_path . $DOT . q{tbi}, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + } + ); + submit_recipe( { base_command => $profile_base_command, diff --git a/templates/mip_install_config.yaml b/templates/mip_install_config.yaml index fc4ffff6c..ff218d152 100644 --- a/templates/mip_install_config.yaml +++ b/templates/mip_install_config.yaml @@ -156,7 +156,7 @@ container: uri: docker.io/clinicalgenomics/preseq:3.1.2 retroseq: executable: - retroseq: "perl /opt/conda/share/RetroSeq/bin/retroseq.pl" + retroseq.pl: "perl /opt/conda/share/RetroSeq/bin/retroseq.pl" uri: docker.io/clinicalgenomics/retroseq:1.5_9d4f3b5 rhocall: executable: From 7b53ad0b100e26b2ac5d989e410ce31282c0a691 Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 4 Apr 2023 17:29:07 +0200 Subject: [PATCH 04/34] update vcf header prior to sorting --- definitions/rd_dna_parameters.yaml | 4 +++- lib/MIP/Recipes/Analysis/Retroseq.pm | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index b5f5615be..0ac5f0c0d 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -408,6 +408,7 @@ picardtools_path: - markduplicates - picardtools_collecthsmetrics - picardtools_collectmultiplemetrics + - retroseq - sv_reformat data_type: SCALAR type: path @@ -1263,9 +1264,10 @@ retroseq: file_tag: _me outfile_suffix: ".vcf.gz" program_executables: + - bcftools + - picard - retroseq.pl - tabix - - bcftools type: recipe mobile_element_reference: associated_recipe: diff --git a/lib/MIP/Recipes/Analysis/Retroseq.pm b/lib/MIP/Recipes/Analysis/Retroseq.pm index b81b32459..b8afb6584 100644 --- a/lib/MIP/Recipes/Analysis/Retroseq.pm +++ b/lib/MIP/Recipes/Analysis/Retroseq.pm @@ -131,6 +131,7 @@ sub analysis_retroseq { use MIP::Program::Bcftools qw{ bcftools_sort }; use MIP::Program::Gnu::Coreutils qw{ gnu_echo }; use MIP::Program::Htslib qw{ htslib_tabix }; + use MIP::Program::Picardtools qw{ picardtools_updatevcfsequencedictionary }; use MIP::Program::Retroseq qw{ retroseq_call retroseq_discover }; use MIP::Recipe qw{ parse_recipe_prerequisites }; use MIP::Sample_info qw{ set_file_path_to_store set_recipe_outfile_in_sample_info }; @@ -258,11 +259,24 @@ sub analysis_retroseq { ); say {$filehandle} $NEWLINE; + say {$filehandle} q{## Cleaning up the vcf}; + my $picardtools_outfile_path = $outfile_path_prefix . q{_header_fix.vcf}; + picardtools_updatevcfsequencedictionary( + { + filehandle => $filehandle, + infile_path => $discover_outfile_path, + java_jar => catfile( $active_parameter_href->{picardtools_path}, q{picard.jar} ), + outfile_path => $picardtools_outfile_path, + sequence_dictionary => $infile_path, + } + ); + say {$filehandle} $NEWLINE; + my $sort_memory = $recipe{memory} - 2; bcftools_sort( { filehandle => $filehandle, - infile_path => $call_outfile_path, + infile_path => $picardtools_outfile_path, max_mem => $sort_memory . q{G}, outfile_path => $outfile_path, output_type => q{z}, From 3373a47dbefa129894de67498d62e2bb6fd0a779 Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 5 Apr 2023 10:56:46 +0200 Subject: [PATCH 05/34] adding retroseq to install --- definitions/install_parameters.yaml | 1 + lib/MIP/Cli/Mip/Install.pm | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/definitions/install_parameters.yaml b/definitions/install_parameters.yaml index 4512614bb..d8e07bfe0 100644 --- a/definitions/install_parameters.yaml +++ b/definitions/install_parameters.yaml @@ -93,6 +93,7 @@ rd_dna: - picard - plink - python + - retroseq - rhocall - rtg-tools - sambamba diff --git a/lib/MIP/Cli/Mip/Install.pm b/lib/MIP/Cli/Mip/Install.pm index 26457a3a0..bb25d89af 100644 --- a/lib/MIP/Cli/Mip/Install.pm +++ b/lib/MIP/Cli/Mip/Install.pm @@ -143,7 +143,7 @@ sub _build_usage { qw{ arriba bedtools blobfish bootstrapann bwa bwakit bwa-mem2 cadd chanjo chromograph cnvnator cyrius deeptrio deepvariant delly expansionhunter fastqc gatk gatk4 genmod gens_preproc gffcompare glnexus hmtnote htslib manta megafusion mip mip_scripts multiqc - pdfmerger perl peddy picard plink preseq python rhocall rseqc rtg-tools salmon sambamba + pdfmerger perl peddy picard plink preseq python retroseq rhocall rseqc rtg-tools salmon sambamba smncopynumbercaller star star-fusion stranger stringtie svdb telomerecat tiddit trim-galore ucsc upd utilities varg vcf2cytosure vcfanno vep vts } ] @@ -176,7 +176,7 @@ q{Save singularity images to sif and update run instructions for offline mip exe qw{ arriba bedtools blobfish bootstrapann bwa bwakit bwa-mem2 cadd chanjo chromograph cnvnator cyrius deeptrio deepvariant delly expansionhunter fastqc gatk gatk4 genmod gens_preproc gffcompare glnexus hmtnote htslib manta megafusion mip mip_scripts multiqc - pdfmerger perl peddy picard plink preseq python rhocall rseqc rtg-tools salmon sambamba + pdfmerger perl peddy picard plink preseq python retroseq rhocall rseqc rtg-tools salmon sambamba smncopynumbercaller star star-fusion stranger stringtie svdb telomerecat tiddit trim-galore ucsc upd utilities varg vcf2cytosure vcfanno vep vts } ] From 57513bb48d7fca407082487bd02b362894adead6 Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 5 Apr 2023 15:02:25 +0200 Subject: [PATCH 06/34] fixing input path for picard --- lib/MIP/Recipes/Analysis/Retroseq.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/MIP/Recipes/Analysis/Retroseq.pm b/lib/MIP/Recipes/Analysis/Retroseq.pm index b8afb6584..572964d28 100644 --- a/lib/MIP/Recipes/Analysis/Retroseq.pm +++ b/lib/MIP/Recipes/Analysis/Retroseq.pm @@ -264,7 +264,7 @@ sub analysis_retroseq { picardtools_updatevcfsequencedictionary( { filehandle => $filehandle, - infile_path => $discover_outfile_path, + infile_path => $call_outfile_path, java_jar => catfile( $active_parameter_href->{picardtools_path}, q{picard.jar} ), outfile_path => $picardtools_outfile_path, sequence_dictionary => $infile_path, From 9dc06aa57404e8b549b01dca74e5b00fb2eb4b83 Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 5 Apr 2023 15:07:46 +0200 Subject: [PATCH 07/34] removing store command --- lib/MIP/Recipes/Analysis/Retroseq.pm | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Retroseq.pm b/lib/MIP/Recipes/Analysis/Retroseq.pm index 572964d28..e171eb3f7 100644 --- a/lib/MIP/Recipes/Analysis/Retroseq.pm +++ b/lib/MIP/Recipes/Analysis/Retroseq.pm @@ -134,7 +134,7 @@ sub analysis_retroseq { use MIP::Program::Picardtools qw{ picardtools_updatevcfsequencedictionary }; use MIP::Program::Retroseq qw{ retroseq_call retroseq_discover }; use MIP::Recipe qw{ parse_recipe_prerequisites }; - use MIP::Sample_info qw{ set_file_path_to_store set_recipe_outfile_in_sample_info }; + use MIP::Sample_info qw{ set_recipe_outfile_in_sample_info }; use MIP::Script::Setup_script qw{ setup_script }; ## Retrieve logger object @@ -310,17 +310,6 @@ sub analysis_retroseq { } ); - set_file_path_to_store( - { - format => q{vcf}, - id => $sample_id, - path => $outfile_path, - path_index => $outfile_path . $DOT . q{tbi}, - recipe_name => $recipe_name, - sample_info_href => $sample_info_href, - } - ); - submit_recipe( { base_command => $profile_base_command, From f6ebda86a914e0d2aaa5e09d29f51cc32c9b25f1 Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 5 Apr 2023 10:51:22 +0200 Subject: [PATCH 08/34] prepare bam files for drop --- definitions/rd_dna_initiation_map.yaml | 1 + definitions/rd_dna_parameters.yaml | 13 +++++++++++++ lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 11 ++++++++++- lib/MIP/Recipes/Analysis/Samtools_merge.pm | 16 +++++++++++++--- lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 8 +++++--- 5 files changed, 42 insertions(+), 7 deletions(-) diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index 29522e336..282dcd288 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -50,6 +50,7 @@ CHAIN_ALL: - sv_reformat - vcf2cytosure_ar - CHAIN_MOBILE_ELEMENTS: + - merge_bam_me - retroseq - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 0ac5f0c0d..d2f25d404 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -226,6 +226,7 @@ recipe_core_number: gzip_fastq: 0 manta: 36 markduplicates: 13 + merge_bam_me: 5 mitodel: 1 mt_annotation: 1 multiqc_ar: 1 @@ -357,6 +358,7 @@ recipe_time: gzip_fastq: 2 manta: 30 markduplicates: 20 + merge_bam_me: 5 mitodel: 2 mt_annotation: 1 multiqc_ar: 5 @@ -1255,6 +1257,17 @@ sv_reformat_remove_genes_file: reference: reference_dir type: path ## Mobile elenment chain +merge_bam_me: + analysis_mode: sample + associated_recipe: + - mip + data_type: SCALAR + default: 1 + file_tag: "_all" + program_executables: + - samtools + outfile_suffix: ".bam" + type: recipe retroseq: analysis_mode: sample associated_recipe: diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 7590cdb5b..ab44481d1 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1157,7 +1157,16 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ); option( - q{me_retroseq} => ( + q{merge_bam_me} => ( + cmd_tags => [q{Analysis recipe switch}], + documentation => q{Prepare bam files for RetroSeq}, + is => q{rw}, + isa => enum( [ 0, 1, 2 ] ), + ) + ); + + option( + q{retroseq} => ( cmd_tags => [q{Analysis recipe switch}], documentation => q{Discover mobile elements using RetroSeq}, is => q{rw}, diff --git a/lib/MIP/Recipes/Analysis/Samtools_merge.pm b/lib/MIP/Recipes/Analysis/Samtools_merge.pm index b736caee0..0825f972a 100644 --- a/lib/MIP/Recipes/Analysis/Samtools_merge.pm +++ b/lib/MIP/Recipes/Analysis/Samtools_merge.pm @@ -475,16 +475,26 @@ sub analysis_samtools_merge_panel { ## Unpack parameters ## Get the io infiles per chain and id + + ## Special case for mobile elements + my $stream_for_input = q{in}; + my $recipe_name_for_input = $recipe_name; + if ( $recipe_name eq q{merge_bam_me} ) { + + $recipe_name_for_input = $active_parameter_href->{bwa_mem2} ? q{bwa_mem2} : q{bwa_mem}; + $stream_for_input = q{out}; + } + my %io = get_io_files( { id => $sample_id, file_info_href => $file_info_href, parameter_href => $parameter_href, - recipe_name => $recipe_name, - stream => q{in}, + recipe_name => $recipe_name_for_input, + stream => $stream_for_input, } ); - my @infile_paths = @{ $io{in}{file_paths} }; + my @infile_paths = @{ $io{$stream_for_input}{file_paths} }; my %recipe = parse_recipe_prerequisites( { diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index 9df2fb9d5..cfe983299 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -130,8 +130,8 @@ sub parse_rd_dna { Readonly my @ONLY_WGS_VARIANT_CALLER_RECIPES => qw{ cnvnator_ar delly_reformat tiddit }; Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter - gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata mitodel retroseq - samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; + gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata merge_bam_me mitodel + retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; ## Set analysis constants @@ -496,7 +496,8 @@ sub pipeline_analyse_rd_dna { use MIP::Recipes::Analysis::Rtg_vcfeval qw{ analysis_rtg_vcfeval }; use MIP::Recipes::Analysis::Sacct qw{ analysis_sacct }; use MIP::Recipes::Analysis::Sambamba_depth qw{ analysis_sambamba_depth }; - use MIP::Recipes::Analysis::Samtools_merge qw{ analysis_samtools_merge }; + use MIP::Recipes::Analysis::Samtools_merge + qw{ analysis_samtools_merge analysis_samtools_merge_panel }; use MIP::Recipes::Analysis::Samtools_subsample_mt qw{ analysis_samtools_subsample_mt }; use MIP::Recipes::Analysis::Smncopynumbercaller qw{ analysis_smncopynumbercaller }; use MIP::Recipes::Analysis::Star_caller qw{ analysis_star_caller }; @@ -587,6 +588,7 @@ sub pipeline_analyse_rd_dna { gzip_fastq => \&analysis_gzip_fastq, manta => \&analysis_manta, markduplicates => \&analysis_markduplicates, + merge_bam_me => \&analysis_samtools_merge_panel, mitodel => \&analysis_mitodel, mt_annotation => \&analysis_mt_annotation, multiqc_ar => \&analysis_multiqc, From 0b2daa77c24e7c45acb199e3a8a0765f1143fb0a Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 5 Apr 2023 16:46:07 +0200 Subject: [PATCH 09/34] samtools view -> cp for single bam --- lib/MIP/Recipes/Analysis/Samtools_merge.pm | 25 ++++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Samtools_merge.pm b/lib/MIP/Recipes/Analysis/Samtools_merge.pm index 0825f972a..1c62d1f3e 100644 --- a/lib/MIP/Recipes/Analysis/Samtools_merge.pm +++ b/lib/MIP/Recipes/Analysis/Samtools_merge.pm @@ -463,7 +463,8 @@ sub analysis_samtools_merge_panel { use MIP::File_info qw{ get_io_files parse_io_outfiles set_merged_infile_prefix }; use MIP::Processmanagement::Processes qw{ submit_recipe }; - use MIP::Program::Samtools qw{ samtools_merge samtools_view }; + use MIP::Program::Gnu::Coreutils qw{ gnu_cp }; + use MIP::Program::Samtools qw{ samtools_merge samtools_index }; use MIP::Recipe qw{ parse_recipe_prerequisites }; use MIP::Sample_info qw{ set_recipe_outfile_in_sample_info }; use MIP::Script::Setup_script qw{ setup_script }; @@ -576,7 +577,6 @@ sub analysis_samtools_merge_panel { ## More than one file - we have something to merge if ( scalar @infile_paths > 1 ) { - ## Samtools_merge say {$filehandle} q{## Merging alignment files}; samtools_merge( { @@ -586,7 +586,7 @@ sub analysis_samtools_merge_panel { outfile_path => $outfile_path, output_format => q{bam}, referencefile_path => ${active_parameter_href}->{human_genome_reference}, - thread_number => 2, + thread_number => $core_number - 1, write_index => 1, } ); @@ -596,13 +596,20 @@ sub analysis_samtools_merge_panel { ## Only 1 infile - rename sample and index instead of merge to streamline handling of filenames downstream say {$filehandle} q{## Only one infile, rename to streamline handling of files downstream}; - samtools_view( + gnu_cp( + { + filehandle => $filehandle, + infile_path => $infile_paths[0], + outfile_path => $outfile_path, + } + ); + say {$filehandle} $NEWLINE; + + samtools_index( { - filehandle => $filehandle, - infile_path => $infile_paths[0], - outfile_path => $outfile_path, - output_format => q{bam}, - write_index => 1, + bai_format => 1, + filehandle => $filehandle, + infile_path => $outfile_path, } ); say {$filehandle} $NEWLINE; From 1c72a4c2afd404f0adfd2a17278b3eca8fc89421 Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 5 Apr 2023 18:26:16 +0200 Subject: [PATCH 10/34] changing name of prepare bam recipe --- definitions/rd_dna_initiation_map.yaml | 2 +- definitions/rd_dna_parameters.yaml | 8 ++++---- lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 2 +- lib/MIP/Recipes/Analysis/Samtools_merge.pm | 2 +- lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index 282dcd288..e8f4e15fc 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -50,7 +50,7 @@ CHAIN_ALL: - sv_reformat - vcf2cytosure_ar - CHAIN_MOBILE_ELEMENTS: - - merge_bam_me + - me_merge_bam - retroseq - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index d2f25d404..7a34af6bd 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -226,7 +226,7 @@ recipe_core_number: gzip_fastq: 0 manta: 36 markduplicates: 13 - merge_bam_me: 5 + me_merge_bam: 5 mitodel: 1 mt_annotation: 1 multiqc_ar: 1 @@ -358,7 +358,7 @@ recipe_time: gzip_fastq: 2 manta: 30 markduplicates: 20 - merge_bam_me: 5 + me_merge_bam: 5 mitodel: 2 mt_annotation: 1 multiqc_ar: 5 @@ -1256,8 +1256,8 @@ sv_reformat_remove_genes_file: mandatory: no reference: reference_dir type: path -## Mobile elenment chain -merge_bam_me: +## Mobile element chain +me_merge_bam: analysis_mode: sample associated_recipe: - mip diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index ab44481d1..0731cf1aa 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1157,7 +1157,7 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ); option( - q{merge_bam_me} => ( + q{me_merge_bam} => ( cmd_tags => [q{Analysis recipe switch}], documentation => q{Prepare bam files for RetroSeq}, is => q{rw}, diff --git a/lib/MIP/Recipes/Analysis/Samtools_merge.pm b/lib/MIP/Recipes/Analysis/Samtools_merge.pm index 1c62d1f3e..29f4aeeb1 100644 --- a/lib/MIP/Recipes/Analysis/Samtools_merge.pm +++ b/lib/MIP/Recipes/Analysis/Samtools_merge.pm @@ -480,7 +480,7 @@ sub analysis_samtools_merge_panel { ## Special case for mobile elements my $stream_for_input = q{in}; my $recipe_name_for_input = $recipe_name; - if ( $recipe_name eq q{merge_bam_me} ) { + if ( $recipe_name eq q{me_merge_bam} ) { $recipe_name_for_input = $active_parameter_href->{bwa_mem2} ? q{bwa_mem2} : q{bwa_mem}; $stream_for_input = q{out}; diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index cfe983299..d3368405c 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -130,7 +130,7 @@ sub parse_rd_dna { Readonly my @ONLY_WGS_VARIANT_CALLER_RECIPES => qw{ cnvnator_ar delly_reformat tiddit }; Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter - gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata merge_bam_me mitodel + gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_merge_bam mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; @@ -588,7 +588,7 @@ sub pipeline_analyse_rd_dna { gzip_fastq => \&analysis_gzip_fastq, manta => \&analysis_manta, markduplicates => \&analysis_markduplicates, - merge_bam_me => \&analysis_samtools_merge_panel, + me_merge_bam => \&analysis_samtools_merge_panel, mitodel => \&analysis_mitodel, mt_annotation => \&analysis_mt_annotation, multiqc_ar => \&analysis_multiqc, From d67dc901e827f7898c1f77a95ab5780f4f84c38b Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 5 Apr 2023 18:14:56 +0200 Subject: [PATCH 11/34] merge mobile element vcfs --- definitions/rd_dna_initiation_map.yaml | 1 + definitions/rd_dna_parameters.yaml | 15 ++ lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 9 + lib/MIP/Recipes/Analysis/Svdb_merge_me.pm | 263 +++++++++++++++++++++ lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 6 +- t/analysis_me_merge_vcfs.t | 118 +++++++++ 6 files changed, 410 insertions(+), 2 deletions(-) create mode 100644 lib/MIP/Recipes/Analysis/Svdb_merge_me.pm create mode 100644 t/analysis_me_merge_vcfs.t diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index e8f4e15fc..201afb9da 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -52,6 +52,7 @@ CHAIN_ALL: - CHAIN_MOBILE_ELEMENTS: - me_merge_bam - retroseq + - me_merge_vcfs - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution - PARALLEL: diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 7a34af6bd..49635f846 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -227,6 +227,7 @@ recipe_core_number: manta: 36 markduplicates: 13 me_merge_bam: 5 + me_merge_vcfs: 2 mitodel: 1 mt_annotation: 1 multiqc_ar: 1 @@ -359,6 +360,7 @@ recipe_time: manta: 30 markduplicates: 20 me_merge_bam: 5 + me_merge_vcfs: 2 mitodel: 2 mt_annotation: 1 multiqc_ar: 5 @@ -1289,6 +1291,19 @@ mobile_element_reference: is_reference: 1 reference: reference_dir type: path +me_merge_vcfs: + analysis_mode: case + associated_recipe: + - mip + data_type: SCALAR + default: 1 + file_tag: _me + outfile_suffix: ".vcf.gz" + program_executables: + - bgzip + - svdb + - tabix + type: recipe ## GATK CollectReadCounts gatk_collectreadcounts: analysis_mode: sample diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 0731cf1aa..12fe00951 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1174,6 +1174,15 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); + option( + q{me_merge_vcfs} => ( + cmd_tags => [q{Analysis recipe switch}], + documentation => q{Merge sample vcfs from RetroSeq}, + is => q{rw}, + isa => enum( [ 0, 1, 2 ] ), + ) + ); + option( q{mobile_element_reference} => ( cmd_tags => [q{file.vcf=TE_type}], diff --git a/lib/MIP/Recipes/Analysis/Svdb_merge_me.pm b/lib/MIP/Recipes/Analysis/Svdb_merge_me.pm new file mode 100644 index 000000000..ab4bafb3d --- /dev/null +++ b/lib/MIP/Recipes/Analysis/Svdb_merge_me.pm @@ -0,0 +1,263 @@ +package MIP::Recipes::Analysis::Svdb_merge_me; + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile devnull }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use utf8; +use warnings; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw{ :all }; +use Readonly; + +## MIPs lib/ +use MIP::Constants qw{ $DASH $LOG_NAME $NEWLINE $PIPE $SINGLE_QUOTE $SPACE }; + +BEGIN { + + require Exporter; + use base qw{ Exporter }; + + # Functions and variables which can be optionally exported + our @EXPORT_OK = qw{ analysis_me_merge_vcfs }; + +} + +sub analysis_me_merge_vcfs { + +## Function : Merge sample vcfs from RetroSeq to generate a case vcf. +## Returns : +## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} +## : $case_id => Family id +## : $file_info_href => File_info hash {REF} +## : $job_id_href => Job id hash {REF} +## : $parameter_href => Parameter hash {REF} +## : $profile_base_command => Submission profile base command +## : $recipe_name => Recipe name +## : $sample_info_href => Info on samples and case hash {REF} + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $active_parameter_href; + my $file_info_href; + my $job_id_href; + my $parameter_href; + my $recipe_name; + my $sample_info_href; + + ## Default(s) + my $case_id; + my $profile_base_command; + + my $tmpl = { + active_parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$active_parameter_href, + strict_type => 1, + }, + case_id => { + default => $arg_href->{active_parameter_href}{case_id}, + store => \$case_id, + strict_type => 1, + }, + file_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$file_info_href, + strict_type => 1, + }, + job_id_href => { + default => {}, + defined => 1, + required => 1, + store => \$job_id_href, + strict_type => 1, + }, + parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$parameter_href, + strict_type => 1, + }, + profile_base_command => { + default => q{sbatch}, + store => \$profile_base_command, + strict_type => 1, + }, + recipe_name => { + defined => 1, + required => 1, + store => \$recipe_name, + strict_type => 1, + }, + sample_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$sample_info_href, + strict_type => 1, + }, + }; + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + use MIP::File_info qw{ get_io_files parse_io_outfiles }; + use MIP::Program::Svdb qw{ svdb_merge }; + use MIP::Program::Htslib qw{ htslib_bgzip htslib_tabix }; + use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Recipe qw{ parse_recipe_prerequisites }; + use MIP::Sample_info qw{ set_file_path_to_store set_recipe_outfile_in_sample_info }; + use MIP::Script::Setup_script qw{ setup_script }; + + ### PREPROCESSING: + + ## Retrieve logger object + my $log = Log::Log4perl->get_logger($LOG_NAME); + + my %recipe = parse_recipe_prerequisites( + { + active_parameter_href => $active_parameter_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + my $core_number = $recipe{core_number}; + my $memory = $recipe{memory}; + + ## Get the io infiles per chain and id + my @me_infile_paths; + + SAMPLE_ID: + foreach my $sample_id ( @{ $active_parameter_href->{sample_ids} } ) { + + ## Get the io infiles per chain and id + my %sample_io = get_io_files( + { + id => $sample_id, + file_info_href => $file_info_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + stream => q{in}, + } + ); + push @me_infile_paths, $sample_io{in}{file_path}; + } + + my %io = parse_io_outfiles( + { + chain_id => $recipe{job_id_chain}, + id => $case_id, + file_info_href => $file_info_href, + file_name_prefixes_ref => [$case_id], + outdata_dir => $active_parameter_href->{outdata_dir}, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + my $outfile_path = $io{out}{file_path}; + my $outfile_path_prefix = $io{out}{file_path_prefix}; + + ## Filehandles + # Create anonymous filehandle + my $filehandle = IO::Handle->new(); + + ## Creates recipe directories (info & data & script), recipe script filenames and writes sbatch header + my ( $recipe_file_path, $recipe_info_path ) = setup_script( + { + active_parameter_href => $active_parameter_href, + core_number => $core_number, + directory_id => $case_id, + filehandle => $filehandle, + job_id_href => $job_id_href, + memory_allocation => $memory, + process_time => $recipe{time}, + recipe_directory => $recipe_name, + recipe_name => $recipe_name, + } + ); + + ### SHELL: + + say {$filehandle} q{## } . $recipe_name; + + my $svdb_merge_output = $outfile_path_prefix . q{.vcf}; + svdb_merge( + { + filehandle => $filehandle, + infile_paths_ref => \@me_infile_paths, + } + ); + print ${filehandle} $PIPE . $SPACE; + + htslib_bgzip( + { + filehandle => $filehandle, + stdoutfile_path => $outfile_path, + threads => $core_number, + + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix { + ( + filehandle => $filehandle, + infile_path => $outfile_path, + ) + }; + + ## Close filehandle + close $filehandle or $log->logcroak(q{Could not close filehandle}); + + if ( $recipe{mode} == 1 ) { + + set_recipe_outfile_in_sample_info( + { + path => $outfile_path, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + } + ); + + set_file_path_to_store( + { + format => q{vcf}, + id => $case_id, + path => $outfile_path, + path_index => $outfile_path . q{.tbi}, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + } + ); + + submit_recipe( + { + base_command => $profile_base_command, + case_id => $case_id, + dependency_method => q{sample_to_case}, + log => $log, + job_id_chain => $recipe{job_id_chain}, + job_id_href => $job_id_href, + job_reservation_name => $active_parameter_href->{job_reservation_name}, + max_parallel_processes_count_href => + $file_info_href->{max_parallel_processes_count}, + recipe_file_path => $recipe_file_path, + sample_ids_ref => \@{ $active_parameter_href->{sample_ids} }, + submission_profile => $active_parameter_href->{submission_profile}, + } + ); + } + return 1; +} + +1; diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index d3368405c..dda099ea8 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -130,8 +130,8 @@ sub parse_rd_dna { Readonly my @ONLY_WGS_VARIANT_CALLER_RECIPES => qw{ cnvnator_ar delly_reformat tiddit }; Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter - gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_merge_bam mitodel - retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; + gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_merge_bam me_merge_vcfs + mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; ## Set analysis constants @@ -504,6 +504,7 @@ sub pipeline_analyse_rd_dna { use MIP::Recipes::Analysis::Sv_annotate qw{ analysis_sv_annotate }; use MIP::Recipes::Analysis::Sv_reformat qw{ analysis_reformat_sv }; use MIP::Recipes::Analysis::Sv_combinevariantcallsets qw{ analysis_sv_combinevariantcallsets }; + use MIP::Recipes::Analysis::Svdb_merge_me qw{ analysis_me_merge_vcfs }; use MIP::Recipes::Analysis::Telomerecat qw{ analysis_telomerecat }; use MIP::Recipes::Analysis::Tiddit qw{ analysis_tiddit }; use MIP::Recipes::Analysis::Tiddit_coverage qw{ analysis_tiddit_coverage }; @@ -589,6 +590,7 @@ sub pipeline_analyse_rd_dna { manta => \&analysis_manta, markduplicates => \&analysis_markduplicates, me_merge_bam => \&analysis_samtools_merge_panel, + me_merge_vcfs => \&analysis_me_merge_vcfs, mitodel => \&analysis_mitodel, mt_annotation => \&analysis_mt_annotation, multiqc_ar => \&analysis_multiqc, diff --git a/t/analysis_me_merge_vcfs.t b/t/analysis_me_merge_vcfs.t new file mode 100644 index 000000000..787f52788 --- /dev/null +++ b/t/analysis_me_merge_vcfs.t @@ -0,0 +1,118 @@ +#!/usr/bin/env perl + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile }; +use FindBin qw{ $Bin }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use Test::More; +use utf8; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw { :all }; +use Modern::Perl qw{ 2018 }; +use Test::Trap; + +## MIPs lib/ +use lib catdir( dirname($Bin), q{lib} ); +use MIP::Constants qw{ $COLON $COMMA $SPACE }; +use MIP::Test::Fixtures qw{ test_add_io_for_recipe test_log test_mip_hashes }; + +BEGIN { + + use MIP::Test::Fixtures qw{ test_import }; + +### Check all internal dependency modules and imports +## Modules with import + my %perl_module = ( + q{MIP::Recipes::Analysis::Svdb_merge_me} => [qw{ analysis_me_merge_vcfs }], + q{MIP::Test::Fixtures} => [qw{ test_add_io_for_recipe test_log test_mip_hashes }], + ); + + test_import( { perl_module_href => \%perl_module, } ); +} + +use MIP::Recipes::Analysis::Svdb_merge_me qw{ analysis_me_merge_vcfs }; + +diag( q{Test analysis_me_merge_vcfs from Svdb_merge_me.pm} + . $COMMA + . $SPACE . q{Perl} + . $SPACE + . $PERL_VERSION + . $SPACE + . $EXECUTABLE_NAME ); + +test_log( { no_screen => 1, } ); + +## Given analysis parameters +my $recipe_name = q{me_merge_vcfs}; +my $slurm_mock_cmd = catfile( $Bin, qw{ data modules slurm-mock.pl } ); + +my %active_parameter = test_mip_hashes( + { + mip_hash_name => q{active_parameter}, + recipe_name => $recipe_name, + } +); +$active_parameter{$recipe_name} = 1; +$active_parameter{recipe_core_number}{$recipe_name} = 1; +$active_parameter{recipe_time}{$recipe_name} = 1; +my $case_id = $active_parameter{case_id}; + +my %file_info = test_mip_hashes( + { + mip_hash_name => q{file_info}, + recipe_name => $recipe_name, + } +); + +my %job_id; +my %parameter = test_mip_hashes( + { + mip_hash_name => q{recipe_parameter}, + recipe_name => $recipe_name, + } +); + +SAMPLE_ID: +foreach my $sample_id ( @{ $active_parameter{sample_ids} } ) { + + test_add_io_for_recipe( + { + file_info_href => \%file_info, + id => $sample_id, + parameter_href => \%parameter, + recipe_name => $recipe_name, + step => q{vcf}, + } + ); +} + +my %sample_info = test_mip_hashes( + { + mip_hash_name => q{qc_sample_info}, + recipe_name => $recipe_name, + } +); + +my $is_ok = analysis_me_merge_vcfs( + { + active_parameter_href => \%active_parameter, + case_id => $case_id, + file_info_href => \%file_info, + job_id_href => \%job_id, + parameter_href => \%parameter, + profile_base_command => $slurm_mock_cmd, + recipe_name => $recipe_name, + sample_info_href => \%sample_info, + } +); +## Then return TRUE +ok( $is_ok, q{ Executed analysis recipe } . $recipe_name ); + +done_testing(); From 3d1f0717ce86559d9e5b97def3a4583a41c00249 Mon Sep 17 00:00:00 2001 From: jemten Date: Thu, 6 Apr 2023 11:15:15 +0200 Subject: [PATCH 12/34] adding bnd and overlap option to svdb merge --- definitions/rd_dna_parameters.yaml | 12 ++++++++ lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 18 ++++++++++++ lib/MIP/Program/Svdb.pm | 36 ++++++++++++++++++----- lib/MIP/Recipes/Analysis/Svdb_merge_me.pm | 2 ++ t/svdb_merge.t | 12 ++++++++ 5 files changed, 73 insertions(+), 7 deletions(-) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 49635f846..eb8ab8904 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -1304,6 +1304,18 @@ me_merge_vcfs: - svdb - tabix type: recipe +me_merge_vcfs_overlap: + associated_recipe: + - me_merge_vcfs + data_type: SCALAR + default: 0.5 + type: recipe_argument +me_merge_vcfs_bnd_distance: + associated_recipe: + - me_merge_vcfs + data_type: SCALAR + default: 150 + type: recipe_argument ## GATK CollectReadCounts gatk_collectreadcounts: analysis_mode: sample diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 12fe00951..3f895b9a2 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1183,6 +1183,24 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); + option( + q{me_merge_vcfs_bnd_distance} => ( + cmd_tags => [q{Default: 150}], + documentation => q{Maximum distance between two similar BNDs}, + is => q{rw}, + isa => Num, + ) + ); + + option( + q{me_merge_vcfs_overlap} => ( + cmd_tags => [q{Default: }], + documentation => q{Overlap required to mege two events}, + is => q{rw}, + isa => Num, + ) + ); + option( q{mobile_element_reference} => ( cmd_tags => [q{file.vcf=TE_type}], diff --git a/lib/MIP/Program/Svdb.pm b/lib/MIP/Program/Svdb.pm index 14dd6282d..d5ef4ff87 100644 --- a/lib/MIP/Program/Svdb.pm +++ b/lib/MIP/Program/Svdb.pm @@ -34,10 +34,12 @@ sub svdb_merge { ## Function : Perl wrapper for writing svdb merge recipe to $filehandle or return commands array. Based on svdb 1.0.7. ## Returns : @commands -## Arguments: $filehandle => Filehandle to write to +## Arguments: $bnd_distance => Maximum distance between two similar precise breakpoints +## : $filehandle => Filehandle to write to ## : $infile_paths_ref => Infile path {REF} ## : $notag => Do not add the the VARID and set entries to the info field ## : $outfile_path => Outfile path +## : $overlap => Overlap required to merge two events ## : $pass_only => Only merge variants labeled PASS ## : $priority => Priority order of structural variant calls ## : $same_order => Across all input vcf files, the order of the sample columns are the same @@ -48,10 +50,12 @@ sub svdb_merge { my ($arg_href) = @_; ## Flatten argument(s) + my $bnd_distance; my $filehandle; my $infile_paths_ref; my $notag; my $outfile_path; + my $overlap; my $pass_only; my $priority; my $same_order; @@ -60,6 +64,11 @@ sub svdb_merge { my $stdoutfile_path; my $tmpl = { + bnd_distance => { + allow => [ undef, qr/ \A \d+ \z /sxm ], + strict_type => 1, + store => \$bnd_distance + }, filehandle => { store => \$filehandle, }, infile_paths_ref => { default => [], @@ -70,7 +79,12 @@ sub svdb_merge { }, notag => { store => \$notag, strict_type => 1, }, outfile_path => { store => \$outfile_path, strict_type => 1, }, - pass_only => { + overlap => { + allow => [ undef, qr/ \A [-]? \d+ | \d+[.]\d+ \z /sxm ], + strict_type => 1, + store => \$overlap + }, + pass_only => { allow => [ undef, 0, 1 ], store => \$pass_only, strict_type => 1, @@ -96,6 +110,18 @@ sub svdb_merge { my @commands = ( get_executable_base_command( { base_command => $BASE_COMMAND, } ), qw{ --merge } ); + if ($bnd_distance) { + + push @commands, q{--bnd_distance} . $SPACE . $bnd_distance; + } + if ($notag) { + + push @commands, q{--notag}; + } + if ($overlap) { + + push @commands, q{--overlap} . $SPACE . $overlap; + } if ($pass_only) { push @commands, q{--pass_only},; @@ -104,10 +130,6 @@ sub svdb_merge { push @commands, q{--priority} . $SPACE . $priority; } - if ($notag) { - - push @commands, q{--notag}; - } if ($same_order) { push @commands, q{--same_order}; @@ -207,7 +229,7 @@ sub svdb_query { out_allele_count_tag => { strict_type => 1, store => \$out_allele_count_tag }, out_frequency_tag => { strict_type => 1, store => \$out_frequency_tag }, overlap => { - allow => qr/ \A \d+ | d+[.]d+$ /sxm, + allow => qr/ \A [-]? \d+ | \d+[.]\d+ \z /sxm, strict_type => 1, store => \$overlap }, diff --git a/lib/MIP/Recipes/Analysis/Svdb_merge_me.pm b/lib/MIP/Recipes/Analysis/Svdb_merge_me.pm index ab4bafb3d..e4a44930a 100644 --- a/lib/MIP/Recipes/Analysis/Svdb_merge_me.pm +++ b/lib/MIP/Recipes/Analysis/Svdb_merge_me.pm @@ -193,8 +193,10 @@ sub analysis_me_merge_vcfs { my $svdb_merge_output = $outfile_path_prefix . q{.vcf}; svdb_merge( { + bnd_distance => $active_parameter_href->{me_merge_vcfs_bnd_distance}, filehandle => $filehandle, infile_paths_ref => \@me_infile_paths, + overlap => $active_parameter_href->{me_merge_vcfs_overlap}, } ); print ${filehandle} $PIPE . $SPACE; diff --git a/t/svdb_merge.t b/t/svdb_merge.t index db318a113..04e560492 100644 --- a/t/svdb_merge.t +++ b/t/svdb_merge.t @@ -44,6 +44,10 @@ diag( q{Test svdb_merge from Svdb.pm} . $SPACE . $EXECUTABLE_NAME ); +## Constants +Readonly my $BND_DISTANCE => 2_000; +Readonly my $EVENT_OVERLAP => -1; + ## Base arguments my @function_base_commands = qw{ svdb --merge }; @@ -80,6 +84,10 @@ my %required_argument = ( ); my %specific_argument = ( + bnd_distance => { + input => $BND_DISTANCE, + expected_output => q{--bnd_distance} . $SPACE . $BND_DISTANCE, + }, infile_paths_ref => { inputs_ref => [ catfile(qw{ a test infile_1 }), catfile(qw{ a test infile_2 }) ], expected_output => q{--vcf} @@ -92,6 +100,10 @@ my %specific_argument = ( input => q{1}, expected_output => q{--notag}, }, + overlap => { + input => $EVENT_OVERLAP, + expected_output => q{--overlap} . $SPACE . $EVENT_OVERLAP, + }, pass_only => { input => 1, expected_output => q{--pass_only}, From d435707c6afc81801db7ce5fdcbb53dc06727baf Mon Sep 17 00:00:00 2001 From: jemten Date: Wed, 12 Apr 2023 15:07:03 +0200 Subject: [PATCH 13/34] Annotating mobile element with swegen frq --- definitions/rd_dna_initiation_map.yaml | 1 + definitions/rd_dna_parameters.yaml | 42 +++ lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 37 ++- lib/MIP/Recipes/Analysis/Me_annotate.pm | 312 ++++++++++++++++++ lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 4 +- t/analysis_me_annotate.t | 122 +++++++ t/data/references/grch37_alu_swegen.vcf.gz | 0 .../references/grch37_alu_swegen.vcf.gz.tbi | 0 t/data/references/grch37_herv_swegen.vcf.gz | 0 .../references/grch37_herv_swegen.vcf.gz.tbi | 0 t/data/references/grch37_l1_swegen.vcf.gz | 0 t/data/references/grch37_l1_swegen.vcf.gz.tbi | 0 t/data/references/grch37_sva_swegen.vcf.gz | 0 .../references/grch37_sva_swegen.vcf.gz.tbi | 0 t/data/references/grch38_alu_swegen.vcf.gz | 0 .../references/grch38_alu_swegen.vcf.gz.tbi | 0 t/data/references/grch38_herv_swegen.vcf.gz | 0 .../references/grch38_herv_swegen.vcf.gz.tbi | 0 t/data/references/grch38_l1_swegen.vcf.gz | 0 t/data/references/grch38_l1_swegen.vcf.gz.tbi | 0 t/data/references/grch38_sva_swegen.vcf.gz | 0 .../references/grch38_sva_swegen.vcf.gz.tbi | 0 templates/grch38_mip_rd_dna_config.yaml | 7 + templates/mip_rd_dna_config.yaml | 7 + 24 files changed, 530 insertions(+), 2 deletions(-) create mode 100644 lib/MIP/Recipes/Analysis/Me_annotate.pm create mode 100644 t/analysis_me_annotate.t create mode 100644 t/data/references/grch37_alu_swegen.vcf.gz create mode 100644 t/data/references/grch37_alu_swegen.vcf.gz.tbi create mode 100644 t/data/references/grch37_herv_swegen.vcf.gz create mode 100644 t/data/references/grch37_herv_swegen.vcf.gz.tbi create mode 100644 t/data/references/grch37_l1_swegen.vcf.gz create mode 100644 t/data/references/grch37_l1_swegen.vcf.gz.tbi create mode 100644 t/data/references/grch37_sva_swegen.vcf.gz create mode 100644 t/data/references/grch37_sva_swegen.vcf.gz.tbi create mode 100644 t/data/references/grch38_alu_swegen.vcf.gz create mode 100644 t/data/references/grch38_alu_swegen.vcf.gz.tbi create mode 100644 t/data/references/grch38_herv_swegen.vcf.gz create mode 100644 t/data/references/grch38_herv_swegen.vcf.gz.tbi create mode 100644 t/data/references/grch38_l1_swegen.vcf.gz create mode 100644 t/data/references/grch38_l1_swegen.vcf.gz.tbi create mode 100644 t/data/references/grch38_sva_swegen.vcf.gz create mode 100644 t/data/references/grch38_sva_swegen.vcf.gz.tbi diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index 201afb9da..9942808d4 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -53,6 +53,7 @@ CHAIN_ALL: - me_merge_bam - retroseq - me_merge_vcfs + - me_annotate - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution - PARALLEL: diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index eb8ab8904..6d57b2931 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -226,6 +226,7 @@ recipe_core_number: gzip_fastq: 0 manta: 36 markduplicates: 13 + me_annotate: 2 me_merge_bam: 5 me_merge_vcfs: 2 mitodel: 1 @@ -359,6 +360,7 @@ recipe_time: gzip_fastq: 2 manta: 30 markduplicates: 20 + me_annotate: 2 me_merge_bam: 5 me_merge_vcfs: 2 mitodel: 2 @@ -1316,6 +1318,46 @@ me_merge_vcfs_bnd_distance: data_type: SCALAR default: 150 type: recipe_argument +me_annotate: + analysis_mode: case + associated_recipe: + - mip + data_type: SCALAR + default: 1 + file_tag: _me_ann + outfile_suffix: ".vcf.gz" + program_executables: + - svdb + - tabix + type: recipe +me_annotate_query_files: + associated_recipe: + - me_annotate + data_type: HASH + exists_check: file + is_reference: 1 + reference: reference_dir + type: path +me_annotate_query_overlap: + associated_recipe: + - me_annotate + data_type: SCALAR + default: -1 + type: recipe_argument +me_annotate_query_bnd_distance: + associated_recipe: + - me_annotate + data_type: SCALAR + default: 150 + type: recipe_argument +me_annotate_exons_bed: + associated_recipe: + - me_annotate + data_type: SCALAR + exists_check: file + is_reference: 1 + reference: reference_dir + type: path ## GATK CollectReadCounts gatk_collectreadcounts: analysis_mode: sample diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 3f895b9a2..1721d4704 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1195,7 +1195,7 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds option( q{me_merge_vcfs_overlap} => ( cmd_tags => [q{Default: }], - documentation => q{Overlap required to mege two events}, + documentation => q{Overlap required to merge two events}, is => q{rw}, isa => Num, ) @@ -1210,6 +1210,41 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); + option( + q{me_annotate} => ( + cmd_tags => [q{Analysis recipe switch}], + documentation => q{Annotate mobile elememnt}, + is => q{rw}, + isa => enum( [ 0, 1, 2 ] ), + ) + ); + + option( + q{me_annotate_query_bnd_distance} => ( + cmd_tags => [q{Default: 150}], + documentation => q{Maximum distance between two similar BNDs}, + is => q{rw}, + isa => Num, + ) + ); + + option( + q{me_annotate_query_overlap} => ( + cmd_tags => [q{Default: }], + documentation => q{Overlap required to annotate}, + is => q{rw}, + isa => Num, + ) + ); + + option( + q{me_annotate_exons_bed} => ( + documentation => q{Exon definitions in bed format}, + is => q{rw}, + isa => Str, + ) + ); + option( q{gatk_haplotypecaller} => ( cmd_tags => [q{Analysis recipe switch}], diff --git a/lib/MIP/Recipes/Analysis/Me_annotate.pm b/lib/MIP/Recipes/Analysis/Me_annotate.pm new file mode 100644 index 000000000..7dcb2bcb5 --- /dev/null +++ b/lib/MIP/Recipes/Analysis/Me_annotate.pm @@ -0,0 +1,312 @@ +package MIP::Recipes::Analysis::Me_annotate; + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile devnull }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use utf8; +use warnings; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw{ :all }; +use Readonly; + +## MIPs lib/ +use MIP::Constants qw{ $EMPTY_STR $DOT $LOG_NAME $NEWLINE $PIPE $SPACE $UNDERSCORE }; + +BEGIN { + + require Exporter; + use base qw{ Exporter }; + + # Functions and variables which can be optionally exported + our @EXPORT_OK = qw{ analysis_me_annotate }; + +} + +sub analysis_me_annotate { + +## Function : Annotate mobile elements. +## Returns : +## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} +## : $case_id => Family id +## : $file_info_href => File_info hash {REF} +## : $job_id_href => Job id hash {REF} +## : $parameter_href => Parameter hash {REF} +## : $profile_base_command => Submission profile base command +## : $recipe_name => Recipe name +## : $sample_info_href => Info on samples and case hash {REF} + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $active_parameter_href; + my $file_info_href; + my $job_id_href; + my $parameter_href; + my $recipe_name; + my $sample_info_href; + + ## Default(s) + my $case_id; + my $profile_base_command; + + my $tmpl = { + active_parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$active_parameter_href, + strict_type => 1, + }, + case_id => { + default => $arg_href->{active_parameter_href}{case_id}, + store => \$case_id, + strict_type => 1, + }, + file_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$file_info_href, + strict_type => 1, + }, + job_id_href => { + default => {}, + defined => 1, + required => 1, + store => \$job_id_href, + strict_type => 1, + }, + parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$parameter_href, + strict_type => 1, + }, + profile_base_command => { + default => q{sbatch}, + store => \$profile_base_command, + strict_type => 1, + }, + recipe_name => { + defined => 1, + required => 1, + store => \$recipe_name, + strict_type => 1, + }, + sample_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$sample_info_href, + strict_type => 1, + }, + }; + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + use MIP::File_info qw{ get_io_files parse_io_outfiles }; + use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Program::Bcftools qw{ bcftools_annotate }; + use MIP::Program::Gnu::Coreutils qw( gnu_mv ); + use MIP::Program::Htslib qw{ htslib_bgzip htslib_tabix }; + use MIP::Program::Svdb qw{ svdb_query }; + use MIP::Recipe qw{ parse_recipe_prerequisites }; + use MIP::Sample_info qw{ set_file_path_to_store set_recipe_outfile_in_sample_info }; + use MIP::Script::Setup_script qw{ setup_script }; + use MIP::Validate::Data qw{ %constraint }; + + ### PREPROCESSING: + + ## Retrieve logger object + my $log = Log::Log4perl->get_logger($LOG_NAME); + + my %io = get_io_files( + { + id => $case_id, + file_info_href => $file_info_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + stream => q{in}, + } + ); + my $infile_name_prefix = $io{in}{file_name_prefix}; + my $infile_path = $io{in}{file_path}; + my $infile_path_prefix = $io{in}{file_path_prefix}; + my $infile_suffix = $io{in}{file_suffix}; + + my %recipe = parse_recipe_prerequisites( + { + active_parameter_href => $active_parameter_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + my $core_number = $recipe{core_number}; + my $memory = $recipe{memory}; + + %io = ( + %io, + parse_io_outfiles( + { + chain_id => $recipe{job_id_chain}, + id => $case_id, + file_info_href => $file_info_href, + file_name_prefixes_ref => [$infile_name_prefix], + outdata_dir => $active_parameter_href->{outdata_dir}, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ) + ); + + my $outfile_path = $io{out}{file_path}; + my $outfile_path_prefix = $io{out}{file_path_prefix}; + my $outfile_suffix = $io{out}{file_suffix}; + + ## Filehandles + # Create anonymous filehandle + my $filehandle = IO::Handle->new(); + + ## Creates recipe directories (info & data & script), recipe script filenames and writes sbatch header + my ( $recipe_file_path, $recipe_info_path ) = setup_script( + { + active_parameter_href => $active_parameter_href, + core_number => $core_number, + directory_id => $case_id, + filehandle => $filehandle, + job_id_href => $job_id_href, + memory_allocation => $memory, + process_time => $recipe{time}, + recipe_directory => $recipe_name, + recipe_name => $recipe_name, + } + ); + + ### SHELL: + + say {$filehandle} q{## } . $recipe_name; + + my $svdb_infile_path = $infile_path; + ## Default for when there are no svdb annotation files + my $svdb_outfile_path = $infile_path; + my $alt_file_tag = $UNDERSCORE . q{svdbq}; + my $query_outfile_suffix = $DOT . q{vcf}; + my $outfile_tracker = 0; + + QUERIES: + while ( my ( $query_db_file, $query_db_tag_info ) = + each %{ $active_parameter_href->{me_annotate_query_files} } ) + { + + if ($outfile_tracker) { + + $svdb_infile_path = + $outfile_path_prefix + . $alt_file_tag + . $query_outfile_suffix + . $DOT + . $outfile_tracker; + } + $outfile_tracker++; + $svdb_outfile_path = + $outfile_path_prefix . $alt_file_tag . $query_outfile_suffix . $DOT . $outfile_tracker; + + ## Get parameters +# Split query_db_tag to decide svdb input query fields +# FORMAT: filename|OUT_FREQUENCY_INFO_KEY|OUT_ALLELE_COUNT_INFO_KEY|IN_FREQUENCY_INFO_KEY|IN_ALLELE_COUNT_INFO_KEY + my ( $query_db_tag, $out_frequency_tag_suffix, $out_allele_count_tag_suffix, + $in_frequency_tag, $in_allele_count_tag ) + = split /[|]/sxm, $query_db_tag_info; + my $out_frequency_tag = $out_frequency_tag_suffix ||= $EMPTY_STR; + my $out_allele_count_tag = $out_allele_count_tag_suffix ||= $EMPTY_STR; + + svdb_query( + { + bnd_distance => $active_parameter_href->{me_annotate_query_bnd_distance}, + dbfile_path => $query_db_file, + filehandle => $filehandle, + infile_path => $svdb_infile_path, + in_frequency_tag => $in_frequency_tag, + in_allele_count_tag => $in_allele_count_tag, + out_frequency_tag => $query_db_tag . $out_frequency_tag, + out_allele_count_tag => $query_db_tag . $out_allele_count_tag, + stdoutfile_path => $svdb_outfile_path, + overlap => $active_parameter_href->{me_annotate_query_overlap}, + } + ); + say {$filehandle} $NEWLINE; + } + + if ( not $constraint{is_gzipped}->($svdb_outfile_path) ) { + + htslib_bgzip( + { + filehandle => $filehandle, + infile_path => $svdb_outfile_path, + stdoutfile_path => $outfile_path, + threads => $core_number, + + } + ); + } + else { + + gnu_mv( + { + filehandle => $filehandle, + infile_path => $svdb_outfile_path, + outfile_path => $outfile_path, + } + ); + } + say {$filehandle} $NEWLINE; + + htslib_tabix { + ( + filehandle => $filehandle, + infile_path => $outfile_path, + ) + }; + say {$filehandle} $NEWLINE; + + ## Close filehandle + close $filehandle or $log->logcroak(q{Could not close filehandle}); + + if ( $recipe{mode} == 1 ) { + + set_recipe_outfile_in_sample_info( + { + path => $outfile_path, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + } + ); + + submit_recipe( + { + base_command => $profile_base_command, + case_id => $case_id, + dependency_method => q{sample_to_case}, + log => $log, + job_id_chain => $recipe{job_id_chain}, + job_id_href => $job_id_href, + job_reservation_name => $active_parameter_href->{job_reservation_name}, + max_parallel_processes_count_href => + $file_info_href->{max_parallel_processes_count}, + recipe_file_path => $recipe_file_path, + sample_ids_ref => \@{ $active_parameter_href->{sample_ids} }, + submission_profile => $active_parameter_href->{submission_profile}, + } + ); + } + return 1; +} + +1; diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index dda099ea8..0c0841ce9 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -130,7 +130,7 @@ sub parse_rd_dna { Readonly my @ONLY_WGS_VARIANT_CALLER_RECIPES => qw{ cnvnator_ar delly_reformat tiddit }; Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter - gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_merge_bam me_merge_vcfs + gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_annotate me_merge_bam me_merge_vcfs mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; @@ -475,6 +475,7 @@ sub pipeline_analyse_rd_dna { use MIP::Recipes::Analysis::Gzip_fastq qw{ analysis_gzip_fastq }; use MIP::Recipes::Analysis::Manta qw{ analysis_manta }; use MIP::Recipes::Analysis::Markduplicates qw{ analysis_markduplicates }; + use MIP::Recipes::Analysis::Me_annotate qw{ analysis_me_annotate}; use MIP::Recipes::Analysis::Mip_qccollect qw{ analysis_mip_qccollect }; use MIP::Recipes::Analysis::Mip_vcfparser qw{ analysis_mip_vcfparser }; use MIP::Recipes::Analysis::Mip_vercollect qw{ analysis_mip_vercollect }; @@ -589,6 +590,7 @@ sub pipeline_analyse_rd_dna { gzip_fastq => \&analysis_gzip_fastq, manta => \&analysis_manta, markduplicates => \&analysis_markduplicates, + me_annotate => \&analysis_me_annotate, me_merge_bam => \&analysis_samtools_merge_panel, me_merge_vcfs => \&analysis_me_merge_vcfs, mitodel => \&analysis_mitodel, diff --git a/t/analysis_me_annotate.t b/t/analysis_me_annotate.t new file mode 100644 index 000000000..7aeeed6c1 --- /dev/null +++ b/t/analysis_me_annotate.t @@ -0,0 +1,122 @@ +#!/usr/bin/env perl + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile }; +use FindBin qw{ $Bin }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use Test::More; +use utf8; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw { :all }; +use Modern::Perl qw{ 2018 }; +use Readonly; +use Test::Trap; + +## MIPs lib/ +use lib catdir( dirname($Bin), q{lib} ); +use MIP::Constants qw{ $COLON $COMMA $SPACE }; +use MIP::Test::Fixtures qw{ test_add_io_for_recipe test_log test_mip_hashes }; + +BEGIN { + + use MIP::Test::Fixtures qw{ test_import }; + +### Check all internal dependency modules and imports +## Modules with import + my %perl_module = ( + q{MIP::Recipes::Analysis::Me_annotate} => [qw{ analysis_me_annotate }], + q{MIP::Test::Fixtures} => [qw{ test_add_io_for_recipe test_log test_mip_hashes }], + ); + + test_import( { perl_module_href => \%perl_module, } ); +} + +use MIP::Recipes::Analysis::Me_annotate qw{ analysis_me_annotate }; + +diag( q{Test analysis_me_annotate from Me_annotate.pm} + . $COMMA + . $SPACE . q{Perl} + . $SPACE + . $PERL_VERSION + . $SPACE + . $EXECUTABLE_NAME ); + +## Constants +Readonly my $ANNOTATION_OVERLAP => -1; +Readonly my $BND_DISTANCE => 150; + +test_log( { no_screen => 1, } ); + +## Given analysis parameters +my $recipe_name = q{me_annotate}; +my $slurm_mock_cmd = catfile( $Bin, qw{ data modules slurm-mock.pl } ); + +my %active_parameter = test_mip_hashes( + { + mip_hash_name => q{active_parameter}, + recipe_name => $recipe_name, + } +); +$active_parameter{$recipe_name} = 1; +$active_parameter{recipe_core_number}{$recipe_name} = 1; +$active_parameter{recipe_time}{$recipe_name} = 1; +$active_parameter{me_annotate_query_files} = { a_file => q{a_file|AF|AC|in_AF|in_AC|1}, }; +my $case_id = $active_parameter{case_id}; +$active_parameter{me_annotate_query_overlap} = $ANNOTATION_OVERLAP; +$active_parameter{me_annotate_query_bnd_distance} = $BND_DISTANCE; + +my %file_info = test_mip_hashes( + { + mip_hash_name => q{file_info}, + recipe_name => $recipe_name, + } +); + +my %job_id; +my %parameter = test_mip_hashes( + { + mip_hash_name => q{recipe_parameter}, + recipe_name => $recipe_name, + } +); + +test_add_io_for_recipe( + { + file_info_href => \%file_info, + id => $case_id, + parameter_href => \%parameter, + recipe_name => $recipe_name, + step => q{vcf}, + } +); + +my %sample_info = test_mip_hashes( + { + mip_hash_name => q{qc_sample_info}, + recipe_name => $recipe_name, + } +); + +my $is_ok = analysis_me_annotate( + { + active_parameter_href => \%active_parameter, + case_id => $case_id, + file_info_href => \%file_info, + job_id_href => \%job_id, + parameter_href => \%parameter, + profile_base_command => $slurm_mock_cmd, + recipe_name => $recipe_name, + sample_info_href => \%sample_info, + } +); +## Then return TRUE +ok( $is_ok, q{ Executed analysis recipe } . $recipe_name ); + +done_testing(); diff --git a/t/data/references/grch37_alu_swegen.vcf.gz b/t/data/references/grch37_alu_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_alu_swegen.vcf.gz.tbi b/t/data/references/grch37_alu_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_herv_swegen.vcf.gz b/t/data/references/grch37_herv_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_herv_swegen.vcf.gz.tbi b/t/data/references/grch37_herv_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_l1_swegen.vcf.gz b/t/data/references/grch37_l1_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_l1_swegen.vcf.gz.tbi b/t/data/references/grch37_l1_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_sva_swegen.vcf.gz b/t/data/references/grch37_sva_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch37_sva_swegen.vcf.gz.tbi b/t/data/references/grch37_sva_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_alu_swegen.vcf.gz b/t/data/references/grch38_alu_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_alu_swegen.vcf.gz.tbi b/t/data/references/grch38_alu_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_herv_swegen.vcf.gz b/t/data/references/grch38_herv_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_herv_swegen.vcf.gz.tbi b/t/data/references/grch38_herv_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_l1_swegen.vcf.gz b/t/data/references/grch38_l1_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_l1_swegen.vcf.gz.tbi b/t/data/references/grch38_l1_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_sva_swegen.vcf.gz b/t/data/references/grch38_sva_swegen.vcf.gz new file mode 100644 index 000000000..e69de29bb diff --git a/t/data/references/grch38_sva_swegen.vcf.gz.tbi b/t/data/references/grch38_sva_swegen.vcf.gz.tbi new file mode 100644 index 000000000..e69de29bb diff --git a/templates/grch38_mip_rd_dna_config.yaml b/templates/grch38_mip_rd_dna_config.yaml index ae0141eab..866b0d372 100755 --- a/templates/grch38_mip_rd_dna_config.yaml +++ b/templates/grch38_mip_rd_dna_config.yaml @@ -40,6 +40,13 @@ gatk_varianteval_dbsnp: grch38_variant_-gold_standard_dbsnp-.vcf.gz gatk_varianteval_gold: grch38_mills_and_1000g_-gold_standard_indels-.vcf.gz genmod_models_reduced_penetrance_file: grch38_cust003-cmms-red-pen_-2017_FAKE-.tsv human_genome_reference: grch38_homo_sapiens_-assembly-.fasta +me_annotate_exons_bed: grch38_scout_exons_-2020-09-17-.bed +me_annotate_query_files: + # FORMAT: filename|OUT_FREQUENCY_INFO_KEY|OUT_ALLELE_COUNT_INFO_KEY|IN_FREQUENCY_INFO_KEY|IN_ALLELE_COUNT_INFO_KEY|USE_IN_FREQUENCY_FILTER + grch38_alu_swegen.vcf.gz: swegen_alu_|FRQ|OCC|FRQ|OCC|1 + grch38_herv_swegen.vcf.gz: swegen_herv_|FRQ|OCC|FRQ|OCC|1 + grch38_l1_swegen.vcf.gz: swegen_l1_|FRQ|OCC|FRQ|OCC|1 + grch38_sva_swegen.vcf.gz: swegen_sva_|FRQ|OCC|FRQ|OCC|1 mobile_element_reference: grch38_g1k_alu.bed: ALU grch38_g1k_l1.bed: L1 diff --git a/templates/mip_rd_dna_config.yaml b/templates/mip_rd_dna_config.yaml index db3d8120e..3679cca13 100755 --- a/templates/mip_rd_dna_config.yaml +++ b/templates/mip_rd_dna_config.yaml @@ -23,6 +23,13 @@ sample_info_file: cluster_constant_path!/case_id!/analysis_constant_path!/case_i gatk_genotypegvcfs_ref_gvcf: grch37_gatk_merged_reference_samples.txt genmod_models_reduced_penetrance_file: grch37_cust003-cmms-red-pen_-2017-.tsv human_genome_reference: grch37_homo_sapiens_-d5-.fasta +me_annotate_exons_bed: grch37_scout_exons_-2017-01-.bed +me_annotate_query_files: + # FORMAT: filename|OUT_FREQUENCY_INFO_KEY|OUT_ALLELE_COUNT_INFO_KEY|IN_FREQUENCY_INFO_KEY|IN_ALLELE_COUNT_INFO_KEY|USE_IN_FREQUENCY_FILTER + grch37_alu_swegen.vcf.gz: swegen_alu_|FRQ|OCC|FRQ|OCC|1 + grch37_herv_swegen.vcf.gz: swegen_herv_|FRQ|OCC|FRQ|OCC|1 + grch37_l1_swegen.vcf.gz: swegen_l1_|FRQ|OCC|FRQ|OCC|1 + grch37_sva_swegen.vcf.gz: swegen_sva_|FRQ|OCC|FRQ|OCC|1 mobile_element_reference: grch37_g1k_alu.bed: ALU grch37_g1k_l1.bed: L1 From c2ab43652284eaecdef3d5b7bf03e79dc6103ed5 Mon Sep 17 00:00:00 2001 From: jemten Date: Thu, 13 Apr 2023 09:42:18 +0200 Subject: [PATCH 14/34] annotate me with breakpoints in exon --- definitions/rd_dna_parameters.yaml | 2 +- lib/MIP/Program/Bcftools.pm | 8 ++++++ lib/MIP/Recipes/Analysis/Me_annotate.pm | 38 +++++++++---------------- t/bcftools_annotate.t | 4 +++ 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 6d57b2931..ebbf432a5 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -1324,7 +1324,7 @@ me_annotate: - mip data_type: SCALAR default: 1 - file_tag: _me_ann + file_tag: _ ann outfile_suffix: ".vcf.gz" program_executables: - svdb diff --git a/lib/MIP/Program/Bcftools.pm b/lib/MIP/Program/Bcftools.pm index 2276bbbed..796872053 100644 --- a/lib/MIP/Program/Bcftools.pm +++ b/lib/MIP/Program/Bcftools.pm @@ -60,6 +60,7 @@ sub bcftools_annotate { ## : $headerfile_path => File with lines which should be appended to the VCF header ## : $include => Include only sites for which the expression is true ## : $infile_path => Infile path to read from +## : mark_sites => Mark sites that are present in annotation file ## : $outfile_path => Outfile path to write to ## : $output_type => 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v] ## : $regions_ref => Regions to process {REF} @@ -80,6 +81,7 @@ sub bcftools_annotate { my $include; my $infile_path; my $headerfile_path; + my $mark_sites; my $outfile_path; my $regions_ref; my $remove_ids_ref; @@ -100,6 +102,7 @@ sub bcftools_annotate { headerfile_path => { store => \$headerfile_path, strict_type => 1, }, include => { store => \$include, strict_type => 1, }, infile_path => { store => \$infile_path, strict_type => 1, }, + mark_sites => { store => \$mark_sites, strict_type => 1, }, outfile_path => { store => \$outfile_path, strict_type => 1, }, output_type => { allow => [qw{ b u z v}], @@ -158,6 +161,11 @@ sub bcftools_annotate { push @commands, q{--include} . $SPACE . $include; } + if ($mark_sites) { + + push @commands, q{--mark-sites} . $SPACE . $mark_sites; + } + if ( @{$remove_ids_ref} ) { push @commands, q{--remove} . $SPACE . join $COMMA, @{$remove_ids_ref}; diff --git a/lib/MIP/Recipes/Analysis/Me_annotate.pm b/lib/MIP/Recipes/Analysis/Me_annotate.pm index 7dcb2bcb5..970698b7f 100644 --- a/lib/MIP/Recipes/Analysis/Me_annotate.pm +++ b/lib/MIP/Recipes/Analysis/Me_annotate.pm @@ -114,8 +114,7 @@ sub analysis_me_annotate { use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::Processmanagement::Processes qw{ submit_recipe }; use MIP::Program::Bcftools qw{ bcftools_annotate }; - use MIP::Program::Gnu::Coreutils qw( gnu_mv ); - use MIP::Program::Htslib qw{ htslib_bgzip htslib_tabix }; + use MIP::Program::Htslib qw{ htslib_tabix }; use MIP::Program::Svdb qw{ svdb_query }; use MIP::Recipe qw{ parse_recipe_prerequisites }; use MIP::Sample_info qw{ set_file_path_to_store set_recipe_outfile_in_sample_info }; @@ -244,28 +243,19 @@ sub analysis_me_annotate { say {$filehandle} $NEWLINE; } - if ( not $constraint{is_gzipped}->($svdb_outfile_path) ) { - - htslib_bgzip( - { - filehandle => $filehandle, - infile_path => $svdb_outfile_path, - stdoutfile_path => $outfile_path, - threads => $core_number, - - } - ); - } - else { - - gnu_mv( - { - filehandle => $filehandle, - infile_path => $svdb_outfile_path, - outfile_path => $outfile_path, - } - ); - } + my $header = + q{<(echo '##INFO=')}; + bcftools_annotate( + { + annotations_file_path => $active_parameter_href->{me_annotate_exons_bed}, + columns_name => q{CHROM,FROM,TO}, + headerfile_path => $header, + infile_path => $svdb_outfile_path, + mark_sites => q{IN_EXON}, + outfile_path => $outfile_path, + output_type => q{z}, + } + ); say {$filehandle} $NEWLINE; htslib_tabix { diff --git a/t/bcftools_annotate.t b/t/bcftools_annotate.t index 8c5d15bea..46dd9f85b 100644 --- a/t/bcftools_annotate.t +++ b/t/bcftools_annotate.t @@ -92,6 +92,10 @@ my %specific_argument = ( input => q{infile.test}, expected_output => q{infile.test}, }, + mark_sites => { + input => q{TAG}, + expected_output => q{--mark-sites TAG}, + }, outfile_path => { input => q{outfile.txt}, expected_output => q{-o outfile.txt}, From ca7438053d94b2c486b867456184329295e5876d Mon Sep 17 00:00:00 2001 From: jemten Date: Thu, 13 Apr 2023 10:13:53 +0200 Subject: [PATCH 15/34] fixing typos --- definitions/rd_dna_parameters.yaml | 2 +- lib/MIP/Recipes/Analysis/Me_annotate.pm | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index ebbf432a5..be2c428ba 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -1324,7 +1324,7 @@ me_annotate: - mip data_type: SCALAR default: 1 - file_tag: _ ann + file_tag: _ann outfile_suffix: ".vcf.gz" program_executables: - svdb diff --git a/lib/MIP/Recipes/Analysis/Me_annotate.pm b/lib/MIP/Recipes/Analysis/Me_annotate.pm index 970698b7f..5ac4bd47c 100644 --- a/lib/MIP/Recipes/Analysis/Me_annotate.pm +++ b/lib/MIP/Recipes/Analysis/Me_annotate.pm @@ -249,6 +249,7 @@ sub analysis_me_annotate { { annotations_file_path => $active_parameter_href->{me_annotate_exons_bed}, columns_name => q{CHROM,FROM,TO}, + filehandle => $filehandle, headerfile_path => $header, infile_path => $svdb_outfile_path, mark_sites => q{IN_EXON}, From 200e7c199b2121b42b51f376eca6a978a1c10226 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 14 Apr 2023 13:41:51 +0200 Subject: [PATCH 16/34] adding executable to recipe --- definitions/rd_dna_parameters.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index be2c428ba..67fad12db 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -1327,6 +1327,7 @@ me_annotate: file_tag: _ann outfile_suffix: ".vcf.gz" program_executables: + - bcftools - svdb - tabix type: recipe From e93d82ef691c65215e150d5310bacec8ed623f44 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 14 Apr 2023 13:40:23 +0200 Subject: [PATCH 17/34] adding vep for mobile_elements --- definitions/rd_dna_initiation_map.yaml | 1 + definitions/rd_dna_parameters.yaml | 15 + lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 9 + lib/MIP/Recipes/Analysis/Vep.pm | 340 ++++++++++++++++++++- lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 5 +- t/analysis_vep_me.t | 120 ++++++++ 6 files changed, 485 insertions(+), 5 deletions(-) create mode 100644 t/analysis_vep_me.t diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index 9942808d4..f9afd1243 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -54,6 +54,7 @@ CHAIN_ALL: - retroseq - me_merge_vcfs - me_annotate + - me_varianteffectpredictor - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution - PARALLEL: diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 67fad12db..b6f291c42 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -229,6 +229,7 @@ recipe_core_number: me_annotate: 2 me_merge_bam: 5 me_merge_vcfs: 2 + me_varianteffectpredictor: 4 mitodel: 1 mt_annotation: 1 multiqc_ar: 1 @@ -363,6 +364,7 @@ recipe_time: me_annotate: 2 me_merge_bam: 5 me_merge_vcfs: 2 + me_varianteffectpredictor: 3 mitodel: 2 mt_annotation: 1 multiqc_ar: 5 @@ -1359,6 +1361,19 @@ me_annotate_exons_bed: is_reference: 1 reference: reference_dir type: path +me_varianteffectpredictor: + analysis_mode: case + associated_recipe: + - mip + data_type: SCALAR + default: 1 + file_tag: _vep + outfile_suffix: ".vcf.gz" + program_executables: + - bcftools + - vep + - tabix + type: recipe ## GATK CollectReadCounts gatk_collectreadcounts: analysis_mode: sample diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 1721d4704..2ca1f760e 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1245,6 +1245,15 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); + option( + q{me_varianteffectpredictor} => ( + cmd_tags => [q{Analysis recipe switch}], + documentation => q{Annotate mobile elememnts with VEP}, + is => q{rw}, + isa => enum( [ 0, 1, 2 ] ), + ) + ); + option( q{gatk_haplotypecaller} => ( cmd_tags => [q{Analysis recipe switch}], diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index fd3005f68..460676186 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -4,8 +4,8 @@ use 5.026; use Carp; use charnames qw{ :full :short }; use English qw{ -no_match_vars }; -use File::Basename qw{ fileparse }; -use File::Spec::Functions qw{ catfile splitpath }; +use File::Basename qw{ dirname fileparse }; +use File::Spec::Functions qw{ catfile devnull splitpath }; use open qw{ :encoding(UTF-8) :std }; use Params::Check qw{ check allow last_error }; use POSIX; @@ -19,7 +19,7 @@ use Readonly; ## MIPs lib/ use MIP::Constants - qw{ %ANALYSIS $ASTERISK $COMMA $DOT $EMPTY_STR $LOG_NAME $NEWLINE %PRIMARY_CONTIG $SPACE $UNDERSCORE }; + qw{ %ANALYSIS $ASTERISK $COMMA $DOT $EMPTY_STR $LOG_NAME $NEWLINE $PIPE %PRIMARY_CONTIG $SPACE $UNDERSCORE }; use MIP::File::Format::Vep qw{ create_vep_synonyms_file }; BEGIN { @@ -30,6 +30,7 @@ BEGIN { # Functions and variables which can be optionally exported our @EXPORT_OK = qw{ analysis_vep + analysis_vep_me analysis_vep_wgs analysis_vep_sv_wes analysis_vep_sv_wgs @@ -1398,6 +1399,339 @@ sub analysis_vep { return 1; } +sub analysis_vep_me { + +## Function : Varianteffectpredictor performs annotation of mobile elements. +## Returns : +## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} +## : $case_id => Family id +## : $filehandle => Filehandle to write to +## : $file_info_href => The file_info hash {REF} +## : $job_id_href => Job id hash {REF} +## : $parameter_href => Parameter hash {REF} +## : $profile_base_command => Submission profile base command +## : $recipe_name => Program name +## : $sample_info_href => Info on samples and case hash {REF} +## : $temp_directory => Temporary directory +## : $xargs_file_counter => The xargs file counter + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $active_parameter_href; + my $file_info_href; + my $job_id_href; + my $parameter_href; + my $recipe_name; + my $sample_info_href; + + ## Default(s) + my $case_id; + my $profile_base_command; + my $temp_directory; + my $xargs_file_counter; + + my $tmpl = { + active_parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$active_parameter_href, + strict_type => 1, + }, + case_id => { + default => $arg_href->{active_parameter_href}{case_id}, + store => \$case_id, + strict_type => 1, + }, + file_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$file_info_href, + strict_type => 1, + }, + job_id_href => { + default => {}, + defined => 1, + required => 1, + store => \$job_id_href, + strict_type => 1, + }, + parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$parameter_href, + strict_type => 1, + }, + profile_base_command => { + default => q{sbatch}, + store => \$profile_base_command, + strict_type => 1, + }, + recipe_name => { + defined => 1, + required => 1, + store => \$recipe_name, + strict_type => 1, + }, + sample_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$sample_info_href, + strict_type => 1, + }, + temp_directory => { + default => $arg_href->{active_parameter_href}{temp_directory}, + store => \$temp_directory, + strict_type => 1, + }, + }; + + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + use MIP::Cluster qw{ get_core_number update_memory_allocation }; + use MIP::File_info qw{ get_io_files parse_io_outfiles }; + use MIP::List qw{ get_splitted_lists }; + use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Program::Bcftools qw{ bcftools_concat bcftools_view }; + use MIP::Program::Htslib qw{ htslib_tabix }; + use MIP::Program::Vep qw{ variant_effect_predictor }; + use MIP::Recipe qw{ parse_recipe_prerequisites }; + use MIP::Recipes::Analysis::Xargs qw{ xargs_command }; + use MIP::Sample_info qw{ set_recipe_metafile_in_sample_info set_recipe_outfile_in_sample_info }; + use MIP::Script::Setup_script qw{ setup_script }; + + ### PREPROCESSING: + + ## Constants + Readonly my $VEP_FORK_NUMBER => 4; + + ## Retrieve logger object + my $log = Log::Log4perl->get_logger($LOG_NAME); + + ## Unpack parameters + ## Get the io infiles per chain and id + my %io = get_io_files( + { + id => $case_id, + file_info_href => $file_info_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + stream => q{in}, + } + ); + + my $infile_name_prefix = $io{in}{file_name_prefix}; + my $infile_path_prefix = $io{in}{file_path_prefix}; + my $infile_suffix = $io{in}{file_suffix}; + my $infile_path = $infile_path_prefix . $infile_suffix; + + my $genome_reference_version = $file_info_href->{human_genome_reference_version}; + + my %recipe = parse_recipe_prerequisites( + { + active_parameter_href => $active_parameter_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + + %io = parse_io_outfiles( + { + chain_id => $recipe{job_id_chain}, + id => $case_id, + file_info_href => $file_info_href, + file_name_prefixes_ref => [$infile_name_prefix], + outdata_dir => $active_parameter_href->{outdata_dir}, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + my $outdir_path_prefix = $io{out}{dir_path_prefix}; + my $outfile_path_prefix = $io{out}{file_path_prefix}; + my $outfile_suffix = $io{out}{file_suffix}; + my $outfile_path = $io{out}{file_path}; + + ## Filehandles + # Create anonymous filehandle + my $filehandle = IO::Handle->new(); + + ## Creates recipe directories (info & data & script), recipe script filenames and writes sbatch header + my ( $recipe_file_path, $recipe_info_path ) = setup_script( + { + active_parameter_href => $active_parameter_href, + core_number => $recipe{core_number}, + directory_id => $case_id, + filehandle => $filehandle, + job_id_href => $job_id_href, + memory_allocation => $recipe{memory}, + process_time => $recipe{time}, + recipe_directory => $recipe_name, + recipe_name => $recipe_name, + temp_directory => $temp_directory, + } + ); + + ### SHELL: + + ## Get the vep synonyms file path for if required (grch38) + my $vep_synonyms_file_path = create_vep_synonyms_file( + { + outfile_path => catfile( $outdir_path_prefix, q{synonyms.tsv} ), + version => $genome_reference_version, + } + ); + + ## Varianteffectpredictor + say {$filehandle} q{## Varianteffectpredictor}; + + my $assembly_version = + $file_info_href->{human_genome_reference_source} . $genome_reference_version; + + ## Get genome source and version to be compatible with VEP + $assembly_version = _get_assembly_name( { assembly_version => $assembly_version, } ); + + # VEP custom annotations + my @custom_annotations = _get_custom_annotation_cmds( + { + vep_custom_annotation_href => $active_parameter_href->{vep_custom_annotation}, + } + ); + + ## VEP plugins + my @plugins = + _get_plugin_cmds( { vep_plugin_href => $active_parameter_href->{sv_vep_plugin}, } ); + + ## Get contigs + my ( $mt_contig_ref, $contigs_ref ) = get_splitted_lists( + { + regexp => qr/M/, + list_ref => $file_info_href->{bam_contigs}, + } + ); + + ## VEP features + my @vep_features; + + FEATURE: + foreach my $vep_feature ( @{ $active_parameter_href->{vep_features} } ) { + + # Add VEP features to the output. + push @vep_features, $vep_feature; + } + + ## Special annotation distance for the MT means that we do two calls to vep + bcftools_view( + { + filehandle => $filehandle, + targets => q{^} . $mt_contig_ref->[0], + infile_path => $infile_path, + } + ); + print {$filehandle} $PIPE . $SPACE; + + my $vep_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT.vcf}; + variant_effect_predictor( + { + assembly => $assembly_version, + cache_directory => $active_parameter_href->{vep_directory_cache}, + distance => $ANNOTATION_DISTANCE, + filehandle => $filehandle, + fork => $VEP_FORK_NUMBER, + outfile_format => q{vcf}, + outfile_path => $vep_non_mt_outfile_path, + plugins_dir_path => $active_parameter_href->{vep_plugins_dir_path}, + plugins_ref => \@plugins, + reference_path => $active_parameter_href->{human_genome_reference}, + synonyms_file_path => $vep_synonyms_file_path, + vep_features_ref => \@vep_features, + } + ); + say {$filehandle} $NEWLINE; + + bcftools_view( + { + filehandle => $filehandle, + regions_ref => [ $contigs_ref->[20], $mt_contig_ref->[0] ], + infile_path => $infile_path, + } + ); + print {$filehandle} $PIPE . $SPACE; + + ## Special case for mitochondrial contig annotation + my @mt_vep_features = map { ( $_ eq q{refseq} ) ? q{all_} . $_ : $_ } @vep_features; + variant_effect_predictor( + { + assembly => $assembly_version, + cache_directory => $active_parameter_href->{vep_directory_cache}, + distance => $ANNOTATION_DISTANCE_MT, + filehandle => $filehandle, + fork => $VEP_FORK_NUMBER, + outfile_format => q{vcf}, + plugins_dir_path => $active_parameter_href->{vep_plugins_dir_path}, + plugins_ref => \@plugins, + reference_path => $active_parameter_href->{human_genome_reference}, + synonyms_file_path => $vep_synonyms_file_path, + vep_features_ref => \@mt_vep_features, + } + ); + print {$filehandle} $PIPE . $SPACE; + + bcftools_concat( + { + filehandle => $filehandle, + infile_paths_ref => + [ $vep_non_mt_outfile_path, catfile( dirname( devnull() ), q{stdin} ) ], + outfile_path => $outfile_path, + output_type => q{z}, + threads => $recipe{core_number}, + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix( + { + filehandle => $filehandle, + infile_path => $outfile_path, + } + ); + say {$filehandle} $NEWLINE; + + close $filehandle; + + if ( $recipe{mode} == 1 ) { + + ## Collect QC metadata info for later use + set_recipe_outfile_in_sample_info( + { + path => $outfile_path, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + } + ); + submit_recipe( + { + base_command => $profile_base_command, + case_id => $case_id, + dependency_method => q{sample_to_case}, + job_id_chain => $recipe{job_id_chain}, + job_id_href => $job_id_href, + job_reservation_name => $active_parameter_href->{job_reservation_name}, + log => $log, + max_parallel_processes_count_href => + $file_info_href->{max_parallel_processes_count}, + recipe_file_path => $recipe_file_path, + sample_ids_ref => \@{ $active_parameter_href->{sample_ids} }, + submission_profile => $active_parameter_href->{submission_profile}, + } + ); + } + return 1; +} + sub _get_custom_annotation_cmds { ## Function : Build the custom annotation command per file for vep diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index 0c0841ce9..c0d9482a4 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -131,7 +131,7 @@ sub parse_rd_dna { Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_annotate me_merge_bam me_merge_vcfs - mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; + me_varianteffectpredictor mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; ## Set analysis constants @@ -513,7 +513,7 @@ sub pipeline_analyse_rd_dna { use MIP::Recipes::Analysis::Varg qw{ analysis_varg }; use MIP::Recipes::Analysis::Variant_annotation qw{ analysis_variant_annotation }; use MIP::Recipes::Analysis::Vcf2cytosure qw{ analysis_vcf2cytosure }; - use MIP::Recipes::Analysis::Vep qw{ analysis_vep_wgs }; + use MIP::Recipes::Analysis::Vep qw{ analysis_vep_me analysis_vep_wgs }; use MIP::Recipes::Build::Rd_dna qw{build_rd_dna_meta_files}; ### Pipeline specific checks @@ -593,6 +593,7 @@ sub pipeline_analyse_rd_dna { me_annotate => \&analysis_me_annotate, me_merge_bam => \&analysis_samtools_merge_panel, me_merge_vcfs => \&analysis_me_merge_vcfs, + me_varianteffectpredictor => \&analysis_vep_me, mitodel => \&analysis_mitodel, mt_annotation => \&analysis_mt_annotation, multiqc_ar => \&analysis_multiqc, diff --git a/t/analysis_vep_me.t b/t/analysis_vep_me.t new file mode 100644 index 000000000..a03be551c --- /dev/null +++ b/t/analysis_vep_me.t @@ -0,0 +1,120 @@ +#!/usr/bin/env perl + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile }; +use FindBin qw{ $Bin }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use Test::More; +use utf8; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw { :all }; +use Modern::Perl qw{ 2018 }; +use Readonly; +use Test::Trap; + +## MIPs lib/ +use lib catdir( dirname($Bin), q{lib} ); +use MIP::Constants qw{ $COLON $COMMA $SPACE }; +use MIP::Test::Fixtures qw{ test_add_io_for_recipe test_log test_mip_hashes }; + +## Constants +Readonly my $GENOME_BUILD_VERSION_38 => 38; +Readonly my $GENOME_BUILD_VERSION_20 => 20; +Readonly my $GENOME_BUILD_VERSION_19 => 19; + +BEGIN { + + use MIP::Test::Fixtures qw{ test_import }; + +### Check all internal dependency modules and imports +## Modules with import + my %perl_module = ( + q{MIP::Recipes::Analysis::Vep} => [qw{ analysis_vep_me }], + q{MIP::Test::Fixtures} => [qw{ test_add_io_for_recipe test_log test_mip_hashes }], + ); + + test_import( { perl_module_href => \%perl_module, } ); +} + +use MIP::Recipes::Analysis::Vep qw{ analysis_vep_me }; + +diag( q{Test analysis_vep_me from Vep.pm} + . $COMMA + . $SPACE . q{Perl} + . $SPACE + . $PERL_VERSION + . $SPACE + . $EXECUTABLE_NAME ); + +my $log = test_log( { log_name => q{MIP}, no_screen => 1, } ); + +## Given analysis parameters +my $recipe_name = q{me_varianteffectpredictor}; +my $slurm_mock_cmd = catfile( $Bin, qw{ data modules slurm-mock.pl } ); + +my %active_parameter = test_mip_hashes( + { + mip_hash_name => q{active_parameter}, + recipe_name => $recipe_name, + } +); +$active_parameter{$recipe_name} = 1; +$active_parameter{recipe_core_number}{$recipe_name} = 1; +$active_parameter{recipe_time}{$recipe_name} = 1; +my $case_id = $active_parameter{case_id}; +$active_parameter{vep_directory_cache} = q{a_cache}; +my %file_info = test_mip_hashes( + { + mip_hash_name => q{file_info}, + recipe_name => $recipe_name, + } +); + +$file_info{human_genome_reference_source} = q{grch}; +$file_info{human_genome_reference_version} = $GENOME_BUILD_VERSION_38; + +my %job_id; +my %parameter = test_mip_hashes( + { + mip_hash_name => q{recipe_parameter}, + recipe_name => $recipe_name, + } +); + +test_add_io_for_recipe( + { + file_info_href => \%file_info, + id => $case_id, + parameter_href => \%parameter, + recipe_name => $recipe_name, + step => q{vcf}, + } +); +$active_parameter{vep_features} = [qw{ refseq }]; + +my %sample_info; + +my $is_ok = analysis_vep_me( + { + active_parameter_href => \%active_parameter, + case_id => $case_id, + file_info_href => \%file_info, + job_id_href => \%job_id, + parameter_href => \%parameter, + profile_base_command => $slurm_mock_cmd, + recipe_name => $recipe_name, + sample_info_href => \%sample_info, + } +); + +## Then return TRUE +ok( $is_ok, q{ Executed analysis recipe } . $recipe_name ); + +done_testing(); From 8507290d4f8d681c7b0c604b5e88a1dd0aa144b2 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 14 Apr 2023 14:13:05 +0200 Subject: [PATCH 18/34] fixing filename --- lib/MIP/Recipes/Analysis/Vep.pm | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index 460676186..fa6777358 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -1526,8 +1526,9 @@ sub analysis_vep_me { my $infile_name_prefix = $io{in}{file_name_prefix}; my $infile_path_prefix = $io{in}{file_path_prefix}; - my $infile_suffix = $io{in}{file_suffix}; - my $infile_path = $infile_path_prefix . $infile_suffix; + + #my $infile_suffix = $io{in}{file_suffix}; + my $infile_path = $io{in}{file_path}; my $genome_reference_version = $file_info_href->{human_genome_reference_version}; @@ -1552,8 +1553,9 @@ sub analysis_vep_me { ); my $outdir_path_prefix = $io{out}{dir_path_prefix}; my $outfile_path_prefix = $io{out}{file_path_prefix}; - my $outfile_suffix = $io{out}{file_suffix}; - my $outfile_path = $io{out}{file_path}; + + #my $outfile_suffix = $io{out}{file_suffix}; + my $outfile_path = $io{out}{file_path}; ## Filehandles # Create anonymous filehandle @@ -1655,8 +1657,8 @@ sub analysis_vep_me { bcftools_view( { filehandle => $filehandle, - regions_ref => [ $contigs_ref->[20], $mt_contig_ref->[0] ], infile_path => $infile_path, + regions_ref => [ $contigs_ref->[20], $mt_contig_ref->[0] ], } ); print {$filehandle} $PIPE . $SPACE; @@ -1680,8 +1682,17 @@ sub analysis_vep_me { ); print {$filehandle} $PIPE . $SPACE; + bcftools_view( + { + filehandle => $filehandle, + regions_ref => [ $mt_contig_ref->[0] ], + } + ); + print {$filehandle} $PIPE . $SPACE; + bcftools_concat( { + allow_overlaps => 1, filehandle => $filehandle, infile_paths_ref => [ $vep_non_mt_outfile_path, catfile( dirname( devnull() ), q{stdin} ) ], From a1a1e4445aa66995cde4a80f1637942937b7c4c3 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 14 Apr 2023 16:09:28 +0200 Subject: [PATCH 19/34] sorting output --- definitions/rd_dna_parameters.yaml | 2 +- lib/MIP/Program/Vep.pm | 23 ++++++++---- lib/MIP/Recipes/Analysis/Vep.pm | 56 ++++++++++++++++++++++-------- t/variant_effect_predictor.t | 24 +++++-------- 4 files changed, 67 insertions(+), 38 deletions(-) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index b6f291c42..0dd08873b 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -1371,8 +1371,8 @@ me_varianteffectpredictor: outfile_suffix: ".vcf.gz" program_executables: - bcftools - - vep - tabix + - vep type: recipe ## GATK CollectReadCounts gatk_collectreadcounts: diff --git a/lib/MIP/Program/Vep.pm b/lib/MIP/Program/Vep.pm index cb304a968..e7dd30079 100644 --- a/lib/MIP/Program/Vep.pm +++ b/lib/MIP/Program/Vep.pm @@ -39,6 +39,7 @@ sub variant_effect_predictor { ## Arguments: $assembly => Assembly version to use ## : $buffer_size => Sets the internal buffer size, corresponding to the number of variants that are read in to memory simultaneously ## : $cache_directory => VEP chache directory +## : $compress_output => Compress output ## : $custom_annotations_ref => Custom annotations {REF} ## : $distance => Modify the distance up and/or downstream between a variant and a transcript for which VEP will assign the upstream_gene_variant or downstream_gene_variant consequences ## : $filehandle => Filehandle to write to @@ -65,6 +66,7 @@ sub variant_effect_predictor { my $assembly; my $buffer_size; my $cache_directory; + my $compress_output; my $custom_annotations_ref; my $filehandle; my $infile_path; @@ -102,6 +104,11 @@ sub variant_effect_predictor { store => \$cache_directory, strict_type => 1, }, + compress_output => { + allow => [ undef, 0, 1 ], + store => \$compress_output, + strict_type => 1, + }, custom_annotations_ref => { default => [], store => \$custom_annotations_ref, @@ -225,6 +232,10 @@ sub variant_effect_predictor { push @commands, q{--dir_cache} . $SPACE . $cache_directory; } + if ($compress_output) { + + push @commands, q{--compress_output} . $SPACE . q{bgzip}; + } if ($infile_format) { push @commands, q{--format} . $SPACE . $infile_format; @@ -259,8 +270,7 @@ sub variant_effect_predictor { } if ( @{$custom_annotations_ref} ) { - push @commands, q{--custom} . $SPACE . join q{ --custom }, - @{$custom_annotations_ref}; + push @commands, q{--custom} . $SPACE . join q{ --custom }, @{$custom_annotations_ref}; } if ( @{$vep_features_ref} ) { @@ -353,11 +363,10 @@ sub variant_effect_predictor_install { store => \$species_ref, strict_type => 1, }, - stderrfile_path => { store => \$stderrfile_path, strict_type => 1, }, - stderrfile_path_append => - { store => \$stderrfile_path_append, strict_type => 1, }, - stdoutfile_path => { store => \$stdoutfile_path, strict_type => 1, }, - version => { store => \$version, strict_type => 1, }, + stderrfile_path => { store => \$stderrfile_path, strict_type => 1, }, + stderrfile_path_append => { store => \$stderrfile_path_append, strict_type => 1, }, + stdoutfile_path => { store => \$stdoutfile_path, strict_type => 1, }, + version => { store => \$version, strict_type => 1, }, }; check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index fa6777358..58ec393ea 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -1496,7 +1496,7 @@ sub analysis_vep_me { use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::List qw{ get_splitted_lists }; use MIP::Processmanagement::Processes qw{ submit_recipe }; - use MIP::Program::Bcftools qw{ bcftools_concat bcftools_view }; + use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; use MIP::Program::Htslib qw{ htslib_tabix }; use MIP::Program::Vep qw{ variant_effect_predictor }; use MIP::Recipe qw{ parse_recipe_prerequisites }; @@ -1526,9 +1526,7 @@ sub analysis_vep_me { my $infile_name_prefix = $io{in}{file_name_prefix}; my $infile_path_prefix = $io{in}{file_path_prefix}; - - #my $infile_suffix = $io{in}{file_suffix}; - my $infile_path = $io{in}{file_path}; + my $infile_path = $io{in}{file_path}; my $genome_reference_version = $file_info_href->{human_genome_reference_version}; @@ -1553,9 +1551,8 @@ sub analysis_vep_me { ); my $outdir_path_prefix = $io{out}{dir_path_prefix}; my $outfile_path_prefix = $io{out}{file_path_prefix}; - - #my $outfile_suffix = $io{out}{file_suffix}; - my $outfile_path = $io{out}{file_path}; + my $outfile_suffix = $io{out}{file_constant_suffix}; + my $outfile_path = $io{out}{file_path}; ## Filehandles # Create anonymous filehandle @@ -1635,11 +1632,12 @@ sub analysis_vep_me { ); print {$filehandle} $PIPE . $SPACE; - my $vep_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT.vcf}; + my $vep_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffix; variant_effect_predictor( { assembly => $assembly_version, cache_directory => $active_parameter_href->{vep_directory_cache}, + compress_output => 1, distance => $ANNOTATION_DISTANCE, filehandle => $filehandle, fork => $VEP_FORK_NUMBER, @@ -1654,6 +1652,14 @@ sub analysis_vep_me { ); say {$filehandle} $NEWLINE; + htslib_tabix( + { + filehandle => $filehandle, + infile_path => $vep_non_mt_outfile_path, + } + ); + say {$filehandle} $NEWLINE; + bcftools_view( { filehandle => $filehandle, @@ -1673,6 +1679,7 @@ sub analysis_vep_me { filehandle => $filehandle, fork => $VEP_FORK_NUMBER, outfile_format => q{vcf}, + outfile_path => q{STDOUT}, plugins_dir_path => $active_parameter_href->{vep_plugins_dir_path}, plugins_ref => \@plugins, reference_path => $active_parameter_href->{human_genome_reference}, @@ -1682,23 +1689,42 @@ sub analysis_vep_me { ); print {$filehandle} $PIPE . $SPACE; + my $vep_mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffix; bcftools_view( + { + filehandle => $filehandle, + output_type => q{z}, + outfile_path => $vep_mt_outfile_path, + targets => $mt_contig_ref->[0], + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix( { filehandle => $filehandle, - regions_ref => [ $mt_contig_ref->[0] ], + infile_path => $vep_non_mt_outfile_path, } ); - print {$filehandle} $PIPE . $SPACE; + say {$filehandle} $NEWLINE; bcftools_concat( { allow_overlaps => 1, filehandle => $filehandle, - infile_paths_ref => - [ $vep_non_mt_outfile_path, catfile( dirname( devnull() ), q{stdin} ) ], - outfile_path => $outfile_path, - output_type => q{z}, - threads => $recipe{core_number}, + infile_paths_ref => [ $vep_non_mt_outfile_path, $vep_mt_outfile_path ], + output_type => q{u}, + threads => $recipe{core_number}, + } + ); + print {$filehandle} $PIPE . $SPACE; + + bcftools_sort( + { + filehandle => $filehandle, + output_type => q{z}, + outfile_path => $outfile_path, + temp_directory => $temp_directory, } ); say {$filehandle} $NEWLINE; diff --git a/t/variant_effect_predictor.t b/t/variant_effect_predictor.t index f62f0c1b6..dd0dffd0a 100644 --- a/t/variant_effect_predictor.t +++ b/t/variant_effect_predictor.t @@ -22,16 +22,13 @@ use Readonly; use lib catdir( dirname($Bin), q{lib} ); use MIP::Constants qw{ $COMMA $SPACE }; - BEGIN { use MIP::Test::Fixtures qw{ test_import }; ### Check all internal dependency modules and imports ## Modules with import - my %perl_module = ( - q{MIP::Program::Vep} => [qw{ variant_effect_predictor }], -); + my %perl_module = ( q{MIP::Program::Vep} => [qw{ variant_effect_predictor }], ); test_import( { perl_module_href => \%perl_module, } ); } @@ -96,9 +93,11 @@ my %specific_argument = ( }, cache_directory => { input => catdir( q{test_dir}, q{test_cache_dir} ), - expected_output => q{--dir_cache} - . $SPACE - . catdir( q{test_dir}, q{test_cache_dir} ), + expected_output => q{--dir_cache} . $SPACE . catdir( q{test_dir}, q{test_cache_dir} ), + }, + compress_output => { + input => 1, + expected_output => q{--compress_output bgzip}, }, custom_annotations_ref => { inputs_ref => [ @@ -107,8 +106,7 @@ my %specific_argument = ( q{path_1key_1,file_type_1,annotation_type_1,force_report_coordinates_1} ) ], - expected_output => - q{--custom path,key,file_type,annotation_type,force_report_coordinates} + expected_output => q{--custom path,key,file_type,annotation_type,force_report_coordinates} . $SPACE . q{--custom path_1key_1,file_type_1,annotation_type_1,force_report_coordinates_1}, }, @@ -130,9 +128,7 @@ my %specific_argument = ( }, infile_path => { input => catfile( q{test_dir}, q{infile.vcf} ), - expected_output => q{--input_file} - . $SPACE - . catfile( q{test_dir}, q{infile.vcf} ), + expected_output => q{--input_file} . $SPACE . catfile( q{test_dir}, q{infile.vcf} ), }, max_sv_size => { input => 1, @@ -148,9 +144,7 @@ my %specific_argument = ( }, outfile_path => { input => catfile( q{test_dir}, q{infile.vcf} ), - expected_output => q{--output_file} - . $SPACE - . catfile( q{test_dir}, q{infile.vcf} ), + expected_output => q{--output_file} . $SPACE . catfile( q{test_dir}, q{infile.vcf} ), }, plugins_dir_path => { input => catdir(qw{ test_dir plugins }), From 9f5b92ac961b97666575347d298c0c2bdf2c9aff Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 14 Apr 2023 13:40:23 +0200 Subject: [PATCH 20/34] adding vep for mobile_elements --- lib/MIP/Recipes/Analysis/Vep.pm | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index 58ec393ea..7413e47d5 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -1496,7 +1496,6 @@ sub analysis_vep_me { use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::List qw{ get_splitted_lists }; use MIP::Processmanagement::Processes qw{ submit_recipe }; - use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; use MIP::Program::Htslib qw{ htslib_tabix }; use MIP::Program::Vep qw{ variant_effect_predictor }; use MIP::Recipe qw{ parse_recipe_prerequisites }; From 59ceeaa0bc52ba9797dafd5cd09ffcc4b65aa414 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 14 Apr 2023 16:09:28 +0200 Subject: [PATCH 21/34] sorting output --- lib/MIP/Recipes/Analysis/Vep.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index 7413e47d5..58ec393ea 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -1496,6 +1496,7 @@ sub analysis_vep_me { use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::List qw{ get_splitted_lists }; use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; use MIP::Program::Htslib qw{ htslib_tabix }; use MIP::Program::Vep qw{ variant_effect_predictor }; use MIP::Recipe qw{ parse_recipe_prerequisites }; From d3af3b7b4293713411cca38c2ce9ee0a0ce14b16 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 17 Apr 2023 10:11:23 +0200 Subject: [PATCH 22/34] force indexing --- lib/MIP/Recipes/Analysis/Vep.pm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index 58ec393ea..0d7719fe0 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -1655,6 +1655,7 @@ sub analysis_vep_me { htslib_tabix( { filehandle => $filehandle, + force => 1, infile_path => $vep_non_mt_outfile_path, } ); @@ -1703,6 +1704,7 @@ sub analysis_vep_me { htslib_tabix( { filehandle => $filehandle, + force => 1, infile_path => $vep_non_mt_outfile_path, } ); @@ -1732,6 +1734,7 @@ sub analysis_vep_me { htslib_tabix( { filehandle => $filehandle, + force => 1, infile_path => $outfile_path, } ); From 697ed482767e11c211c2b13d64636179e2244a7f Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 18 Apr 2023 09:24:07 +0200 Subject: [PATCH 23/34] fix filename for MT file --- lib/MIP/Recipes/Analysis/Vep.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index 0d7719fe0..275416842 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -1705,7 +1705,7 @@ sub analysis_vep_me { { filehandle => $filehandle, force => 1, - infile_path => $vep_non_mt_outfile_path, + infile_path => $vep_mt_outfile_path, } ); say {$filehandle} $NEWLINE; From 8fe83f96bbf43360ee563115ee1231b48685efec Mon Sep 17 00:00:00 2001 From: Anders Jemt Date: Tue, 18 Apr 2023 12:54:56 +0200 Subject: [PATCH 24/34] Update lib/MIP/Cli/Mip/Analyse/Rd_dna.pm Co-authored-by: Henrik Stranneheim --- lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 2ca1f760e..9760d428b 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1248,7 +1248,7 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds option( q{me_varianteffectpredictor} => ( cmd_tags => [q{Analysis recipe switch}], - documentation => q{Annotate mobile elememnts with VEP}, + documentation => q{Annotate mobile elements with VEP}, is => q{rw}, isa => enum( [ 0, 1, 2 ] ), ) From a768cadc3a9a65e4e8539746a34e78550863eb93 Mon Sep 17 00:00:00 2001 From: jemten Date: Fri, 21 Apr 2023 16:06:00 +0200 Subject: [PATCH 25/34] removed me exon annotation --- definitions/rd_dna_parameters.yaml | 8 -------- lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 8 -------- lib/MIP/Recipes/Analysis/Me_annotate.pm | 10 ++-------- templates/grch38_mip_rd_dna_config.yaml | 1 - templates/mip_rd_dna_config.yaml | 1 - 5 files changed, 2 insertions(+), 26 deletions(-) diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 0dd08873b..41ccfd1a3 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -1353,14 +1353,6 @@ me_annotate_query_bnd_distance: data_type: SCALAR default: 150 type: recipe_argument -me_annotate_exons_bed: - associated_recipe: - - me_annotate - data_type: SCALAR - exists_check: file - is_reference: 1 - reference: reference_dir - type: path me_varianteffectpredictor: analysis_mode: case associated_recipe: diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 9760d428b..202308614 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1237,14 +1237,6 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); - option( - q{me_annotate_exons_bed} => ( - documentation => q{Exon definitions in bed format}, - is => q{rw}, - isa => Str, - ) - ); - option( q{me_varianteffectpredictor} => ( cmd_tags => [q{Analysis recipe switch}], diff --git a/lib/MIP/Recipes/Analysis/Me_annotate.pm b/lib/MIP/Recipes/Analysis/Me_annotate.pm index 5ac4bd47c..4d53c9648 100644 --- a/lib/MIP/Recipes/Analysis/Me_annotate.pm +++ b/lib/MIP/Recipes/Analysis/Me_annotate.pm @@ -113,7 +113,7 @@ sub analysis_me_annotate { use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::Processmanagement::Processes qw{ submit_recipe }; - use MIP::Program::Bcftools qw{ bcftools_annotate }; + use MIP::Program::Bcftools qw{ bcftools_view }; use MIP::Program::Htslib qw{ htslib_tabix }; use MIP::Program::Svdb qw{ svdb_query }; use MIP::Recipe qw{ parse_recipe_prerequisites }; @@ -243,16 +243,10 @@ sub analysis_me_annotate { say {$filehandle} $NEWLINE; } - my $header = - q{<(echo '##INFO=')}; - bcftools_annotate( + bcftools_view( { - annotations_file_path => $active_parameter_href->{me_annotate_exons_bed}, - columns_name => q{CHROM,FROM,TO}, filehandle => $filehandle, - headerfile_path => $header, infile_path => $svdb_outfile_path, - mark_sites => q{IN_EXON}, outfile_path => $outfile_path, output_type => q{z}, } diff --git a/templates/grch38_mip_rd_dna_config.yaml b/templates/grch38_mip_rd_dna_config.yaml index 866b0d372..d8e323d23 100755 --- a/templates/grch38_mip_rd_dna_config.yaml +++ b/templates/grch38_mip_rd_dna_config.yaml @@ -40,7 +40,6 @@ gatk_varianteval_dbsnp: grch38_variant_-gold_standard_dbsnp-.vcf.gz gatk_varianteval_gold: grch38_mills_and_1000g_-gold_standard_indels-.vcf.gz genmod_models_reduced_penetrance_file: grch38_cust003-cmms-red-pen_-2017_FAKE-.tsv human_genome_reference: grch38_homo_sapiens_-assembly-.fasta -me_annotate_exons_bed: grch38_scout_exons_-2020-09-17-.bed me_annotate_query_files: # FORMAT: filename|OUT_FREQUENCY_INFO_KEY|OUT_ALLELE_COUNT_INFO_KEY|IN_FREQUENCY_INFO_KEY|IN_ALLELE_COUNT_INFO_KEY|USE_IN_FREQUENCY_FILTER grch38_alu_swegen.vcf.gz: swegen_alu_|FRQ|OCC|FRQ|OCC|1 diff --git a/templates/mip_rd_dna_config.yaml b/templates/mip_rd_dna_config.yaml index 3679cca13..d448ec9bf 100755 --- a/templates/mip_rd_dna_config.yaml +++ b/templates/mip_rd_dna_config.yaml @@ -23,7 +23,6 @@ sample_info_file: cluster_constant_path!/case_id!/analysis_constant_path!/case_i gatk_genotypegvcfs_ref_gvcf: grch37_gatk_merged_reference_samples.txt genmod_models_reduced_penetrance_file: grch37_cust003-cmms-red-pen_-2017-.tsv human_genome_reference: grch37_homo_sapiens_-d5-.fasta -me_annotate_exons_bed: grch37_scout_exons_-2017-01-.bed me_annotate_query_files: # FORMAT: filename|OUT_FREQUENCY_INFO_KEY|OUT_ALLELE_COUNT_INFO_KEY|IN_FREQUENCY_INFO_KEY|IN_ALLELE_COUNT_INFO_KEY|USE_IN_FREQUENCY_FILTER grch37_alu_swegen.vcf.gz: swegen_alu_|FRQ|OCC|FRQ|OCC|1 From abb410b0a091109726339dab08d66775fb350276 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 11:03:01 +0200 Subject: [PATCH 26/34] filter mobile elements --- definitions/rd_dna_initiation_map.yaml | 1 + definitions/rd_dna_parameters.yaml | 21 + lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 19 + lib/MIP/Recipes/Analysis/Me_filter.pm | 499 +++++++++++++++++++++ lib/MIP/Recipes/Analysis/Vep.pm | 5 +- lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 6 +- t/analysis_me_filter.t | 117 +++++ 7 files changed, 664 insertions(+), 4 deletions(-) create mode 100644 lib/MIP/Recipes/Analysis/Me_filter.pm create mode 100644 t/analysis_me_filter.t diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index f9afd1243..0a2235ba1 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -55,6 +55,7 @@ CHAIN_ALL: - me_merge_vcfs - me_annotate - me_varianteffectpredictor + - me_filter - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution - PARALLEL: diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 41ccfd1a3..29eb40ca9 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -227,6 +227,7 @@ recipe_core_number: manta: 36 markduplicates: 13 me_annotate: 2 + me_filter: 2 me_merge_bam: 5 me_merge_vcfs: 2 me_varianteffectpredictor: 4 @@ -362,6 +363,7 @@ recipe_time: manta: 30 markduplicates: 20 me_annotate: 2 + me_filter: 1 me_merge_bam: 5 me_merge_vcfs: 2 me_varianteffectpredictor: 3 @@ -1366,6 +1368,25 @@ me_varianteffectpredictor: - tabix - vep type: recipe +me_filter: + analysis_mode: case + associated_recipe: + - mip + data_type: SCALAR + default: 1 + file_tag: _filter + outfile_suffix: ".vcf.gz" + program_executables: + - bcftools + - mip + - tabix + type: recipe +me_filter_frequency_threshold: + associated_recipe: + - me_filter + data_type: SCALAR + default: 0.1 + type: recipe_argument ## GATK CollectReadCounts gatk_collectreadcounts: analysis_mode: sample diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 202308614..88d3cfafd 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1246,6 +1246,25 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); + option( + q{me_filter} => ( + cmd_tags => [q{Analysis recipe switch}], + documentation => q{Filter mobile elements}, + is => q{rw}, + isa => enum( [ 0, 1, 2 ] ), + ) + ); + + option( + q{me_filter_frequency_threshold} => ( + cmd_tags => [q{Default: 0.1}], + documentation => + q{Threshold frequency for variants to be filtered out, set to 0 to disable}, + is => q{rw}, + isa => Num, + ) + ); + option( q{gatk_haplotypecaller} => ( cmd_tags => [q{Analysis recipe switch}], diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm new file mode 100644 index 000000000..9bc064245 --- /dev/null +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -0,0 +1,499 @@ +package MIP::Recipes::Analysis::Me_filter; + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile devnull }; +use List::Util qw{ first}; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use utf8; +use warnings; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw{ :all }; +use Readonly; + +## MIPs lib/ +use MIP::Constants qw{ %ANALYSIS $DOT $DOUBLE_QUOTE $LOG_NAME $NEWLINE $PIPE $SPACE $UNDERSCORE }; + +BEGIN { + + require Exporter; + use base qw{ Exporter }; + + # Functions and variables which can be optionally exported + our @EXPORT_OK = qw{ analysis_me_filter }; + +} + +## Constants +Readonly my $ANNOTATION_DISTANCE => $ANALYSIS{ANNOTATION_DISTANCE}; +Readonly my $ANNOTATION_DISTANCE_MT => $ANALYSIS{ANNOTATION_DISTANCE_MT}; +Readonly my $MT_CONTIG_ID_REGEXP => q{MT | M | chrM}; + +sub analysis_me_filter { + +## Function : Vcfparser performs parsing of varianteffectpredictor annotated ws SV variants +## Returns : +## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} +## : $case_id => Family id +## : $filehandle => Sbatch filehandle to write to +## : $file_info_href => File info hash {REF} +## : $job_id_href => Job id hash {REF} +## : $parameter_href => Parameter hash {REF} +## : $profile_base_command => Submission profile base command +## : $recipe_name => Program name +## : $sample_info_href => Info on samples and case hash {REF} +## : $temp_directory => Temporary directory +## : $xargs_file_counter => The xargs file counter + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $active_parameter_href; + my $file_info_href; + my $job_id_href; + my $parameter_href; + my $recipe_name; + my $sample_info_href; + + ## Default(s) + my $case_id; + my $profile_base_command; + my $temp_directory; + my $xargs_file_counter; + + my $tmpl = { + active_parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$active_parameter_href, + strict_type => 1, + }, + case_id => { + default => $arg_href->{active_parameter_href}{case_id}, + store => \$case_id, + strict_type => 1, + }, + file_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$file_info_href, + strict_type => 1, + }, + job_id_href => { + default => {}, + defined => 1, + required => 1, + store => \$job_id_href, + strict_type => 1, + }, + parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$parameter_href, + strict_type => 1, + }, + profile_base_command => { + default => q{sbatch}, + store => \$profile_base_command, + strict_type => 1, + }, + recipe_name => { + defined => 1, + required => 1, + store => \$recipe_name, + strict_type => 1, + }, + sample_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$sample_info_href, + strict_type => 1, + }, + temp_directory => { + default => $arg_href->{active_parameter_href}{temp_directory}, + store => \$temp_directory, + strict_type => 1, + }, + xargs_file_counter => { + allow => qr{ \A\d+\z }xsm, + default => 0, + store => \$xargs_file_counter, + strict_type => 1, + }, + }; + + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + use MIP::Analysis qw{ get_vcf_parser_analysis_suffix }; + use MIP::File_info qw{ get_io_files parse_io_outfiles }; + use MIP::List qw{ check_element_exist_hash_of_array }; + use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; + use MIP::Program::Htslib qw{ htslib_tabix }; + use MIP::Program::Mip qw{ mip_vcfparser }; + use MIP::Recipe qw{ parse_recipe_prerequisites }; + use MIP::Sample_info + qw{ set_file_path_to_store set_gene_panel set_recipe_outfile_in_sample_info }; + use MIP::Script::Setup_script qw{ setup_script}; + + ### PREPROCESSING: + + ## Retrieve logger object + my $log = Log::Log4perl->get_logger($LOG_NAME); + + ## Unpack parameters + my %io = get_io_files( + { + id => $case_id, + file_info_href => $file_info_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + stream => q{in}, + temp_directory => $temp_directory, + } + ); + my $infile_name_prefix = $io{in}{file_name_prefix}; + my $infile_path = $io{in}{file_path}; + + my %recipe = parse_recipe_prerequisites( + { + active_parameter_href => $active_parameter_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + + my @vcfparser_analysis_types = get_vcf_parser_analysis_suffix( + { + vcfparser_outfile_count => $active_parameter_href->{sv_vcfparser_outfile_count}, + } + ); + + ## Set and get the io files per chain, id and stream + my @set_outfile_name_prefixes = + map { $infile_name_prefix . $_ } @vcfparser_analysis_types; + %io = ( + %io, + parse_io_outfiles( + { + chain_id => $recipe{job_id_chain}, + id => $case_id, + file_info_href => $file_info_href, + file_name_prefix => $infile_name_prefix, + iterators_ref => \@vcfparser_analysis_types, + outdata_dir => $active_parameter_href->{outdata_dir}, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + temp_directory => $temp_directory, + } + ) + ); + my $outdir_path = $io{out}{dir_path}; + my $outfile_path_prefix = $io{out}{file_path_prefix}; + my @outfile_paths = @{ $io{out}{file_paths} }; + my %outfile_path = %{ $io{out}{file_path_href} }; + my @outfile_suffixes = @{ $io{out}{file_suffixes} }; + + ## Filehandles + my $filehandle = IO::Handle->new(); + + ## Creates recipe directories (info & data & script), recipe script filenames and writes sbatch header + my ( $recipe_file_path, $recipe_info_path ) = setup_script( + { + active_parameter_href => $active_parameter_href, + core_number => $recipe{core_number}, + directory_id => $case_id, + filehandle => $filehandle, + job_id_href => $job_id_href, + memory_allocation => $recipe{memory}, + process_time => $recipe{time}, + recipe_directory => $recipe_name, + recipe_name => $recipe_name, + temp_directory => $temp_directory, + } + ); + + my $mt_contig = first { / $MT_CONTIG_ID_REGEXP /xms } @{ $file_info_href->{contigs} }; + my @select_feature_annotation_columns; + my $select_file; + my $select_file_matching_column; + my $non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[0]; + my $select_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[1]; + my $mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[0]; + my $select_mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[1]; + + if ( $active_parameter_href->{sv_vcfparser_select_file} ) { + + ## List of genes to analyse separately + $select_file = catfile( $active_parameter_href->{sv_vcfparser_select_file} ); + + ## Column of HGNC Symbol in select file ("-sf") + $select_file_matching_column = + $active_parameter_href->{sv_vcfparser_select_file_matching_column}; + + if ( exists $active_parameter_href->{sv_vcfparser_select_feature_annotation_columns} ) { + + @select_feature_annotation_columns = + @{ $active_parameter_href->{sv_vcfparser_select_feature_annotation_columns} }; + } + } + + my $exclude_filter = _build_bcftools_frequency_filter( + { + frequency_threshold => $active_parameter_href->{me_filter_frequency_threshold}, + me_annotate_query_info_href => $active_parameter_href->{me_annotate_query_files}, + } + ); + + say {$filehandle} q{## Vcfparser non MT contigs}; + bcftools_view( + { + exclude => $exclude_filter, + filehandle => $filehandle, + infile_path => $infile_path, + targets => q{^} . $mt_contig, + } + ); + print {$filehandle} $PIPE . $SPACE; + + mip_vcfparser( + { + filehandle => $filehandle, + infile_path => catfile( dirname( devnull() ), q{stdin} ), + log_file_path => catfile( $outdir_path, q{vcfparser} . q{.log} ), + padding => $ANNOTATION_DISTANCE, + parse_vep => $active_parameter_href->{sv_varianteffectpredictor}, + per_gene => $active_parameter_href->{sv_vcfparser_per_gene}, + pli_values_file_path => $active_parameter_href->{vcfparser_pli_score_file}, + range_feature_annotation_columns_ref => + \@{ $active_parameter_href->{sv_vcfparser_range_feature_annotation_columns} }, + range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, + select_feature_annotation_columns_ref => \@select_feature_annotation_columns, + stdoutfile_path => $non_mt_outfile_path, + select_feature_file_path => $select_non_mt_outfile_path, + select_feature_matching_column => $select_file_matching_column, + select_outfile => $select_non_mt_outfile_path, + variant_type => q{sv}, + } + ); + say {$filehandle} $NEWLINE; + + say {$filehandle} q{## Vcfparser MT}; + bcftools_view( + { + exclude => $exclude_filter, + filehandle => $filehandle, + infile_path => $infile_path, + regions_ref => [$mt_contig], + } + ); + print {$filehandle} $PIPE . $SPACE; + + mip_vcfparser( + { + filehandle => $filehandle, + infile_path => catfile( dirname( devnull() ), q{stdin} ), + log_file_path => catfile( $outdir_path, q{vcfparser} . q{.log} ), + padding => $ANNOTATION_DISTANCE_MT, + parse_vep => $active_parameter_href->{me_varianteffectpredictor}, + per_gene => $active_parameter_href->{sv_vcfparser_per_gene}, + pli_values_file_path => $active_parameter_href->{vcfparser_pli_score_file}, + range_feature_annotation_columns_ref => + \@{ $active_parameter_href->{sv_vcfparser_range_feature_annotation_columns} }, + range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, + select_feature_annotation_columns_ref => \@select_feature_annotation_columns, + stdoutfile_path => $mt_outfile_path, + select_feature_file_path => $select_mt_outfile_path, + select_feature_matching_column => $select_file_matching_column, + select_outfile => $select_mt_outfile_path, + variant_type => q{sv}, + } + ); + say {$filehandle} $NEWLINE; + + ### Special case: replace all clinical mitochondrial variants with research mitochondrial variants + $select_mt_outfile_path = + $active_parameter_href->{sv_vcfparser_add_all_mt_var} + ? $mt_outfile_path + : $select_mt_outfile_path; + + ## Concatenate MT variants with the rest + my @file_sets = ( + { + files_to_concat_ref => [ $non_mt_outfile_path, $mt_outfile_path ], + outfile => $outfile_paths[0], + }, + { + files_to_concat_ref => [ $select_non_mt_outfile_path, $select_mt_outfile_path ], + outfile => $outfile_paths[1], + }, + ); + + foreach my $outfile_set (@file_sets) { + + bcftools_concat( + { + allow_overlaps => 1, + filehandle => $filehandle, + infile_paths_ref => $outfile_set->{files_to_concat_ref}, + output_type => q{u}, + threads => $recipe{core_number}, + } + ); + print {$filehandle} $PIPE . $SPACE; + + bcftools_sort( + { + filehandle => $filehandle, + output_type => q{z}, + outfile_path => $outfile_set->{outfile}, + temp_directory => $temp_directory, + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix( + { + filehandle => $filehandle, + force => 1, + infile_path => $outfile_set->{outfile}, + } + ); + say {$filehandle} $NEWLINE; + } + + close $filehandle; + + if ( $recipe{mode} == 1 ) { + + ## Collect QC metadata info for later use + set_recipe_outfile_in_sample_info( + { + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + path => $outfile_paths[0], + } + ); + + my %gene_panels = ( + range_file => q{sv_vcfparser_range_feature_file}, + select_file => q{sv_vcfparser_select_file}, + ); + GENE_PANEL: + while ( my ( $gene_panel_key, $gene_panel_file ) = each %gene_panels ) { + + ## Collect databases(s) from a potentially merged gene panel file and adds them to sample_info + set_gene_panel( + { + aggregate_gene_panel_file => $active_parameter_href->{$gene_panel_file}, + aggregate_gene_panels_key => $gene_panel_key, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + } + ); + } + + OUTFILE: + while ( my ( $file_key, $outfile ) = each %outfile_path ) { + + my $metafile_tag = $file_key =~ m/selected/xms ? q{clinical} : q{research}; + + set_file_path_to_store( + { + format => q{vcf}, + id => $case_id, + path => $outfile, + path_index => $outfile . $DOT . q{tbi}, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + tag => $metafile_tag, + } + ); + } + + submit_recipe( + { + base_command => $profile_base_command, + case_id => $case_id, + dependency_method => q{sample_to_case}, + job_id_chain => $recipe{job_id_chain}, + job_id_href => $job_id_href, + job_reservation_name => $active_parameter_href->{job_reservation_name}, + log => $log, + max_parallel_processes_count_href => + $file_info_href->{max_parallel_processes_count}, + recipe_file_path => $recipe_file_path, + sample_ids_ref => \@{ $active_parameter_href->{sample_ids} }, + submission_profile => $active_parameter_href->{submission_profile}, + } + ); + } + return 1; +} + +sub _build_bcftools_frequency_filter { + +## Function : Build the frequency filter command +## Returns : +## Arguments: $frequency_threshold => Frequency annotation to use in filtering +## : $me_annotate_query_info_href => SVDB query files used to annotate ME {REF} + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $frequency_threshold; + my $me_annotate_query_info_href; + + my $tmpl = { + frequency_threshold => { + required => 1, + store => \$frequency_threshold, + strict_type => 1, + }, + me_annotate_query_info_href => { + default => {}, + required => 1, + store => \$me_annotate_query_info_href, + } + }; + + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + ## Skip if no threshold is set + return if ( not $frequency_threshold ); + + ## Find annotations to use for filtering + my @frequency_filters; + my $frequency_expression = $SPACE . q{>} . $SPACE . $frequency_threshold; + + ANNOTATION: + while ( my ( $annotation_file, $annotation_tags ) = each %{$me_annotate_query_info_href} ) { + + my ( $tag_prefix, $fq_suffix, $cnt_suffix, $file_fq_tag, $file_cnt_tag, $use_in_filter ) = + split /[|]/sxm, $annotation_tags; + + next ANNOTATION if ( not $use_in_filter ); + + push @frequency_filters, q{INFO/} . $tag_prefix . $file_fq_tag . $frequency_expression; + } + + ## build filter + my $exclude_filter = + $DOUBLE_QUOTE + . join( $SPACE . $PIPE . $SPACE . q{INFO/}, @frequency_filters ) + . $DOUBLE_QUOTE; + + return $exclude_filter; +} + +1; diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index 275416842..6089a97c4 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -43,6 +43,7 @@ Readonly my $ANNOTATION_DISTANCE => $ANALYSIS{ANNOTATION_DISTANCE}; Readonly my $ANNOTATION_DISTANCE_MT => $ANALYSIS{ANNOTATION_DISTANCE_MT}; Readonly my $BUFFER_SIZE => 20_000; Readonly my $BUFFER_SIZE_SV => 100; +Readonly my $CONTIG_20 => 20; sub analysis_vep_wgs { @@ -1626,8 +1627,8 @@ sub analysis_vep_me { bcftools_view( { filehandle => $filehandle, - targets => q{^} . $mt_contig_ref->[0], infile_path => $infile_path, + targets => q{^} . $mt_contig_ref->[0], } ); print {$filehandle} $PIPE . $SPACE; @@ -1665,7 +1666,7 @@ sub analysis_vep_me { { filehandle => $filehandle, infile_path => $infile_path, - regions_ref => [ $contigs_ref->[20], $mt_contig_ref->[0] ], + regions_ref => [ $contigs_ref->[$CONTIG_20], $mt_contig_ref->[0] ], } ); print {$filehandle} $PIPE . $SPACE; diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index c0d9482a4..087ef99ce 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -130,7 +130,7 @@ sub parse_rd_dna { Readonly my @ONLY_WGS_VARIANT_CALLER_RECIPES => qw{ cnvnator_ar delly_reformat tiddit }; Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter - gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_annotate me_merge_bam me_merge_vcfs + gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_annotate me_filter me_merge_bam me_merge_vcfs me_varianteffectpredictor mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; @@ -475,7 +475,8 @@ sub pipeline_analyse_rd_dna { use MIP::Recipes::Analysis::Gzip_fastq qw{ analysis_gzip_fastq }; use MIP::Recipes::Analysis::Manta qw{ analysis_manta }; use MIP::Recipes::Analysis::Markduplicates qw{ analysis_markduplicates }; - use MIP::Recipes::Analysis::Me_annotate qw{ analysis_me_annotate}; + use MIP::Recipes::Analysis::Me_annotate qw{ analysis_me_annotate }; + use MIP::Recipes::Analysis::Me_filter qw{ analysis_me_filter }; use MIP::Recipes::Analysis::Mip_qccollect qw{ analysis_mip_qccollect }; use MIP::Recipes::Analysis::Mip_vcfparser qw{ analysis_mip_vcfparser }; use MIP::Recipes::Analysis::Mip_vercollect qw{ analysis_mip_vercollect }; @@ -591,6 +592,7 @@ sub pipeline_analyse_rd_dna { manta => \&analysis_manta, markduplicates => \&analysis_markduplicates, me_annotate => \&analysis_me_annotate, + me_filter => \&analysis_me_filter, me_merge_bam => \&analysis_samtools_merge_panel, me_merge_vcfs => \&analysis_me_merge_vcfs, me_varianteffectpredictor => \&analysis_vep_me, diff --git a/t/analysis_me_filter.t b/t/analysis_me_filter.t new file mode 100644 index 000000000..59ddfff1a --- /dev/null +++ b/t/analysis_me_filter.t @@ -0,0 +1,117 @@ +#!/usr/bin/env perl + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile }; +use FindBin qw{ $Bin }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use Test::More; +use utf8; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw { :all }; +use Modern::Perl qw{ 2018 }; +use Readonly; + +## MIPs lib/ +use lib catdir( dirname($Bin), q{lib} ); +use MIP::Constants qw{ $COLON $COMMA $SPACE }; +use MIP::Test::Fixtures qw{ test_add_io_for_recipe test_log test_mip_hashes }; + +BEGIN { + + use MIP::Test::Fixtures qw{ test_import }; + +### Check all internal dependency modules and imports +## Modules with import + my %perl_module = ( + q{MIP::Recipes::Analysis::Me_filter} => [qw{ analysis_me_filter }], + q{MIP::Test::Fixtures} => [qw{ test_add_io_for_recipe test_log test_mip_hashes }], + ); + + test_import( { perl_module_href => \%perl_module, } ); +} + +use MIP::Recipes::Analysis::Me_filter qw{ analysis_me_filter }; + +diag( q{Test analysis_me_filter from Me_filter.pm} + . $COMMA + . $SPACE . q{Perl} + . $SPACE + . $PERL_VERSION + . $SPACE + . $EXECUTABLE_NAME ); + +Readonly my $THRESHOLD => 0.05; + +my $log = test_log( { log_name => q{MIP}, no_screen => 1, } ); + +## Given analysis parameters +my $recipe_name = q{me_filter}; +my $slurm_mock_cmd = catfile( $Bin, qw{ data modules slurm-mock.pl } ); + +my %active_parameter = test_mip_hashes( + { + mip_hash_name => q{active_parameter}, + recipe_name => $recipe_name, + } +); +$active_parameter{$recipe_name} = 1; +$active_parameter{recipe_core_number}{$recipe_name} = 1; +$active_parameter{recipe_time}{$recipe_name} = 1; +my $case_id = $active_parameter{case_id}; +$active_parameter{sv_vcfparser_outfile_count} = 2; +@{ $active_parameter{sv_vcfparser_select_feature_annotation_columns} } = ( 1, 2 ); +$active_parameter{sv_vcfparser_select_file} = + catfile( $Bin, qw{ data 643594-miptest aggregated_gene_panel_test.txt } ); +$active_parameter{me_filter_frequency_threshold} = $THRESHOLD; + +my %file_info = test_mip_hashes( + { + mip_hash_name => q{file_info}, + recipe_name => $recipe_name, + } +); + +my %job_id; +my %parameter = test_mip_hashes( + { + mip_hash_name => q{recipe_parameter}, + recipe_name => $recipe_name, + } +); + +test_add_io_for_recipe( + { + file_info_href => \%file_info, + id => $case_id, + parameter_href => \%parameter, + recipe_name => $recipe_name, + step => q{vcf}, + } +); + +my %sample_info; + +my $is_ok = analysis_me_filter( + { + active_parameter_href => \%active_parameter, + case_id => $case_id, + file_info_href => \%file_info, + job_id_href => \%job_id, + parameter_href => \%parameter, + profile_base_command => $slurm_mock_cmd, + recipe_name => $recipe_name, + sample_info_href => \%sample_info, + } +); + +## Then return TRUE +ok( $is_ok, q{ Executed analysis recipe } . $recipe_name ); + +done_testing(); From 72a0fa070e10f732fa2e50a4e51103d3392673d2 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 12:51:37 +0200 Subject: [PATCH 27/34] fixing filter expression --- lib/MIP/Recipes/Analysis/Me_filter.pm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index 9bc064245..cefb48e5a 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -489,9 +489,7 @@ sub _build_bcftools_frequency_filter { ## build filter my $exclude_filter = - $DOUBLE_QUOTE - . join( $SPACE . $PIPE . $SPACE . q{INFO/}, @frequency_filters ) - . $DOUBLE_QUOTE; + $DOUBLE_QUOTE . join( $SPACE . $PIPE . $SPACE, @frequency_filters ) . $DOUBLE_QUOTE; return $exclude_filter; } From 98b41a54ee650c4ede2ce601a787bf396a0e4d6d Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 13:20:56 +0200 Subject: [PATCH 28/34] add correct select file --- lib/MIP/Recipes/Analysis/Me_filter.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index cefb48e5a..e55cea4e5 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -280,7 +280,7 @@ sub analysis_me_filter { range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, select_feature_annotation_columns_ref => \@select_feature_annotation_columns, stdoutfile_path => $non_mt_outfile_path, - select_feature_file_path => $select_non_mt_outfile_path, + select_feature_file_path => $select_file, select_feature_matching_column => $select_file_matching_column, select_outfile => $select_non_mt_outfile_path, variant_type => q{sv}, From c6e988c23f8c7149a8a0495358fb2f13528abe08 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 13:33:11 +0200 Subject: [PATCH 29/34] fixinng select file for 2nd pass --- lib/MIP/Recipes/Analysis/Me_filter.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index e55cea4e5..63a10e9e7 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -313,7 +313,7 @@ sub analysis_me_filter { range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, select_feature_annotation_columns_ref => \@select_feature_annotation_columns, stdoutfile_path => $mt_outfile_path, - select_feature_file_path => $select_mt_outfile_path, + select_feature_file_path => $select_file, select_feature_matching_column => $select_file_matching_column, select_outfile => $select_mt_outfile_path, variant_type => q{sv}, From b74a44f4f9f38b7a3559c6f39d45f32e250e7d88 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 17:18:05 +0200 Subject: [PATCH 30/34] remove gz ending from vcfparser output --- lib/MIP/Recipes/Analysis/Me_filter.pm | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index 63a10e9e7..ec90a3525 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -18,7 +18,8 @@ use autodie qw{ :all }; use Readonly; ## MIPs lib/ -use MIP::Constants qw{ %ANALYSIS $DOT $DOUBLE_QUOTE $LOG_NAME $NEWLINE $PIPE $SPACE $UNDERSCORE }; +use MIP::Constants + qw{ %ANALYSIS $DOT $DOUBLE_QUOTE $EMPTY_STR $LOG_NAME $NEWLINE $PIPE $SPACE $UNDERSCORE }; BEGIN { @@ -137,6 +138,7 @@ sub analysis_me_filter { use MIP::Analysis qw{ get_vcf_parser_analysis_suffix }; use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::List qw{ check_element_exist_hash_of_array }; + use MIP::File::Path qw{ remove_file_path_suffix }; use MIP::Processmanagement::Processes qw{ submit_recipe }; use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; use MIP::Program::Htslib qw{ htslib_tabix }; @@ -227,10 +229,12 @@ sub analysis_me_filter { my @select_feature_annotation_columns; my $select_file; my $select_file_matching_column; - my $non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[0]; - my $select_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[1]; - my $mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[0]; - my $select_mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[1]; + my ( $no_gz_file_ending, $select_no_gz_file_ending ) = + map { s/[.]gz\z/$EMPTY_STR/r } @outfile_suffixes; + my $non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $no_gz_file_ending; + my $select_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $select_no_gz_file_ending; + my $mt_outfile_path = $outfile_path_prefix . q{_MT} . $no_gz_file_ending; + my $select_mt_outfile_path = $outfile_path_prefix . q{_MT} . $select_no_gz_file_ending; if ( $active_parameter_href->{sv_vcfparser_select_file} ) { From 7a35a52f751580f29589291dd04d903890c807af Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 18:31:39 +0200 Subject: [PATCH 31/34] compress and index vcfparser output --- lib/MIP/Recipes/Analysis/Me_filter.pm | 40 ++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index ec90a3525..eea94e29c 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -141,7 +141,7 @@ sub analysis_me_filter { use MIP::File::Path qw{ remove_file_path_suffix }; use MIP::Processmanagement::Processes qw{ submit_recipe }; use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; - use MIP::Program::Htslib qw{ htslib_tabix }; + use MIP::Program::Htslib qw{ htslib_bgzip htslib_tabix }; use MIP::Program::Mip qw{ mip_vcfparser }; use MIP::Recipe qw{ parse_recipe_prerequisites }; use MIP::Sample_info @@ -331,18 +331,50 @@ sub analysis_me_filter { ? $mt_outfile_path : $select_mt_outfile_path; + my @files_to_compress_and_index = ( + $mt_outfile_path, $non_mt_outfile_path, + $select_mt_outfile_path, $select_non_mt_outfile_path, + ); + + say {$filehandle} q{## Compress and index vcfparser output}; + FILE_TO_COMPRESS_AND_INDEX: + foreach my $file_to_compress_and_index (@files_to_compress_and_index) { + + htslib_bgzip( + { + infile_path => $file_to_compress_and_index, + force => 1, + filehandle => $filehandle, + threads => $recipe{core_number}, + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix( + { + filehandle => $filehandle, + force => 1, + infile_path => $file_to_compress_and_index . q{.gz}, + } + ); + say {$filehandle} $NEWLINE; + } + ## Concatenate MT variants with the rest my @file_sets = ( { - files_to_concat_ref => [ $non_mt_outfile_path, $mt_outfile_path ], + files_to_concat_ref => [ $non_mt_outfile_path . q{.gz}, $mt_outfile_path . q{.gz} ], outfile => $outfile_paths[0], }, { - files_to_concat_ref => [ $select_non_mt_outfile_path, $select_mt_outfile_path ], - outfile => $outfile_paths[1], + files_to_concat_ref => + [ $select_non_mt_outfile_path . q{.gz}, $select_mt_outfile_path . q{.gz} ], + outfile => $outfile_paths[1], }, ); + say {$filehandle} q{## Concatenate, sort and index}; + OUTFILE_SET: foreach my $outfile_set (@file_sets) { bcftools_concat( From 47fb949fbde806682376d44d62933f257934f778 Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 25 Apr 2023 09:07:18 +0200 Subject: [PATCH 32/34] compress before renaming --- lib/MIP/Recipes/Analysis/Me_filter.pm | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index eea94e29c..b5b84b774 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -325,18 +325,13 @@ sub analysis_me_filter { ); say {$filehandle} $NEWLINE; - ### Special case: replace all clinical mitochondrial variants with research mitochondrial variants - $select_mt_outfile_path = - $active_parameter_href->{sv_vcfparser_add_all_mt_var} - ? $mt_outfile_path - : $select_mt_outfile_path; + say {$filehandle} q{## Compress and index vcfparser output}; my @files_to_compress_and_index = ( $mt_outfile_path, $non_mt_outfile_path, $select_mt_outfile_path, $select_non_mt_outfile_path, ); - say {$filehandle} q{## Compress and index vcfparser output}; FILE_TO_COMPRESS_AND_INDEX: foreach my $file_to_compress_and_index (@files_to_compress_and_index) { @@ -360,6 +355,12 @@ sub analysis_me_filter { say {$filehandle} $NEWLINE; } + ### Special case: replace all clinical mitochondrial variants with research mitochondrial variants + $select_mt_outfile_path = + $active_parameter_href->{sv_vcfparser_add_all_mt_var} + ? $mt_outfile_path + : $select_mt_outfile_path; + ## Concatenate MT variants with the rest my @file_sets = ( { @@ -374,6 +375,7 @@ sub analysis_me_filter { ); say {$filehandle} q{## Concatenate, sort and index}; + OUTFILE_SET: foreach my $outfile_set (@file_sets) { From 3849464450558499c2ac72c6c6c465b622f9b69d Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 25 Apr 2023 11:38:58 +0200 Subject: [PATCH 33/34] Updating recipe description --- lib/MIP/Recipes/Analysis/Me_filter.pm | 2 +- lib/MIP/Recipes/Analysis/Mip_vcfparser.pm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index b5b84b774..a4a970b4d 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -38,7 +38,7 @@ Readonly my $MT_CONTIG_ID_REGEXP => q{MT | M | chrM}; sub analysis_me_filter { -## Function : Vcfparser performs parsing of varianteffectpredictor annotated ws SV variants +## Function : Filters mobile elements on frequency and split the vcf in aclinical and research vcf using vcfparser. ## Returns : ## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} ## : $case_id => Family id diff --git a/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm b/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm index b46a1adca..5c4b216bf 100644 --- a/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm +++ b/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm @@ -1131,7 +1131,7 @@ sub analysis_mip_vcfparser_sv_wes { sub analysis_mip_vcfparser_sv_wgs { -## Function : Vcfparser performs parsing of varianteffectpredictor annotated ws SV variants +## Function : Vcfparser performs parsing of varianteffectpredictor annotated wgs SV variants ## Returns : ## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} ## : $case_id => Family id From da95004ceb8917915d43ace29d36234b1f3befbc Mon Sep 17 00:00:00 2001 From: jemten Date: Thu, 27 Apr 2023 10:28:50 +0200 Subject: [PATCH 34/34] updating expansionhunter database --- CHANGELOG.md | 4 ++ definitions/rd_dna_parameters.yaml | 2 +- ...ansionhunter_variant_catalog_-5.0.0-.json} | 0 .../mip_download_rd_dna_config_-1.0-.yaml | 45 +++++++------------ 4 files changed, 20 insertions(+), 31 deletions(-) rename t/data/references/{grch37_expansionhunter_variant_catalog_-4.0.2-.json => grch37_expansionhunter_variant_catalog_-5.0.0-.json} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68397452c..5b32ce1b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ This project adheres to [Semantic Versioning](http://semver.org/). - Adds retroseq for mobile element detection +### Databases + +- expansionhunter variant catalog: v4.0.2 -> v5.0.0 + ### Tools - RetroSeq: 9d4f3b5 diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 29eb40ca9..d048e6d2b 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -880,7 +880,7 @@ expansionhunter_variant_catalog_file_path: associated_recipe: - expansionhunter data_type: SCALAR - default: grch37_expansionhunter_variant_catalog_-4.0.2-.json + default: grch37_expansionhunter_variant_catalog_-5.0.0-.json exists_check: file is_reference: 1 reference: reference_dir diff --git a/t/data/references/grch37_expansionhunter_variant_catalog_-4.0.2-.json b/t/data/references/grch37_expansionhunter_variant_catalog_-5.0.0-.json similarity index 100% rename from t/data/references/grch37_expansionhunter_variant_catalog_-4.0.2-.json rename to t/data/references/grch37_expansionhunter_variant_catalog_-5.0.0-.json diff --git a/templates/mip_download_rd_dna_config_-1.0-.yaml b/templates/mip_download_rd_dna_config_-1.0-.yaml index e5daeb2e6..585795270 100644 --- a/templates/mip_download_rd_dna_config_-1.0-.yaml +++ b/templates/mip_download_rd_dna_config_-1.0-.yaml @@ -41,9 +41,8 @@ reference: - 20150915 - 20200310 expansionhunter: - - 3.0.1 - - 3.1.2 - 4.0.2 + - 5.0.0 gatk_mitochondrial_ref: - v0_m_amb - v0_m_ann @@ -445,20 +444,6 @@ reference_feature: url_prefix: https://raw.githubusercontent.com/dellytools/delly/master/excludeTemplates/ expansionhunter: grch37: - 3.0.1: - file: variant_catalog_grch37.json - file_check: variant_catalog_grch37.json.md5 - outfile: grch37_expansionhunter_variant_catalog_-3.0.1-.json - outfile_check: grch37_expansionhunter_variant_catalog_-3.0.1-.json.md5 - outfile_check_method: md5sum - url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v3.0.1/ - 3.1.2: - file: variant_catalog_grch37.json - file_check: variant_catalog_grch37.json.md5 - outfile: grch37_expansionhunter_variant_catalog_-3.1.2-.json - outfile_check: grch37_expansionhunter_variant_catalog_-3.1.2-.json.md5 - outfile_check_method: md5sum - url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v3.1.2/ 4.0.2: file: variant_catalog_grch37.json file_check: variant_catalog_grch37.json.md5 @@ -466,21 +451,14 @@ reference_feature: outfile_check: grch37_expansionhunter_variant_catalog_-4.0.2-.json.md5 outfile_check_method: md5sum url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v4.0.2/ - grch38: - 3.0.1: - file: variant_catalog_grch38.json - file_check: variant_catalog_grch38.json.md5 - outfile: grch38_expansionhunter_variant_catalog_-3.0.1-.json - outfile_check: grch38_expansionhunter_variant_catalog_-3.0.1-.json.md5 - outfile_check_method: md5sum - url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v3.0.1/ - 3.1.2: - file: variant_catalog_hg38.json - file_check: variant_catalog_hg38.json.md5 - outfile: grch38_expansionhunter_variant_catalog_-3.1.2-.json - outfile_check: grch38_expansionhunter_variant_catalog_-3.1.2-.json.md5 + 5.0.0: + file: variant_catalog_grch37.json + file_check: variant_catalog_grch37.json.md5 + outfile: grch37_expansionhunter_variant_catalog_-5.0.0-.json + outfile_check: grch37_expansionhunter_variant_catalog_-5.0.0-.json.md5 outfile_check_method: md5sum - url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v3.1.2/ + url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v5.0.0/ + grch38: 4.0.2: file: variant_catalog_hg38.json file_check: variant_catalog_hg38.json.md5 @@ -488,6 +466,13 @@ reference_feature: outfile_check: grch38_expansionhunter_variant_catalog_-4.0.2-.json.md5 outfile_check_method: md5sum url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v4.0.2/ + 5.0.0: + file: variant_catalog_hg38.json + file_check: variant_catalog_hg38.json.md5 + outfile: grch38_expansionhunter_variant_catalog_-5.0.0-.json + outfile_check: grch38_expansionhunter_variant_catalog_-5.0.0-.json.md5 + outfile_check_method: md5sum + url_prefix: https://raw.githubusercontent.com/Clinical-Genomics/reference-files/master/rare-disease/disease_loci/ExpansionHunter-v5.0.0/ gatk_mitochondrial_ref: grch38: v0_m_amb: