From abb410b0a091109726339dab08d66775fb350276 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 11:03:01 +0200 Subject: [PATCH 1/8] filter mobile elements --- definitions/rd_dna_initiation_map.yaml | 1 + definitions/rd_dna_parameters.yaml | 21 + lib/MIP/Cli/Mip/Analyse/Rd_dna.pm | 19 + lib/MIP/Recipes/Analysis/Me_filter.pm | 499 +++++++++++++++++++++ lib/MIP/Recipes/Analysis/Vep.pm | 5 +- lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm | 6 +- t/analysis_me_filter.t | 117 +++++ 7 files changed, 664 insertions(+), 4 deletions(-) create mode 100644 lib/MIP/Recipes/Analysis/Me_filter.pm create mode 100644 t/analysis_me_filter.t diff --git a/definitions/rd_dna_initiation_map.yaml b/definitions/rd_dna_initiation_map.yaml index f9afd1243..0a2235ba1 100644 --- a/definitions/rd_dna_initiation_map.yaml +++ b/definitions/rd_dna_initiation_map.yaml @@ -55,6 +55,7 @@ CHAIN_ALL: - me_merge_vcfs - me_annotate - me_varianteffectpredictor + - me_filter - CHAIN_MAIN: # PARALLEL chains, which inherit from MAIN in initiation, but are merged back to CHAIN_MAIN after execution - PARALLEL: diff --git a/definitions/rd_dna_parameters.yaml b/definitions/rd_dna_parameters.yaml index 41ccfd1a3..29eb40ca9 100755 --- a/definitions/rd_dna_parameters.yaml +++ b/definitions/rd_dna_parameters.yaml @@ -227,6 +227,7 @@ recipe_core_number: manta: 36 markduplicates: 13 me_annotate: 2 + me_filter: 2 me_merge_bam: 5 me_merge_vcfs: 2 me_varianteffectpredictor: 4 @@ -362,6 +363,7 @@ recipe_time: manta: 30 markduplicates: 20 me_annotate: 2 + me_filter: 1 me_merge_bam: 5 me_merge_vcfs: 2 me_varianteffectpredictor: 3 @@ -1366,6 +1368,25 @@ me_varianteffectpredictor: - tabix - vep type: recipe +me_filter: + analysis_mode: case + associated_recipe: + - mip + data_type: SCALAR + default: 1 + file_tag: _filter + outfile_suffix: ".vcf.gz" + program_executables: + - bcftools + - mip + - tabix + type: recipe +me_filter_frequency_threshold: + associated_recipe: + - me_filter + data_type: SCALAR + default: 0.1 + type: recipe_argument ## GATK CollectReadCounts gatk_collectreadcounts: analysis_mode: sample diff --git a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm index 202308614..88d3cfafd 100644 --- a/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm +++ b/lib/MIP/Cli/Mip/Analyse/Rd_dna.pm @@ -1246,6 +1246,25 @@ q{Default: hgvs, symbol, numbers, sift, polyphen, humdiv, domains, protein, ccds ) ); + option( + q{me_filter} => ( + cmd_tags => [q{Analysis recipe switch}], + documentation => q{Filter mobile elements}, + is => q{rw}, + isa => enum( [ 0, 1, 2 ] ), + ) + ); + + option( + q{me_filter_frequency_threshold} => ( + cmd_tags => [q{Default: 0.1}], + documentation => + q{Threshold frequency for variants to be filtered out, set to 0 to disable}, + is => q{rw}, + isa => Num, + ) + ); + option( q{gatk_haplotypecaller} => ( cmd_tags => [q{Analysis recipe switch}], diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm new file mode 100644 index 000000000..9bc064245 --- /dev/null +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -0,0 +1,499 @@ +package MIP::Recipes::Analysis::Me_filter; + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile devnull }; +use List::Util qw{ first}; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use utf8; +use warnings; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw{ :all }; +use Readonly; + +## MIPs lib/ +use MIP::Constants qw{ %ANALYSIS $DOT $DOUBLE_QUOTE $LOG_NAME $NEWLINE $PIPE $SPACE $UNDERSCORE }; + +BEGIN { + + require Exporter; + use base qw{ Exporter }; + + # Functions and variables which can be optionally exported + our @EXPORT_OK = qw{ analysis_me_filter }; + +} + +## Constants +Readonly my $ANNOTATION_DISTANCE => $ANALYSIS{ANNOTATION_DISTANCE}; +Readonly my $ANNOTATION_DISTANCE_MT => $ANALYSIS{ANNOTATION_DISTANCE_MT}; +Readonly my $MT_CONTIG_ID_REGEXP => q{MT | M | chrM}; + +sub analysis_me_filter { + +## Function : Vcfparser performs parsing of varianteffectpredictor annotated ws SV variants +## Returns : +## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} +## : $case_id => Family id +## : $filehandle => Sbatch filehandle to write to +## : $file_info_href => File info hash {REF} +## : $job_id_href => Job id hash {REF} +## : $parameter_href => Parameter hash {REF} +## : $profile_base_command => Submission profile base command +## : $recipe_name => Program name +## : $sample_info_href => Info on samples and case hash {REF} +## : $temp_directory => Temporary directory +## : $xargs_file_counter => The xargs file counter + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $active_parameter_href; + my $file_info_href; + my $job_id_href; + my $parameter_href; + my $recipe_name; + my $sample_info_href; + + ## Default(s) + my $case_id; + my $profile_base_command; + my $temp_directory; + my $xargs_file_counter; + + my $tmpl = { + active_parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$active_parameter_href, + strict_type => 1, + }, + case_id => { + default => $arg_href->{active_parameter_href}{case_id}, + store => \$case_id, + strict_type => 1, + }, + file_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$file_info_href, + strict_type => 1, + }, + job_id_href => { + default => {}, + defined => 1, + required => 1, + store => \$job_id_href, + strict_type => 1, + }, + parameter_href => { + default => {}, + defined => 1, + required => 1, + store => \$parameter_href, + strict_type => 1, + }, + profile_base_command => { + default => q{sbatch}, + store => \$profile_base_command, + strict_type => 1, + }, + recipe_name => { + defined => 1, + required => 1, + store => \$recipe_name, + strict_type => 1, + }, + sample_info_href => { + default => {}, + defined => 1, + required => 1, + store => \$sample_info_href, + strict_type => 1, + }, + temp_directory => { + default => $arg_href->{active_parameter_href}{temp_directory}, + store => \$temp_directory, + strict_type => 1, + }, + xargs_file_counter => { + allow => qr{ \A\d+\z }xsm, + default => 0, + store => \$xargs_file_counter, + strict_type => 1, + }, + }; + + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + use MIP::Analysis qw{ get_vcf_parser_analysis_suffix }; + use MIP::File_info qw{ get_io_files parse_io_outfiles }; + use MIP::List qw{ check_element_exist_hash_of_array }; + use MIP::Processmanagement::Processes qw{ submit_recipe }; + use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; + use MIP::Program::Htslib qw{ htslib_tabix }; + use MIP::Program::Mip qw{ mip_vcfparser }; + use MIP::Recipe qw{ parse_recipe_prerequisites }; + use MIP::Sample_info + qw{ set_file_path_to_store set_gene_panel set_recipe_outfile_in_sample_info }; + use MIP::Script::Setup_script qw{ setup_script}; + + ### PREPROCESSING: + + ## Retrieve logger object + my $log = Log::Log4perl->get_logger($LOG_NAME); + + ## Unpack parameters + my %io = get_io_files( + { + id => $case_id, + file_info_href => $file_info_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + stream => q{in}, + temp_directory => $temp_directory, + } + ); + my $infile_name_prefix = $io{in}{file_name_prefix}; + my $infile_path = $io{in}{file_path}; + + my %recipe = parse_recipe_prerequisites( + { + active_parameter_href => $active_parameter_href, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + } + ); + + my @vcfparser_analysis_types = get_vcf_parser_analysis_suffix( + { + vcfparser_outfile_count => $active_parameter_href->{sv_vcfparser_outfile_count}, + } + ); + + ## Set and get the io files per chain, id and stream + my @set_outfile_name_prefixes = + map { $infile_name_prefix . $_ } @vcfparser_analysis_types; + %io = ( + %io, + parse_io_outfiles( + { + chain_id => $recipe{job_id_chain}, + id => $case_id, + file_info_href => $file_info_href, + file_name_prefix => $infile_name_prefix, + iterators_ref => \@vcfparser_analysis_types, + outdata_dir => $active_parameter_href->{outdata_dir}, + parameter_href => $parameter_href, + recipe_name => $recipe_name, + temp_directory => $temp_directory, + } + ) + ); + my $outdir_path = $io{out}{dir_path}; + my $outfile_path_prefix = $io{out}{file_path_prefix}; + my @outfile_paths = @{ $io{out}{file_paths} }; + my %outfile_path = %{ $io{out}{file_path_href} }; + my @outfile_suffixes = @{ $io{out}{file_suffixes} }; + + ## Filehandles + my $filehandle = IO::Handle->new(); + + ## Creates recipe directories (info & data & script), recipe script filenames and writes sbatch header + my ( $recipe_file_path, $recipe_info_path ) = setup_script( + { + active_parameter_href => $active_parameter_href, + core_number => $recipe{core_number}, + directory_id => $case_id, + filehandle => $filehandle, + job_id_href => $job_id_href, + memory_allocation => $recipe{memory}, + process_time => $recipe{time}, + recipe_directory => $recipe_name, + recipe_name => $recipe_name, + temp_directory => $temp_directory, + } + ); + + my $mt_contig = first { / $MT_CONTIG_ID_REGEXP /xms } @{ $file_info_href->{contigs} }; + my @select_feature_annotation_columns; + my $select_file; + my $select_file_matching_column; + my $non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[0]; + my $select_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[1]; + my $mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[0]; + my $select_mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[1]; + + if ( $active_parameter_href->{sv_vcfparser_select_file} ) { + + ## List of genes to analyse separately + $select_file = catfile( $active_parameter_href->{sv_vcfparser_select_file} ); + + ## Column of HGNC Symbol in select file ("-sf") + $select_file_matching_column = + $active_parameter_href->{sv_vcfparser_select_file_matching_column}; + + if ( exists $active_parameter_href->{sv_vcfparser_select_feature_annotation_columns} ) { + + @select_feature_annotation_columns = + @{ $active_parameter_href->{sv_vcfparser_select_feature_annotation_columns} }; + } + } + + my $exclude_filter = _build_bcftools_frequency_filter( + { + frequency_threshold => $active_parameter_href->{me_filter_frequency_threshold}, + me_annotate_query_info_href => $active_parameter_href->{me_annotate_query_files}, + } + ); + + say {$filehandle} q{## Vcfparser non MT contigs}; + bcftools_view( + { + exclude => $exclude_filter, + filehandle => $filehandle, + infile_path => $infile_path, + targets => q{^} . $mt_contig, + } + ); + print {$filehandle} $PIPE . $SPACE; + + mip_vcfparser( + { + filehandle => $filehandle, + infile_path => catfile( dirname( devnull() ), q{stdin} ), + log_file_path => catfile( $outdir_path, q{vcfparser} . q{.log} ), + padding => $ANNOTATION_DISTANCE, + parse_vep => $active_parameter_href->{sv_varianteffectpredictor}, + per_gene => $active_parameter_href->{sv_vcfparser_per_gene}, + pli_values_file_path => $active_parameter_href->{vcfparser_pli_score_file}, + range_feature_annotation_columns_ref => + \@{ $active_parameter_href->{sv_vcfparser_range_feature_annotation_columns} }, + range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, + select_feature_annotation_columns_ref => \@select_feature_annotation_columns, + stdoutfile_path => $non_mt_outfile_path, + select_feature_file_path => $select_non_mt_outfile_path, + select_feature_matching_column => $select_file_matching_column, + select_outfile => $select_non_mt_outfile_path, + variant_type => q{sv}, + } + ); + say {$filehandle} $NEWLINE; + + say {$filehandle} q{## Vcfparser MT}; + bcftools_view( + { + exclude => $exclude_filter, + filehandle => $filehandle, + infile_path => $infile_path, + regions_ref => [$mt_contig], + } + ); + print {$filehandle} $PIPE . $SPACE; + + mip_vcfparser( + { + filehandle => $filehandle, + infile_path => catfile( dirname( devnull() ), q{stdin} ), + log_file_path => catfile( $outdir_path, q{vcfparser} . q{.log} ), + padding => $ANNOTATION_DISTANCE_MT, + parse_vep => $active_parameter_href->{me_varianteffectpredictor}, + per_gene => $active_parameter_href->{sv_vcfparser_per_gene}, + pli_values_file_path => $active_parameter_href->{vcfparser_pli_score_file}, + range_feature_annotation_columns_ref => + \@{ $active_parameter_href->{sv_vcfparser_range_feature_annotation_columns} }, + range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, + select_feature_annotation_columns_ref => \@select_feature_annotation_columns, + stdoutfile_path => $mt_outfile_path, + select_feature_file_path => $select_mt_outfile_path, + select_feature_matching_column => $select_file_matching_column, + select_outfile => $select_mt_outfile_path, + variant_type => q{sv}, + } + ); + say {$filehandle} $NEWLINE; + + ### Special case: replace all clinical mitochondrial variants with research mitochondrial variants + $select_mt_outfile_path = + $active_parameter_href->{sv_vcfparser_add_all_mt_var} + ? $mt_outfile_path + : $select_mt_outfile_path; + + ## Concatenate MT variants with the rest + my @file_sets = ( + { + files_to_concat_ref => [ $non_mt_outfile_path, $mt_outfile_path ], + outfile => $outfile_paths[0], + }, + { + files_to_concat_ref => [ $select_non_mt_outfile_path, $select_mt_outfile_path ], + outfile => $outfile_paths[1], + }, + ); + + foreach my $outfile_set (@file_sets) { + + bcftools_concat( + { + allow_overlaps => 1, + filehandle => $filehandle, + infile_paths_ref => $outfile_set->{files_to_concat_ref}, + output_type => q{u}, + threads => $recipe{core_number}, + } + ); + print {$filehandle} $PIPE . $SPACE; + + bcftools_sort( + { + filehandle => $filehandle, + output_type => q{z}, + outfile_path => $outfile_set->{outfile}, + temp_directory => $temp_directory, + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix( + { + filehandle => $filehandle, + force => 1, + infile_path => $outfile_set->{outfile}, + } + ); + say {$filehandle} $NEWLINE; + } + + close $filehandle; + + if ( $recipe{mode} == 1 ) { + + ## Collect QC metadata info for later use + set_recipe_outfile_in_sample_info( + { + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + path => $outfile_paths[0], + } + ); + + my %gene_panels = ( + range_file => q{sv_vcfparser_range_feature_file}, + select_file => q{sv_vcfparser_select_file}, + ); + GENE_PANEL: + while ( my ( $gene_panel_key, $gene_panel_file ) = each %gene_panels ) { + + ## Collect databases(s) from a potentially merged gene panel file and adds them to sample_info + set_gene_panel( + { + aggregate_gene_panel_file => $active_parameter_href->{$gene_panel_file}, + aggregate_gene_panels_key => $gene_panel_key, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + } + ); + } + + OUTFILE: + while ( my ( $file_key, $outfile ) = each %outfile_path ) { + + my $metafile_tag = $file_key =~ m/selected/xms ? q{clinical} : q{research}; + + set_file_path_to_store( + { + format => q{vcf}, + id => $case_id, + path => $outfile, + path_index => $outfile . $DOT . q{tbi}, + recipe_name => $recipe_name, + sample_info_href => $sample_info_href, + tag => $metafile_tag, + } + ); + } + + submit_recipe( + { + base_command => $profile_base_command, + case_id => $case_id, + dependency_method => q{sample_to_case}, + job_id_chain => $recipe{job_id_chain}, + job_id_href => $job_id_href, + job_reservation_name => $active_parameter_href->{job_reservation_name}, + log => $log, + max_parallel_processes_count_href => + $file_info_href->{max_parallel_processes_count}, + recipe_file_path => $recipe_file_path, + sample_ids_ref => \@{ $active_parameter_href->{sample_ids} }, + submission_profile => $active_parameter_href->{submission_profile}, + } + ); + } + return 1; +} + +sub _build_bcftools_frequency_filter { + +## Function : Build the frequency filter command +## Returns : +## Arguments: $frequency_threshold => Frequency annotation to use in filtering +## : $me_annotate_query_info_href => SVDB query files used to annotate ME {REF} + + my ($arg_href) = @_; + + ## Flatten argument(s) + my $frequency_threshold; + my $me_annotate_query_info_href; + + my $tmpl = { + frequency_threshold => { + required => 1, + store => \$frequency_threshold, + strict_type => 1, + }, + me_annotate_query_info_href => { + default => {}, + required => 1, + store => \$me_annotate_query_info_href, + } + }; + + check( $tmpl, $arg_href, 1 ) or croak q{Could not parse arguments!}; + + ## Skip if no threshold is set + return if ( not $frequency_threshold ); + + ## Find annotations to use for filtering + my @frequency_filters; + my $frequency_expression = $SPACE . q{>} . $SPACE . $frequency_threshold; + + ANNOTATION: + while ( my ( $annotation_file, $annotation_tags ) = each %{$me_annotate_query_info_href} ) { + + my ( $tag_prefix, $fq_suffix, $cnt_suffix, $file_fq_tag, $file_cnt_tag, $use_in_filter ) = + split /[|]/sxm, $annotation_tags; + + next ANNOTATION if ( not $use_in_filter ); + + push @frequency_filters, q{INFO/} . $tag_prefix . $file_fq_tag . $frequency_expression; + } + + ## build filter + my $exclude_filter = + $DOUBLE_QUOTE + . join( $SPACE . $PIPE . $SPACE . q{INFO/}, @frequency_filters ) + . $DOUBLE_QUOTE; + + return $exclude_filter; +} + +1; diff --git a/lib/MIP/Recipes/Analysis/Vep.pm b/lib/MIP/Recipes/Analysis/Vep.pm index 275416842..6089a97c4 100644 --- a/lib/MIP/Recipes/Analysis/Vep.pm +++ b/lib/MIP/Recipes/Analysis/Vep.pm @@ -43,6 +43,7 @@ Readonly my $ANNOTATION_DISTANCE => $ANALYSIS{ANNOTATION_DISTANCE}; Readonly my $ANNOTATION_DISTANCE_MT => $ANALYSIS{ANNOTATION_DISTANCE_MT}; Readonly my $BUFFER_SIZE => 20_000; Readonly my $BUFFER_SIZE_SV => 100; +Readonly my $CONTIG_20 => 20; sub analysis_vep_wgs { @@ -1626,8 +1627,8 @@ sub analysis_vep_me { bcftools_view( { filehandle => $filehandle, - targets => q{^} . $mt_contig_ref->[0], infile_path => $infile_path, + targets => q{^} . $mt_contig_ref->[0], } ); print {$filehandle} $PIPE . $SPACE; @@ -1665,7 +1666,7 @@ sub analysis_vep_me { { filehandle => $filehandle, infile_path => $infile_path, - regions_ref => [ $contigs_ref->[20], $mt_contig_ref->[0] ], + regions_ref => [ $contigs_ref->[$CONTIG_20], $mt_contig_ref->[0] ], } ); print {$filehandle} $PIPE . $SPACE; diff --git a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm index c0d9482a4..087ef99ce 100644 --- a/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm +++ b/lib/MIP/Recipes/Pipeline/Analyse_rd_dna.pm @@ -130,7 +130,7 @@ sub parse_rd_dna { Readonly my @ONLY_WGS_VARIANT_CALLER_RECIPES => qw{ cnvnator_ar delly_reformat tiddit }; Readonly my @ONLY_WGS_RECIPIES => qw{ chromograph_rhoviz cnvnator_ar delly_call delly_reformat expansionhunter - gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_annotate me_merge_bam me_merge_vcfs + gatk_collectreadcounts gatk_denoisereadcounts gens_generatedata me_annotate me_filter me_merge_bam me_merge_vcfs me_varianteffectpredictor mitodel retroseq samtools_subsample_mt smncopynumbercaller star_caller telomerecat_ar tiddit }; Readonly my @REMOVE_CONFIG_KEYS => qw{ associated_recipe }; @@ -475,7 +475,8 @@ sub pipeline_analyse_rd_dna { use MIP::Recipes::Analysis::Gzip_fastq qw{ analysis_gzip_fastq }; use MIP::Recipes::Analysis::Manta qw{ analysis_manta }; use MIP::Recipes::Analysis::Markduplicates qw{ analysis_markduplicates }; - use MIP::Recipes::Analysis::Me_annotate qw{ analysis_me_annotate}; + use MIP::Recipes::Analysis::Me_annotate qw{ analysis_me_annotate }; + use MIP::Recipes::Analysis::Me_filter qw{ analysis_me_filter }; use MIP::Recipes::Analysis::Mip_qccollect qw{ analysis_mip_qccollect }; use MIP::Recipes::Analysis::Mip_vcfparser qw{ analysis_mip_vcfparser }; use MIP::Recipes::Analysis::Mip_vercollect qw{ analysis_mip_vercollect }; @@ -591,6 +592,7 @@ sub pipeline_analyse_rd_dna { manta => \&analysis_manta, markduplicates => \&analysis_markduplicates, me_annotate => \&analysis_me_annotate, + me_filter => \&analysis_me_filter, me_merge_bam => \&analysis_samtools_merge_panel, me_merge_vcfs => \&analysis_me_merge_vcfs, me_varianteffectpredictor => \&analysis_vep_me, diff --git a/t/analysis_me_filter.t b/t/analysis_me_filter.t new file mode 100644 index 000000000..59ddfff1a --- /dev/null +++ b/t/analysis_me_filter.t @@ -0,0 +1,117 @@ +#!/usr/bin/env perl + +use 5.026; +use Carp; +use charnames qw{ :full :short }; +use English qw{ -no_match_vars }; +use File::Basename qw{ dirname }; +use File::Spec::Functions qw{ catdir catfile }; +use FindBin qw{ $Bin }; +use open qw{ :encoding(UTF-8) :std }; +use Params::Check qw{ allow check last_error }; +use Test::More; +use utf8; +use warnings qw{ FATAL utf8 }; + +## CPANM +use autodie qw { :all }; +use Modern::Perl qw{ 2018 }; +use Readonly; + +## MIPs lib/ +use lib catdir( dirname($Bin), q{lib} ); +use MIP::Constants qw{ $COLON $COMMA $SPACE }; +use MIP::Test::Fixtures qw{ test_add_io_for_recipe test_log test_mip_hashes }; + +BEGIN { + + use MIP::Test::Fixtures qw{ test_import }; + +### Check all internal dependency modules and imports +## Modules with import + my %perl_module = ( + q{MIP::Recipes::Analysis::Me_filter} => [qw{ analysis_me_filter }], + q{MIP::Test::Fixtures} => [qw{ test_add_io_for_recipe test_log test_mip_hashes }], + ); + + test_import( { perl_module_href => \%perl_module, } ); +} + +use MIP::Recipes::Analysis::Me_filter qw{ analysis_me_filter }; + +diag( q{Test analysis_me_filter from Me_filter.pm} + . $COMMA + . $SPACE . q{Perl} + . $SPACE + . $PERL_VERSION + . $SPACE + . $EXECUTABLE_NAME ); + +Readonly my $THRESHOLD => 0.05; + +my $log = test_log( { log_name => q{MIP}, no_screen => 1, } ); + +## Given analysis parameters +my $recipe_name = q{me_filter}; +my $slurm_mock_cmd = catfile( $Bin, qw{ data modules slurm-mock.pl } ); + +my %active_parameter = test_mip_hashes( + { + mip_hash_name => q{active_parameter}, + recipe_name => $recipe_name, + } +); +$active_parameter{$recipe_name} = 1; +$active_parameter{recipe_core_number}{$recipe_name} = 1; +$active_parameter{recipe_time}{$recipe_name} = 1; +my $case_id = $active_parameter{case_id}; +$active_parameter{sv_vcfparser_outfile_count} = 2; +@{ $active_parameter{sv_vcfparser_select_feature_annotation_columns} } = ( 1, 2 ); +$active_parameter{sv_vcfparser_select_file} = + catfile( $Bin, qw{ data 643594-miptest aggregated_gene_panel_test.txt } ); +$active_parameter{me_filter_frequency_threshold} = $THRESHOLD; + +my %file_info = test_mip_hashes( + { + mip_hash_name => q{file_info}, + recipe_name => $recipe_name, + } +); + +my %job_id; +my %parameter = test_mip_hashes( + { + mip_hash_name => q{recipe_parameter}, + recipe_name => $recipe_name, + } +); + +test_add_io_for_recipe( + { + file_info_href => \%file_info, + id => $case_id, + parameter_href => \%parameter, + recipe_name => $recipe_name, + step => q{vcf}, + } +); + +my %sample_info; + +my $is_ok = analysis_me_filter( + { + active_parameter_href => \%active_parameter, + case_id => $case_id, + file_info_href => \%file_info, + job_id_href => \%job_id, + parameter_href => \%parameter, + profile_base_command => $slurm_mock_cmd, + recipe_name => $recipe_name, + sample_info_href => \%sample_info, + } +); + +## Then return TRUE +ok( $is_ok, q{ Executed analysis recipe } . $recipe_name ); + +done_testing(); From 72a0fa070e10f732fa2e50a4e51103d3392673d2 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 12:51:37 +0200 Subject: [PATCH 2/8] fixing filter expression --- lib/MIP/Recipes/Analysis/Me_filter.pm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index 9bc064245..cefb48e5a 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -489,9 +489,7 @@ sub _build_bcftools_frequency_filter { ## build filter my $exclude_filter = - $DOUBLE_QUOTE - . join( $SPACE . $PIPE . $SPACE . q{INFO/}, @frequency_filters ) - . $DOUBLE_QUOTE; + $DOUBLE_QUOTE . join( $SPACE . $PIPE . $SPACE, @frequency_filters ) . $DOUBLE_QUOTE; return $exclude_filter; } From 98b41a54ee650c4ede2ce601a787bf396a0e4d6d Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 13:20:56 +0200 Subject: [PATCH 3/8] add correct select file --- lib/MIP/Recipes/Analysis/Me_filter.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index cefb48e5a..e55cea4e5 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -280,7 +280,7 @@ sub analysis_me_filter { range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, select_feature_annotation_columns_ref => \@select_feature_annotation_columns, stdoutfile_path => $non_mt_outfile_path, - select_feature_file_path => $select_non_mt_outfile_path, + select_feature_file_path => $select_file, select_feature_matching_column => $select_file_matching_column, select_outfile => $select_non_mt_outfile_path, variant_type => q{sv}, From c6e988c23f8c7149a8a0495358fb2f13528abe08 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 13:33:11 +0200 Subject: [PATCH 4/8] fixinng select file for 2nd pass --- lib/MIP/Recipes/Analysis/Me_filter.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index e55cea4e5..63a10e9e7 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -313,7 +313,7 @@ sub analysis_me_filter { range_feature_file_path => $active_parameter_href->{sv_vcfparser_range_feature_file}, select_feature_annotation_columns_ref => \@select_feature_annotation_columns, stdoutfile_path => $mt_outfile_path, - select_feature_file_path => $select_mt_outfile_path, + select_feature_file_path => $select_file, select_feature_matching_column => $select_file_matching_column, select_outfile => $select_mt_outfile_path, variant_type => q{sv}, From b74a44f4f9f38b7a3559c6f39d45f32e250e7d88 Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 17:18:05 +0200 Subject: [PATCH 5/8] remove gz ending from vcfparser output --- lib/MIP/Recipes/Analysis/Me_filter.pm | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index 63a10e9e7..ec90a3525 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -18,7 +18,8 @@ use autodie qw{ :all }; use Readonly; ## MIPs lib/ -use MIP::Constants qw{ %ANALYSIS $DOT $DOUBLE_QUOTE $LOG_NAME $NEWLINE $PIPE $SPACE $UNDERSCORE }; +use MIP::Constants + qw{ %ANALYSIS $DOT $DOUBLE_QUOTE $EMPTY_STR $LOG_NAME $NEWLINE $PIPE $SPACE $UNDERSCORE }; BEGIN { @@ -137,6 +138,7 @@ sub analysis_me_filter { use MIP::Analysis qw{ get_vcf_parser_analysis_suffix }; use MIP::File_info qw{ get_io_files parse_io_outfiles }; use MIP::List qw{ check_element_exist_hash_of_array }; + use MIP::File::Path qw{ remove_file_path_suffix }; use MIP::Processmanagement::Processes qw{ submit_recipe }; use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; use MIP::Program::Htslib qw{ htslib_tabix }; @@ -227,10 +229,12 @@ sub analysis_me_filter { my @select_feature_annotation_columns; my $select_file; my $select_file_matching_column; - my $non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[0]; - my $select_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $outfile_suffixes[1]; - my $mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[0]; - my $select_mt_outfile_path = $outfile_path_prefix . q{_MT} . $outfile_suffixes[1]; + my ( $no_gz_file_ending, $select_no_gz_file_ending ) = + map { s/[.]gz\z/$EMPTY_STR/r } @outfile_suffixes; + my $non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $no_gz_file_ending; + my $select_non_mt_outfile_path = $outfile_path_prefix . q{_non_MT} . $select_no_gz_file_ending; + my $mt_outfile_path = $outfile_path_prefix . q{_MT} . $no_gz_file_ending; + my $select_mt_outfile_path = $outfile_path_prefix . q{_MT} . $select_no_gz_file_ending; if ( $active_parameter_href->{sv_vcfparser_select_file} ) { From 7a35a52f751580f29589291dd04d903890c807af Mon Sep 17 00:00:00 2001 From: jemten Date: Mon, 24 Apr 2023 18:31:39 +0200 Subject: [PATCH 6/8] compress and index vcfparser output --- lib/MIP/Recipes/Analysis/Me_filter.pm | 40 ++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index ec90a3525..eea94e29c 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -141,7 +141,7 @@ sub analysis_me_filter { use MIP::File::Path qw{ remove_file_path_suffix }; use MIP::Processmanagement::Processes qw{ submit_recipe }; use MIP::Program::Bcftools qw{ bcftools_concat bcftools_sort bcftools_view }; - use MIP::Program::Htslib qw{ htslib_tabix }; + use MIP::Program::Htslib qw{ htslib_bgzip htslib_tabix }; use MIP::Program::Mip qw{ mip_vcfparser }; use MIP::Recipe qw{ parse_recipe_prerequisites }; use MIP::Sample_info @@ -331,18 +331,50 @@ sub analysis_me_filter { ? $mt_outfile_path : $select_mt_outfile_path; + my @files_to_compress_and_index = ( + $mt_outfile_path, $non_mt_outfile_path, + $select_mt_outfile_path, $select_non_mt_outfile_path, + ); + + say {$filehandle} q{## Compress and index vcfparser output}; + FILE_TO_COMPRESS_AND_INDEX: + foreach my $file_to_compress_and_index (@files_to_compress_and_index) { + + htslib_bgzip( + { + infile_path => $file_to_compress_and_index, + force => 1, + filehandle => $filehandle, + threads => $recipe{core_number}, + } + ); + say {$filehandle} $NEWLINE; + + htslib_tabix( + { + filehandle => $filehandle, + force => 1, + infile_path => $file_to_compress_and_index . q{.gz}, + } + ); + say {$filehandle} $NEWLINE; + } + ## Concatenate MT variants with the rest my @file_sets = ( { - files_to_concat_ref => [ $non_mt_outfile_path, $mt_outfile_path ], + files_to_concat_ref => [ $non_mt_outfile_path . q{.gz}, $mt_outfile_path . q{.gz} ], outfile => $outfile_paths[0], }, { - files_to_concat_ref => [ $select_non_mt_outfile_path, $select_mt_outfile_path ], - outfile => $outfile_paths[1], + files_to_concat_ref => + [ $select_non_mt_outfile_path . q{.gz}, $select_mt_outfile_path . q{.gz} ], + outfile => $outfile_paths[1], }, ); + say {$filehandle} q{## Concatenate, sort and index}; + OUTFILE_SET: foreach my $outfile_set (@file_sets) { bcftools_concat( From 47fb949fbde806682376d44d62933f257934f778 Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 25 Apr 2023 09:07:18 +0200 Subject: [PATCH 7/8] compress before renaming --- lib/MIP/Recipes/Analysis/Me_filter.pm | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index eea94e29c..b5b84b774 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -325,18 +325,13 @@ sub analysis_me_filter { ); say {$filehandle} $NEWLINE; - ### Special case: replace all clinical mitochondrial variants with research mitochondrial variants - $select_mt_outfile_path = - $active_parameter_href->{sv_vcfparser_add_all_mt_var} - ? $mt_outfile_path - : $select_mt_outfile_path; + say {$filehandle} q{## Compress and index vcfparser output}; my @files_to_compress_and_index = ( $mt_outfile_path, $non_mt_outfile_path, $select_mt_outfile_path, $select_non_mt_outfile_path, ); - say {$filehandle} q{## Compress and index vcfparser output}; FILE_TO_COMPRESS_AND_INDEX: foreach my $file_to_compress_and_index (@files_to_compress_and_index) { @@ -360,6 +355,12 @@ sub analysis_me_filter { say {$filehandle} $NEWLINE; } + ### Special case: replace all clinical mitochondrial variants with research mitochondrial variants + $select_mt_outfile_path = + $active_parameter_href->{sv_vcfparser_add_all_mt_var} + ? $mt_outfile_path + : $select_mt_outfile_path; + ## Concatenate MT variants with the rest my @file_sets = ( { @@ -374,6 +375,7 @@ sub analysis_me_filter { ); say {$filehandle} q{## Concatenate, sort and index}; + OUTFILE_SET: foreach my $outfile_set (@file_sets) { From 3849464450558499c2ac72c6c6c465b622f9b69d Mon Sep 17 00:00:00 2001 From: jemten Date: Tue, 25 Apr 2023 11:38:58 +0200 Subject: [PATCH 8/8] Updating recipe description --- lib/MIP/Recipes/Analysis/Me_filter.pm | 2 +- lib/MIP/Recipes/Analysis/Mip_vcfparser.pm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/MIP/Recipes/Analysis/Me_filter.pm b/lib/MIP/Recipes/Analysis/Me_filter.pm index b5b84b774..a4a970b4d 100644 --- a/lib/MIP/Recipes/Analysis/Me_filter.pm +++ b/lib/MIP/Recipes/Analysis/Me_filter.pm @@ -38,7 +38,7 @@ Readonly my $MT_CONTIG_ID_REGEXP => q{MT | M | chrM}; sub analysis_me_filter { -## Function : Vcfparser performs parsing of varianteffectpredictor annotated ws SV variants +## Function : Filters mobile elements on frequency and split the vcf in aclinical and research vcf using vcfparser. ## Returns : ## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} ## : $case_id => Family id diff --git a/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm b/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm index b46a1adca..5c4b216bf 100644 --- a/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm +++ b/lib/MIP/Recipes/Analysis/Mip_vcfparser.pm @@ -1131,7 +1131,7 @@ sub analysis_mip_vcfparser_sv_wes { sub analysis_mip_vcfparser_sv_wgs { -## Function : Vcfparser performs parsing of varianteffectpredictor annotated ws SV variants +## Function : Vcfparser performs parsing of varianteffectpredictor annotated wgs SV variants ## Returns : ## Arguments: $active_parameter_href => Active parameters for this analysis hash {REF} ## : $case_id => Family id