From 1bc625f447cb4809a407ed768f6c1fab53113669 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Wed, 25 Jan 2023 14:18:33 +0000 Subject: [PATCH 1/8] initial commit --- nextflow/nf_config/nextflow.config | 2 ++ nextflow/nf_modules/run_vep_chros.nf | 2 +- nextflow/nf_modules/split_into_chros.nf | 17 ++++++++++++++++- nextflow/workflows/run_vep.nf | 4 ++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/nextflow/nf_config/nextflow.config b/nextflow/nf_config/nextflow.config index 184740511..b975d9ee2 100644 --- a/nextflow/nf_config/nextflow.config +++ b/nextflow/nf_config/nextflow.config @@ -38,3 +38,5 @@ profiles { //params.chros_file = "$PWD/examples/clinvar-testset/chros.txt" params.vep_config = "$PWD/nf_config/vep.ini" params.output_prefix = "" +params.split_by_region = 0 +params.region_size = 1000000 diff --git a/nextflow/nf_modules/run_vep_chros.nf b/nextflow/nf_modules/run_vep_chros.nf index e3ce4635d..33a027ae7 100644 --- a/nextflow/nf_modules/run_vep_chros.nf +++ b/nextflow/nf_modules/run_vep_chros.nf @@ -45,7 +45,7 @@ process chrosVEP { } else { """ - vep -i ${vcfFile} -o ${prefix}-${vcfFile} --vcf --compress_output bgzip --format vcf --config ${vep_config} + vep -i ${vcfFile} -o ${prefix}-${vcfFile} --vcf --compress_output bgzip --format vcf --config ${vep_config} tabix -p vcf ${prefix}-${vcfFile} """ } diff --git a/nextflow/nf_modules/split_into_chros.nf b/nextflow/nf_modules/split_into_chros.nf index ef96bc46b..c6d936b37 100644 --- a/nextflow/nf_modules/split_into_chros.nf +++ b/nextflow/nf_modules/split_into_chros.nf @@ -28,13 +28,28 @@ process splitVCF { val(chr) path(vcf) path(vcf_index) + val(split_by_region) + val(region_size) output: - tuple path("${prefix}.${chr}.vcf.gz"), path("${prefix}.${chr}.vcf.gz.tbi") + tuple path("${prefix}.${chr}.*vcf.gz"), path("${prefix}.${chr}.*vcf.gz.tbi"), emit: files script: """ bcftools view -r ${chr} ${vcf} -o ${prefix}.${chr}.vcf.gz -O z bcftools index -t ${prefix}.${chr}.vcf.gz + + if [[ ${split_by_region} ]]; then + bcftools query -f'%CHROM\t%POS\n' ${prefix}.${chr}.vcf.gz | split -l ${region_size} + + for file in x*; do + bcftools view -T \${file} -Oz ${prefix}.${chr}.vcf.gz > ${prefix}.${chr}.\${file}.vcf.gz + bcftools index -t ${prefix}.${chr}.\${file}.vcf.gz + done + + rm ${prefix}.${chr}.vcf.gz + rm ${prefix}.${chr}.vcf.gz.tbi + rm x* + fi """ } diff --git a/nextflow/workflows/run_vep.nf b/nextflow/workflows/run_vep.nf index caf793d0c..e8c526032 100644 --- a/nextflow/workflows/run_vep.nf +++ b/nextflow/workflows/run_vep.nf @@ -97,7 +97,7 @@ log.info params.chros readChrVCF(params.vcf, vcf_index) chr = readChrVCF.out.splitText().map{it -> it.trim()} } - splitVCF(chr, params.vcf, vcf_index) - chrosVEP(splitVCF.out, params.vep_config) + splitVCF(chr, params.vcf, vcf_index, params.split_by_region, params.region_size) + chrosVEP(splitVCF.out.files.transpose(), params.vep_config) mergeVCF(chrosVEP.out.vcfFile.collect(), chrosVEP.out.indexFile.collect()) } From d70cb05a635ad4fd8df3036c95c5f29759371d47 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Mon, 30 Jan 2023 09:26:49 +0000 Subject: [PATCH 2/8] Allow for large number of unordered files to merge --- nextflow/nf_modules/merge_chros_VCF.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nextflow/nf_modules/merge_chros_VCF.nf b/nextflow/nf_modules/merge_chros_VCF.nf index 5deecfb77..4af01cf6f 100644 --- a/nextflow/nf_modules/merge_chros_VCF.nf +++ b/nextflow/nf_modules/merge_chros_VCF.nf @@ -43,8 +43,9 @@ process mergeVCF { script: """ - bcftools concat ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz - bcftools sort -Oz temp-${ mergedVCF}.vcf.gz -o ${ mergedVCF}.vcf.gz + mkdir -p temp + bcftools concat -a ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz + bcftools sort -T temp -Oz temp-${ mergedVCF}.vcf.gz -o ${ mergedVCF}.vcf.gz bcftools index -t ${ mergedVCF}.vcf.gz """ } From 396370c1da3c1e5b24495117df817b9c33961d16 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Fri, 10 Feb 2023 19:26:29 +0000 Subject: [PATCH 3/8] Add support for resume --- nextflow/nf_config/nextflow.config | 4 ++-- nextflow/nf_modules/merge_chros_VCF.nf | 2 ++ nextflow/workflows/run_vep.nf | 21 +++++++++++++-------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/nextflow/nf_config/nextflow.config b/nextflow/nf_config/nextflow.config index b975d9ee2..5b5aceee8 100644 --- a/nextflow/nf_config/nextflow.config +++ b/nextflow/nf_config/nextflow.config @@ -12,9 +12,9 @@ profiles { lsf { process.executor = 'lsf' - process.memory = '5GB' + process.memory = '10GB' process.cpus = 1 - process.clusterOptions = '-R "select[mem>5000] rusage[mem=5000]" -M5000' + process.clusterOptions = '-R "select[mem>10000] rusage[mem=10000]" -M10000' singularity { enabled = true autoMounts = true diff --git a/nextflow/nf_modules/merge_chros_VCF.nf b/nextflow/nf_modules/merge_chros_VCF.nf index 4af01cf6f..cb0e1c0ec 100644 --- a/nextflow/nf_modules/merge_chros_VCF.nf +++ b/nextflow/nf_modules/merge_chros_VCF.nf @@ -32,6 +32,8 @@ process mergeVCF { cpus params.cpus container "${params.singularity_dir}/bcftools.sif" + + cache 'lenient' input: diff --git a/nextflow/workflows/run_vep.nf b/nextflow/workflows/run_vep.nf index e8c526032..035a3584b 100644 --- a/nextflow/workflows/run_vep.nf +++ b/nextflow/workflows/run_vep.nf @@ -59,12 +59,14 @@ if(check_bgzipped.exitValue()){ exit 1, "The specified VCF file is not bgzipped: ${params.vcf}" } -def sout = new StringBuilder(), serr = new StringBuilder() -check_parsing = "$params.singularity_dir/vep.sif tabix -p vcf -f $params.vcf".execute() -check_parsing.consumeProcessOutput(sout, serr) -check_parsing.waitFor() -if( serr ){ - exit 1, "The specified VCF file has issues in parsing: $serr" +if ( !params.skip_check ){ + def sout = new StringBuilder(), serr = new StringBuilder() + check_parsing = "$params.singularity_dir/vep.sif tabix -p vcf -f $params.vcf".execute() + check_parsing.consumeProcessOutput(sout, serr) + check_parsing.waitFor() + if( serr ){ + exit 1, "The specified VCF file has issues in parsing: $serr" + } } vcf_index = "${params.vcf}.tbi" @@ -86,7 +88,7 @@ log.info params.chros if (params.chros){ log.info 'Reading chromosome names from list' chr_str = params.chros.toString() - chr = Channel.of(chr_str.split(',')) + chr = Channel.of(chr_str.split(',')).toSortedList() } else if (params.chros_file) { log.info 'Reading chromosome names from file' @@ -97,7 +99,10 @@ log.info params.chros readChrVCF(params.vcf, vcf_index) chr = readChrVCF.out.splitText().map{it -> it.trim()} } + chr.view() splitVCF(chr, params.vcf, vcf_index, params.split_by_region, params.region_size) - chrosVEP(splitVCF.out.files.transpose(), params.vep_config) + chan = splitVCF.out.files.transpose() + chan.view() + chrosVEP(chan, params.vep_config) mergeVCF(chrosVEP.out.vcfFile.collect(), chrosVEP.out.indexFile.collect()) } From 16412312bd2721c390551674b5f6ea4fb5702028 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Mon, 13 Mar 2023 16:29:06 +0000 Subject: [PATCH 4/8] Update module and file names --- nextflow/nf_modules/{merge_chros_VCF.nf => merge_VCF.nf} | 0 nextflow/nf_modules/{read_chros_VCF.nf => read_VCF.nf} | 0 nextflow/nf_modules/{run_vep_chros.nf => run_vep.nf} | 0 nextflow/nf_modules/{split_into_chros.nf => split_VCF.nf} | 0 nextflow/workflows/run_vep.nf | 8 ++++---- 5 files changed, 4 insertions(+), 4 deletions(-) rename nextflow/nf_modules/{merge_chros_VCF.nf => merge_VCF.nf} (100%) rename nextflow/nf_modules/{read_chros_VCF.nf => read_VCF.nf} (100%) rename nextflow/nf_modules/{run_vep_chros.nf => run_vep.nf} (100%) rename nextflow/nf_modules/{split_into_chros.nf => split_VCF.nf} (100%) diff --git a/nextflow/nf_modules/merge_chros_VCF.nf b/nextflow/nf_modules/merge_VCF.nf similarity index 100% rename from nextflow/nf_modules/merge_chros_VCF.nf rename to nextflow/nf_modules/merge_VCF.nf diff --git a/nextflow/nf_modules/read_chros_VCF.nf b/nextflow/nf_modules/read_VCF.nf similarity index 100% rename from nextflow/nf_modules/read_chros_VCF.nf rename to nextflow/nf_modules/read_VCF.nf diff --git a/nextflow/nf_modules/run_vep_chros.nf b/nextflow/nf_modules/run_vep.nf similarity index 100% rename from nextflow/nf_modules/run_vep_chros.nf rename to nextflow/nf_modules/run_vep.nf diff --git a/nextflow/nf_modules/split_into_chros.nf b/nextflow/nf_modules/split_VCF.nf similarity index 100% rename from nextflow/nf_modules/split_into_chros.nf rename to nextflow/nf_modules/split_VCF.nf diff --git a/nextflow/workflows/run_vep.nf b/nextflow/workflows/run_vep.nf index 035a3584b..29c7ee492 100644 --- a/nextflow/workflows/run_vep.nf +++ b/nextflow/workflows/run_vep.nf @@ -17,10 +17,10 @@ params.chros="" params.chros_file="" // module imports -include { splitVCF } from '../nf_modules/split_into_chros.nf' -include { mergeVCF } from '../nf_modules/merge_chros_VCF.nf' -include { chrosVEP } from '../nf_modules/run_vep_chros.nf' -include { readChrVCF } from '../nf_modules/read_chros_VCF.nf' +include { splitVCF } from '../nf_modules/split_VCF.nf' +include { mergeVCF } from '../nf_modules/merge_VCF.nf' +include { chrosVEP } from '../nf_modules/run_vep.nf' +include { readChrVCF } from '../nf_modules/read_VCF.nf' // print usage if (params.help) { From 82410807c6e71b895b58051e190c4b7b35f141b2 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Mon, 13 Mar 2023 17:26:14 +0000 Subject: [PATCH 5/8] Update the parameter name to more sensible one --- nextflow/nf_config/nextflow.config | 3 +-- nextflow/nf_modules/split_VCF.nf | 7 +++---- nextflow/workflows/run_vep.nf | 9 +++------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/nextflow/nf_config/nextflow.config b/nextflow/nf_config/nextflow.config index 5b5aceee8..0ce8f91bc 100644 --- a/nextflow/nf_config/nextflow.config +++ b/nextflow/nf_config/nextflow.config @@ -38,5 +38,4 @@ profiles { //params.chros_file = "$PWD/examples/clinvar-testset/chros.txt" params.vep_config = "$PWD/nf_config/vep.ini" params.output_prefix = "" -params.split_by_region = 0 -params.region_size = 1000000 +params.bin_size = 0 diff --git a/nextflow/nf_modules/split_VCF.nf b/nextflow/nf_modules/split_VCF.nf index c6d936b37..42b51e401 100644 --- a/nextflow/nf_modules/split_VCF.nf +++ b/nextflow/nf_modules/split_VCF.nf @@ -28,8 +28,7 @@ process splitVCF { val(chr) path(vcf) path(vcf_index) - val(split_by_region) - val(region_size) + val(bin_size) output: tuple path("${prefix}.${chr}.*vcf.gz"), path("${prefix}.${chr}.*vcf.gz.tbi"), emit: files @@ -39,8 +38,8 @@ process splitVCF { bcftools view -r ${chr} ${vcf} -o ${prefix}.${chr}.vcf.gz -O z bcftools index -t ${prefix}.${chr}.vcf.gz - if [[ ${split_by_region} ]]; then - bcftools query -f'%CHROM\t%POS\n' ${prefix}.${chr}.vcf.gz | split -l ${region_size} + if [[ ${bin_size} ]]; then + bcftools query -f'%CHROM\t%POS\n' ${prefix}.${chr}.vcf.gz | split -l ${bin_size} for file in x*; do bcftools view -T \${file} -Oz ${prefix}.${chr}.vcf.gz > ${prefix}.${chr}.\${file}.vcf.gz diff --git a/nextflow/workflows/run_vep.nf b/nextflow/workflows/run_vep.nf index 29c7ee492..93f500bdd 100644 --- a/nextflow/workflows/run_vep.nf +++ b/nextflow/workflows/run_vep.nf @@ -88,7 +88,7 @@ log.info params.chros if (params.chros){ log.info 'Reading chromosome names from list' chr_str = params.chros.toString() - chr = Channel.of(chr_str.split(',')).toSortedList() + chr = Channel.of(chr_str.split(',')) } else if (params.chros_file) { log.info 'Reading chromosome names from file' @@ -99,10 +99,7 @@ log.info params.chros readChrVCF(params.vcf, vcf_index) chr = readChrVCF.out.splitText().map{it -> it.trim()} } - chr.view() - splitVCF(chr, params.vcf, vcf_index, params.split_by_region, params.region_size) - chan = splitVCF.out.files.transpose() - chan.view() - chrosVEP(chan, params.vep_config) + splitVCF(chr, params.vcf, vcf_index, params.bin_size) + chrosVEP(splitVCF.out.files.transpose(), params.vep_config) mergeVCF(chrosVEP.out.vcfFile.collect(), chrosVEP.out.indexFile.collect()) } From 5f296fd40f2957bcde8f9aa0d21339c448c44f19 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Mon, 13 Mar 2023 17:34:34 +0000 Subject: [PATCH 6/8] Edit Readme --- nextflow/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow/README.md b/nextflow/README.md index baab50d4b..a678a25a5 100644 --- a/nextflow/README.md +++ b/nextflow/README.md @@ -75,6 +75,8 @@ The following config files are used and can be modified depending on user requir --chros_file LIST_OF_CHROS_FILE Path to file containing list of chromosomes --cpus INT Number of CPUs to use. Default 1. --output_prefix FILENAME_PREFIX Output filename prefix. The generated output file will have name .vcf.gz + --skip_check [0,1] If set will skip checking of tabix index file for input VCF, we can do this if there a index file already exist. It enables the first module to load from cache if -resume is used. Default: 0 + --bin_size SIZE If given there will be further split and each file will contain exactly SIZE number of variants. Enables faster run in expense of more jobs. By default the input file is only split by chromosome (SIZE=0). Default: 0 ``` NB: File paths are expected to be absolute paths. From e33452ea4917e97865ced8b68293acc3cc970ce4 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Mon, 13 Mar 2023 21:42:01 +0000 Subject: [PATCH 7/8] Restore nf config value --- nextflow/nf_config/nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow/nf_config/nextflow.config b/nextflow/nf_config/nextflow.config index 0ce8f91bc..104ed6bc9 100644 --- a/nextflow/nf_config/nextflow.config +++ b/nextflow/nf_config/nextflow.config @@ -12,9 +12,9 @@ profiles { lsf { process.executor = 'lsf' - process.memory = '10GB' + process.memory = '5GB' process.cpus = 1 - process.clusterOptions = '-R "select[mem>10000] rusage[mem=10000]" -M10000' + process.clusterOptions = '-R "select[mem>5000] rusage[mem=5000]" -M5000' singularity { enabled = true autoMounts = true From 6998e7eea9d4a5fb35eefce7526118db3a0b3773 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Thu, 16 Mar 2023 09:36:02 +0000 Subject: [PATCH 8/8] Remove bcftools header line --- nextflow/nf_modules/merge_VCF.nf | 4 ++-- nextflow/nf_modules/split_VCF.nf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nextflow/nf_modules/merge_VCF.nf b/nextflow/nf_modules/merge_VCF.nf index cb0e1c0ec..fbe78b2e2 100644 --- a/nextflow/nf_modules/merge_VCF.nf +++ b/nextflow/nf_modules/merge_VCF.nf @@ -46,8 +46,8 @@ process mergeVCF { script: """ mkdir -p temp - bcftools concat -a ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz + bcftools concat --no-version -a ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz bcftools sort -T temp -Oz temp-${ mergedVCF}.vcf.gz -o ${ mergedVCF}.vcf.gz - bcftools index -t ${ mergedVCF}.vcf.gz + bcftools index -t ${ mergedVCF}.vcf.gz """ } diff --git a/nextflow/nf_modules/split_VCF.nf b/nextflow/nf_modules/split_VCF.nf index 42b51e401..3c355c6dd 100644 --- a/nextflow/nf_modules/split_VCF.nf +++ b/nextflow/nf_modules/split_VCF.nf @@ -35,14 +35,14 @@ process splitVCF { script: """ - bcftools view -r ${chr} ${vcf} -o ${prefix}.${chr}.vcf.gz -O z + bcftools view --no-version -r ${chr} ${vcf} -o ${prefix}.${chr}.vcf.gz -O z bcftools index -t ${prefix}.${chr}.vcf.gz if [[ ${bin_size} ]]; then bcftools query -f'%CHROM\t%POS\n' ${prefix}.${chr}.vcf.gz | split -l ${bin_size} for file in x*; do - bcftools view -T \${file} -Oz ${prefix}.${chr}.vcf.gz > ${prefix}.${chr}.\${file}.vcf.gz + bcftools view --no-version -T \${file} -Oz ${prefix}.${chr}.vcf.gz > ${prefix}.${chr}.\${file}.vcf.gz bcftools index -t ${prefix}.${chr}.\${file}.vcf.gz done