diff --git a/nextflow/README.md b/nextflow/README.md index baab50d4b..a678a25a5 100644 --- a/nextflow/README.md +++ b/nextflow/README.md @@ -75,6 +75,8 @@ The following config files are used and can be modified depending on user requir --chros_file LIST_OF_CHROS_FILE Path to file containing list of chromosomes --cpus INT Number of CPUs to use. Default 1. --output_prefix FILENAME_PREFIX Output filename prefix. The generated output file will have name .vcf.gz + --skip_check [0,1] If set will skip checking of tabix index file for input VCF, we can do this if there a index file already exist. It enables the first module to load from cache if -resume is used. Default: 0 + --bin_size SIZE If given there will be further split and each file will contain exactly SIZE number of variants. Enables faster run in expense of more jobs. By default the input file is only split by chromosome (SIZE=0). Default: 0 ``` NB: File paths are expected to be absolute paths. diff --git a/nextflow/nf_config/nextflow.config b/nextflow/nf_config/nextflow.config index 184740511..104ed6bc9 100644 --- a/nextflow/nf_config/nextflow.config +++ b/nextflow/nf_config/nextflow.config @@ -38,3 +38,4 @@ profiles { //params.chros_file = "$PWD/examples/clinvar-testset/chros.txt" params.vep_config = "$PWD/nf_config/vep.ini" params.output_prefix = "" +params.bin_size = 0 diff --git a/nextflow/nf_modules/merge_chros_VCF.nf b/nextflow/nf_modules/merge_VCF.nf similarity index 72% rename from nextflow/nf_modules/merge_chros_VCF.nf rename to nextflow/nf_modules/merge_VCF.nf index 5deecfb77..fbe78b2e2 100644 --- a/nextflow/nf_modules/merge_chros_VCF.nf +++ b/nextflow/nf_modules/merge_VCF.nf @@ -32,6 +32,8 @@ process mergeVCF { cpus params.cpus container "${params.singularity_dir}/bcftools.sif" + + cache 'lenient' input: @@ -43,8 +45,9 @@ process mergeVCF { script: """ - bcftools concat ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz - bcftools sort -Oz temp-${ mergedVCF}.vcf.gz -o ${ mergedVCF}.vcf.gz - bcftools index -t ${ mergedVCF}.vcf.gz + mkdir -p temp + bcftools concat --no-version -a ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz + bcftools sort -T temp -Oz temp-${ mergedVCF}.vcf.gz -o ${ mergedVCF}.vcf.gz + bcftools index -t ${ mergedVCF}.vcf.gz """ } diff --git a/nextflow/nf_modules/read_chros_VCF.nf b/nextflow/nf_modules/read_VCF.nf similarity index 100% rename from nextflow/nf_modules/read_chros_VCF.nf rename to nextflow/nf_modules/read_VCF.nf diff --git a/nextflow/nf_modules/run_vep_chros.nf b/nextflow/nf_modules/run_vep.nf similarity index 91% rename from nextflow/nf_modules/run_vep_chros.nf rename to nextflow/nf_modules/run_vep.nf index e3ce4635d..33a027ae7 100644 --- a/nextflow/nf_modules/run_vep_chros.nf +++ b/nextflow/nf_modules/run_vep.nf @@ -45,7 +45,7 @@ process chrosVEP { } else { """ - vep -i ${vcfFile} -o ${prefix}-${vcfFile} --vcf --compress_output bgzip --format vcf --config ${vep_config} + vep -i ${vcfFile} -o ${prefix}-${vcfFile} --vcf --compress_output bgzip --format vcf --config ${vep_config} tabix -p vcf ${prefix}-${vcfFile} """ } diff --git a/nextflow/nf_modules/split_VCF.nf b/nextflow/nf_modules/split_VCF.nf new file mode 100644 index 000000000..3c355c6dd --- /dev/null +++ b/nextflow/nf_modules/split_VCF.nf @@ -0,0 +1,54 @@ +#!/usr/bin/env nextflow + +/* + * Script to split a multi-chromosome VCF into single-chromosome VCFs + */ + +nextflow.enable.dsl=2 + +// defaults +prefix = "out" +params.outdir = "" +params.cpus = 1 + +process splitVCF { + /* + Function to split a multi-chromosome VCF into single chromosome VCF + + Returns + ------- + Returns 2 files per chromosome: + 1) A VCF format file for each splitted chromosome + 2) A tabix index for that VCF + */ + cpus params.cpus + container "${params.singularity_dir}/bcftools.sif" + + input: + val(chr) + path(vcf) + path(vcf_index) + val(bin_size) + + output: + tuple path("${prefix}.${chr}.*vcf.gz"), path("${prefix}.${chr}.*vcf.gz.tbi"), emit: files + + script: + """ + bcftools view --no-version -r ${chr} ${vcf} -o ${prefix}.${chr}.vcf.gz -O z + bcftools index -t ${prefix}.${chr}.vcf.gz + + if [[ ${bin_size} ]]; then + bcftools query -f'%CHROM\t%POS\n' ${prefix}.${chr}.vcf.gz | split -l ${bin_size} + + for file in x*; do + bcftools view --no-version -T \${file} -Oz ${prefix}.${chr}.vcf.gz > ${prefix}.${chr}.\${file}.vcf.gz + bcftools index -t ${prefix}.${chr}.\${file}.vcf.gz + done + + rm ${prefix}.${chr}.vcf.gz + rm ${prefix}.${chr}.vcf.gz.tbi + rm x* + fi + """ +} diff --git a/nextflow/nf_modules/split_into_chros.nf b/nextflow/nf_modules/split_into_chros.nf deleted file mode 100644 index ef96bc46b..000000000 --- a/nextflow/nf_modules/split_into_chros.nf +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env nextflow - -/* - * Script to split a multi-chromosome VCF into single-chromosome VCFs - */ - -nextflow.enable.dsl=2 - -// defaults -prefix = "out" -params.outdir = "" -params.cpus = 1 - -process splitVCF { - /* - Function to split a multi-chromosome VCF into single chromosome VCF - - Returns - ------- - Returns 2 files per chromosome: - 1) A VCF format file for each splitted chromosome - 2) A tabix index for that VCF - */ - cpus params.cpus - container "${params.singularity_dir}/bcftools.sif" - - input: - val(chr) - path(vcf) - path(vcf_index) - - output: - tuple path("${prefix}.${chr}.vcf.gz"), path("${prefix}.${chr}.vcf.gz.tbi") - - script: - """ - bcftools view -r ${chr} ${vcf} -o ${prefix}.${chr}.vcf.gz -O z - bcftools index -t ${prefix}.${chr}.vcf.gz - """ -} diff --git a/nextflow/workflows/run_vep.nf b/nextflow/workflows/run_vep.nf index caf793d0c..93f500bdd 100644 --- a/nextflow/workflows/run_vep.nf +++ b/nextflow/workflows/run_vep.nf @@ -17,10 +17,10 @@ params.chros="" params.chros_file="" // module imports -include { splitVCF } from '../nf_modules/split_into_chros.nf' -include { mergeVCF } from '../nf_modules/merge_chros_VCF.nf' -include { chrosVEP } from '../nf_modules/run_vep_chros.nf' -include { readChrVCF } from '../nf_modules/read_chros_VCF.nf' +include { splitVCF } from '../nf_modules/split_VCF.nf' +include { mergeVCF } from '../nf_modules/merge_VCF.nf' +include { chrosVEP } from '../nf_modules/run_vep.nf' +include { readChrVCF } from '../nf_modules/read_VCF.nf' // print usage if (params.help) { @@ -59,12 +59,14 @@ if(check_bgzipped.exitValue()){ exit 1, "The specified VCF file is not bgzipped: ${params.vcf}" } -def sout = new StringBuilder(), serr = new StringBuilder() -check_parsing = "$params.singularity_dir/vep.sif tabix -p vcf -f $params.vcf".execute() -check_parsing.consumeProcessOutput(sout, serr) -check_parsing.waitFor() -if( serr ){ - exit 1, "The specified VCF file has issues in parsing: $serr" +if ( !params.skip_check ){ + def sout = new StringBuilder(), serr = new StringBuilder() + check_parsing = "$params.singularity_dir/vep.sif tabix -p vcf -f $params.vcf".execute() + check_parsing.consumeProcessOutput(sout, serr) + check_parsing.waitFor() + if( serr ){ + exit 1, "The specified VCF file has issues in parsing: $serr" + } } vcf_index = "${params.vcf}.tbi" @@ -97,7 +99,7 @@ log.info params.chros readChrVCF(params.vcf, vcf_index) chr = readChrVCF.out.splitText().map{it -> it.trim()} } - splitVCF(chr, params.vcf, vcf_index) - chrosVEP(splitVCF.out, params.vep_config) + splitVCF(chr, params.vcf, vcf_index, params.bin_size) + chrosVEP(splitVCF.out.files.transpose(), params.vep_config) mergeVCF(chrosVEP.out.vcfFile.collect(), chrosVEP.out.indexFile.collect()) }