Merge pull request #1376 from nakib103/nextflow_split_opt

Ensembl · Mar 27, 2023 · 2e26b23 · 2e26b23
2 parents 1c9cef7 + 6998e7e
commit 2e26b23
Show file tree

Hide file tree

Showing 8 changed files with 78 additions and 56 deletions.
diff --git a/nextflow/README.md b/nextflow/README.md
@@ -75,6 +75,8 @@ The following config files are used and can be modified depending on user requir
   --chros_file LIST_OF_CHROS_FILE   Path to file containing list of chromosomes
   --cpus INT                        Number of CPUs to use. Default 1.
   --output_prefix FILENAME_PREFIX   Output filename prefix. The generated output file will have name <output_prefix>.vcf.gz
+  --skip_check [0,1]                If set will skip checking of tabix index file for input VCF, we can do this if there a index file already exist. It enables the first module to load from cache if -resume is used. Default: 0
+  --bin_size SIZE                 If given there will be further split and each file will contain exactly SIZE number of variants. Enables faster run in expense of more jobs. By default the input file is only split by chromosome (SIZE=0). Default: 0
 ```
 NB: File paths are expected to be absolute paths.
 

diff --git a/nextflow/nf_config/nextflow.config b/nextflow/nf_config/nextflow.config
@@ -38,3 +38,4 @@ profiles {
 //params.chros_file = "$PWD/examples/clinvar-testset/chros.txt"
 params.vep_config = "$PWD/nf_config/vep.ini"
 params.output_prefix = ""
+params.bin_size = 0
diff --git a/nextflow/nf_modules/merge_chros_VCF.nf → nextflow/nf_modules/merge_VCF.nf b/nextflow/nf_modules/merge_chros_VCF.nf → nextflow/nf_modules/merge_VCF.nf
@@ -32,6 +32,8 @@ process mergeVCF {
 
   cpus params.cpus
   container "${params.singularity_dir}/bcftools.sif"
+
+  cache 'lenient'
 
 
   input:
@@ -43,8 +45,9 @@ process mergeVCF {
 
   script: 
   """
-  bcftools concat ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz
-  bcftools sort -Oz temp-${ mergedVCF}.vcf.gz -o ${ mergedVCF}.vcf.gz 
-  bcftools  index -t ${ mergedVCF}.vcf.gz
+  mkdir -p temp
+  bcftools concat --no-version -a ${ vcfFiles } -Oz -o temp-${ mergedVCF}.vcf.gz
+  bcftools sort -T temp -Oz temp-${ mergedVCF}.vcf.gz -o ${ mergedVCF}.vcf.gz 
+  bcftools index -t ${ mergedVCF}.vcf.gz
   """
 }
diff --git a/nextflow/nf_modules/read_chros_VCF.nf → nextflow/nf_modules/read_VCF.nf b/nextflow/nf_modules/read_chros_VCF.nf → nextflow/nf_modules/read_VCF.nf
diff --git a/nextflow/nf_modules/run_vep_chros.nf → nextflow/nf_modules/run_vep.nf b/nextflow/nf_modules/run_vep_chros.nf → nextflow/nf_modules/run_vep.nf
@@ -45,7 +45,7 @@ process chrosVEP {
   }
   else {
     """
-    vep -i ${vcfFile} -o ${prefix}-${vcfFile} --vcf --compress_output bgzip --format vcf --config ${vep_config} 
+    vep -i ${vcfFile} -o ${prefix}-${vcfFile} --vcf --compress_output bgzip --format vcf --config ${vep_config}
     tabix -p vcf ${prefix}-${vcfFile}
     """	
   }

diff --git a/nextflow/nf_modules/split_VCF.nf b/nextflow/nf_modules/split_VCF.nf
@@ -0,0 +1,54 @@
+#!/usr/bin/env nextflow
+
+/* 
+ * Script to split a multi-chromosome VCF into single-chromosome VCFs
+ */
+
+nextflow.enable.dsl=2
+
+// defaults
+prefix = "out"
+params.outdir = ""
+params.cpus = 1
+
+process splitVCF {
+  /*
+  Function to split a multi-chromosome VCF into single chromosome VCF
+
+  Returns
+  -------
+  Returns 2 files per chromosome:
+      1) A VCF format file for each splitted chromosome
+      2) A tabix index for that VCF
+  */
+  cpus params.cpus
+  container "${params.singularity_dir}/bcftools.sif"
+
+  input:
+  val(chr)
+  path(vcf)
+  path(vcf_index)
+  val(bin_size)
+
+  output:
+  tuple path("${prefix}.${chr}.*vcf.gz"), path("${prefix}.${chr}.*vcf.gz.tbi"), emit: files
+
+  script:
+  """
+  bcftools view --no-version -r ${chr} ${vcf} -o ${prefix}.${chr}.vcf.gz -O z
+  bcftools index -t ${prefix}.${chr}.vcf.gz
+  
+  if [[ ${bin_size} ]]; then 
+    bcftools query -f'%CHROM\t%POS\n' ${prefix}.${chr}.vcf.gz | split -l ${bin_size}
+    
+    for file in x*; do 
+      bcftools view --no-version -T \${file} -Oz ${prefix}.${chr}.vcf.gz > ${prefix}.${chr}.\${file}.vcf.gz
+      bcftools index -t ${prefix}.${chr}.\${file}.vcf.gz
+    done
+    
+    rm ${prefix}.${chr}.vcf.gz
+    rm ${prefix}.${chr}.vcf.gz.tbi
+    rm x*
+  fi
+  """
+}
diff --git a/nextflow/nf_modules/split_into_chros.nf b/nextflow/nf_modules/split_into_chros.nf
diff --git a/nextflow/workflows/run_vep.nf b/nextflow/workflows/run_vep.nf
@@ -17,10 +17,10 @@ params.chros=""
 params.chros_file=""
 
 // module imports
-include { splitVCF } from '../nf_modules/split_into_chros.nf' 
-include { mergeVCF } from '../nf_modules/merge_chros_VCF.nf'  
-include { chrosVEP } from '../nf_modules/run_vep_chros.nf'
-include { readChrVCF } from '../nf_modules/read_chros_VCF.nf'
+include { splitVCF } from '../nf_modules/split_VCF.nf' 
+include { mergeVCF } from '../nf_modules/merge_VCF.nf'  
+include { chrosVEP } from '../nf_modules/run_vep.nf'
+include { readChrVCF } from '../nf_modules/read_VCF.nf'
 
  // print usage
 if (params.help) {
@@ -59,12 +59,14 @@ if(check_bgzipped.exitValue()){
   exit 1, "The specified VCF file is not bgzipped: ${params.vcf}"
 }
 
-def sout = new StringBuilder(), serr = new StringBuilder()
-check_parsing = "$params.singularity_dir/vep.sif tabix -p vcf -f $params.vcf".execute()
-check_parsing.consumeProcessOutput(sout, serr)
-check_parsing.waitFor()
-if( serr ){
-  exit 1, "The specified VCF file has issues in parsing: $serr"
+if ( !params.skip_check ){
+  def sout = new StringBuilder(), serr = new StringBuilder()
+  check_parsing = "$params.singularity_dir/vep.sif tabix -p vcf -f $params.vcf".execute()
+  check_parsing.consumeProcessOutput(sout, serr)
+  check_parsing.waitFor()
+  if( serr ){
+    exit 1, "The specified VCF file has issues in parsing: $serr"
+  }
 }
 vcf_index = "${params.vcf}.tbi"
 
@@ -97,7 +99,7 @@ log.info params.chros
     readChrVCF(params.vcf, vcf_index)
     chr = readChrVCF.out.splitText().map{it -> it.trim()}
   }
-  splitVCF(chr, params.vcf, vcf_index)
-  chrosVEP(splitVCF.out, params.vep_config)
+  splitVCF(chr, params.vcf, vcf_index, params.bin_size)
+  chrosVEP(splitVCF.out.files.transpose(), params.vep_config)
   mergeVCF(chrosVEP.out.vcfFile.collect(), chrosVEP.out.indexFile.collect())
 }