feat: QC (#6)

* added qc standard * added_qc * reduce conda envs * fixed linting and formatting * changed .config in testing and del normFactor log * fixed formatting * fixed spp * added samtools to env * fixed qc and peak calling for both se and pe
DavideBrex · Nov 22, 2023 · b956ba0 · b956ba0
1 parent d05a8dc
commit b956ba0
Show file tree

Hide file tree

Showing 21 changed files with 1,281 additions and 47 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,7 @@ report/**
 Dockerfile
 config/samples_sheet_ER.csv
 config/samples_sheet_bap1.csv
+dag.svg
+report.html
+config/config_bap1.yaml
+config/samples_sheet_bap1Short.csv
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -12,6 +12,9 @@ trimming: true
 
 aligner: bowtie
 
+#type of normalization to perform to generate the bigwig files (one from RAW, Orlando , RX-Input)
+normalization_type: "RX-Input"
+
 resources:
   ref: 
     #if bowtie index is not available leave only "" , it will be created
@@ -50,17 +53,18 @@ params:
     memory: 1
   deeptools:
     #change this according to: https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html
-    effective_genome_length: 2701495761
+    effective_genome_length: 11000000
     #read extension, only for SE
     read_extension: 75
   peakCalling:
-    #specify the ucsc genome name (hg38, mm10, dm6, etc.)
-    genome: "hg38"
     chrom_sizes: "data/hg38.chromsizes"
     macs2:
-      pvalue: 1e-5
+      pvalue: 0.05
     epic2:
-      fdr: 0.0005
+      #effective genome fraction, change this according to https://github.com/biocore-ntnu/epic2/blob/master/epic2/effective_sizes/hg38_50.txt
+      # in the github it is indicated as Effective genome size, and you need to know the read length in  order to set it
+      egf: 0.869 
+      fdr: 0.05
     edd:
       fdr: 0.05
 

diff --git a/config/config.yaml b/config/config.yaml
@@ -12,6 +12,9 @@ trimming: true
 
 aligner: bowtie
 
+#type of normalization to perform to generate the bigwig files (one from RAW, Orlando , RX-Input)
+normalization_type: "RX-Input"
+
 resources:
   ref: 
     #if bowtie index is not available leave only "" , it will be created
@@ -54,13 +57,14 @@ params:
     #read extension, only for SE
     read_extension: 75
   peakCalling:
-    #specify the ucsc genome name (hg38, mm10, dm6, etc.)
-    genome: "hg38"
     chrom_sizes: ".test/data/hg38.chromsizes"
     macs2:
-      pvalue: 1e-5
+      pvalue: 0.05
     epic2:
-      fdr: 0.0005
+      #effective genome fraction, change this according to https://github.com/biocore-ntnu/epic2/blob/master/epic2/effective_sizes/hg38_50.txt
+      # in the github it is indicated as Effective genome size, and you need to know the read length in  order to set it
+      egf: 0.869 
+      fdr: 0.05
     edd:
       fdr: 0.05
 

diff --git a/config/samples_sheet.csv b/config/samples_sheet.csv
@@ -3,8 +3,8 @@ H3K4me3_untreated,1,H3K4me3,Input_untreated,1,narrow,.test/data/fastq/H3K4me3_un
 H3K4me3_untreated,1,H3K4me3,Input_untreated,1,narrow,.test/data/fastq/H3K4me3_untreated-1_Homo_sapiens_L2.fastq.gz,
 H3K4me3_untreated,2,H3K4me3,Input_untreated,1,narrow,.test/data/fastq/H3K4me3_untreated-2_Homo_sapiens_L1.fastq.gz,
 H3K4me3_untreated,2,H3K4me3,Input_untreated,1,narrow,.test/data/fastq/H3K4me3_untreated-2_Homo_sapiens_L2.fastq.gz,
-H3K4me3_EGF,1,H3K4me3,Input_EGF,1,narrow,.test/data/fastq/H3K4me3_EGF-1_Homo_sapiens.fastq.gz,
-H3K4me3_EGF,2,H3K4me3,Input_EGF,2,narrow,.test/data/fastq/H3K4me3_EGF-2_Homo_sapiens.fastq.gz,
+H3K4me3_EGF,1,H3K4me3,Input_EGF,1,broad,.test/data/fastq/H3K4me3_EGF-1_Homo_sapiens.fastq.gz,
+H3K4me3_EGF,2,H3K4me3,Input_EGF,2,broad,.test/data/fastq/H3K4me3_EGF-2_Homo_sapiens.fastq.gz,
 H3K9me2_untreated,1,H3K9me2,Input_untreated,1,very-broad,.test/data/fastq/H3K9me2_untreated-1_Homo_sapiens.fastq.gz,
 H3K9me2_EGF,1,H3K9me2,Input_EGF,1,very-broad,.test/data/fastq/H3K9me2_EGF-1_Homo_sapiens.fastq.gz,
 Input_untreated,1,,,,,.test/data/fastq/Input-untreated-1_Homo_sapiens_ChIP-Seq.fastq.gz,

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -37,6 +37,7 @@ include: "rules/trim.smk"
 include: "rules/align.smk"
 include: "rules/createBigWigs.smk"
 include: "rules/callPeaks.smk"
+include: "rules/qc.smk"
 
 
 rule all:

diff --git a/workflow/envs/Renv.yaml b/workflow/envs/Renv.yaml
@@ -0,0 +1,10 @@
+name: Renv
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults  ## block user/system channels
+dependencies:
+  - r-base=4.1.0
+  - r-spp
+  - samtools
+  - gawk
diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml
diff --git a/workflow/envs/qc.yaml b/workflow/envs/qc.yaml
@@ -0,0 +1,8 @@
+channels:
+    - conda-forge
+    - bioconda
+    - defaults  
+dependencies:
+    - fastqc
+    - multiqc
+    - deeptools
diff --git a/workflow/envs/epic2.yaml → workflow/envs/various.yaml b/workflow/envs/epic2.yaml → workflow/envs/various.yaml
@@ -1,7 +1,11 @@
-name: epic2
+name: various
 channels:
   - conda-forge
   - bioconda
 dependencies:
   - epic2
+  - pysam
+  - chromap
+  - bedtools
+  - chip-r
 
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
@@ -69,7 +69,7 @@ if config["aligner"] == "bowtie":
                 outdir
             ),
         benchmark:
-            "{}results/.benchmarks/{{id}}.align.benchmark.txt".format(outdir)
+            "{}results/.benchmarks/{{id}}_spike.align.benchmark.txt".format(outdir)
         shell:
             """
             bowtie -p {threads} {params.bowtie} -x {params.index} {params.inputsel} 2> {log.align} \
@@ -90,8 +90,10 @@ rule clean_spike:
         sample_ref="{}results/bam/{{id}}.clean.bam".format(outdir),
         sample_spike="{}results/bam_spike/{{id}}_spike.clean.bam".format(outdir),
     conda:
-        "../envs/pysam.yaml"
+        "../envs/various.yaml"
     log:
         "{}results/logs/spike/{{id}}.removeSpikeDups".format(outdir),
+    benchmark:
+        "{}results/.benchmarks/{{id}}.clean_spike.benchmark.txt".format(outdir)
     script:
         "../scripts/remove_spikeDups.py"
diff --git a/workflow/rules/callPeaks.smk b/workflow/rules/callPeaks.smk
@@ -24,6 +24,8 @@ rule macs2_callNarrowPeak:
         + " --pvalue "
         + config["params"]["peakCalling"]["macs2"]["pvalue"]
         + " --keep-dup all",
+    benchmark:
+        "{}results/.benchmarks/{{sample}}.macs2.benchmark.txt".format(outdir)
     wrapper:
         "v2.6.0/bio/macs2/callpeak"
 
@@ -38,22 +40,30 @@ rule epic2_callBroadPeaks:
         broadPeaks="{}results/peakCalling/epic2/{{sample}}_broadPeaks.bed".format(
             outdir
         ),
+        summary="{}results/logs/peakCalling/epic2/{{sample}}_summary.txt".format(outdir),
     log:
         "{}results/logs/peakCalling/epic2/{{sample}}_callpeak.log".format(outdir),
     params:
         fdr=config["params"]["peakCalling"]["epic2"]["fdr"],
-        genome=config["params"]["peakCalling"]["genome"],
+        egf=config["params"]["peakCalling"]["epic2"]["egf"],
+        chrom_size=config["params"]["peakCalling"]["chrom_sizes"],
+    benchmark:
+        "{}results/.benchmarks/{{sample}}.epic2.benchmark.txt".format(outdir)
     conda:
-        "../envs/epic2.yaml"
+        "../envs/various.yaml"
     shell:
         """
         epic2 -t {input.treatment} \
               -c {input.control} \
               --guess-bampe \
               -fdr {params.fdr} \
-              --genome {params.genome} \
+              --chromsizes {params.chrom_size} \
+              --effective-genome-fraction {params.egf} \
               --output {output.broadPeaks} \
               2> {log}
+        
+        # Count the number of peaks and create a summary file
+        echo "Total Peaks: $(wc -l < {output.broadPeaks})" > {output.summary}
         """
 
 
@@ -67,13 +77,16 @@ rule edd_callVeryBroadPeaks:
         veryBroadPeaks="{}results/peakCalling/edd/{{sample}}/{{sample}}_peaks.bed".format(
             outdir
         ),
+        summary="{}results/logs/peakCalling/edd/{{sample}}_summary.txt".format(outdir),
     log:
         "{}results/logs/peakCalling/edd/{{sample}}_callpeak.log".format(outdir),
     params:
         chrom_size=config["params"]["peakCalling"]["chrom_sizes"],
         fdr=config["params"]["peakCalling"]["edd"]["fdr"],
         blacklist=config["resources"]["ref"]["blacklist"],
         output_dir="{}results/peakCalling/edd/{{sample}}".format(outdir),
+    benchmark:
+        "{}results/.benchmarks/{{sample}}.edd.benchmark.txt".format(outdir)
     conda:
         "../envs/edd.yaml"
     threads: 8
@@ -88,4 +101,6 @@ rule edd_callVeryBroadPeaks:
             2> {log}
         mv {params.output_dir}/log.txt {params.output_dir}/{wildcards.sample}_runlog.txt
         mv {params.output_dir}/{wildcards.sample}_runlog.txt $(dirname {log})
+        # Count the number of peaks and create a summary file
+        echo "Total Peaks: $(wc -l < {output.veryBroadPeaks})" > {output.summary}
         """