Merge pull request #417 from tovahmarkowitz/activeDev

ChIP-seq multiQC improvements, and a few other minor details
CCBR · May 2, 2019 · 6349c7a · 6349c7a
2 parents 282b258 + 66d2cb0
commit 6349c7a
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 47 deletions.
diff --git a/Results-template/Scripts/createtable b/Results-template/Scripts/createtable
@@ -2,7 +2,7 @@
 '''
 Author:		Skyler Kuhn
 Date created:	08/18/2018
-Last modified:	08/23/2018
+Last modified:	04/26/2019 by Tovah Markowitz: markowitzte@nih.gov
 Email:		kuhnsa@nih.gov
 
 About: This program takes standard input from the concatenation of each *qc.metric file.The results 
@@ -30,10 +30,14 @@ def file2table():
             pass
 
     df = pd.DataFrame(tabledict)
+    df.index.name = 'SampleName'
+    df.reset_index(inplace=True)
     #print(df[['NSC', 'FRiP', 'PCB1', 'PCB2', 'RSC']])  #re-order columns
     #cols = df.columns.tolist() # view df columns names
     #orderedcols = ordercolumns(cols)
-    print(df.to_string())
+    #print(df.to_string())
+    print(df[['SampleName', 'NReads', 'NMappedReads', 'NUniqMappedReads', 'NRF', 'PBC1', 'PBC2', 'FragmentLength', 'NSC', 'RSC', 'Qtag']].to_string(index=False,justify="left"))
+
 
 if __name__ == '__main__':
     file2table()

diff --git a/Results-template/Scripts/filterMetrics b/Results-template/Scripts/filterMetrics
@@ -33,6 +33,9 @@ About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py,
 	9.) Find NSC, RSC, Qtag
 	  # awk '{print $(NF-2),$(NF-1),$(NF)}' H3k4me3_gran_1.sorted.Q5DD.ppqt | ./filterMetrics H3k4me3_gran_1 ppqt
 
+	10.) Find the Fragment Length
+	  # awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | awk -F ',' '{print $1}' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
+
 Python version(s):	2.7 or 3.X
 
 '''
@@ -61,7 +64,9 @@ def getmetadata(type):
                 metadata = 'NMappedReads'
 	elif type == 'unreads':
                 metadata = 'NUniqMappedReads'
-	return metadata	
+	elif type == 'fragLen':
+		metadata = ['FragmentLength']
+	return metadata
 
 
 def filteredData(sample, ftype):
@@ -75,7 +80,7 @@ def filteredData(sample, ftype):
 			else:
 				v1, v2 = linelist[0], linelist[1]
 				print("{}\t{}\t{}".format(sample, mtypes, add(v1,v2)))
-		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
+		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf' or ftype == 'fragLen':
 			mtypes = getmetadata(ftype)
 			for i in range(len(linelist)):
 				print("{}\t{}\t{}".format(sample, mtypes[i], linelist[i]))

diff --git a/Results-template/Scripts/jaccard_score.py b/Results-template/Scripts/jaccard_score.py
@@ -20,7 +20,9 @@
 
 def split_infiles(infiles):
     # breaks the infile string with space-delimited file names and creates a list
-    infileList = infiles.split(" ")
+    infileList = infiles.strip("\'").strip('\"').split(" ")
+    if len(infileList) == 1:
+        infileList = infileList[0].split(";")
     return(infileList)
 
 def loop_jaccard(infileList, genomefile):
@@ -32,6 +34,7 @@ def loop_jaccard(infileList, genomefile):
     out = []
     for z in range(nfiles):
         fileA = infileList[z]
+        print("fileA is: " + fileA) 
         a = BedTool(fileA)
         a = a.sort(g=genomefile)
         for y in range(z+1,nfiles):
@@ -68,7 +71,7 @@ def main():
 
     parser = optparse.OptionParser(description=desc)
 
-    parser.add_option('-i', dest='infiles', default='', help='A space-delimited list of \
+    parser.add_option('-i', dest='infiles', default='', help='A space- or semicolon-delimited list of \
 input files for analysis.')
     parser.add_option('-o', dest='outfile', default='', help='The name of the output file \
 where all the jaccard score information will be saved.')

diff --git a/Results-template/Scripts/ngsqc_plot.py b/Results-template/Scripts/ngsqc_plot.py
@@ -4,12 +4,14 @@
 Name: ngsqc_plot.py
 Created by: Tovah Markowitz
 Date: 1/3/19
+Updated: 4/18/19
 
 Purpose: To take a number of NGSQC_report.txt files and convert into a figure.
 Currently designed to work in case where all files have been moved from their original
 subdirectories and the filename includes information of about the source data. All input
 files must be in the noted directory (not recursive), end with ".NGSQC_report.txt", and
-contain 'ext'. Output figures are named based upon 'ext'.
+contain 'ext'. Output figures are named based upon 'ext'. Now also creates a single output
+table that can be read into the multiqc report.
 """
 
 ##########################################
@@ -54,28 +56,44 @@ def create_plot(ngsqcAll, ext, outfileName):
 	plt.savefig(outfileName)
 	plt.close("all")
 
+def write_ngsqc(ngsqcAll,directory,ext):
+        """to write a set of tables of the ngsqc data used in the image for incorporation
+        into multiqc
+        """
+        for sample in ngsqcAll:
+            if ext == "":
+                outfile = directory + "/" + sample[0] + ".NGSQC.txt"
+            else:
+                outfile = directory + "/" + sample[0] + "." + ext + ".NGSQC.txt"
+            f = open(outfile, 'w')
+            f.write( "\n".join( str(sample[1][0][i]) + "," + str(sample[1][1][i]) for i in range(len(sample[1][0])) ) )
+            f.close()
+
 def all_ngsqc(directory, ext):
-	""" To get the ngsqc data for all NGSQC_report.txt files within a single folder
-	 assumes that all files have been moved from their original subdirectories and 
-	 filename include information about the source of the data
-	"""
-	files = os.listdir(directory)
-	files2 = [ file for file in files if re.search("NGSQC_report.txt",file) ]
-	files3 = [ file for file in files if re.search(ext,file) ]
-	ngsqcAll = [ ]
-	regex = r"(.*)." + re.escape(ext) + r".NGSQC_report.txt"
-	for file in files3:
-		tmp = re.search(regex,file)
-		ngsqcAll.append([ tmp.group(1), read_ngsqc(directory + "/" + file) ])
-	return ngsqcAll
+        """ To get the ngsqc data for all NGSQC_report.txt files within a single folder
+        assumes that all files have been moved from their original subdirectories and 
+        filename include information about the source of the data
+        """
+        files = os.listdir(directory)
+        files2 = [ file for file in files if re.search("NGSQC_report.txt",file) ]
+        files3 = [ file for file in files2 if re.search(ext,file) ]
+        ngsqcAll = [ ]
+        if ext == "":
+            regex = r"(.*).NGSQC_report.txt"
+        else:
+            regex = r"(.*)." + re.escape(ext) + r".NGSQC_report.txt"
+        for file in files3:
+            tmp = re.search(regex,file)
+            ngsqcAll.append([ tmp.group(1), read_ngsqc(directory + "/" + file) ])
+        return ngsqcAll
 
 def name_outimage(directory, ext, group):
 	"""this version uses ext to create the output file name
 	"""
 	if group == "":
-		outfileName =  directory + "/NGSQC." + ext + ".pdf"
+		outfileName =  directory + "/NGSQC." + ext + ".png"
 	else:
-		outfileName =  directory + "/" + group + ".NGSQC." + ext + ".pdf"
+		outfileName =  directory + "/" + group + ".NGSQC." + ext + ".png"
 	return outfileName
 
 #############################################
@@ -106,6 +124,7 @@ def main():
 
 	outfileName = name_outimage(directory, ext, group)
 	ngsqcAll = all_ngsqc(directory, ext)
+	write_ngsqc(ngsqcAll, directory, ext)
 	create_plot(ngsqcAll, ext, outfileName)
 
 #dir='/Users/markowitzte/Desktop/ngsqc'

diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
@@ -146,15 +146,15 @@ if se == 'yes' :
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             # ngsqc
-            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.pdf"),group=groups),
+            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
             # preseq
             expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
             expand(join(workpath,"QC","{name}.nrf"), name=samples),
             expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
-            join(workpath,"QCTable.txt"),
+            join(workpath,"QC","QCTable.txt"),
+
 
-
     rule trim_se: # actually trim, filter polyX and remove black listed reads
         input:
             infq=join(workpath,"{name}.R1.fastq.gz"),
@@ -188,7 +188,7 @@ java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq VALIDATION_S
 pigz -p 16 ${{sample}}.cutadapt.noBL.fastq;
 mv ${{sample}}.cutadapt.noBL.fastq.gz {output.outfq};
             """
-            
+
     rule kraken_se:
         input:
             fq = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
@@ -327,15 +327,15 @@ if pe == 'yes':
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             # ngsqc
-            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.pdf"),group=groups),
+            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
             # preseq
             expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
             expand(join(workpath,"QC","{name}.nrf"), name=samples),
             expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
-            join(workpath,"QCTable.txt"),
+            join(workpath,"QC","QCTable.txt"),
+
 
-
     rule trim_pe: # trim, remove PolyX and remove BL reads
         input:
             file1=join(workpath,"{name}.R1.fastq.gz"),
@@ -445,7 +445,6 @@ samtools flagstat {output.outbam2} > {output.flagstat2}
             out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), 
         params:
             rname='pl:dedup',
-            batch='--mem=98g --time=10:00:00 --gres=lscratch:800',
             picardver=config['bin'][pfamily]['tool_versions']['PICARDVER'],
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
             javaram='16g',
@@ -482,7 +481,6 @@ rule ppqt:
         pdf= join(workpath,bam_dir,"{name}.sorted{ext}pdf"),
     params:
         rname="pl:ppqt",
-        batch='--mem=24g --time=10:00:00 --gres=lscratch:800',
         samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         rver=config['bin'][pfamily]['tool_versions']['RVER'],
     run:
@@ -507,7 +505,6 @@ rule bam2bw:
         outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),
     params:
         rname="pl:bam2bw",
-        batch='--mem=24g --time=10:00:00 --gres=lscratch:800',
         reflen=config['references'][pfamily]['REFLEN'],
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
     run:
@@ -547,9 +544,9 @@ rule deeptools_prep:
         bam=expand(join(workpath,bam_dir,"{name}.{ext}.bam"),name=samples,ext=extensions2),
         bw2=expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput)
     output:
-        expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions),
-        expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2),
-        expand(join(workpath,bw_dir,"{group2}.{ext2}.deeptools_prep"), zip, group2=deepgroups,ext2=deepexts),
+        temp(expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions)),
+        temp(expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2)),
+        temp(expand(join(workpath,bw_dir,"{group2}.{ext2}.deeptools_prep"), zip, group2=deepgroups,ext2=deepexts)),
     params:
         rname="pl:deeptools_prep",
         batch="--mem=10g --time=1:00:00",
@@ -612,10 +609,13 @@ rule deeptools_QC:
         cmd2="plotCorrelation -in "+output.npz+" -o "+output.heatmap+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
 	cmd3="plotCorrelation -in "+output.npz+" -o "+output.scatter+" -c 'spearman' -p 'scatterplot' --skipZeros --removeOutliers"
         cmd4="plotPCA -in "+output.npz+" -o "+output.pca
+        cmd5="plotCorrelation -in "+output.npz+" -o "+join(workpath,deeptools_dir,"spearman_heatmap."+wildcards.ext+"_mqc.png")+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
 	shell(commoncmd+cmd1)
         shell(commoncmd+cmd2)
 	shell(commoncmd+cmd3)
 	shell(commoncmd+cmd4)
+        if "Q5DD" in wildcards.ext:
+            shell(commoncmd+cmd5)
 
 rule deeptools_fingerprint:
     input:
@@ -626,15 +626,15 @@ rule deeptools_fingerprint:
     params:
         rname="pl:deeptools_fingerprint",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
-	batch="--cpus-per-task=8"
+	nthreads="8"
     run:
         import re
         commoncmd="module load {params.deeptoolsver}; module load python;"
         listfile=list(map(lambda z:z.strip().split(),open(input.prep,'r').readlines()))
         ext=listfile[0][0]
         bams=listfile[1]
         labels=listfile[2]
-        cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p 4 --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image 
+        cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p "+params.nthreads+" --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image 
         if se == "yes":
             cmd+=" -e 200"
         shell(commoncmd+cmd)
@@ -649,15 +649,15 @@ rule deeptools_fingerprint_Q5DD:
     params:
         rname="pl:deeptools_fingerprint_Q5DD",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
-	batch="--cpus-per-task=8"
+	nthreads="8"
     run:
         import re
         commoncmd="module load {params.deeptoolsver}; module load python;"
         listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines()))
         ext=listfile[0][0]
         bams=listfile[1]
         labels=listfile[2]
-        cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p 4 --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image+" --outRawCounts "+output.raw
+        cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p "+params.nthreads+" --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image+" --outRawCounts "+output.raw
         if se == "yes":
             cmd+=" -e 200"
         shell(commoncmd+cmd)
@@ -676,7 +676,8 @@ rule deeptools_genes:
     params:
         rname="pl:deeptools_genes",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
-        prebed=config['references'][pfamily]['GENEINFO']
+        prebed=config['references'][pfamily]['GENEINFO'],
+        nthreads="16"
     run:
         import re
         commoncmd="module load {params.deeptoolsver}; module load python;"
@@ -685,8 +686,8 @@ rule deeptools_genes:
         bws=listfile[1]
         labels=listfile[2]
         cmd1="awk -v OFS='\t' -F'\t' '{{print $1, $2, $3, $5, \".\", $4}}' "+params.prebed+" > "+output.bed
-        cmd2="computeMatrix scale-regions -S "+" ".join(bws)+" -R "+output.bed+" -p 4 --upstream 1000 --regionBodyLength 2000 --downstream 1000 --skipZeros -o "+output.metamat+" --samplesLabel "+" ".join(labels)
-        cmd3="computeMatrix reference-point -S "+" ".join(bws)+" -R "+output.bed+" -p 4 --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros -o "+output.TSSmat+" --samplesLabel "+" ".join(labels)
+        cmd2="computeMatrix scale-regions -S "+" ".join(bws)+" -R "+output.bed+" -p "+params.nthreads+" --upstream 1000 --regionBodyLength 2000 --downstream 1000 --skipZeros -o "+output.metamat+" --samplesLabel "+" ".join(labels)
+        cmd3="computeMatrix reference-point -S "+" ".join(bws)+" -R "+output.bed+" -p "+params.nthreads+" --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros -o "+output.TSSmat+" --samplesLabel "+" ".join(labels)
         cmd4="plotHeatmap -m "+output.metamat+" -out "+output.metaheat+" --colorMap 'PuOr_r' --yAxisLabel 'average RPGC' --regionsLabel 'genes' --legendLocation 'none'"
         cmd5="plotHeatmap -m "+output.TSSmat+" -out "+output.TSSheat+" --colorMap 'RdBu_r' --yAxisLabel 'average RPGC' --regionsLabel 'genes' --legendLocation 'none'"
         cmd6="plotProfile -m "+output.metamat+" -out "+output.metaline+" --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-right"
@@ -851,7 +852,7 @@ rule ngsqc_plot:
     input:
         ngsqc=expand(join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples),
     output:
-        out=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.pdf"),group=groups),
+        png=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
     params:
         rname="pl:ngsqc_plot",
         script=join(workpath,"Scripts","ngsqc_plot.py"),
@@ -867,9 +868,10 @@ rule ngsqc_plot:
             for label in labels:
                 cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".sorted.Q5DD.NGSQC_report.txt " + group + "; " 
             cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'sorted.Q5DD' -g '" + group + "'; "
-            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.pdf " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.pdf" + "; "
+            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.png" + "; "
+        cmd4 = "mv */*.sorted.Q5DD.NGSQC.txt " + workpath + "/QC; "
         shell(commoncmd1)
-        shell(commoncmd2 + cmd1 + cmd2 + cmd3)
+        shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4)
 
 rule QCstats:
     input:
@@ -896,6 +898,8 @@ grep 'mapped (' {input.ddflagstat} | awk '{{print $1,$3}}' | {params.filterColla
 cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQCfile}
 # NSC, RSC, Qtag
 awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile}
+# Fragment Length
+awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
             """
 
 rule QCTable:
@@ -906,7 +910,7 @@ rule QCTable:
         inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)),
         filterCollate=join(workpath,"Scripts","createtable"),
     output:
-        qctable=join(workpath,"QCTable.txt"),
+        qctable=join(workpath,"QC","QCTable.txt"),
     threads: 16
     shell: """
 cat {params.inputstring} | {params.filterCollate} > {output.qctable}
@@ -919,10 +923,12 @@ rule multiqc:
         expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
         expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
 	expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
-        join(workpath,"QCTable.txt"),
+        join(workpath,"QC","QCTable.txt"),
         join(workpath,"rawQC"),
         join(workpath,"QC"),
         expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups),
+        expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+        join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"),
     output:
         join(workpath,"Reports","multiqc_report.html")
     params:

diff --git a/cluster.json b/cluster.json
@@ -107,6 +107,15 @@
     "deeptools": {
         "mem": "24g"
     },
+    "deeptools_fingerprint": {
+        "threads":"8"
+    },
+    "deeptools_fingerprint_Q5DD": {
+        "threads":"8"
+    },
+    "deeptools_genes": {
+        "threads":"16"
+    },
     "deeptools_prep": {
         "mem": "4g",
         "threads": "2"