ChIP-seq updates: estimated fragment length and more (#450)

* improving estimated fragment length * est fraglen; diffbind * DiffBind and cleanup * ppqt changes * ppqt improvements and a few other details * scRNASeq changes Feb 2020 (#438) * scRNASeq changes Feb 2020 * March updates * Merging activeDev for v4.0.1 release (#444) * Fix sequenza error * Scrnaseq (#437) Scrnaseq: miRNA-seq changes and scRNA-seq changes * Symlink counts matrix if "counts/" exists in users rawdata directory * Dectect if users is starting from a raw counts matrix * Updating pop-up box dialogue after detection of new counts matrix filetype * Find differential expression metadata in "/path/to/rawdata/counts/" * Readability: checking for counts matrix outside conditional statement * Adding "--alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" to the first and second pass of STAR * Explicitly loading 'python/2.7': default version of python changing to '3.7' Co-authored-by: Mayank Tandon <mtandon09@gmail.com> Co-authored-by: wong-nw <38703762+wong-nw@users.noreply.github.com> Co-authored-by: jlac <justin.lack@nih.gov> Co-authored-by: Skyler Kuhn <kuhnsa2@mymail.vcu.edu> Co-authored-by: Mayank Tandon <mtandon09@gmail.com> Co-authored-by: jlac <justin.lack@nih.gov> Co-authored-by: skchronicles <kuhnsa2@vcu.edu> * improving ppqt, diffbind, frip, and jaccard * fixed the InitialChIPseqQC.snakefile conflict * fastqc bug fixed * more memory for BWA in ChIP-seq pipeline * reverting ngsqc_plot to a version that functions Co-authored-by: wong-nw <38703762+wong-nw@users.noreply.github.com> Co-authored-by: Skyler Kuhn <kuhnsa2@mymail.vcu.edu> Co-authored-by: Mayank Tandon <mtandon09@gmail.com> Co-authored-by: jlac <justin.lack@nih.gov> Co-authored-by: skchronicles <kuhnsa2@vcu.edu>
CCBR · Jun 17, 2020 · 2082431 · 2082431
1 parent 8701b49
commit 2082431
Show file tree

Hide file tree

Showing 8 changed files with 371 additions and 433 deletions.
diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd
@@ -45,6 +45,7 @@ suppressMessages(library(DiffBind))
 ## Read in sample sheet information and peak information
 ```{r samples, echo=FALSE, warning=FALSE,message=FALSE} 
 samples <- dba(sampleSheet=csvfile)
+consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
 print(samples)
 ```
 
@@ -53,22 +54,22 @@ print(samples)
 ## Plot raw information about the peaks
 ### Correlation heatmap: Only peaks
 ```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-plot(samples,main="")
+try(plot(samples,main=""),silent=TRUE)
 ```
 
 ### PCA: Only peaks
 ```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"}
-dba.plotPCA(samples,DBA_CONDITION)
+try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
 ```
 
 ### Overlapping peak counts
-```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center"}
+```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5}
 if (nrow(samples$samples) < 5) {
    dba.plotVenn(samples,1:nrow(samples$samples)) 
 } else {
    dba.plotVenn(samples,samples$masks[[3]])
    dba.plotVenn(samples,samples$masks[[4]])
-   dba.plotVenn(samples,samples$masks$Replicate.1)
+   dba.plotVenn(consensus,consensus$masks$Consensus)
 }
 ```
 
@@ -103,17 +104,17 @@ print(DBdataCounts)
 ## Plot raw information about all analyzed peaks
 ### Correlation heatmap: Peaks and reads
 ```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-plot(DBdataCounts, main="")
+try(plot(DBdataCounts, main=""),silent=TRUE)
 ```
 
 ### Heatmap: Average signal across each peak
 ```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-dba.plotHeatmap(DBdataCounts,correlations=FALSE)
+try(dba.plotHeatmap(DBdataCounts,correlations=FALSE),silent=TRUE)
 ```
 
 ### PCA: Peaks and reads
 ```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
-dba.plotPCA(DBdataCounts,DBA_CONDITION)
+try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
 ```
 
 ## Associate individual samples with the different contrasts
@@ -131,67 +132,46 @@ DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
 ```
 
 ```{r, echo=FALSE, warning=FALSE,message=FALSE}
-nsamples <- sum(DBdatacontrast$contrasts[[1]]$group1) + sum(DBdatacontrast$contrasts[[1]]$group2)
-
 DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2)
-#sum(DBAnalysisDeseq2$contrasts[[1]]$DESeq2$de$padj < 0.05)
-nDeseq2 <- length(DBReportDeseq2)
-nDeseq2P <- sum(DBReportDeseq2$Conc > 0)
-nDeseq2N <- sum(DBReportDeseq2$Conc < 0)
 DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER)
-nEdgeR <- length(DBReportEdgeR)
-nEdgeRP <- sum(DBReportEdgeR$Conc > 0)
-nEdgeRN <- sum(DBReportEdgeR$Conc < 0)
 ```
 
 ### PCA: DeSeq2
 ```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
-if (nDeseq2 > nsamples) {
-	dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2)
-}
+try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE)
 ```
 
 ### PCA: EdgeR
 ```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
-if (nEdgeR > nsamples) {
-	dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER)
-}
+try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE)
 ```
 
 ### MANorm: (left) Deseq2, (right) EdgeR
 ```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
 par(mfcol=c(1,2))
-dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2)
-dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER)
+try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
+try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
 ```
 
 ### Volcano plot: DeSeq2
 ```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-if (nDeseq2 > 0) {
-	dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2)
-}
+try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
 ```
 
 ### Volcano plot: EdgeR
 ```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-if (nEdgeR > 0) {
-	dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER)
-}
+try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
 ```
 
 ### Boxplots: (left) Deseq2, (right) EdgeR
 ```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
 par(mfcol=c(1,2))
-if ((nDeseq2N > 1) & (nDeseq2P > 1)) {
-	dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2)
+if (length(DBReportDeseq2) > 0) {
+   try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
 } else {
-    if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
-		plot(0,type='n',axes=FALSE,ann=FALSE)
-	}
-}
-if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
-	dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER)
+   plot(0,type='n',axes=FALSE,ann=FALSE)
 }
+try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
 ```
 
 ## Differentially bound peaks: Deseq2 output

diff --git a/Results-template/Scripts/filterMetrics b/Results-template/Scripts/filterMetrics
@@ -2,7 +2,7 @@
 '''
 Author:		Skyler Kuhn
 Date created:	08/18/2018
-Last modified:	08/23/2018
+Last modified:	09/10/2019 by Tovah Markowitz markowitzte@nih.gov
 Email:		kuhnsa@nih.gov
 
 About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py, ngsqc,
@@ -34,7 +34,7 @@ About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py,
 	  # awk '{print $(NF-2),$(NF-1),$(NF)}' H3k4me3_gran_1.sorted.Q5DD.ppqt | ./filterMetrics H3k4me3_gran_1 ppqt
 
 	10.) Find the Fragment Length
-	  # awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | awk -F ',' '{print $1}' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
+	  # awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | sed -e 's/,/ /g' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
 
 Python version(s):	2.7 or 3.X
 
@@ -65,11 +65,15 @@ def getmetadata(type):
 	elif type == 'unreads':
                 metadata = 'NUniqMappedReads'
 	elif type == 'fragLen':
-		metadata = ['FragmentLength']
+		metadata = 'FragmentLength'
 	return metadata
 
 
 def filteredData(sample, ftype):
+	""" 
+	Data grabbed by the awk or grep commands in the above example cases becomes 
+	variable 'line' and gets split by space to make 'linelist'
+	"""
 	for line in sys.stdin:
 		linelist = line.strip().split()
 		if 'reads' in ftype:
@@ -80,7 +84,17 @@ def filteredData(sample, ftype):
 			else:
 				v1, v2 = linelist[0], linelist[1]
 				print("{}\t{}\t{}".format(sample, mtypes, add(v1,v2)))
-		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf' or ftype == 'fragLen':
+		elif ftype == 'fragLen':
+			mtypes = getmetadata(ftype)
+			extenders = []
+			for ppqt_value in linelist:
+				if int(ppqt_value) > 150:
+                            		extenders.append(ppqt_value)
+			if len(extenders) > 0:
+				print("{}\t{}\t{}".format(sample, mtypes, extenders[0]))
+			else:
+                        	print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
+		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
 			mtypes = getmetadata(ftype)
 			for i in range(len(linelist)):
 				print("{}\t{}\t{}".format(sample, mtypes[i], linelist[i]))

diff --git a/Results-template/Scripts/frip_plot.py b/Results-template/Scripts/frip_plot.py
@@ -4,6 +4,7 @@
 Name: frip_plot.py
 Created by: Tovah Markowitz
 Date: 5/14/19
+Updated: 1/16/20 to allow more control of the peaktool name and splitting of the bedfile name, also adjusted for when nplots is 1 for the scatter plot
 
 Purpose: To create visually appealing plots of FRiP scores
 Currently only works with python/3.5
@@ -68,30 +69,40 @@ def clip_bamfile_name(bamfile):
     condition =  ".".join(bamfile.split("/")[-1].split(".")[1:-1])
     return( sample, condition )
 
-def clip_bedfile_name(bedfile):
+def clip_bedfile_name(bedfile,filetype):
     """
     clip bed file name for table/plotting purposes; assumes file 
     naming system matches that of Pipeliner
     """
-    toolused = bedfile.split("/")[-3]
-    sample = bedfile.split("/")[-2]
+    if filetype == "":
+        toolused = bedfile.split("/")[-3]
+        sample = bedfile.split("/")[-2]
+    else:
+        toolused = filetype
+        sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
     return( toolused, sample )
 
-def process_files(bamfiles, bedfiles, genome):
+def process_files(bamfiles, bedfiles, genome, filetypes):
     """ 
     this is the main function to take in list of input files and 
     put out an array containing key file name information, read 
     counts, and FRiP scores
     """
     bamfileL = split_infiles(bamfiles)
     bedfileL = split_infiles(bedfiles)
+    filetypesL = split_infiles(filetypes)
     out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", 
     "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
     for bam in bamfileL:
         nreads = count_reads_in_bam(bam)
         (bamsample, condition) = clip_bamfile_name(bam)
-        for bed in bedfileL:
-            (bedtool, bedsample) = clip_bedfile_name(bed)
+        for i in range(len(bedfileL)):
+            bed = bedfileL[i]
+            if len(filetypesL) > 1:
+                filetype = filetypesL[i]
+            else:
+                filetype = filetypesL[0]
+            (bedtool, bedsample) = clip_bedfile_name(bed,filetype)
             noverlaps = count_reads_in_bed(bam, bed, genome)
             frip = calculate_frip(nreads, noverlaps)
             nbases = measure_bedfile_coverage(bed, genome) / 1000000
@@ -121,13 +132,17 @@ def create_barplot(out2,outbar):
     used to create the peak file, and the panels are the different 
     peak calling tools
     """
-    sns.set(style="whitegrid", palette="Set1", font_scale=1.5)
-    bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
+    sns.set(style="whitegrid", palette="Set1", font_scale=0.75)
+    if len(out2.bedtool.unique()) > 1:
+        bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
              col="bedtool", data=out2, kind="bar", col_wrap=2)
+    else:
+        bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
+             col="bedtool", data=out2, kind="bar")
     bp.set_axis_labels("Bam File", 'Fraction Reads in Peaks (FRiP)')
     bp.set_titles("{col_name}")
     bp._legend.set_title("Peak File")
-    bp.set_xticklabels(rotation=10)
+    bp.set_xticklabels(rotation=70)
     #plt.show(bp)
     plt.savefig(outbar, bbox_inches='tight')
     plt.close("all")
@@ -141,31 +156,45 @@ def create_scatter(out2, outscatter):
     """
     bams= out2.loc[:,'bamsample'].unique()
     nplots= len(bams)
-    sns.set(style="whitegrid", palette="Set1")
-    f, axes = plt.subplots(1, nplots, sharey=True)
-    for bi in range(nplots-1):
-        tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
-        sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
+    nplotsmid= int(nplots/2)
+    sns.set(style="whitegrid", palette="Set1",font_scale=0.75)
+    if nplots > 1:
+        f, axes = plt.subplots(1, nplots, sharey=True, sharex=True)
+        for bi in range(nplots-1):
+            tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
+            sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
                         hue="bedsample", style="bedtool", ax=axes[bi],
                         markers=['o','s','v','X'], legend=False)
-        axes[bi].set(xlabel="# bases in peaks (M)", 
+            if bi == nplotsmid:
+                axes[bi].set(xlabel="# bases in peaks (M)", 
                      ylabel='Fraction Reads in Peaks (FRiP)')
-        axes[bi].set_title( bams[bi] )
-        axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
-        axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
-        axes[bi].grid(b=True, which='minor', linestyle="--", 
+            else:
+                axes[bi].set(xlabel="", 
+                     ylabel='Fraction Reads in Peaks (FRiP)')
+            axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor")
+            axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
+            axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
+            axes[bi].grid(b=True, which='minor', linestyle="--", 
                       color="gray", linewidth=0.5)
-    tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
-    sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
+        tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
+        sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
                     style="bedtool", ax=axes[nplots-1],
                     markers=['o','s','v','X'])
-    axes[nplots-1].set(xlabel="# bases in peaks (M)", 
+        axes[nplots-1].set(xlabel="", 
                        ylabel='Fraction Reads in Peaks (FRiP)')
-    axes[nplots-1].set_title( bams[nplots-1] )
-    axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
-    axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
-    axes[nplots-1].grid(b=True, which='minor', linestyle="--",
+        axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor")
+        axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
+        axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
+        axes[nplots-1].grid(b=True, which='minor', linestyle="--",
                         color="gray", linewidth=0.5)
+    else:
+        bp = sns.scatterplot(data=out2, x="n_basesM", y="FRiP", hue="bedsample",
+                    style="bedtool", markers=['o','s','v','X'])
+        bp.set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)')
+        bp.set_title( bams[0], rotation=70, rotation_mode="anchor")
+        bp.get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
+        bp.grid(b=True, which='major', color="gray", linewidth=1)
+        bp.grid(b=True, which='minor', linestyle="--", color="gray", linewidth=0.5)
     plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
     #plt.show()
     plt.savefig(outscatter, bbox_inches='tight')
@@ -198,14 +227,17 @@ def main():
 size of every chromosome.')
     parser.add_option('-o', dest='outroot', default='', 
            help='The root name of the multiple output files. Default: ""')
+    parser.add_option('-t', dest='filetypes', default='', help='A space- \
+or semicolon-delimited list of input file sources/types. Default: ""')
 
     (options,args) = parser.parse_args()
     bedfiles = options.peakfiles
     bamfiles = options.bamfiles
     genomefile = options.genomefile
     outroot = options.outroot
+    filetypes = options.filetypes
 
-    out2 = process_files(bamfiles, bedfiles, genomefile)
+    out2 = process_files(bamfiles, bedfiles, genomefile, filetypes)
     (outtable, outbar, outscatter) = create_outfile_names(outroot)
     write_table(out2, outtable)
     create_barplot(out2,outbar)