Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

Commit

Permalink
ChIP-seq updates: estimated fragment length and more (#450)
Browse files Browse the repository at this point in the history
* improving estimated fragment length

* est fraglen; diffbind

* DiffBind and cleanup

* ppqt changes

* ppqt improvements and a few other details

* scRNASeq changes Feb 2020 (#438)

* scRNASeq changes Feb 2020

* March updates

* Merging activeDev for v4.0.1 release (#444)

* Fix sequenza error

* Scrnaseq (#437)

Scrnaseq: miRNA-seq changes and scRNA-seq changes

* Symlink counts matrix if "counts/" exists in users rawdata directory

* Dectect if users is starting from a raw counts matrix

* Updating pop-up box dialogue after detection of new counts matrix filetype

* Find differential expression metadata in "/path/to/rawdata/counts/"

* Readability: checking for counts matrix outside conditional statement

* Adding "--alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" to the first and second pass of STAR

* Explicitly loading 'python/2.7': default version of python changing to '3.7'

Co-authored-by: Mayank  Tandon <mtandon09@gmail.com>
Co-authored-by: wong-nw <38703762+wong-nw@users.noreply.github.com>
Co-authored-by: jlac <justin.lack@nih.gov>

Co-authored-by: Skyler Kuhn <kuhnsa2@mymail.vcu.edu>
Co-authored-by: Mayank  Tandon <mtandon09@gmail.com>
Co-authored-by: jlac <justin.lack@nih.gov>
Co-authored-by: skchronicles <kuhnsa2@vcu.edu>

* improving ppqt, diffbind, frip, and jaccard

* fixed the InitialChIPseqQC.snakefile conflict

* fastqc bug fixed

* more memory for BWA in ChIP-seq pipeline

* reverting ngsqc_plot to a version that functions

Co-authored-by: wong-nw <38703762+wong-nw@users.noreply.github.com>
Co-authored-by: Skyler Kuhn <kuhnsa2@mymail.vcu.edu>
Co-authored-by: Mayank  Tandon <mtandon09@gmail.com>
Co-authored-by: jlac <justin.lack@nih.gov>
Co-authored-by: skchronicles <kuhnsa2@vcu.edu>
  • Loading branch information
6 people committed Jun 17, 2020
1 parent 8701b49 commit 2082431
Show file tree
Hide file tree
Showing 8 changed files with 371 additions and 433 deletions.
56 changes: 18 additions & 38 deletions Results-template/Scripts/DiffBind_pipeliner.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ suppressMessages(library(DiffBind))
## Read in sample sheet information and peak information
```{r samples, echo=FALSE, warning=FALSE,message=FALSE}
samples <- dba(sampleSheet=csvfile)
consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
print(samples)
```

Expand All @@ -53,22 +54,22 @@ print(samples)
## Plot raw information about the peaks
### Correlation heatmap: Only peaks
```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
plot(samples,main="")
try(plot(samples,main=""),silent=TRUE)
```

### PCA: Only peaks
```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"}
dba.plotPCA(samples,DBA_CONDITION)
try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
```

### Overlapping peak counts
```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center"}
```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5}
if (nrow(samples$samples) < 5) {
dba.plotVenn(samples,1:nrow(samples$samples))
} else {
dba.plotVenn(samples,samples$masks[[3]])
dba.plotVenn(samples,samples$masks[[4]])
dba.plotVenn(samples,samples$masks$Replicate.1)
dba.plotVenn(consensus,consensus$masks$Consensus)
}
```

Expand Down Expand Up @@ -103,17 +104,17 @@ print(DBdataCounts)
## Plot raw information about all analyzed peaks
### Correlation heatmap: Peaks and reads
```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
plot(DBdataCounts, main="")
try(plot(DBdataCounts, main=""),silent=TRUE)
```

### Heatmap: Average signal across each peak
```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
dba.plotHeatmap(DBdataCounts,correlations=FALSE)
try(dba.plotHeatmap(DBdataCounts,correlations=FALSE),silent=TRUE)
```

### PCA: Peaks and reads
```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
dba.plotPCA(DBdataCounts,DBA_CONDITION)
try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
```

## Associate individual samples with the different contrasts
Expand All @@ -131,67 +132,46 @@ DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
```

```{r, echo=FALSE, warning=FALSE,message=FALSE}
nsamples <- sum(DBdatacontrast$contrasts[[1]]$group1) + sum(DBdatacontrast$contrasts[[1]]$group2)
DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2)
#sum(DBAnalysisDeseq2$contrasts[[1]]$DESeq2$de$padj < 0.05)
nDeseq2 <- length(DBReportDeseq2)
nDeseq2P <- sum(DBReportDeseq2$Conc > 0)
nDeseq2N <- sum(DBReportDeseq2$Conc < 0)
DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER)
nEdgeR <- length(DBReportEdgeR)
nEdgeRP <- sum(DBReportEdgeR$Conc > 0)
nEdgeRN <- sum(DBReportEdgeR$Conc < 0)
```

### PCA: DeSeq2
```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
if (nDeseq2 > nsamples) {
dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2)
}
try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE)
```

### PCA: EdgeR
```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
if (nEdgeR > nsamples) {
dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER)
}
try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE)
```

### MANorm: (left) Deseq2, (right) EdgeR
```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
par(mfcol=c(1,2))
dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2)
dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER)
try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
```

### Volcano plot: DeSeq2
```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
if (nDeseq2 > 0) {
dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2)
}
try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
```

### Volcano plot: EdgeR
```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
if (nEdgeR > 0) {
dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER)
}
try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
```

### Boxplots: (left) Deseq2, (right) EdgeR
```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
par(mfcol=c(1,2))
if ((nDeseq2N > 1) & (nDeseq2P > 1)) {
dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2)
if (length(DBReportDeseq2) > 0) {
try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
} else {
if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
plot(0,type='n',axes=FALSE,ann=FALSE)
}
}
if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER)
plot(0,type='n',axes=FALSE,ann=FALSE)
}
try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
```

## Differentially bound peaks: Deseq2 output
Expand Down
22 changes: 18 additions & 4 deletions Results-template/Scripts/filterMetrics
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
'''
Author: Skyler Kuhn
Date created: 08/18/2018
Last modified: 08/23/2018
Last modified: 09/10/2019 by Tovah Markowitz markowitzte@nih.gov
Email: kuhnsa@nih.gov
About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py, ngsqc,
Expand Down Expand Up @@ -34,7 +34,7 @@ About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py,
# awk '{print $(NF-2),$(NF-1),$(NF)}' H3k4me3_gran_1.sorted.Q5DD.ppqt | ./filterMetrics H3k4me3_gran_1 ppqt
10.) Find the Fragment Length
# awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | awk -F ',' '{print $1}' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
# awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | sed -e 's/,/ /g' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
Python version(s): 2.7 or 3.X
Expand Down Expand Up @@ -65,11 +65,15 @@ def getmetadata(type):
elif type == 'unreads':
metadata = 'NUniqMappedReads'
elif type == 'fragLen':
metadata = ['FragmentLength']
metadata = 'FragmentLength'
return metadata


def filteredData(sample, ftype):
"""
Data grabbed by the awk or grep commands in the above example cases becomes
variable 'line' and gets split by space to make 'linelist'
"""
for line in sys.stdin:
linelist = line.strip().split()
if 'reads' in ftype:
Expand All @@ -80,7 +84,17 @@ def filteredData(sample, ftype):
else:
v1, v2 = linelist[0], linelist[1]
print("{}\t{}\t{}".format(sample, mtypes, add(v1,v2)))
elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf' or ftype == 'fragLen':
elif ftype == 'fragLen':
mtypes = getmetadata(ftype)
extenders = []
for ppqt_value in linelist:
if int(ppqt_value) > 150:
extenders.append(ppqt_value)
if len(extenders) > 0:
print("{}\t{}\t{}".format(sample, mtypes, extenders[0]))
else:
print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
mtypes = getmetadata(ftype)
for i in range(len(linelist)):
print("{}\t{}\t{}".format(sample, mtypes[i], linelist[i]))
Expand Down
86 changes: 59 additions & 27 deletions Results-template/Scripts/frip_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Name: frip_plot.py
Created by: Tovah Markowitz
Date: 5/14/19
Updated: 1/16/20 to allow more control of the peaktool name and splitting of the bedfile name, also adjusted for when nplots is 1 for the scatter plot
Purpose: To create visually appealing plots of FRiP scores
Currently only works with python/3.5
Expand Down Expand Up @@ -68,30 +69,40 @@ def clip_bamfile_name(bamfile):
condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1])
return( sample, condition )

def clip_bedfile_name(bedfile):
def clip_bedfile_name(bedfile,filetype):
"""
clip bed file name for table/plotting purposes; assumes file
naming system matches that of Pipeliner
"""
toolused = bedfile.split("/")[-3]
sample = bedfile.split("/")[-2]
if filetype == "":
toolused = bedfile.split("/")[-3]
sample = bedfile.split("/")[-2]
else:
toolused = filetype
sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
return( toolused, sample )

def process_files(bamfiles, bedfiles, genome):
def process_files(bamfiles, bedfiles, genome, filetypes):
"""
this is the main function to take in list of input files and
put out an array containing key file name information, read
counts, and FRiP scores
"""
bamfileL = split_infiles(bamfiles)
bedfileL = split_infiles(bedfiles)
filetypesL = split_infiles(filetypes)
out = [[ "bedtool", "bedsample", "bamsample", "bamcondition",
"n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
for bam in bamfileL:
nreads = count_reads_in_bam(bam)
(bamsample, condition) = clip_bamfile_name(bam)
for bed in bedfileL:
(bedtool, bedsample) = clip_bedfile_name(bed)
for i in range(len(bedfileL)):
bed = bedfileL[i]
if len(filetypesL) > 1:
filetype = filetypesL[i]
else:
filetype = filetypesL[0]
(bedtool, bedsample) = clip_bedfile_name(bed,filetype)
noverlaps = count_reads_in_bed(bam, bed, genome)
frip = calculate_frip(nreads, noverlaps)
nbases = measure_bedfile_coverage(bed, genome) / 1000000
Expand Down Expand Up @@ -121,13 +132,17 @@ def create_barplot(out2,outbar):
used to create the peak file, and the panels are the different
peak calling tools
"""
sns.set(style="whitegrid", palette="Set1", font_scale=1.5)
bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
sns.set(style="whitegrid", palette="Set1", font_scale=0.75)
if len(out2.bedtool.unique()) > 1:
bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
col="bedtool", data=out2, kind="bar", col_wrap=2)
else:
bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
col="bedtool", data=out2, kind="bar")
bp.set_axis_labels("Bam File", 'Fraction Reads in Peaks (FRiP)')
bp.set_titles("{col_name}")
bp._legend.set_title("Peak File")
bp.set_xticklabels(rotation=10)
bp.set_xticklabels(rotation=70)
#plt.show(bp)
plt.savefig(outbar, bbox_inches='tight')
plt.close("all")
Expand All @@ -141,31 +156,45 @@ def create_scatter(out2, outscatter):
"""
bams= out2.loc[:,'bamsample'].unique()
nplots= len(bams)
sns.set(style="whitegrid", palette="Set1")
f, axes = plt.subplots(1, nplots, sharey=True)
for bi in range(nplots-1):
tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
nplotsmid= int(nplots/2)
sns.set(style="whitegrid", palette="Set1",font_scale=0.75)
if nplots > 1:
f, axes = plt.subplots(1, nplots, sharey=True, sharex=True)
for bi in range(nplots-1):
tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
hue="bedsample", style="bedtool", ax=axes[bi],
markers=['o','s','v','X'], legend=False)
axes[bi].set(xlabel="# bases in peaks (M)",
if bi == nplotsmid:
axes[bi].set(xlabel="# bases in peaks (M)",
ylabel='Fraction Reads in Peaks (FRiP)')
axes[bi].set_title( bams[bi] )
axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
axes[bi].grid(b=True, which='minor', linestyle="--",
else:
axes[bi].set(xlabel="",
ylabel='Fraction Reads in Peaks (FRiP)')
axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor")
axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
axes[bi].grid(b=True, which='minor', linestyle="--",
color="gray", linewidth=0.5)
tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
style="bedtool", ax=axes[nplots-1],
markers=['o','s','v','X'])
axes[nplots-1].set(xlabel="# bases in peaks (M)",
axes[nplots-1].set(xlabel="",
ylabel='Fraction Reads in Peaks (FRiP)')
axes[nplots-1].set_title( bams[nplots-1] )
axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
axes[nplots-1].grid(b=True, which='minor', linestyle="--",
axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor")
axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
axes[nplots-1].grid(b=True, which='minor', linestyle="--",
color="gray", linewidth=0.5)
else:
bp = sns.scatterplot(data=out2, x="n_basesM", y="FRiP", hue="bedsample",
style="bedtool", markers=['o','s','v','X'])
bp.set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)')
bp.set_title( bams[0], rotation=70, rotation_mode="anchor")
bp.get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
bp.grid(b=True, which='major', color="gray", linewidth=1)
bp.grid(b=True, which='minor', linestyle="--", color="gray", linewidth=0.5)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
#plt.show()
plt.savefig(outscatter, bbox_inches='tight')
Expand Down Expand Up @@ -198,14 +227,17 @@ def main():
size of every chromosome.')
parser.add_option('-o', dest='outroot', default='',
help='The root name of the multiple output files. Default: ""')
parser.add_option('-t', dest='filetypes', default='', help='A space- \
or semicolon-delimited list of input file sources/types. Default: ""')

(options,args) = parser.parse_args()
bedfiles = options.peakfiles
bamfiles = options.bamfiles
genomefile = options.genomefile
outroot = options.outroot
filetypes = options.filetypes

out2 = process_files(bamfiles, bedfiles, genomefile)
out2 = process_files(bamfiles, bedfiles, genomefile, filetypes)
(outtable, outbar, outscatter) = create_outfile_names(outroot)
write_table(out2, outtable)
create_barplot(out2,outbar)
Expand Down
Loading

0 comments on commit 2082431

Please sign in to comment.