Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

Commit

Permalink
Merge pull request #417 from tovahmarkowitz/activeDev
Browse files Browse the repository at this point in the history
ChIP-seq multiQC improvements, and a few other minor details
  • Loading branch information
skchronicles committed May 2, 2019
2 parents 282b258 + 66d2cb0 commit 6349c7a
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 47 deletions.
8 changes: 6 additions & 2 deletions Results-template/Scripts/createtable
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
'''
Author: Skyler Kuhn
Date created: 08/18/2018
Last modified: 08/23/2018
Last modified: 04/26/2019 by Tovah Markowitz: markowitzte@nih.gov
Email: kuhnsa@nih.gov
About: This program takes standard input from the concatenation of each *qc.metric file.The results
Expand Down Expand Up @@ -30,10 +30,14 @@ def file2table():
pass

df = pd.DataFrame(tabledict)
df.index.name = 'SampleName'
df.reset_index(inplace=True)
#print(df[['NSC', 'FRiP', 'PCB1', 'PCB2', 'RSC']]) #re-order columns
#cols = df.columns.tolist() # view df columns names
#orderedcols = ordercolumns(cols)
print(df.to_string())
#print(df.to_string())
print(df[['SampleName', 'NReads', 'NMappedReads', 'NUniqMappedReads', 'NRF', 'PBC1', 'PBC2', 'FragmentLength', 'NSC', 'RSC', 'Qtag']].to_string(index=False,justify="left"))


if __name__ == '__main__':
file2table()
Expand Down
9 changes: 7 additions & 2 deletions Results-template/Scripts/filterMetrics
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py,
9.) Find NSC, RSC, Qtag
# awk '{print $(NF-2),$(NF-1),$(NF)}' H3k4me3_gran_1.sorted.Q5DD.ppqt | ./filterMetrics H3k4me3_gran_1 ppqt
10.) Find the Fragment Length
# awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | awk -F ',' '{print $1}' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
Python version(s): 2.7 or 3.X
'''
Expand Down Expand Up @@ -61,7 +64,9 @@ def getmetadata(type):
metadata = 'NMappedReads'
elif type == 'unreads':
metadata = 'NUniqMappedReads'
return metadata
elif type == 'fragLen':
metadata = ['FragmentLength']
return metadata


def filteredData(sample, ftype):
Expand All @@ -75,7 +80,7 @@ def filteredData(sample, ftype):
else:
v1, v2 = linelist[0], linelist[1]
print("{}\t{}\t{}".format(sample, mtypes, add(v1,v2)))
elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf' or ftype == 'fragLen':
mtypes = getmetadata(ftype)
for i in range(len(linelist)):
print("{}\t{}\t{}".format(sample, mtypes[i], linelist[i]))
Expand Down
7 changes: 5 additions & 2 deletions Results-template/Scripts/jaccard_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

def split_infiles(infiles):
# breaks the infile string with space-delimited file names and creates a list
infileList = infiles.split(" ")
infileList = infiles.strip("\'").strip('\"').split(" ")
if len(infileList) == 1:
infileList = infileList[0].split(";")
return(infileList)

def loop_jaccard(infileList, genomefile):
Expand All @@ -32,6 +34,7 @@ def loop_jaccard(infileList, genomefile):
out = []
for z in range(nfiles):
fileA = infileList[z]
print("fileA is: " + fileA)
a = BedTool(fileA)
a = a.sort(g=genomefile)
for y in range(z+1,nfiles):
Expand Down Expand Up @@ -68,7 +71,7 @@ def main():

parser = optparse.OptionParser(description=desc)

parser.add_option('-i', dest='infiles', default='', help='A space-delimited list of \
parser.add_option('-i', dest='infiles', default='', help='A space- or semicolon-delimited list of \
input files for analysis.')
parser.add_option('-o', dest='outfile', default='', help='The name of the output file \
where all the jaccard score information will be saved.')
Expand Down
51 changes: 35 additions & 16 deletions Results-template/Scripts/ngsqc_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
Name: ngsqc_plot.py
Created by: Tovah Markowitz
Date: 1/3/19
Updated: 4/18/19
Purpose: To take a number of NGSQC_report.txt files and convert into a figure.
Currently designed to work in case where all files have been moved from their original
subdirectories and the filename includes information of about the source data. All input
files must be in the noted directory (not recursive), end with ".NGSQC_report.txt", and
contain 'ext'. Output figures are named based upon 'ext'.
contain 'ext'. Output figures are named based upon 'ext'. Now also creates a single output
table that can be read into the multiqc report.
"""

##########################################
Expand Down Expand Up @@ -54,28 +56,44 @@ def create_plot(ngsqcAll, ext, outfileName):
plt.savefig(outfileName)
plt.close("all")

def write_ngsqc(ngsqcAll,directory,ext):
"""to write a set of tables of the ngsqc data used in the image for incorporation
into multiqc
"""
for sample in ngsqcAll:
if ext == "":
outfile = directory + "/" + sample[0] + ".NGSQC.txt"
else:
outfile = directory + "/" + sample[0] + "." + ext + ".NGSQC.txt"
f = open(outfile, 'w')
f.write( "\n".join( str(sample[1][0][i]) + "," + str(sample[1][1][i]) for i in range(len(sample[1][0])) ) )
f.close()

def all_ngsqc(directory, ext):
""" To get the ngsqc data for all NGSQC_report.txt files within a single folder
assumes that all files have been moved from their original subdirectories and
filename include information about the source of the data
"""
files = os.listdir(directory)
files2 = [ file for file in files if re.search("NGSQC_report.txt",file) ]
files3 = [ file for file in files if re.search(ext,file) ]
ngsqcAll = [ ]
regex = r"(.*)." + re.escape(ext) + r".NGSQC_report.txt"
for file in files3:
tmp = re.search(regex,file)
ngsqcAll.append([ tmp.group(1), read_ngsqc(directory + "/" + file) ])
return ngsqcAll
""" To get the ngsqc data for all NGSQC_report.txt files within a single folder
assumes that all files have been moved from their original subdirectories and
filename include information about the source of the data
"""
files = os.listdir(directory)
files2 = [ file for file in files if re.search("NGSQC_report.txt",file) ]
files3 = [ file for file in files2 if re.search(ext,file) ]
ngsqcAll = [ ]
if ext == "":
regex = r"(.*).NGSQC_report.txt"
else:
regex = r"(.*)." + re.escape(ext) + r".NGSQC_report.txt"
for file in files3:
tmp = re.search(regex,file)
ngsqcAll.append([ tmp.group(1), read_ngsqc(directory + "/" + file) ])
return ngsqcAll

def name_outimage(directory, ext, group):
"""this version uses ext to create the output file name
"""
if group == "":
outfileName = directory + "/NGSQC." + ext + ".pdf"
outfileName = directory + "/NGSQC." + ext + ".png"
else:
outfileName = directory + "/" + group + ".NGSQC." + ext + ".pdf"
outfileName = directory + "/" + group + ".NGSQC." + ext + ".png"
return outfileName

#############################################
Expand Down Expand Up @@ -106,6 +124,7 @@ def main():

outfileName = name_outimage(directory, ext, group)
ngsqcAll = all_ngsqc(directory, ext)
write_ngsqc(ngsqcAll, directory, ext)
create_plot(ngsqcAll, ext, outfileName)

#dir='/Users/markowitzte/Desktop/ngsqc'
Expand Down
56 changes: 31 additions & 25 deletions Rules/InitialChIPseqQC.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,15 @@ if se == 'yes' :
expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
# ngsqc
expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.pdf"),group=groups),
expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
# preseq
expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
# QC Table
expand(join(workpath,"QC","{name}.nrf"), name=samples),
expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
join(workpath,"QCTable.txt"),
join(workpath,"QC","QCTable.txt"),



rule trim_se: # actually trim, filter polyX and remove black listed reads
input:
infq=join(workpath,"{name}.R1.fastq.gz"),
Expand Down Expand Up @@ -188,7 +188,7 @@ java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq VALIDATION_S
pigz -p 16 ${{sample}}.cutadapt.noBL.fastq;
mv ${{sample}}.cutadapt.noBL.fastq.gz {output.outfq};
"""

rule kraken_se:
input:
fq = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
Expand Down Expand Up @@ -327,15 +327,15 @@ if pe == 'yes':
expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
# ngsqc
expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.pdf"),group=groups),
expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
# preseq
expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
# QC Table
expand(join(workpath,"QC","{name}.nrf"), name=samples),
expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
join(workpath,"QCTable.txt"),
join(workpath,"QC","QCTable.txt"),



rule trim_pe: # trim, remove PolyX and remove BL reads
input:
file1=join(workpath,"{name}.R1.fastq.gz"),
Expand Down Expand Up @@ -445,7 +445,6 @@ samtools flagstat {output.outbam2} > {output.flagstat2}
out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"),
params:
rname='pl:dedup',
batch='--mem=98g --time=10:00:00 --gres=lscratch:800',
picardver=config['bin'][pfamily]['tool_versions']['PICARDVER'],
samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
javaram='16g',
Expand Down Expand Up @@ -482,7 +481,6 @@ rule ppqt:
pdf= join(workpath,bam_dir,"{name}.sorted{ext}pdf"),
params:
rname="pl:ppqt",
batch='--mem=24g --time=10:00:00 --gres=lscratch:800',
samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
rver=config['bin'][pfamily]['tool_versions']['RVER'],
run:
Expand All @@ -507,7 +505,6 @@ rule bam2bw:
outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),
params:
rname="pl:bam2bw",
batch='--mem=24g --time=10:00:00 --gres=lscratch:800',
reflen=config['references'][pfamily]['REFLEN'],
deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
run:
Expand Down Expand Up @@ -547,9 +544,9 @@ rule deeptools_prep:
bam=expand(join(workpath,bam_dir,"{name}.{ext}.bam"),name=samples,ext=extensions2),
bw2=expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput)
output:
expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions),
expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2),
expand(join(workpath,bw_dir,"{group2}.{ext2}.deeptools_prep"), zip, group2=deepgroups,ext2=deepexts),
temp(expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions)),
temp(expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2)),
temp(expand(join(workpath,bw_dir,"{group2}.{ext2}.deeptools_prep"), zip, group2=deepgroups,ext2=deepexts)),
params:
rname="pl:deeptools_prep",
batch="--mem=10g --time=1:00:00",
Expand Down Expand Up @@ -612,10 +609,13 @@ rule deeptools_QC:
cmd2="plotCorrelation -in "+output.npz+" -o "+output.heatmap+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
cmd3="plotCorrelation -in "+output.npz+" -o "+output.scatter+" -c 'spearman' -p 'scatterplot' --skipZeros --removeOutliers"
cmd4="plotPCA -in "+output.npz+" -o "+output.pca
cmd5="plotCorrelation -in "+output.npz+" -o "+join(workpath,deeptools_dir,"spearman_heatmap."+wildcards.ext+"_mqc.png")+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
shell(commoncmd+cmd1)
shell(commoncmd+cmd2)
shell(commoncmd+cmd3)
shell(commoncmd+cmd4)
if "Q5DD" in wildcards.ext:
shell(commoncmd+cmd5)

rule deeptools_fingerprint:
input:
Expand All @@ -626,15 +626,15 @@ rule deeptools_fingerprint:
params:
rname="pl:deeptools_fingerprint",
deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
batch="--cpus-per-task=8"
nthreads="8"
run:
import re
commoncmd="module load {params.deeptoolsver}; module load python;"
listfile=list(map(lambda z:z.strip().split(),open(input.prep,'r').readlines()))
ext=listfile[0][0]
bams=listfile[1]
labels=listfile[2]
cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p 4 --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image
cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p "+params.nthreads+" --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image
if se == "yes":
cmd+=" -e 200"
shell(commoncmd+cmd)
Expand All @@ -649,15 +649,15 @@ rule deeptools_fingerprint_Q5DD:
params:
rname="pl:deeptools_fingerprint_Q5DD",
deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
batch="--cpus-per-task=8"
nthreads="8"
run:
import re
commoncmd="module load {params.deeptoolsver}; module load python;"
listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines()))
ext=listfile[0][0]
bams=listfile[1]
labels=listfile[2]
cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p 4 --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image+" --outRawCounts "+output.raw
cmd="plotFingerprint -b "+" ".join(bams)+" --labels "+" ".join(labels)+" -p "+params.nthreads+" --skipZeros --outQualityMetrics "+output.metrics+" --plotFile "+output.image+" --outRawCounts "+output.raw
if se == "yes":
cmd+=" -e 200"
shell(commoncmd+cmd)
Expand All @@ -676,7 +676,8 @@ rule deeptools_genes:
params:
rname="pl:deeptools_genes",
deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
prebed=config['references'][pfamily]['GENEINFO']
prebed=config['references'][pfamily]['GENEINFO'],
nthreads="16"
run:
import re
commoncmd="module load {params.deeptoolsver}; module load python;"
Expand All @@ -685,8 +686,8 @@ rule deeptools_genes:
bws=listfile[1]
labels=listfile[2]
cmd1="awk -v OFS='\t' -F'\t' '{{print $1, $2, $3, $5, \".\", $4}}' "+params.prebed+" > "+output.bed
cmd2="computeMatrix scale-regions -S "+" ".join(bws)+" -R "+output.bed+" -p 4 --upstream 1000 --regionBodyLength 2000 --downstream 1000 --skipZeros -o "+output.metamat+" --samplesLabel "+" ".join(labels)
cmd3="computeMatrix reference-point -S "+" ".join(bws)+" -R "+output.bed+" -p 4 --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros -o "+output.TSSmat+" --samplesLabel "+" ".join(labels)
cmd2="computeMatrix scale-regions -S "+" ".join(bws)+" -R "+output.bed+" -p "+params.nthreads+" --upstream 1000 --regionBodyLength 2000 --downstream 1000 --skipZeros -o "+output.metamat+" --samplesLabel "+" ".join(labels)
cmd3="computeMatrix reference-point -S "+" ".join(bws)+" -R "+output.bed+" -p "+params.nthreads+" --referencePoint TSS --upstream 3000 --downstream 3000 --skipZeros -o "+output.TSSmat+" --samplesLabel "+" ".join(labels)
cmd4="plotHeatmap -m "+output.metamat+" -out "+output.metaheat+" --colorMap 'PuOr_r' --yAxisLabel 'average RPGC' --regionsLabel 'genes' --legendLocation 'none'"
cmd5="plotHeatmap -m "+output.TSSmat+" -out "+output.TSSheat+" --colorMap 'RdBu_r' --yAxisLabel 'average RPGC' --regionsLabel 'genes' --legendLocation 'none'"
cmd6="plotProfile -m "+output.metamat+" -out "+output.metaline+" --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-right"
Expand Down Expand Up @@ -851,7 +852,7 @@ rule ngsqc_plot:
input:
ngsqc=expand(join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples),
output:
out=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.pdf"),group=groups),
png=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
params:
rname="pl:ngsqc_plot",
script=join(workpath,"Scripts","ngsqc_plot.py"),
Expand All @@ -867,9 +868,10 @@ rule ngsqc_plot:
for label in labels:
cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".sorted.Q5DD.NGSQC_report.txt " + group + "; "
cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'sorted.Q5DD' -g '" + group + "'; "
cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.pdf " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.pdf" + "; "
cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.png" + "; "
cmd4 = "mv */*.sorted.Q5DD.NGSQC.txt " + workpath + "/QC; "
shell(commoncmd1)
shell(commoncmd2 + cmd1 + cmd2 + cmd3)
shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4)

rule QCstats:
input:
Expand All @@ -896,6 +898,8 @@ grep 'mapped (' {input.ddflagstat} | awk '{{print $1,$3}}' | {params.filterColla
cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQCfile}
# NSC, RSC, Qtag
awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile}
# Fragment Length
awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
"""

rule QCTable:
Expand All @@ -906,7 +910,7 @@ rule QCTable:
inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)),
filterCollate=join(workpath,"Scripts","createtable"),
output:
qctable=join(workpath,"QCTable.txt"),
qctable=join(workpath,"QC","QCTable.txt"),
threads: 16
shell: """
cat {params.inputstring} | {params.filterCollate} > {output.qctable}
Expand All @@ -919,10 +923,12 @@ rule multiqc:
expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
join(workpath,"QCTable.txt"),
join(workpath,"QC","QCTable.txt"),
join(workpath,"rawQC"),
join(workpath,"QC"),
expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups),
expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"),
output:
join(workpath,"Reports","multiqc_report.html")
params:
Expand Down
9 changes: 9 additions & 0 deletions cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@
"deeptools": {
"mem": "24g"
},
"deeptools_fingerprint": {
"threads":"8"
},
"deeptools_fingerprint_Q5DD": {
"threads":"8"
},
"deeptools_genes": {
"threads":"16"
},
"deeptools_prep": {
"mem": "4g",
"threads": "2"
Expand Down

0 comments on commit 6349c7a

Please sign in to comment.