Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

Commit

Permalink
adding ngsqc plot feature
Browse files Browse the repository at this point in the history
  • Loading branch information
tovahmarkowitz committed Jan 9, 2019
1 parent f765527 commit 097280f
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 5 deletions.
117 changes: 117 additions & 0 deletions Results-template/Scripts/ngsqc_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python3

"""
Name: ngsqc_plot.py
Created by: Tovah Markowitz
Date: 1/3/19
Purpose: To take a number of NGSQC_report.txt files and convert into a figure.
Currently designed to work in case where all files have been moved from their original
subdirectories and the filename includes information of about the source data. All input
files must be in the noted directory (not recursive), end with ".NGSQC_report.txt", and
contain 'ext'. Output figures are named based upon 'ext'.
"""

##########################################
# Modules
import matplotlib
matplotlib.use('Agg')
import optparse
import os
import re
import matplotlib.pyplot as plt

##########################################
# Functions

def read_ngsqc(ngsqcFile):
"""Purpose: read in ngsqc file and extract similarity QCi data
"""
f = open(ngsqcFile, 'r')
ngsqc = f.readlines()
f.close()
ngsqc2 = [[],[]]
for i in range( len(ngsqc) ):
if re.search( "<", ngsqc[i] ):
tmp = ngsqc[i].strip()
tmp2 = re.search( "RCI < (.*)%: (.*)", tmp )
ngsqc2[0].append(float(tmp2.group(1)))
ngsqc2[1].append(float(tmp2.group(2)))
return ngsqc2

def create_plot(ngsqcAll, ext, outfileName):
""" save data as a figure.
s90 is subsampling 90% and S50 is subsampling 50%.
RCI (Read count intensity)
"""
for ind in ngsqcAll:
plt.plot(ind[1][0],ind[1][1],'.-',label=ind[0])
plt.xlabel('percent RCI difference')
plt.ylabel('Similarity QCi (s90/s50)')
plt.title(ext)
plt.legend()
# plt.show()
plt.savefig(outfileName)
plt.close("all")

def all_ngsqc(directory, ext):
""" To get the ngsqc data for all NGSQC_report.txt files within a single folder
assumes that all files have been moved from their original subdirectories and
filename include information about the source of the data
"""
files = os.listdir(directory)
files2 = [ file for file in files if re.search("NGSQC_report.txt",file) ]
files3 = [ file for file in files if re.search(ext,file) ]
ngsqcAll = [ ]
regex = r"(.*)." + re.escape(ext) + r".NGSQC_report.txt"
for file in files3:
tmp = re.search(regex,file)
ngsqcAll.append([ tmp.group(1), read_ngsqc(directory + "/" + file) ])
return ngsqcAll

def name_outimage(directory, ext, group):
"""this version uses ext to create the output file name
"""
if group == "":
outfileName = directory + "/NGSQC." + ext + ".pdf"
else:
outfileName = directory + "/" + group + ".NGSQC." + ext + ".pdf"
return outfileName

#############################################
# Main

def main():
desc="""
To take a number of NGSQC_report.txt files and convert into a figure.
Currently designed to work in case where all files have been moved from their original
subdirectories and the filename includes information of about the source data. All input
files must be in the noted directory (not recursive), end with ".NGSQC_report.txt", and
contain 'ext'. Output figures are named based upon 'ext'.
"""

parser = optparse.OptionParser(description=desc)

parser.add_option('-d', dest='directory', default='', help='The name of the directory \
with the NGSQC_report.txt files.')
parser.add_option('-e', dest='ext', default='', help='The string found in all \
NGSQC_report.txt files. This string must be directly before ".NGSQC_report.txt".')
parser.add_option('-g', dest='group', default='', help='Any additional naming to add \
the output file name.')

(options,args) = parser.parse_args()
directory = options.directory
ext = options.ext
group = options.group

outfileName = name_outimage(directory, ext, group)
ngsqcAll = all_ngsqc(directory, ext)
create_plot(ngsqcAll, ext, outfileName)

#dir='/Users/markowitzte/Desktop/ngsqc'
#ext="sorted.Q5MDD"

if __name__ == '__main__':
main()


54 changes: 49 additions & 5 deletions Rules/InitialChIPseqQC.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ if se == 'yes' :
expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
# ngsqc
expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5MDD.pdf"),group=groups),
# preseq
expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
# QC Table
Expand Down Expand Up @@ -319,7 +321,7 @@ samtools flagstat {output.outbam2} > {output.flagstat2}
folder=join(workpath,bam_dir),
genomefile=config['references'][pfamily]['REFLEN']
shell: """
if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID ;fi
if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi
cd /lscratch/$SLURM_JOBID;
module load {params.macsver};
module load {params.samtoolsver};
Expand Down Expand Up @@ -796,15 +798,13 @@ rule NRF:
rver=config['bin'][pfamily]['tool_versions']['RVER'],
preseqver=config['bin'][pfamily]['tool_versions']['PRESEQVER'],
nrfscript=join(workpath,"Scripts","atac_nrf.py "),

output:
preseq=join(workpath,"QC","{name}.preseq.dat"),
preseqlog=join(workpath,"QC","{name}.preseq.log"),
nrf=join(workpath,"QC","{name}.nrf"),
threads: 16
shell: """
module load {params.preseqver};
preseq lc_extrap -P -B -o {output.preseq} {input.bam} -seed 12345 -v -l 100000000000 2> {output.preseqlog}
python {params.nrfscript} {output.preseqlog} > {output.nrf}
"""
Expand Down Expand Up @@ -882,6 +882,52 @@ module load {params.perlver};
--aligner bowtie2 --force {input}
"""

rule ngsqc:
input:
tagAlign=join(workpath,bam_dir,"{name}.sorted.Q5MDD.tagAlign.gz")
output:
file=join(workpath,"QC","{name}.sorted.Q5MDD.NGSQC_report.txt"),
params:
rname="pl:ngsqc",
bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
ngsqc=config['bin'][pfamily]['NGSQC'],
genomefile=config['references'][pfamily]['REFLEN']
shell: """
module load {params.bedtoolsver};
if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi
cd /lscratch/$SLURM_JOBID;
cp {input.tagAlign} tmptagAlign.gz;
gzip -d tmptagAlign.gz;
{params.ngsqc} -v -o tmpOut tmptagAlign {params.genomefile};
mv tmpOut/NGSQC_report.txt {output.file}
"""

rule ngsqc_plot:
input:
ngsqc=expand(join(workpath,"QC","{name}.sorted.Q5MDD.NGSQC_report.txt"),name=samples),
output:
out=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5MDD.pdf"),group=groups),
params:
rname="pl:ngsqc_plot",
script=join(workpath,"Scripts","ngsqc_plot.py"),
run:
commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID ;fi"
commoncmd2 = "cd /lscratch/$SLURM_JOBID;"
cmd1 = ""
cmd2 = ""
cmd3 = ""
for group in groups:
labels = [ sample for sample in samples if sample in groupdatawinput[group] ]
cmd1 = cmd1 + "mkdir " + group + "; "
for label in labels:
cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".sorted.Q5MDD.NGSQC_report.txt " + group + "; "
cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'sorted.Q5MDD' -g '" + group + "'; "
cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5MDD.pdf " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5MDD.pdf" + "; "
shell(commoncmd1)
shell(commoncmd2)
shell(cmd1)
shell(cmd2)
shell(cmd3)

rule QCstats:
input:
Expand All @@ -893,7 +939,6 @@ rule QCstats:
params:
rname='pl:QCstats',
filterCollate=join(workpath,"Scripts","filterMetrics"),

output:
sampleQCfile=join(workpath,"QC","{name}.qcmetrics"),
threads: 16
Expand All @@ -918,7 +963,6 @@ rule QCTable:
rname='pl:QCTable',
inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)),
filterCollate=join(workpath,"Scripts","createtable"),

output:
qctable=join(workpath,"QCTable.txt"),
threads: 16
Expand Down
1 change: 1 addition & 0 deletions standard-bin.json
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@
"KRONATOOLSVER": "kronatools/2.7",
"MULTIQC":"multiqc/1.4",
"NGSPLOTVER": "ngsplot/2.63",
"NGSQC": "/data/CCBR_Pipeliner/db/PipeDB/ChIPseq/NGSQC_linux_x86_64",
"PERLVER": "perl/5.24.3",
"PICARDVER": "picard/2.17.11",
"PRESEQVER": "preseq/2.0.3",
Expand Down

0 comments on commit 097280f

Please sign in to comment.