Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…

…tiveDev
CCBR · Oct 15, 2018 · 8a9987e · 8a9987e
2 parents c96c899 + 419bc33
commit 8a9987e
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 21 deletions.
diff --git a/Results-template/Scripts/filterSampleTable.py b/Results-template/Scripts/filterSampleTable.py
@@ -0,0 +1,115 @@
+##############################################################################################
+# Description:  Creates a new sampletable.txt for differential expression analysis which
+#		is necessary because a user may remove samples from their groups.tab file
+#		after running QC. The new sampletable.txt file is created from the
+#		groups.tab file that is provided when running DE half of the pipeline.
+#		Also a corresponding RawCounts_RSEM_genes.txt file is created which only
+#		contains information for samples that that are specificed in groups.tab.
+#		This file is written to DEG_{group1}-{group2}_{mincpm}_{minsample} folder.
+#		It is necessary that the order of the groups listed in sampletable.txt and
+#		and RawCounts_RSEM_genes_filtered.txt match. If they do not the reported
+#		direction (sign: postive or negative) of each gene's fold-change will be
+#		opposite of what it should be. This program also addresses that problem.
+# USAGE:	python filterSampleTable.py -s 'rbfoxKO_1,rbfoxKO_2,control_1,control_2'\
+#					    -g 'KO,KO,WT,WT' -l 'KO_1,KO_2,WT_1,WT_2' \
+#					    -r 'RawCountFile_RSEM_genes.txt' \
+#					    -outsample 'outfolder/sampletable.txt'\
+#					    -outraw 'outfolder/RawCountFile_RSEM_genes.txt'
+##############################################################################################
+
+from __future__ import print_function, division
+import os, sys
+import argparse  # module load python/3.5
+
+
+
+
+def filteredRawCounts(rawcountsfilename, samples, outrawfilename):
+	'''Generator that yields rawcounts information for samples in the current DEG groups.tab file.
+	Helper function checks formating of the headers between the newly generated sampletable.txt and
+	input RawCounts_RSEM_genes.txt file. If the order of groups formatting differs, it fixes it to match
+	the ordering in sampletable.txt (fail-safe to ensure later referential integrity at DE).
+	'''
+	def formatted(headerlist, samples, linelist):
+		headerIndexes = {}
+		symbol = linelist[0]
+		# Create Dictionary for mapping samplename in header of RawCounts_RSEM_genes.txt to its position 
+		for i in range(len(headerlist)):
+			headerIndexes[headerlist[i]] = i
+		indexlist = []
+		formattedlist = [symbol]
+		for sample in samples:
+			try:
+				indexlist.append(headerIndexes[sample])
+				formattedlist.append(linelist[headerIndexes[sample]])
+			except KeyError:
+				raise Exception('Key Error: Failed to find sample {} in RawCounts_RSEM_genes.txt.\nDid you add an additional sample to groups.tab?'.format(sample))
+		return formattedlist
+
+	rawfh = open(rawcountsfilename, 'r')
+	headerlist = [ fn.split('/')[-1].split('.')[0] for fn in next(rawfh).strip().split('\t')]
+
+	newheader = '\t'.join([headerlist[0]]+ samples) + '\n'
+	print(newheader)
+	outfh = open(outrawfilename, 'w')
+	outfh.write(newheader)
+	for line in  rawfh:
+		linelist = line.strip().split('\t')
+		formattedlist = formatted(headerlist, samples, linelist)
+		#print(formattedlist)
+		outfh.write('\t'.join(formattedlist) + '\n')
+	rawfh.close()
+	outfh.close()
+
+def SampleTable(samples, groups, labels,contrasts):
+	'''Generator that yields sample, group, and label info from the current DEG groups.tab file'''
+	for i in range(len(samples)):
+		if groups[i] in contrasts:
+			yield samples[i], groups[i], labels[i]
+
+
+
+if __name__ == '__main__':
+
+	# USAGE: python filterSampleTable.py -c 'KO WT' -s 'rbfoxKO_1,rbfoxKO_2,rbfoxKO_3,control_1,control_2,control_3' -g 'KO,KO,KO,WT,WT,WT' -l 'KO_1,KO_2,KO_3,WT_1,WT_2,WT_3' -r 'RawCountFile_RSEM_genes.txt' -outsample 'outfolder/sampletable.txt' -outraw 'outfolder/RawCountFile_RSEM_genes.txt'
+	# Parse Command-line arguments
+	parser = argparse.ArgumentParser(description='Filters sampletable.txt, cannot always use the sampletable that is generated in QC, sometimes DE is run with less samples.')
+	parser.add_argument('-r','--inputRawCounts', type=str, required=True, help='Input RawCounts_RSEM_genes.txt file that is generated QC: (required)')
+	parser.add_argument('-outraw','--outputRawCounts', type=str, required=True, help='Output RawCounts_RSEM_genes.txt filename (required)')
+	parser.add_argument('-outsample','--outputSampleTable', type=str, required=True, help='Output RawCounts_RSEM_genes.txt filename (required)')
+	parser.add_argument('-s','--samples', type=str, required=True, help='samples string from the run.json file  (required)')
+	parser.add_argument('-c','--contrast', type=str, required=True, help='contrast string (required)')
+	parser.add_argument('-g','--groups', type=str, required=True, help='groups string from the run.json file (required)')
+	parser.add_argument('-l','--labels', type=str, required=True, help='labels string from the run.json file (required)')
+	args = parser.parse_args()
+
+	sampleslist = args.samples.split(',')
+	groupslist = args.groups.split(',')
+	labelslist = args.labels.split(',')
+	contrastlist = args.contrast.split(' ')
+	print(sampleslist)
+	print(groupslist)
+	print(labelslist)
+
+
+	#Creating the new sampletable.txt file based off of the current DEG groups.tab information
+	samptablefh = open(args.outputSampleTable, 'w')
+	samptablefh.write('sampleName\tfileName\tcondition\tlabel\n')
+	samplesincontrast = []
+	groupset = []
+	for sample, group, label in SampleTable(sampleslist, groupslist, labelslist, contrastlist):
+		print('{0}\t{0}\t{1}\t{2}'.format(sample, group, label))
+		samplesincontrast.append(sample)
+		groupset.append(group)
+		samptablefh.write('{0}\t{0}\t{1}\t{2}\n'.format(sample, group, label))
+	samptablefh.close()
+
+	if len(set(groupset)) < 2:
+		raise Exception('Error in groups.tab file. Contrast does not contain two groups!')
+
+	#Creating new RawCounts_RSEM_genes.txt file based off of the current DEG groups.tab information
+	rawfn = args.inputRawCounts
+	outrawfn = args.outputRawCounts
+	filteredRawCounts(rawfn, samplesincontrast, outrawfn)
+
+
diff --git a/Results-template/Scripts/filtersamples.R b/Results-template/Scripts/filtersamples.R
@@ -8,22 +8,21 @@ CPM_CUTOFF <- args[4]
 MINSAMPLES <- args[5]
 SAMPLETABLE <- args[6]
 RAWCOUNTSTABLE <- args[7]
-OUTSAMPLETABLE <- args[8]
-FILTEREDRAWCOUNTSTABLE <- args[9]
+FILTEREDRAWCOUNTSTABLE <- args[8]
 
 x=read.table(SAMPLETABLE,header = T,sep="\t")
 samples=(x$condition==G1 | x$condition==G2)
 
 x1=x[samples,]
-write.table(x1,file=OUTSAMPLETABLE,sep="\t",col.names=T,quote=F,row.names = F)
+#write.table(x1,file=OUTSAMPLETABLE,sep="\t",col.names=T,quote=F,row.names = F)
 
 
 y=read.table(RAWCOUNTSTABLE,header = T,sep = "\t")
 row.names(y)=y$symbol
 y=y[,-which(names(y) %in% ("symbol"))]
-y=y[,samples]
+#y=y[,samples]
 y=ceiling(y)
-colnames(y)=x1$label
+#colnames(y)=x1$label
 k=rowSums(cpm(y)>CPM_CUTOFF)>MINSAMPLES
 # table(k)
 y=y[k,]

diff --git a/Results-template/Scripts/pcacall.R b/Results-template/Scripts/pcacall.R
@@ -1,14 +1,15 @@
-# Example Usgae: Rscript pcacall.R 'DEG_cntrl-test_0.5_2' 'STAR_files/sampletable.txt' 'DEG_cntrl-test_0.5_2/RawCountFile_RSEM_genes_filtered.txt' 'hg19seDEG' 'Enter CCBR Project Description and Notes here.'
+# Example Usgae: Rscript pcacall.R 'DEG_cntrl-test_0.5_2' 'outfilename.html'  'STAR_files/sampletable.txt' 'DEG_cntrl-test_0.5_2/RawCountFile_RSEM_genes_filtered.txt' 'hg19seDEG' 'Enter CCBR Project Description and Notes here.'
 ## grab args
 args <- commandArgs(trailingOnly = TRUE)
 DIR <- args[1]
+outHtml <- args[2]
 # Sys.setenv(RSTUDIO_PANDOC="/Applications/RStudio.app/Contents/MacOS/pandoc")
 setwd(DIR) # new 
-rmarkdown::render("PcaReport.Rmd", params = list(
+rmarkdown::render("PcaReport.Rmd",output_file=outHtml, params = list(
     folder = args[1],
-    sampleinfo = args[2],
-    data = args[3],
-    projectId = args[4],
-    projectDesc = args[5]
+    sampleinfo = args[3],
+    data = args[4],
+    projectId = args[5],
+    projectDesc = args[6]
   ))
 
diff --git a/Rules/initialqcrnaseq.snakefile b/Rules/initialqcrnaseq.snakefile
@@ -1,9 +1,10 @@
 from snakemake.utils import R
 from os.path import join
-configfile: "run.json"
-
 from os import listdir
 
+configfile: "run.json"
+samples=config['project']['groups']['rsamps']
+
 def check_existence(filename):
   if not os.path.exists(filename):
     exit("File: %s does not exists!"%(filename))
@@ -653,18 +654,20 @@ rule samplecondition:
    params: 
     rname='pl:samplecondition',
     batch='--mem=4g --time=10:00:00', 
+    pathprefix=join(workpath,star_dir),
     groups=config['project']['groups']['rgroups'],
     labels=config['project']['groups']['rlabels'],
+    allsamples=config['project']['groups']['rsamps'],
     gtffile=config['references'][pfamily]['GTFFILE']
    run:
         with open(output.out1, "w") as out:
             out.write("sampleName\tfileName\tcondition\tlabel\n")
             i=0
             for f in input.files:
-                out.write("%s\t"  % f)
-                out.write("%s\t"  % f)
-                out.write("%s\t" % params.groups[i])
-                out.write("%s\n" % params.labels[i])                
+                out.write("{}/{}.star.count.txt\t".format(params.pathprefix, params.allsamples[i]))
+                out.write("{}/{}.star.count.txt\t".format(params.pathprefix, params.allsamples[i]))
+                out.write("{}\t".format(params.groups[i]))
+                out.write("{}\n".format(params.labels[i])) 
                 i=i+1
             out.close()
 

diff --git a/Rules/rnaseq.snakefile b/Rules/rnaseq.snakefile
@@ -77,6 +77,7 @@ if config['project']['DEG'] == "yes" and config['project']['TRIM'] == "yes":
     input:
       #Initialize DEG (Filter raw counts matrix by cpm and min sample thresholds)
       expand(join(workpath,"DEG_{deg_dir}", "RawCountFile_RSEM_genes_filtered.txt"), deg_dir=contrasts_w_cpm_cutoff_list),
+      expand(join(workpath,"DEG_{deg_dir}", "RawCountFile_RSEM_genes.txt"), deg_dir=contrasts_w_cpm_cutoff_list),
 
       #EBSeq
       expand(join(workpath,"DEG_{deg_dir}", "EBSeq_isoform_completed.txt"), deg_dir=contrasts_w_cpm_cutoff_list),
@@ -104,17 +105,25 @@ rule init_deg:
     rawcountstab=join(workpath,"DEG_ALL","RawCountFile_RSEM_genes.txt"),
   output:
     sampletable=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}","sampletable.txt"),
+    rawcounts=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}","RawCountFile_RSEM_genes.txt"),
     filteredcountstabs=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}","RawCountFile_RSEM_genes_filtered.txt"),
   params:
     rname="pl:init_deg_dir",
     rscript=join(workpath,"Scripts","filtersamples.R"),
     rver=config['bin'][pfamily]['tool_versions']['RVER'],
+    filterscript=join(workpath,"Scripts","filterSampleTable.py"),
+    pythonver=config['bin'][pfamily]['tool_versions']['PYTHONVER'],
+    groups=",".join(config['project']['groups']['rgroups']),
+    labels=",".join(config['project']['groups']['rlabels']),
+    allsamples=",".join(config['project']['groups']['rsamps']),
     outdir=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}"),
   shell:"""
 if [ ! -d {params.outdir} ]; then mkdir {params.outdir} ;fi
 cd {params.outdir}
 module load {params.rver}
-Rscript {params.rscript} {params.outdir} {wildcards.group1} {wildcards.group2} {wildcards.mincpm} {wildcards.minsample} {input.sampletable} {input.rawcountstab} {output.sampletable} {output.filteredcountstabs}
+module load {params.pythonver}
+python {params.filterscript} -c '{wildcards.group1} {wildcards.group2}' -s '{params.allsamples}' -g '{params.groups}' -l '{params.labels}' -r '{input.rawcountstab}' -outsample '{output.sampletable}' -outraw '{output.rawcounts}'
+Rscript {params.rscript} {params.outdir} {wildcards.group1} {wildcards.group2} {wildcards.mincpm} {wildcards.minsample} {output.sampletable} {output.rawcounts} {output.filteredcountstabs}
 """
 
 rule EBSeq_isoform:
@@ -261,7 +270,7 @@ rule pca:
     file1=join(workpath,star_dir,"sampletable.txt"),
     file2=join(workpath,"DEG_{dtype}","RawCountFile_RSEM_genes_filtered.txt"),
   output: 
-    join(workpath,"DEG_{dtype}","PcaReport.html")
+    outhtml=join(workpath,"DEG_{dtype}","PcaReport.html")
   params: 
     rname='pl:pca',
     batch='--mem=24g --time=10:00:00',
@@ -278,7 +287,7 @@ cp {params.rscript1} {params.outdir}
 cp {params.rscript2} {params.outdir}
 cd {params.outdir}
 module load {params.rver}
-Rscript pcacall.R '{params.outdir}' '{input.file1}' '{input.file2}' '{params.projectId}' '{params.projDesc}'
+Rscript pcacall.R '{params.outdir}' '{output.outhtml}'  '{input.file1}' '{input.file2}' '{params.projectId}' '{params.projDesc}'
 """
 
 rule vennDiagram:
@@ -303,3 +312,4 @@ cd {params.outdir}
 module load {params.rver}
 Rscript {params.rscript} --limma '{input.limmafile}' --edgeR '{input.edgeRfile}' --DESeq2 '{input.deseqfile}'
 """
+
diff --git a/gui/frame.py b/gui/frame.py
@@ -352,7 +352,17 @@ def init_work_dir( self ):
             showinfo( "Success", "The work directory has successfully initialized!")
         else :
             showerror( "Symlink failed", "" )
-
+
+    def popup_warning(self, filename):
+        try:
+            fname = self.workpath.get() + '/' + filename
+            fh = open(fname, 'r')
+            return False
+        except IOError:
+            print("Could not find required file {} in your working directory:{}".format(filename, self.workpath.get()))
+            showerror("Error","{0} file not found!\nDid you initialize the working directory?\nDid you copy {0} into the working directory?".format(filename))
+            return True
+
     def popup_window( self, text, filename ) :
         top = Toplevel()
 
@@ -691,6 +701,11 @@ def readpaste( self, ftype, comments ):
         return
 
     def dryrun( self ) :
+        pl = self.pipeline_name
+        print("Drying-running: {}".format(pl))
+        if pl == 'RNAseq':
+            if self.popup_warning('groups.tab') or self.popup_warning('contrasts.tab'):
+                return
         self.makejson("none")
         #self.run_button.config( state='active' )
         return self.cmd( "--dryrun -p -s %s/Snakefile --rerun-incomplete -d %s" % ( self.workpath.get(), self.workpath.get()))