Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

Commit

Permalink
Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
Browse files Browse the repository at this point in the history
…tiveDev
  • Loading branch information
jlac committed Oct 15, 2018
2 parents c96c899 + 419bc33 commit 8a9987e
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 21 deletions.
115 changes: 115 additions & 0 deletions Results-template/Scripts/filterSampleTable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
##############################################################################################
# Description: Creates a new sampletable.txt for differential expression analysis which
# is necessary because a user may remove samples from their groups.tab file
# after running QC. The new sampletable.txt file is created from the
# groups.tab file that is provided when running DE half of the pipeline.
# Also a corresponding RawCounts_RSEM_genes.txt file is created which only
# contains information for samples that that are specificed in groups.tab.
# This file is written to DEG_{group1}-{group2}_{mincpm}_{minsample} folder.
# It is necessary that the order of the groups listed in sampletable.txt and
# and RawCounts_RSEM_genes_filtered.txt match. If they do not the reported
# direction (sign: postive or negative) of each gene's fold-change will be
# opposite of what it should be. This program also addresses that problem.
# USAGE: python filterSampleTable.py -s 'rbfoxKO_1,rbfoxKO_2,control_1,control_2'\
# -g 'KO,KO,WT,WT' -l 'KO_1,KO_2,WT_1,WT_2' \
# -r 'RawCountFile_RSEM_genes.txt' \
# -outsample 'outfolder/sampletable.txt'\
# -outraw 'outfolder/RawCountFile_RSEM_genes.txt'
##############################################################################################

from __future__ import print_function, division
import os, sys
import argparse # module load python/3.5




def filteredRawCounts(rawcountsfilename, samples, outrawfilename):
'''Generator that yields rawcounts information for samples in the current DEG groups.tab file.
Helper function checks formating of the headers between the newly generated sampletable.txt and
input RawCounts_RSEM_genes.txt file. If the order of groups formatting differs, it fixes it to match
the ordering in sampletable.txt (fail-safe to ensure later referential integrity at DE).
'''
def formatted(headerlist, samples, linelist):
headerIndexes = {}
symbol = linelist[0]
# Create Dictionary for mapping samplename in header of RawCounts_RSEM_genes.txt to its position
for i in range(len(headerlist)):
headerIndexes[headerlist[i]] = i
indexlist = []
formattedlist = [symbol]
for sample in samples:
try:
indexlist.append(headerIndexes[sample])
formattedlist.append(linelist[headerIndexes[sample]])
except KeyError:
raise Exception('Key Error: Failed to find sample {} in RawCounts_RSEM_genes.txt.\nDid you add an additional sample to groups.tab?'.format(sample))
return formattedlist

rawfh = open(rawcountsfilename, 'r')
headerlist = [ fn.split('/')[-1].split('.')[0] for fn in next(rawfh).strip().split('\t')]

newheader = '\t'.join([headerlist[0]]+ samples) + '\n'
print(newheader)
outfh = open(outrawfilename, 'w')
outfh.write(newheader)
for line in rawfh:
linelist = line.strip().split('\t')
formattedlist = formatted(headerlist, samples, linelist)
#print(formattedlist)
outfh.write('\t'.join(formattedlist) + '\n')
rawfh.close()
outfh.close()

def SampleTable(samples, groups, labels,contrasts):
'''Generator that yields sample, group, and label info from the current DEG groups.tab file'''
for i in range(len(samples)):
if groups[i] in contrasts:
yield samples[i], groups[i], labels[i]



if __name__ == '__main__':

# USAGE: python filterSampleTable.py -c 'KO WT' -s 'rbfoxKO_1,rbfoxKO_2,rbfoxKO_3,control_1,control_2,control_3' -g 'KO,KO,KO,WT,WT,WT' -l 'KO_1,KO_2,KO_3,WT_1,WT_2,WT_3' -r 'RawCountFile_RSEM_genes.txt' -outsample 'outfolder/sampletable.txt' -outraw 'outfolder/RawCountFile_RSEM_genes.txt'
# Parse Command-line arguments
parser = argparse.ArgumentParser(description='Filters sampletable.txt, cannot always use the sampletable that is generated in QC, sometimes DE is run with less samples.')
parser.add_argument('-r','--inputRawCounts', type=str, required=True, help='Input RawCounts_RSEM_genes.txt file that is generated QC: (required)')
parser.add_argument('-outraw','--outputRawCounts', type=str, required=True, help='Output RawCounts_RSEM_genes.txt filename (required)')
parser.add_argument('-outsample','--outputSampleTable', type=str, required=True, help='Output RawCounts_RSEM_genes.txt filename (required)')
parser.add_argument('-s','--samples', type=str, required=True, help='samples string from the run.json file (required)')
parser.add_argument('-c','--contrast', type=str, required=True, help='contrast string (required)')
parser.add_argument('-g','--groups', type=str, required=True, help='groups string from the run.json file (required)')
parser.add_argument('-l','--labels', type=str, required=True, help='labels string from the run.json file (required)')
args = parser.parse_args()

sampleslist = args.samples.split(',')
groupslist = args.groups.split(',')
labelslist = args.labels.split(',')
contrastlist = args.contrast.split(' ')
print(sampleslist)
print(groupslist)
print(labelslist)


#Creating the new sampletable.txt file based off of the current DEG groups.tab information
samptablefh = open(args.outputSampleTable, 'w')
samptablefh.write('sampleName\tfileName\tcondition\tlabel\n')
samplesincontrast = []
groupset = []
for sample, group, label in SampleTable(sampleslist, groupslist, labelslist, contrastlist):
print('{0}\t{0}\t{1}\t{2}'.format(sample, group, label))
samplesincontrast.append(sample)
groupset.append(group)
samptablefh.write('{0}\t{0}\t{1}\t{2}\n'.format(sample, group, label))
samptablefh.close()

if len(set(groupset)) < 2:
raise Exception('Error in groups.tab file. Contrast does not contain two groups!')

#Creating new RawCounts_RSEM_genes.txt file based off of the current DEG groups.tab information
rawfn = args.inputRawCounts
outrawfn = args.outputRawCounts
filteredRawCounts(rawfn, samplesincontrast, outrawfn)


9 changes: 4 additions & 5 deletions Results-template/Scripts/filtersamples.R
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,21 @@ CPM_CUTOFF <- args[4]
MINSAMPLES <- args[5]
SAMPLETABLE <- args[6]
RAWCOUNTSTABLE <- args[7]
OUTSAMPLETABLE <- args[8]
FILTEREDRAWCOUNTSTABLE <- args[9]
FILTEREDRAWCOUNTSTABLE <- args[8]

x=read.table(SAMPLETABLE,header = T,sep="\t")
samples=(x$condition==G1 | x$condition==G2)

x1=x[samples,]
write.table(x1,file=OUTSAMPLETABLE,sep="\t",col.names=T,quote=F,row.names = F)
#write.table(x1,file=OUTSAMPLETABLE,sep="\t",col.names=T,quote=F,row.names = F)


y=read.table(RAWCOUNTSTABLE,header = T,sep = "\t")
row.names(y)=y$symbol
y=y[,-which(names(y) %in% ("symbol"))]
y=y[,samples]
#y=y[,samples]
y=ceiling(y)
colnames(y)=x1$label
#colnames(y)=x1$label
k=rowSums(cpm(y)>CPM_CUTOFF)>MINSAMPLES
# table(k)
y=y[k,]
Expand Down
13 changes: 7 additions & 6 deletions Results-template/Scripts/pcacall.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# Example Usgae: Rscript pcacall.R 'DEG_cntrl-test_0.5_2' 'STAR_files/sampletable.txt' 'DEG_cntrl-test_0.5_2/RawCountFile_RSEM_genes_filtered.txt' 'hg19seDEG' 'Enter CCBR Project Description and Notes here.'
# Example Usgae: Rscript pcacall.R 'DEG_cntrl-test_0.5_2' 'outfilename.html' 'STAR_files/sampletable.txt' 'DEG_cntrl-test_0.5_2/RawCountFile_RSEM_genes_filtered.txt' 'hg19seDEG' 'Enter CCBR Project Description and Notes here.'
## grab args
args <- commandArgs(trailingOnly = TRUE)
DIR <- args[1]
outHtml <- args[2]
# Sys.setenv(RSTUDIO_PANDOC="/Applications/RStudio.app/Contents/MacOS/pandoc")
setwd(DIR) # new
rmarkdown::render("PcaReport.Rmd", params = list(
rmarkdown::render("PcaReport.Rmd",output_file=outHtml, params = list(
folder = args[1],
sampleinfo = args[2],
data = args[3],
projectId = args[4],
projectDesc = args[5]
sampleinfo = args[3],
data = args[4],
projectId = args[5],
projectDesc = args[6]
))

15 changes: 9 additions & 6 deletions Rules/initialqcrnaseq.snakefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from snakemake.utils import R
from os.path import join
configfile: "run.json"

from os import listdir

configfile: "run.json"
samples=config['project']['groups']['rsamps']

def check_existence(filename):
if not os.path.exists(filename):
exit("File: %s does not exists!"%(filename))
Expand Down Expand Up @@ -653,18 +654,20 @@ rule samplecondition:
params:
rname='pl:samplecondition',
batch='--mem=4g --time=10:00:00',
pathprefix=join(workpath,star_dir),
groups=config['project']['groups']['rgroups'],
labels=config['project']['groups']['rlabels'],
allsamples=config['project']['groups']['rsamps'],
gtffile=config['references'][pfamily]['GTFFILE']
run:
with open(output.out1, "w") as out:
out.write("sampleName\tfileName\tcondition\tlabel\n")
i=0
for f in input.files:
out.write("%s\t" % f)
out.write("%s\t" % f)
out.write("%s\t" % params.groups[i])
out.write("%s\n" % params.labels[i])
out.write("{}/{}.star.count.txt\t".format(params.pathprefix, params.allsamples[i]))
out.write("{}/{}.star.count.txt\t".format(params.pathprefix, params.allsamples[i]))
out.write("{}\t".format(params.groups[i]))
out.write("{}\n".format(params.labels[i]))
i=i+1
out.close()

Expand Down
16 changes: 13 additions & 3 deletions Rules/rnaseq.snakefile
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ if config['project']['DEG'] == "yes" and config['project']['TRIM'] == "yes":
input:
#Initialize DEG (Filter raw counts matrix by cpm and min sample thresholds)
expand(join(workpath,"DEG_{deg_dir}", "RawCountFile_RSEM_genes_filtered.txt"), deg_dir=contrasts_w_cpm_cutoff_list),
expand(join(workpath,"DEG_{deg_dir}", "RawCountFile_RSEM_genes.txt"), deg_dir=contrasts_w_cpm_cutoff_list),

#EBSeq
expand(join(workpath,"DEG_{deg_dir}", "EBSeq_isoform_completed.txt"), deg_dir=contrasts_w_cpm_cutoff_list),
Expand Down Expand Up @@ -104,17 +105,25 @@ rule init_deg:
rawcountstab=join(workpath,"DEG_ALL","RawCountFile_RSEM_genes.txt"),
output:
sampletable=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}","sampletable.txt"),
rawcounts=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}","RawCountFile_RSEM_genes.txt"),
filteredcountstabs=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}","RawCountFile_RSEM_genes_filtered.txt"),
params:
rname="pl:init_deg_dir",
rscript=join(workpath,"Scripts","filtersamples.R"),
rver=config['bin'][pfamily]['tool_versions']['RVER'],
filterscript=join(workpath,"Scripts","filterSampleTable.py"),
pythonver=config['bin'][pfamily]['tool_versions']['PYTHONVER'],
groups=",".join(config['project']['groups']['rgroups']),
labels=",".join(config['project']['groups']['rlabels']),
allsamples=",".join(config['project']['groups']['rsamps']),
outdir=join(workpath,"DEG_{group1}-{group2}_{mincpm}_{minsample}"),
shell:"""
if [ ! -d {params.outdir} ]; then mkdir {params.outdir} ;fi
cd {params.outdir}
module load {params.rver}
Rscript {params.rscript} {params.outdir} {wildcards.group1} {wildcards.group2} {wildcards.mincpm} {wildcards.minsample} {input.sampletable} {input.rawcountstab} {output.sampletable} {output.filteredcountstabs}
module load {params.pythonver}
python {params.filterscript} -c '{wildcards.group1} {wildcards.group2}' -s '{params.allsamples}' -g '{params.groups}' -l '{params.labels}' -r '{input.rawcountstab}' -outsample '{output.sampletable}' -outraw '{output.rawcounts}'
Rscript {params.rscript} {params.outdir} {wildcards.group1} {wildcards.group2} {wildcards.mincpm} {wildcards.minsample} {output.sampletable} {output.rawcounts} {output.filteredcountstabs}
"""

rule EBSeq_isoform:
Expand Down Expand Up @@ -261,7 +270,7 @@ rule pca:
file1=join(workpath,star_dir,"sampletable.txt"),
file2=join(workpath,"DEG_{dtype}","RawCountFile_RSEM_genes_filtered.txt"),
output:
join(workpath,"DEG_{dtype}","PcaReport.html")
outhtml=join(workpath,"DEG_{dtype}","PcaReport.html")
params:
rname='pl:pca',
batch='--mem=24g --time=10:00:00',
Expand All @@ -278,7 +287,7 @@ cp {params.rscript1} {params.outdir}
cp {params.rscript2} {params.outdir}
cd {params.outdir}
module load {params.rver}
Rscript pcacall.R '{params.outdir}' '{input.file1}' '{input.file2}' '{params.projectId}' '{params.projDesc}'
Rscript pcacall.R '{params.outdir}' '{output.outhtml}' '{input.file1}' '{input.file2}' '{params.projectId}' '{params.projDesc}'
"""

rule vennDiagram:
Expand All @@ -303,3 +312,4 @@ cd {params.outdir}
module load {params.rver}
Rscript {params.rscript} --limma '{input.limmafile}' --edgeR '{input.edgeRfile}' --DESeq2 '{input.deseqfile}'
"""

17 changes: 16 additions & 1 deletion gui/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,17 @@ def init_work_dir( self ):
showinfo( "Success", "The work directory has successfully initialized!")
else :
showerror( "Symlink failed", "" )


def popup_warning(self, filename):
try:
fname = self.workpath.get() + '/' + filename
fh = open(fname, 'r')
return False
except IOError:
print("Could not find required file {} in your working directory:{}".format(filename, self.workpath.get()))
showerror("Error","{0} file not found!\nDid you initialize the working directory?\nDid you copy {0} into the working directory?".format(filename))
return True

def popup_window( self, text, filename ) :
top = Toplevel()

Expand Down Expand Up @@ -691,6 +701,11 @@ def readpaste( self, ftype, comments ):
return

def dryrun( self ) :
pl = self.pipeline_name
print("Drying-running: {}".format(pl))
if pl == 'RNAseq':
if self.popup_warning('groups.tab') or self.popup_warning('contrasts.tab'):
return
self.makejson("none")
#self.run_button.config( state='active' )
return self.cmd( "--dryrun -p -s %s/Snakefile --rerun-incomplete -d %s" % ( self.workpath.get(), self.workpath.get()))
Expand Down

0 comments on commit 8a9987e

Please sign in to comment.