Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

Commit

Permalink
Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
Browse files Browse the repository at this point in the history
…tiveDev
  • Loading branch information
jlac committed Mar 18, 2019
2 parents 6673df4 + 5ebe126 commit 02191ea
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 43 deletions.
19 changes: 16 additions & 3 deletions Results-template/Scripts/filtersamples.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,24 @@ x=read.table(SAMPLETABLE,header = T,sep="\t")

samples=(x$condition==G1 | x$condition==G2)
x1=x[samples,]

g1_samples=(x1$condition==G1)
ng1=min(length(g1_samples[g1_samples==TRUE]),MINSAMPLES)
g2_samples=(x1$condition==G2)
ng2=min(length(g2_samples[g2_samples==TRUE]),MINSAMPLES)


if (MINSAMPLES<0.5) {
MINSAMPLES=0.5
}
if (MINSAMPLES<1) {
ng1=max(1,floor(length(g1_samples[g1_samples==TRUE])*MINSAMPLES))
ng2=max(1,floor(length(g2_samples[g2_samples==TRUE])*MINSAMPLES))
}
if (MINSAMPLES>=1){
ng1=min(length(g1_samples[g1_samples==TRUE]),MINSAMPLES)
ng2=min(length(g2_samples[g2_samples==TRUE]),MINSAMPLES)
}

ng1
ng2

y=read.table(RAWCOUNTSTABLE,header = T,sep = "\t")
y=y[,samples]
Expand Down
101 changes: 68 additions & 33 deletions Results-template/Scripts/rsemcounts.R
Original file line number Diff line number Diff line change
@@ -1,64 +1,99 @@
library('reshape')
library('ggplot2')
library('edgeR')
library('DESeq2')
library('tidyverse')

writegzfile <- function(m,f) {
m=as.data.frame(m)
m$id=rownames(m)
m=separate(data=m,col=id,into=c('ensID','geneName'),sep="\\|",remove=TRUE)
m=m %>% select('ensID','geneName',everything())
write.table(m,file=gzfile(f),sep="\t",row.names = FALSE,quote=F)
}

args <- commandArgs(trailingOnly = TRUE)
DIR <- args[1]
FILES <- args[2]
MINCOUNT <- args[3]
MINSAMPLES <- args[4]
ANNOTATE <- args[5]
ANNOTATE <- args[3]
SAMPLETABLE <- args[4]

#SAMPLETABLE="fullsampletable.txt"
#DIR="~/Desktop/Temp/ccbr842/RNASeq/RSEM_filtering"
#FILES=c("iv_4T1C1_1.RSEM.genes.results iv_4T1C1_2.RSEM.genes.results iv_NF1_1.RSEM.genes.results iv_NF1_2.RSEM.genes.results iv_TSC1_1.RSEM.genes.results iv_TSC1_2.RSEM.genes.results iv_Tgfbr_2_1.RSEM.genes.results iv_Tgfbr_2_2.RSEM.genes.results")
#ANNOTATE="annotate.genes.txt"

MINCOUNT=0.5
MINSAMPLES=0.5


setwd(DIR)
x=read.table(SAMPLETABLE,header = T,sep="\t")
myfiles=as.character(unlist(strsplit(FILES, split=" ")))
res=read.delim(myfiles[1],header=T)[,c(1,5)]
# colnames(res)[1]="gene"
res=read.delim(myfiles[1],header=T)[,c(1,5)]
colnames(res)[2]=as.character(myfiles[1])
# remove the last 5 statistics lines ...
# nr=dim(res)[1]
# res=res[-c((nr-4):nr),]
#
for(i in seq(2, length(myfiles), by = 1))
{{
temp=read.delim(myfiles[i],header=T)[,c(1,5)]
#colnames(temp)[1]="gene"
colnames(temp)[2]=as.character(myfiles[i])
res=merge(res,temp)
}}

gene_name=read.delim(ANNOTATE,header=F,sep=" ")
res2=merge(gene_name,res,by.x=1,by.y=1)
res3=cbind(symbol=paste(res2[,1],"|",res2[,3],sep=""),res2[,-c(1,2,3,4,5)])
write.table(as.data.frame(res3),file="RawCountFile_RSEM_genes.txt",sep="\t",row.names=F)
#
mydata=read.delim("RawCountFile_RSEM_genes.txt",row.names=1)
# rounding
mydata=round(mydata)
val1=as.numeric(MINCOUNT)
val2=as.numeric(MINSAMPLES)
# cat(val1," ", val2, "checking..\n",file="check.txt")
## filter <- apply(mydata, 1, function(x) length(x[x>val1])>=val2)
## res=mydata[filter,]
tot=colSums(mydata)
val1=(val1/max(tot))*1e6
# cat(val1," ", val2, "checking..\n",file="check2.txt")
filter <- apply(cpm(mydata), 1, function(x) length(x[x>val1])>=val2)
res=mydata[filter,]
write.table(as.data.frame(res),file="RawCountFile_RSEM_genes_filtered.txt",sep="\t",col.names=NA,quote=F)
png("RSEM_HistBeforenormFilter.png")
df.m <- melt(as.data.frame(res))
print(ggplot(df.m) + geom_density(aes(x = value, colour = variable)) + labs(x = NULL) + theme(legend.position='top') + scale_x_log10())
dev.off()
write.table(as.data.frame(res3),file="RawCountFile_RSEM_genes.txt",sep="\t",row.names=F,quote = F)
mydata=read.delim("RawCountFile_RSEM_genes.txt",row.names=1,check.names=FALSE,header=T)
print(colnames(mydata))
colnames(mydata)=gsub('\\..*$','',colnames(mydata))
print(colnames(mydata))
colnames(mydata)=gsub('.*/','',colnames(mydata))
print(colnames(mydata))
mydata=ceiling(mydata)
writegzfile(cpm(mydata),"RSEM_CPM_counts.txt.gz")

groups=levels(x$condition)
G1=groups[1]
g1_samples=(x$condition==G1)
ng1=max(1,floor(length(g1_samples[g1_samples==TRUE])*MINSAMPLES))
CPM_CUTOFF=MINCOUNT
mydata1=mydata[,g1_samples]
k_g1=rowSums(cpm(mydata1)>CPM_CUTOFF)>=ng1
k=k_g1
table(k)

for(i in seq(2,length(levels(x$condition)))){
Gi=groups[i]
gi_samples=(x$condition==Gi)
ngi=max(1,floor(length(gi_samples[gi_samples==TRUE])*MINSAMPLES))
mydatai=mydata[,gi_samples]
k_gi=rowSums(cpm(mydatai)>CPM_CUTOFF)>=ngi
k=k|k_gi
print(table(k))
}

res=mydata[k,]
res2=res
res2$symbol=rownames(res2)
res2=res2 %>% select('symbol',everything())
write.table(res2,file="RawCountFile_RSEM_genes_filtered.txt",row.names = F,quote = F,sep="\t")
# png("RSEM_HistBeforenormFilter.png")
# df.m <- melt(as.data.frame(res))
# print(ggplot(df.m) + geom_density(aes(x = value, colour = variable)) + labs(x = NULL) + theme(legend.position='top') + scale_x_log10())
# dev.off()
y = DGEList(counts=res)
## Normalization TMM ------------------------------------------------------------
## method = =c("TMM","RLE","upperquartile","none")
y <- calcNormFactors(y,method="TMM")
ndata= cpm(y,log=FALSE,normalized.lib.sizes=TRUE)
## save it
write.table(ndata,file="RSEM_CPM_TMM_counts.txt",sep="\t",col.names=NA,quote=F)
writegzfile(ndata,"RSEM_CPM_TMM_counts.txt.gz")
## unfiltered normalization
y2 = DGEList(counts=mydata)
y2 <- calcNormFactors(y2,method="TMM")
ndata2= cpm(y2,log=FALSE,normalized.lib.sizes=TRUE)
## save it
write.table(ndata2,file="RSEM_CPM_TMM_unfiltered_counts.txt",sep="\t",col.names=NA)

writegzfile(ndata2,"RSEM_CPM_TMM_unfiltered_counts.txt.gz")
rlogres=rlog(as.matrix(res),blind=TRUE)
rownames(rlogres)=rownames(res)
writegzfile(rlogres,"RSEM_rlog_counts.txt.gz")
11 changes: 5 additions & 6 deletions Rules/initialqcrnaseq.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
rl=int(open(join(input.qcdir,"readlength.txt")).readlines()[0].strip())-1
dbrl=sorted(list(map(lambda x:int(re.findall("genes-(\d+)",x)[0]),glob.glob(params.stardir+'*/',recursive=False))))
bestdbrl=next(x[1] for x in enumerate(dbrl) if x[1] >= rl)
cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" "+input.file2+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted"
cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" "+input.file2+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted"
shell(cmd)
A=open(join(workpath,"run.json"),'r')
a=eval(A.read())
Expand Down Expand Up @@ -490,7 +490,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
rl=int(open(join(input.qcdir,"readlength.txt")).readlines()[0].strip())-1
dbrl=sorted(list(map(lambda x:int(re.findall("genes-(\d+)",x)[0]),glob.glob(params.stardir+'*/',recursive=False))))
bestdbrl=next(x[1] for x in enumerate(dbrl) if x[1] >= rl)
cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted"
cmd="cd {star_dir}; module load "+params.starver+"; STAR --genomeDir "+params.stardir+str(bestdbrl)+" --outFilterIntronMotifs "+params.filterintronmotifs+" --outSAMstrandField "+params.samstrandfield+" --outFilterType "+params.filtertype+" --outFilterMultimapNmax "+str(params.filtermultimapnmax)+" --alignSJoverhangMin "+str(params.alignsjoverhangmin)+" --alignSJDBoverhangMin "+str(params.alignsjdboverhangmin)+" --outFilterMismatchNmax "+str(params.filtermismatchnmax)+" --outFilterMismatchNoverLmax "+str(params.filtermismatchnoverlmax)+" --alignIntronMin "+str(params.alignintronmin)+" --alignIntronMax "+str(params.alignintronmax)+" --alignMatesGapMax "+str(params.alignmatesgapmax)+" --clip3pAdapterSeq "+params.adapter1+" "+params.adapter2+" --readFilesIn "+input.file1+" --readFilesCommand zcat --runThreadN "+str(threads)+" --outFileNamePrefix "+params.prefix+". --outSAMtype BAM Unsorted"
shell(cmd)
A=open(join(workpath,"run.json"),'r')
a=eval(A.read())
Expand Down Expand Up @@ -580,7 +580,7 @@ rule samplecondition:
out.write("sampleName\tfileName\tcondition\tlabel\n")
i=0
for f in input.files:
out.write("{}/{}.star.count.txt\t".format(params.pathprefix, params.allsamples[i]))
out.write("{}\t".format(params.allsamples[i]))
out.write("{}/{}.star.count.txt\t".format(params.pathprefix, params.allsamples[i]))
out.write("{}\t".format(params.groups[i]))
out.write("{}\n".format(params.labels[i]))
Expand Down Expand Up @@ -817,21 +817,20 @@ python {params.pythonscript} {params.annotate} {degall_dir} {degall_dir}
rule rsemcounts:
input:
files=expand(join(workpath,degall_dir,"{name}.RSEM.genes.results"), name=samples),
sampletable=join(workpath,star_dir,"sampletable.txt")
output:
join(workpath,degall_dir,"RawCountFile_RSEM_genes_filtered.txt"),
params:
rname='pl:rsemcounts',
batch='--mem=8g --time=10:00:00',
outdir=join(workpath,degall_dir),
mincount=config['project']['MINCOUNTGENES'],
minsamples=config['project']['MINSAMPLES'],
annotate=config['references'][pfamily]['ANNOTATE'],
rver=config['bin'][pfamily]['tool_versions']['RVER'],
rscript=join(workpath,"Scripts","rsemcounts.R")
shell: """
cd {params.outdir}
module load {params.rver}
Rscript {params.rscript} '{params.outdir}' '{input.files}' '{params.mincount}' '{params.minsamples}' '{params.annotate}'
Rscript {params.rscript} '{params.outdir}' '{input.files}' '{params.annotate}' '{input.sampletable}'
"""


Expand Down
17 changes: 16 additions & 1 deletion gui/rnaseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,14 +255,29 @@ def makejson(self, *args):
cont_cpm_cutoff=[]
cont_mincount=[]
## lbl=[]
## contrasts.tab can have 2,3 or 4 columns
# col 1 and 2 are group1 and group2 respectively
# col 3 is CPM cutoff value ... if not provided 0.5 will be used
# col 4 is either min number of sample per group that need to satisfy the cpm cutoff criteria or fraction of samples that need
# to satisfy the criteria... can be integer >=1 ... or fraction >=0.5 and <1
for x in f:
xsplit=x.split()
if len(xsplit) == 4:
cont.append(xsplit[0])
cont.append(xsplit[1])
cont_cpm_cutoff.append(xsplit[2])
cont_mincount.append(xsplit[3])

elif len(xsplit) == 3:
cont.append(xsplit[0])
cont.append(xsplit[1])
cont_cpm_cutoff.append(xsplit[2])
cont_mincount.append("0.5")
elif len(xsplit) == 2:
cont.append(xsplit[0])
cont.append(xsplit[1])
cont_cpm_cutoff.append("0.5")
cont_mincount.append("0.5")

D["rcontrasts"]=cont
D["rcontrasts_cpm_cutoff"]=cont_cpm_cutoff
D["rcontrasts_min_counts"]=cont_mincount
Expand Down

0 comments on commit 02191ea

Please sign in to comment.