This is a test Jupyter Notebook for Dr. Byrum's RNASeq Analysis Pipeline

To install R in Jupyter Notebook, use the following command from the Anaconda terminal.
conda install -c r r-irkernel

In [1]:
library(edgeR)



Loading required package: limma



The data has been trimmed, aligned to the reference genome, and counts matrix generated. The first step of Differential Expression Analysis is to load the sample information and the counts matrix.
nf-core RNAseq pipeline.

<img src ="https://raw.githubusercontent.com/nf-core/rnaseq/3.8.1/docs/images/nf-core-rnaseq_metro_map_grey.png" alt ="nf-core">

In [3]:
# import targets and counts
targets <- read.csv("targets.csv", header=TRUE, stringsAsFactors=FALSE)
targets

GenewiseCounts <- read.csv("htseq_counts_12.csv", header=TRUE, row.names=1, stringsAsFactors=FALSE)
head(GenewiseCounts)

sample,group,batch
<chr>,<chr>,<chr>
D1,CON,b1
D2,CON,b1
D3,CON,b1
D4,CON,b1
D5,CON,b1
D6,CON,b2
E1,EPZ,b1
E2,EPZ,b1
E3,EPZ,b1
E4,EPZ,b1


Unnamed: 0_level_0,D1,D2,D3,D4,D5,D6,E1,E2,E3,E4,E5,E6
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
Adora2b,0,2,0,2,0,2,8,3,1,5,7,40
Adora2a,731,1036,749,913,761,334,825,1298,1321,1137,1096,617
Gm20741,0,0,0,0,0,0,0,0,0,0,0,0
Gm4340,0,0,0,0,0,0,0,0,0,0,0,0
Gm20744,0,0,0,0,0,0,0,0,0,0,0,0
Gm20743,0,0,0,0,0,0,0,0,0,0,0,0


<b> We must define the sample group as factors and set the levels option so the group is not changed to alphabetical order. Also, define the batch column. </b>

In [5]:
# define main factors for the analysis (group main factor for analysis, batch, etc)
targets$group <- factor(targets$group, levels=c(unique(targets$group)))
table(targets$group)
#levels(targets$group)

targets$batch <- factor(as.character(targets$batch), levels=c(unique(targets$batch)))
table(targets$batch)
#levels(targets$batch)

# create data.frame with factors (potential factors for the analysis)
my_targets <- data.frame(group=targets$group, batch=targets$batch)
rownames(my_targets) <- targets$sample
#head(my_targets)

#table(my_targets$group)
#table(my_targets$batch)


CON EPZ 
  6   6 


b1 b2 
10  2 

<b> DGEList() = Creates a DGEList object from a table of counts (rows=features, columns=samples), group indicator for each column, library size (optional) and a table of feature annotation (optional).</b>

In [8]:
# create DGEList()
y <- DGEList(counts=GenewiseCounts, group=  targets$group, genes=rownames(GenewiseCounts), remove.zeros=FALSE)
colnames(y$genes)[1] <- "gene_id"
head(y$counts)[,1:5]
head(y$genes)
head(y$samples)


Unnamed: 0,D1,D2,D3,D4,D5
Adora2b,0,2,0,2,0
Adora2a,731,1036,749,913,761
Gm20741,0,0,0,0,0
Gm4340,0,0,0,0,0
Gm20744,0,0,0,0,0
Gm20743,0,0,0,0,0


Unnamed: 0_level_0,gene_id
Unnamed: 0_level_1,<chr>
Adora2b,Adora2b
Adora2a,Adora2a
Gm20741,Gm20741
Gm4340,Gm4340
Gm20744,Gm20744
Gm20743,Gm20743


Unnamed: 0_level_0,group,lib.size,norm.factors
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>
D1,CON,22385619,1
D2,CON,28092434,1
D3,CON,19372856,1
D4,CON,27283589,1
D5,CON,23735343,1
D6,CON,17215483,1


[1] "used DGEList()"


<b> Load the gene annotation from the Mus musculus database and add to the DGEList object </b>

In [9]:
library(AnnotationDbi, quietly=TRUE)
library(org.Mm.eg.db, quietly=TRUE)


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following object is masked from 'package:limma':

    plotMA


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconducto

In [13]:
#org.Mm.eg.db
#keytypes(org.Mm.eg.db)


y$genes$ENSEMBL  <- mapIds(org.Mm.eg.db, keys=rownames(y$genes), keytype = "SYMBOL", column="ENSEMBL", multiVals="first")
y$genes$ENTREZID <- mapIds(org.Mm.eg.db, keys=rownames(y$genes), keytype = "SYMBOL", column="ENTREZID", multiVals="first")
y$genes$SYMBOL   <- mapIds(org.Mm.eg.db, keys=rownames(y$genes), keytype = "SYMBOL", column="SYMBOL", multiVals="first")
y$genes$GENENAME <- mapIds(org.Mm.eg.db, keys=rownames(y$genes), keytype = "SYMBOL", column="GENENAME", multiVals="first")


# final imported data
#head(y$counts)[,1:5]
head(y$samples)
head(y$genes)


'select()' returned 1:many mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns



Unnamed: 0_level_0,group,lib.size,norm.factors
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>
D1,CON,22385619,1
D2,CON,28092434,1
D3,CON,19372856,1
D4,CON,27283589,1
D5,CON,23735343,1
D6,CON,17215483,1


Unnamed: 0_level_0,gene_id,ENSEMBL,ENTREZID,SYMBOL,GENENAME
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
Adora2b,Adora2b,ENSMUSG00000018500,11541,Adora2b,adenosine A2b receptor
Adora2a,Adora2a,ENSMUSG00000020178,11540,Adora2a,adenosine A2a receptor
Gm20741,Gm20741,ENSMUSG00000116636,433047,Gm20741,keratin associated protein pseudogene
Gm4340,Gm4340,ENSMUSG00000090854,100043292,Gm4340,predicted gene 4340
Gm20744,Gm20744,ENSMUSG00000109145,434205,Gm20744,"predicted gene, 20744"
Gm20743,Gm20743,ENSMUSG00000104117,433374,Gm20743,"predicted gene, 20743"


<b> Filter genes that contain zeros in all samples </b>

In [17]:
## [2] REMOVE GENES WITH ZERO COUNTS

# genes with zero counts in all samples
zeros <- rowSums(y$counts)==0
table(zeros)

# keep genes with at least 1 count in 1 sample (i.e. filter genes with zero counts in all samples). 
keep <- rowSums(y$counts) > 0
table(keep)
y <- y[keep,, keep.lib.sizes=FALSE]
dim(y)

[1] "[2022-08-08 12:15:05] removing genes with 0 counts in all samples... "



FALSE 
16593 

zeros
FALSE 
16593 

keep
 TRUE 
16593 

<b>Dealing with Sequencing Bias: </b>Check raw values such as number of groups, samples, genes to make sure the object size is correct. Check the library size for each sample, which indicates the depth of sequencing. Calculate counts per million.  

<p>The differences between the number of readings is due to accidental variations in how much each different library is loaded into the flowcell and sequenced. </p>

<p>When loading a multiplexed RNAseq experiment into the flowcell, one quantifies the DNA (the initial RNA is reverse transcribed to DNA) amount for each library, "normalizes" the libraries (i.e., dilutes all libraries to the same DNA concentration), and loads the same amount of each library into the flowcell. In an ideal world, all libraries would have the same number of reads, and then no library size normalization would be necessary for the analysis - this is rarely the case, though, and there is substantial reads number variation between libraries. https://www.biostars.org/p/349881/#350181</p>

<p> There are multiple bias engaged in RNAseq experiment : <b>library size, genes length, RNA population composition for each condition and genes GC composition.</b>
Two bias can be discarded if you compare genes amongst conditions only, because these two are inherent to the gene : <b>genes length and genes GC composition</b></p>
    
<p><b>genes length :</b> The raw count of two genes cannot compared if gene A is twice longer than gene B. Due to its length, the longest gene will have higher chance to be sequenced than the short one. And in the end, for the same expression level, the longest gene will identify more reads than the shortest one.</p>

<p><b>genes GC composition :</b> For two genes with different GC content, the one with the closest GC content to 40% will more likely be sequenced. </p>
<p>The others bias are "technical bias", due to your sample and sequencing method.
<b>library size : </b> The most well know bias. You create two libraries for two conditions with the same RNA composition. The second library works way better than the first one, you get 12 000 000 reads for condition A and 36 000 000 reads for condition B. You will have three times (36 000 000/12 000 000 = 3) more of each RNA in your condition B than your condition A. </p>

<img src="https://i.ibb.co/Ld4YH9m/condition-A.png" alt="condition A">

<img src="https://i.ibb.co/sgF0PSj/condition-B.png" alt="condition B">
  
    
<p>Apart from the differences in library depth, an additional problem is that RNASeq frequently have different amounts of different RNA types in them. A simple example could be that you have more rRNA in one sample than in another (lets say 1% vs 20%) if you do not take this into account it would look like the majority of protein coding genes were downregulated simply because they would get a smaller fraction of reads. Such effects is handled by doing a inter-library normalization an analysis build into all the major DE tool workflows. You can read more about this problem here. </p>

To reduce these bias, there are a lot of methods to normalize RNAseq data.

<p>Those which I call naive ones :

<ul>
    <li>Total count</li>
    <li>Upper Quartile </li>
    <li>RPKM (Reads Per Kilobase per Million, which is not solid enought for cross condition experiment, pub4 & pub5)</li>
</ul>
</p>

Those with a statistical power :

For the batch effect
<ul>
  <li>RLE method (Relative log Expression) like DESeq2 </li>
  <li>TMM method (Trimmed Mean of M values) like edgeR </li>
</ul>
    Plus, the most used rule to normalize gene count :

<ul> 
    <li>negative binomial distribution (edgeR, DESeq2)</li>
<li>Add to that a multiple testing correction, to output strong express genes (DESeq2)</li>
</ul>
    Library size is the major biais and could be handle in DESeq2 using the <b> sizeFactor </b>

In [20]:
##--------------------
##   RAW VARIABLES
##--------------------
y.raw          <- y
ngroups        <- length(unique(y$samples$group)); ngroups
nsamples       <- ncol(y); nsamples
ngenes         <- nrow(y); ngenes

raw_lib        <- y$samples; head(raw_lib)
raw_counts     <- y$counts; #dim(raw_counts)
raw_cpm        <- cpm(y, prior.count=2);# dim(raw_cpm)
raw_lcpm       <- cpm(y, prior.count=2, log=TRUE);# dim(raw_lcpm)
raw_anno       <- y$genes; head(raw_anno)
raw_L          <- log2(2/(mean(y$samples$lib.size) * 1e-06)); #raw_L

Unnamed: 0_level_0,group,lib.size,norm.factors
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>
D1,CON,22385619,1
D2,CON,28092434,1
D3,CON,19372856,1
D4,CON,27283589,1
D5,CON,23735343,1
D6,CON,17215483,1


Unnamed: 0_level_0,gene_id,ENSEMBL,ENTREZID,SYMBOL,GENENAME
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
Adora2b,Adora2b,ENSMUSG00000018500,11541,Adora2b,adenosine A2b receptor
Adora2a,Adora2a,ENSMUSG00000020178,11540,Adora2a,adenosine A2a receptor
Gm20745,Gm20745,ENSMUSG00000111841,434412,Gm20745,"predicted gene, 20745"
Rrp1,Rrp1,ENSMUSG00000061032,18114,Rrp1,ribosomal RNA processing 1
Fam50a,Fam50a,ENSMUSG00000001962,108160,Fam50a,"family with sequence similarity 50, member A"
Gm4349,Gm4349,,100043305,Gm4349,"SET domain, bifurcated 1 pseudogene"


<b>Within-Sample Normalization = Counts per Million </b> Read count routinely refers to the number of reads that align to a particular region. Counts per million mapped reads are counts scaled by the number of sequenced fragments multiplied by one million. Transcripts per million (TPM) is a measurement of the proportion of transcripts in a pool of RNA

In [21]:
pearson.raw_lcpm      <- cor(raw_lcpm, use="all.obs", method="pearson")
spearman.raw_lcpm     <- cor(raw_lcpm, use="all.obs", method="spearman")

raw_var     <- matrixStats::rowVars(raw_lcpm) # row variance per gene
raw_scale   <- t(scale(t(raw_lcpm)))  # scaled lcpm values mean=0, std=1

stopifnot(rownames(raw_scale)==rownames(y$genes))
stopifnot(rownames(raw_scale)==rownames(y$counts))

In [None]:
## mean, median, min., and max. library sizes
L  <- mean(y$samples$lib.size) * 1e-06
M  <- median(y$samples$lib.size) * 1e-06
Mn <- min(y$samples$lib.size) * 1e-06
Mx <- max(y$samples$lib.size) * 1e-06
round(c(L, M, Mn, Mx),1)

# smallest libraries
min_group <- min(table(targets$group))  # 6
min_group
 
# calculates the CPM value that corresponds to a count of 10
cpm(10, min(y$samples$lib.size))
min_cpm <- round(as.numeric(cpm(10, min(y$samples$lib.size))),2)
min_cpm # 0.58

keep <- rowSums(cpm(y) > min_cpm) >=  min_group
table(keep)
# FALSE  TRUE 
#  5136 11457


y <- y[keep, , keep.lib.sizes=FALSE]
dim(y) 

# The option keep.lib.sizes=FALSE causes the library sizes to be recomputed after the filtering.
# This is generally recommended, although the effect on the downstream analysis is usually small.

In [None]:
##---------------------------
## FILTER VARIABLES (y)
##---------------------------
y.filter           <- y
filter_lib         <- y$samples
filter_ngenes      <- nrow(y); filter_ngenes
filter_anno        <- y$genes;head(filter_anno)
head(filter_anno)[1:5,];dim(filter_anno)
filter_counts      <- y$counts
filter_cpm         <- cpm(y, prior.count=2)
filter_lcpm        <- cpm(y, prior.count=2, log=TRUE)
filter_L           <- log2(2/(mean(y$samples$lib.size) * 1e-6));filter_L 

pearson.filter_lcpm      <- cor(filter_lcpm, use="all.obs", method="pearson")
spearman.filter_lcpm     <- cor(filter_lcpm, use="all.obs", method="spearman")

filter_var     <- matrixStats::rowVars(filter_lcpm) # row variance per gene
filter_scale   <- t(scale(t(filter_lcpm)))  # scaled lcpm values mean=0, std=1

print("Saved filtered variables")

stopifnot(rownames(filter_scale)==rownames(y$genes))
stopifnot(rownames(filter_scale)==rownames(y$counts))

<b>Between sample normalization - Trimmed mean of M values (TMM) </b>
<p><b> CPM "normalization" </b> accounts for library size differences between samples, and produces normalized values that can be compared on an absolute scale (e.g., for filtering). <b>TMM normalization </b> accounts for composition bias, and computes normalization factors for comparing between libraries on a relative scale. CPM normalization doesn't account for composition bias, and TMM normalization doesn't produce normalized values. Thus, you need both steps in the analysis pipeline. https://support.bioconductor.org/p/69433/ </p>

In [None]:
##-------------------------------
##     TMM NORMALIZATION (y)     
##-------------------------------

# NORMALIZED DATA (y)
y <- calcNormFactors(y, method="TMM")
head(y$samples)

##---------------------------
##   NORM VARIABLES (y)
##---------------------------
y.norm          <- y
norm_lib        <- y$samples
norm_lib        <- cbind(y$samples, (y$samples$norm.factors * y$samples$lib.size))
colnames(norm_lib) <- c("group", "old.lib.size", "norm.factors", "lib.size"); head(norm_lib)

norm_factor     <- y$samples$norm.factors
norm_anno       <- y$genes
norm_counts     <- y$counts
norm_cpm        <- cpm(y, prior.count=2, normalized.lib.sizes = TRUE)
norm_lcpm       <- cpm(y, prior.count=2, normalized.lib.sizes = TRUE, log=TRUE)
norm_L          <- log2(2/(mean(y$samples$lib.size, normalized.lib.sizes=TRUE) * 1e-06)); norm_L 

pearson.norm_lcpm      <- cor(norm_lcpm, use="all.obs", method="pearson")
spearman.norm_lcpm     <- cor(norm_lcpm, use="all.obs", method="spearman")

norm_var     <- matrixStats::rowVars(norm_lcpm) # row variance per gene
norm_scale   <- t(scale(t(norm_lcpm)))  # scaled lcpm values mean=0, std=1

stopifnot(rownames(norm_scale)==rownames(y$genes))
stopifnot(rownames(norm_scale)==rownames(y$counts))

<b> Limma voom analysis</b>

In [None]:
design = model.matrix(~0 + my_targets$group + my_targets$batch,  data= y$samples)
colnames(design) <- c("CON","EPZ", "b2");design

In [None]:
contrasts <- makeContrasts(EPZ_vs_CON=EPZ-CON, levels=colnames(design));contrasts

In [None]:
v <- voom(y, design, plot=FALSE)
vfit<-lmFit(v, design)
vfit<-contrasts.fit(vfit,contrasts=contrasts)
results <-eBayes(vfit)

In [None]:
png(file.path(".", "01_voom.png"),units="in", width=10, height=5, res=1000, pointsize=12)
par(mfrow=c(1,2))
   voom(y,design=design, normalize.method="none", plot=TRUE, save.plot=TRUE)
   plotSA(results, main="Final Model")
dev.off()

In [None]:
stats <- topTable(results, coef=1, n=Inf, adjust.method="BH", sort.by="none", p.value=1, lfc=0); dim(stats)
results$stats <- stats
sig <- topTable(results, coef=1, n=Inf, adjust.method="BH", sort.by="none", p.value=0.05, lfc=1); dim(sig)
dt <- decideTests(results, method="separate", adjust.method="BH", p.value=0.05, lfc = 1)
#summary(dt)
#        EPZ_vs_CON
# Down            7
# NotSig      10738
# Up            712

In [None]:
png(file.path(".", paste("05_","EPZ_vs_CON", "_p-value.histogram.png",sep="")), units="in", width=6, height=5, res=1000, pointsize=12)
{
   grayblue <-"#b0ddf5"
      
     hist(results$stats$P.Value, col= grayblue, breaks = 100, 
          main=paste("P-value Histogram: ","EPZ vs. CON",sep=""),
          xlab="P.Value", 
          ylab="frequency", 
          font=1, cex.main=1, las=1)
}
dev.off()

In [None]:
     hist(results$stats$P.Value, col= grayblue, breaks = 100, 
          main=paste("P-value Histogram: ","EPZ vs. CON",sep=""),
          xlab="P.Value", 
          ylab="frequency", 
          font=1, cex.main=1, las=1)


In [None]:
i<-1

png(file.path(".", paste("07_","EPZ_vs_CON","_MD.plot.png",sep="")), units="in", width=5, height=5, res=1000, pointsize=10)
{
     ##----------------------------------------------------------------------------
     op <- par(no.readonly = TRUE)  # save current par settings b4 plotting
     #par(mfrow = c(4, 5))          # multiple plots
     par(mar = c(5, 5, 4, 1))       # inner margin (bottom, left, top, right)
     par(mgp = c(3, 1, 0))          # axis margin (labels, ticks, line)
     par(oma = c(2, 1, 0, 1))       # outer margin (bottom, left, top, right)
     ##----------------------------------------------------------------------------
     plotMD(results, column=i, cex=1, las=1, status=dt[,i], values=c(1,-1), col=c("red","blue"), 
            legend="topright", main="", 
            xlab=expression(paste("avg. ", log[2]," (counts-per-million)",sep="")),
            ylab=expression(paste(log[2]," (fold-change)",sep="")),
            cex.axis=1.1, cex.lab=1.2
            # ,ylim=c(-11,13)
            #,xlim=c(2,14)
     )
     title(main=paste("MD plot: ", "EPZ vs. CON", "", sep=""))
     legend("topright", c("up", "not sig.", "down"), inset=0, pch=19,
            box.col="black", box.lwd=1, bg="white",
            col=c("red","black","blue"), ncol=1, cex=1)
     
     # text(x=12, y=-3, label=paste("up =", summary(dt)[3], sep=" "), adj=0, col="red", cex=0.9, font=1)
     # text(x=12, y=-4, label=paste("not sig. =", summary(dt)[2], sep=" "), adj=0, col="black", cex=0.9, font=1)
     # text(x=12, y=-5, label=paste("down =", summary(dt)[1], sep=" "), adj=0, col="blue", cex=0.9, font=1)
     abline(h=c(-1,1), col="black", lty=2,lwd=1)
}


dev.off()


In [None]:
i<-1

     ##----------------------------------------------------------------------------
     op <- par(no.readonly = TRUE)  # save current par settings b4 plotting
     #par(mfrow = c(4, 5))          # multiple plots
     par(mar = c(5, 5, 4, 1))       # inner margin (bottom, left, top, right)
     par(mgp = c(3, 1, 0))          # axis margin (labels, ticks, line)
     par(oma = c(2, 1, 0, 1))       # outer margin (bottom, left, top, right)
     ##----------------------------------------------------------------------------
     plotMD(results, column=i, cex=1, las=1, status=dt[,i], values=c(1,-1), col=c("red","blue"), 
            legend="topright", main="", 
            xlab=expression(paste("avg. ", log[2]," (counts-per-million)",sep="")),
            ylab=expression(paste(log[2]," (fold-change)",sep="")),
            cex.axis=1.1, cex.lab=1.2
            # ,ylim=c(-11,13)
            #,xlim=c(2,14)
     )
     title(main=paste("MD plot: ", "EPZ vs. CON", "", sep=""))
     legend("topright", c("up", "not sig.", "down"), inset=0, pch=19,
            box.col="black", box.lwd=1, bg="white",
            col=c("red","black","blue"), ncol=1, cex=1)
     
     # text(x=12, y=-3, label=paste("up =", summary(dt)[3], sep=" "), adj=0, col="red", cex=0.9, font=1)
     # text(x=12, y=-4, label=paste("not sig. =", summary(dt)[2], sep=" "), adj=0, col="black", cex=0.9, font=1)
     # text(x=12, y=-5, label=paste("down =", summary(dt)[1], sep=" "), adj=0, col="blue", cex=0.9, font=1)
     abline(h=c(-1,1), col="black", lty=2,lwd=1)



In [None]:
library(Glimma, quietly=TRUE)
  
glMDPlot(results, 
         coef=i,
         counts = norm_lcpm, 
         anno = results$genes,
         groups = my_targets$group, 
         samples = colnames(norm_lcpm), 
         status = dt[,i],
         transform=FALSE, 
         main = paste("MD Plot: ","EPZ vs. CON", "", sep=" "),
         xlab = "avg. log2 (counts-per-million)",
         ylab = "log2 (fold-change)", 
         side.xlab = "group",
         side.ylab = "expression (norm_lcpm)", 
         side.log = FALSE,
         # side.gridstep = ifelse(!transform || side.log, FALSE, 0.5),
         p.adj.method = "BH",
         jitter = 30, 
         side.main = "SYMBOL",
         display.columns = colnames(results$genes), 
         cols = c("#00bfff", "#858585", "#ff3030"),
         #sample.cols = colors[my_targets$group],
         path = file.path("."),
         folder = "MD-plots", 
         html = paste("EPZ_vs_CON", "_MD.plot", sep=""), 
         launch = FALSE
     )

In [None]:
png(file.path(".", paste("08_","EPZ_vs_CON","_volcano.plot.png",sep="")), units="in", width=6, height=6, res=1000, pointsize=12)
{
     with(results$stats, 
          plot(logFC, -log10(adj.P.Val),# pch=20, las=1,
               pch = 21, bg = "black", col = "black", lwd = 0.9, cex = 1,
               main=paste("Volcano Plot: ", "EPZ vs. CON","", sep=" "), 
               xlab=expression(paste(,log[2]," (fold-change)",sep="")),
               #xlab= "logFC", 
               ylab=expression(paste("-",log[10]," (adj. p-value)",sep=""))
               # ylab="-log10 (P.Value)", 
               # ,ylim= c(0, 8)
               , xlim= c(-2,7)
               , cex.axis=1.1, cex.lab=1.2
          )
     )
     # grid()
     
     with(subset(results$stats, adj.P.Val <= 0.05 & logFC >= 1), points(logFC, -log10(adj.P.Val), pch = 21, bg = "firebrick2", col = "firebrick2", lwd = 1, cex = 1.1))
     with(subset(results$stats, adj.P.Val <= 0.05 & logFC <= -1), points(logFC, -log10(adj.P.Val), pch = 21, bg = "blue", col = "blue", lwd = 1, cex = 1.1))

     abline(v= c(-1,1), col="black", lty=2, lwd=1)
     abline(h= -log10(0.05), col="black", lty=2, lwd=1)
     
     # text(x=-14, y=8, label=paste("up = ", summary(dt)[3], sep=" "), adj=0, col="red", cex=0.8, font=1)
     # text(x=-14, y=7.5, label=paste("not sig = ", summary(dt)[2], sep=" "), adj=0, col="black", cex=0.8, font=1)
     # text(x=-14, y=7, label=paste("down = ", summary(dt)[1], sep=" "), adj=0, col="blue", cex=0.8, font=1)
}

dev.off()

In [None]:
glXYPlot(x=results$stats$logFC,
         y=-log(results$stats$adj.P.Val, 10),
         counts = norm_lcpm,
         groups = my_targets$group,
         samples = rownames(my_targets),
         status = dt[,i], 
         transform=FALSE, 
         anno = results$genes,
         display.columns = colnames(results$genes),
         xlab = "log2 (fold-change)",
         ylab = "-log10 (adj. p-value)",
         side.main = "SYMBOL",
         side.xlab = "group",
         side.ylab = "expression (norm_lcpm)",
         side.log=FALSE,
         #sample.cols = colors[my_targets$group],
         cols = c("#00bfff", "#858585", "#ff3030"),
         p.adj.method="BH",
         jitter = 30,
         path = file.path("."),
         folder = "Volcano-plots",
         html = paste("EPZ_vs_CON","_volcano.plot", sep=""),
         main=paste("Volcano Plot: ","EPZ vs. CON", "", sep=" "),
         launch = FALSE
         )