This is a collection of wrappers to the Wilcoxon test to run competitive gene set tests and comparison between imbalanced datasets.
library(devtools)
install_github("miccec/yaGST")
library(yaGST)
data("rankedList")
# generate a random data set of dimension 100
geneSet <- sample(head(names(rankedList), 5000), 100)
ans <- mwwGST(rankedList, geneSet, moreDetails = TRUE)
ans
plot(ans)
# generate a second gene set
geneSet <- sample(tail(names(rankedList), 5000), 100)
ans <- mwwGST(rankedList, geneSet, moreDetails = TRUE)
plot(ans)
You may need to download the collection of gene-sets, for example from the Molecular Signatures Database at the Broad Institute
In case the output of the analysis has to be saved in an .xls file, you need to install the WriteXLS package.
load the GO collections
library(yaGST)
GO <- gmt2GO("~/pathToYourFiles/h.all.v6.0.symbols.gmt")
length(GO)
load the ranked list
data("rankedList")
head(rankedList)
tail(rankedList)
run the mwwGST
ans <- lapply(GO, function(x) mwwGST(rankedList, x, minLenGeneSet = 15, verbose = FALSE))
Now, arrange the results in a table
actualGeneSetSize <- unlist(sapply(ans, function(x) x$actualGeneSetCount))
originalGeneSetSize <- unlist(sapply(ans, function(x) x$originalGeneSetCount))
pValue <- unlist(sapply(ans, function(x) x$p.value))
qValue <- p.adjust(pValue)
nes <- unlist(sapply(ans, function(x) x$nes))
pu <- unlist(sapply(ans, function(x) x$pu))
log2.pu <- unlist(sapply(ans, function(x) x$log.pu))
result_table <- cbind(originalGeneSetSize, actualGeneSetSize,
round(nes, 4), round(pu, 4), round(log2.pu, 4), round(pValue, 6), round(qValue, 6))
colnames(result_table) <- c("originalGeneSetSize", "actualGeneSetSize", "NES", "pu", "log2_pu", "pValue", "qValue")
result_table <- result_table[order(result_table[, "NES"], decreasing = TRUE),]
If needed, the table can be stored in an .xls file as follows
library(WriteXLS)
gst_results <- as.data.frame(result_table)
WriteXLS("gst_results", ExcelFileName = "~/pathToYourFiles/aNameForTheAnalysis.xls", row.names = TRUE)
filter and plot the results
qValue_treshold <- 0.05
NES_treshold <- 0.6
selected <- which((result_table[, "NES"] > NES_treshold) & (result_table[, "qValue"] < qValue_treshold))
result_table[selected,]
first_geneSet <- names(selected)[1]
plot(ans[[first_geneSet]], rankedList = rankedList, main = first_geneSet)
data("rankedList")
positive_gs <- sample(head(names(rankedList), 10000), 200)
negative_gs <- sample(tail(names(rankedList), 10000), 200)
ans <- mwwExtGST(rankedList, positive_gs, negative_gs, moreDetails = TRUE)
ans
plot(ans)
nr <- 100; nc <- 1000
# generate a data-matrix with nr samples, and nc features
exprData <- matrix(rpois(nc * nr, 100), nrow = nr, ncol = nc)
colnames(exprData) <- paste0("feat", 1:nc)
rownames(exprData) <- paste0("sam", 1:nr)
# increase the first 3 samples (minority set) of 10\% of the original intensity
# of the first 30 features (later the gene-set)
exprData[1, 1:30] <- exprData[1, 1:30]* runif(30, min = 1, max = 1.10)
exprData[2, 1:30] <- exprData[1, 1:30]* runif(30, min = 1, max = 1.10)
exprData[3, 1:30] <- exprData[1, 1:30]* runif(30, min = 1, max = 1.10)
samples_of_interest <- rownames(exprData)[1:3] # minority set
# running in parallel
library(doParallel)
# adjust the number of CPUs as needed
cl <- makePSOCKcluster(3)
clusterApply(cl, floor(runif(length(cl), max = 10000000)), set.seed)
registerDoParallel(cl)
ans_eeMWW <- eeMWW(exprData, samples_of_interest)
stopCluster(cl)
# set the gene-set and run the enrichment analysis
geneSet <- colnames(exprData)[1:30]
(tmp <- mwwGST(ans_eeMWW, geneSet))
plot(tmp, rankedList = ans_eeMWW)