In [1]:
library(GO.db)
library(topGO)
library(GOSim)
library(org.Sc.sgd.db)
library(igraph)

Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, cbind, colnames, do.call,
    duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
    is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
    paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
    Reduce, rownames, sapply, setdiff, sort, table, tapply, union,
    unique, unsplit, which, which.max, which.min

Loading required package: Biobase
Welcome to Bioconductor

 

In [2]:
file <- "yeast_uetz"

ont <- "BP"
p <- 0.1
init <- 1

db <- org.Sc.sgd.db
mapping <- "org.Sc.sgd.db"
ID <- "ENSEMBL"

##load all community gene lists
setwd(sprintf("/home/david/Documents/ghsom/%s_hierarchy_communities_%s_%s", file, p, init))

setOntology(ont, loadIC=TRUE)
setEvidenceLevel(evidences="all", organism=org.Sc.sgdORGANISM, gomap=org.Sc.sgdGO)

initializing GOSim package ...
-> retrieving GO information for all available genes for organism 'human' in GO database
-> filtering GO terms according to evidence levels 'all'
-> loading files with information content for corresponding GO category (human)
finished.
-> loading files with information content for corresponding GO category (human)
-> retrieving GO information for all available genes for organism 'Saccharomyces cerevisiae' in GO database
-> filtering GO terms according to evidence levels 'all'


In [3]:
generateMap <- function(filename){
    map <- as.matrix(read.csv(filename, sep=",", header = F))
    communities <- map[,1]
    map <- map[,2:ncol(map)]
    rownames(map) <- communities
    colnames(map) <- communities
    return (map)
}

In [4]:
#background gene list
backgroundFilename <- "all_genes.txt"
allGenes <- scan(backgroundFilename, character())

#shortest path files
shortestPathFiles  <- list.files(pattern="*shortest_path*")

#shortest paths list
shortestPaths <- sapply(shortestPathFiles, generateMap)
names(shortestPaths) <- sapply(names(shortestPaths), function(name) strsplit(name, "_")[[1]][[1]])

#communitiy assignemtns
assignments <- as.matrix(read.csv("assignment_matrix.csv", sep=",", header=F))
rownames(assignments) <- allGenes
colnames <- sapply(1:ncol(assignments), function(i) as.character(i-1))
colnames(assignments) <- colnames

In [5]:
getDepth <- function(com) {
    return(which(apply(assignments, 2, function(i) any(i == com))))
}

getGenes <- function(com){
    return(names(which(assignments[,getDepth(com)] == com)))
}

getSubCommunities <- function(com){
    return(try(as.character(unique(assignments[getGenes(com), getDepth(com) + 1]))))
}

getSuperCommunity <- function(com){
    return(try(as.character(unique(assignments[getGenes(com), getDepth(com) - 1]))))
}

getShortestPath <- function(com){
    return (try(shortestPaths[[com]]))
}

In [6]:
allGenesInDB <- keys(db)
allGenes <- allGenes[allGenes %in% allGenesInDB]
enrichmentResults <- sapply(1:max(assignments), function(i) {

    genesOfInterest <- getGenes(i)
    genesOfInterest <- genesOfInterest[genesOfInterest %in% allGenesInDB]
    GOenrichment(genesOfInterest, allGenesInDB, cutoff=0.05, method="weight01")
}
)


Building most specific GOs .....
	( 2909 GO terms found. )

Build GO DAG topology ..........
	( 5064 GO terms and 11404 relations. )

Annotating nodes ...............
	( 6419 genes annotated to the GO terms. )

			 -- Weight01 Algorithm -- 

		 the algorithm is scoring 1848 nontrivial nodes
		 parameters: 
			 test statistic: fisher

	 Level 16:	3 nodes to be scored	(0 eliminated genes)

	 Level 15:	12 nodes to be scored	(0 eliminated genes)

	 Level 14:	33 nodes to be scored	(13 eliminated genes)

	 Level 13:	63 nodes to be scored	(116 eliminated genes)

	 Level 12:	93 nodes to be scored	(406 eliminated genes)

	 Level 11:	131 nodes to be scored	(804 eliminated genes)

	 Level 10:	190 nodes to be scored	(1400 eliminated genes)

	 Level 9:	232 nodes to be scored	(2023 eliminated genes)

	 Level 8:	234 nodes to be scored	(2704 eliminated genes)

	 Level 7:	245 nodes to be scored	(3775 eliminated genes)

	 Level 6:	254 nodes to be scored	(4476 eliminated genes)

	 Level 5:	195 nodes to 

In [16]:
rownames(enrichmentResults) <- c("terms","p-values","genes")
colnames(enrichmentResults) <- 2:max(assignments)

In [17]:
communitySimilarity <- function(community) {
    termSims <- getTermSim(termlist = names(community), method = "Lin", verbose = F)
    if (length(termSims) > 1) {
        return(mean(termSims[upper.tri(termSims)]))
    } else {
        return (NaN)
    }
}

In [19]:
communitySimilarity(enrichmentResults[["p-values", 27]])

In [20]:
getGenes(27)

In [21]:
layerSimilarity <- function(layer) {
    pvalueList <- enrichmentResults["p-values", unique(assignments[,layer][assignments[,layer] != -1]) - 1]
    communitiesSimilarity <- sapply(pvalueList, communitySimilarity)
    communitiesSimilarity <- communitiesSimilarity[!is.na(communitiesSimilarity)]
    return(mean(communitiesSimilarity))
}

In [22]:
layerMeanSimilarities <- sapply(colnames, layerSimilarity)

“argument is not numeric or logical: returning NA”

ERROR: Error in if (term1 == term2) {: missing value where TRUE/FALSE needed


In [None]:
layerMeanSimilarities

In [12]:
geneCommunities <- sapply(1:max(assignments), function (i) getGenes(i)[getGenes(i) %in% allGenesInDB])

In [15]:
getSubCommunities(6)

In [14]:
as.list(org.Sc.sgdPATH[geneCommunities[[6]]])

In [23]:
as.list(org.Sc.sgdPATH2ORF[["00330"]]) %in% allGenes

In [21]:
length(allGenes)

In [104]:
geneCommunities[[1]]

In [27]:
allGenes <- allGenes[allGenes%in% allGenesInDB]

In [28]:
GOenrichment(allGenes, allGenesInDB, cutoff = 0.01, method = "weight01")


Building most specific GOs .....
	( 2909 GO terms found. )

Build GO DAG topology ..........
	( 5064 GO terms and 11404 relations. )

Annotating nodes ...............
	( 6419 genes annotated to the GO terms. )

			 -- Weight01 Algorithm -- 

		 the algorithm is scoring 1848 nontrivial nodes
		 parameters: 
			 test statistic: fisher

	 Level 16:	3 nodes to be scored	(0 eliminated genes)

	 Level 15:	12 nodes to be scored	(0 eliminated genes)

	 Level 14:	33 nodes to be scored	(13 eliminated genes)

	 Level 13:	63 nodes to be scored	(116 eliminated genes)

	 Level 12:	93 nodes to be scored	(406 eliminated genes)

	 Level 11:	131 nodes to be scored	(804 eliminated genes)

	 Level 10:	190 nodes to be scored	(1400 eliminated genes)

	 Level 9:	232 nodes to be scored	(2023 eliminated genes)

	 Level 8:	234 nodes to be scored	(2704 eliminated genes)

	 Level 7:	245 nodes to be scored	(3775 eliminated genes)

	 Level 6:	254 nodes to be scored	(4476 eliminated genes)

	 Level 5:	195 nodes to 

Unnamed: 0,go_id,Term,Definition
131,GO:0000050,urea cycle,"The sequence of reactions by which arginine is synthesized from ornithine, then cleaved to yield urea and regenerate ornithine. The overall reaction equation is NH3 + CO2 + aspartate + 3 ATP + 2 H2O = urea + fumarate + 2 ADP + 2 phosphate + AMP + diphosphate."
459,GO:0000184,"nuclear-transcribed mRNA catabolic process, nonsense-mediated decay","The nonsense-mediated decay pathway for nuclear-transcribed mRNAs degrades mRNAs in which an amino-acid codon has changed to a nonsense codon; this prevents the translation of such mRNAs into truncated, and potentially harmful, proteins."
845,GO:0000290,deadenylation-dependent decapping of nuclear-transcribed mRNA,Cleavage of the 5'-cap of a nuclear mRNA triggered by shortening of the poly(A) tail to below a minimum functional length.
1283,GO:0000723,telomere maintenance,"Any process that contributes to the maintenance of proper telomeric length and structure by affecting and monitoring the activity of telomeric proteins, the length of telomeric DNA and the replication and repair of the DNA. These processes includes those that shorten, lengthen, replicate and repair the telomeric DNA sequences."
1296,GO:0000730,DNA recombinase assembly,"The aggregation, arrangement and bonding together of strand exchange proteins (recombinases) into higher order oligomers on single-stranded DNA."
2134,GO:0001558,regulation of cell growth,"Any process that modulates the frequency, rate, extent or direction of cell growth."
6237,GO:0051017,actin filament bundle assembly,The assembly of actin filament bundles; actin filaments are on the same axis but may be oriented with the same or opposite polarities and may be packed with different levels of tightness.
15538,GO:0006468,protein phosphorylation,The process of introducing a phosphate group on to a protein.
15892,GO:0006607,NLS-bearing protein import into nucleus,"The directed movement of a protein bearing a nuclear localization signal (NLS) from the cytoplasm into the nucleus, across the nuclear membrane."
16867,GO:0006999,nuclear pore organization,"A process that is carried out at the cellular level which results in the assembly, arrangement of constituent parts, or disassembly of the nuclear pore."


In [1]:
## try http:// if https:// URLs are not supported
source("https://bioconductor.org/biocLite.R")
biocLite("ReactomePA")

Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.2 (2016-10-31).
Installing package(s) ‘ReactomePA’
also installing the dependencies ‘rappdirs’, ‘graphite’

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
Old packages: 'AnnotationHub', 'assertthat', 'backports', 'BiocParallel',
  'broom', 'cluster', 'colorspace', 'corpcor', 'curl', 'data.table', 'DBI',
  'digest', 'forcats', 'ggplot2', 'IRanges', 'jsonlite', 'KEGGREST', 'lattice',
  'Matrix', 'matrixStats', 'mgcv', 'nlme', 'openssl', 'pbdZMQ', 'pbkrtest',
  'psych', 'Rcpp', 'RcppEigen', 'readr', 'repr', 'rmarkdown', 'rprojroot',
  'S4Vectors', 'selectr', 'shiny', 'sourcetools', 'SparseM', 'stringi',
  'stringr', 'survival', 'tibble', 'tidyr', 'tidyverse', 'XML', 'xml2',
  'XVector', 'zoo'


In [23]:
library(ReactomePA)
data(geneList)
de <- names(geneList)[abs(geneList) > 1.5]
head(de)

Loading required package: DOSE
DOSE v3.0.10  For help: https://guangchuangyu.github.io/DOSE

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609

ReactomePA v1.18.1  For help: https://guangchuangyu.github.io/ReactomePA

If you use ReactomePA in published research, please cite:
Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479


In [27]:
length(de)

In [30]:
x <- enrichPathway(gene=de, pvalueCutoff=0.05, readable=T)
nrow(as.data.frame(x))

In [31]:
entrezCommunities <- sapply(1:max(assignments), function(i){
    orfs <- getGenes(i)
    orfs <- orfs[orfs%in%allGenesInDB]
    return(as.character(org.Sc.sgdENTREZID[orfs]))
})

In [33]:
length(entrezCommunities[[1]])

In [26]:
pathwayEnrichments <- sapply(entrezCommunities[2:length(entrezCommunities)],
                             function(i) enrichPathway(gene=i, organism = "yeast", universe = entrezCommunities[[1]], 
                                                                            pvalueCutoff = 0.05))

ERROR: Error in .testForValidCols(x, cols): Invalid columns: SYMBOL. Please use the columns method to see a listing of valid arguments.


In [34]:
orfs <- getGenes(2)
orfs <- orfs[orfs%in%allGenesInDB]
entrez <- as.character(org.Sc.sgdENTREZID[orfs])

In [35]:
orfs

In [36]:
entrez

In [37]:
allGenes <- allGenes[allGenes%in%allGenesInDB]
entrezAll <- as.character(org.Sc.sgdENTREZID[allGenes])
entrezAllDB <- as.character(org.Sc.sgdENTREZID[allGenesInDB])

In [38]:
x <- enrichPathway(gene = entrezAll, organism = "yeast", universe = entrezAllDB)

In [39]:
head(as.data.frame(x))

Unnamed: 0,ID,Description,GeneRatio,BgRatio,pvalue,p.adjust,qvalue,geneID,Count
5719840,5719840,Regulation of AMPK activity via LKB1,4/49,10/1145,0.000515426,0.03685296,0.03309577,852088/856749/852763/852664,4
5719841,5719841,Energy dependent regulation of mTOR by LKB1-AMPK,4/49,10/1145,0.000515426,0.03685296,0.03309577,852088/856749/852763/852664,4


In [40]:
unionAllGenes <- scan(character(), file="../yeast_union_all_genes.txt")

In [41]:
unionAllGenes <- unionAllGenes[unionAllGenes%in%allGenesInDB]

In [42]:
x <- enrichPathway(gene = as.character(org.Sc.sgdENTREZID[unionAllGenes]), organism = "yeast", universe = entrezAllDB)

In [44]:
head(as.data.frame(x))

ID,Description,GeneRatio,BgRatio,pvalue,p.adjust,qvalue,geneID,Count


In [2]:
library(WGCNA)

Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust




*
*  Package WGCNA 1.51 loaded.
*
*    Important note: It appears that your system supports multi-threading,
*    but it is not enabled within WGCNA in R. 
*    To allow multi-threading within WGCNA with all available cores, use 
*
*          allowWGCNAThreads()
*
*    within R. Use disableWGCNAThreads() to disable threading if necessary.
*    Alternatively, set the following environment variable on your system:
*
*          ALLOW_WGCNA_THREADS=<number_of_processors>
*
*    for example 
*
*          ALLOW_WGCNA_THREADS=8
*
*    To set the environment variable in linux bash shell, type 
*
*           export ALLOW_WGCNA_THREADS=8
*
*     before running R. Other operating systems or shells will
*     have a similar command to achieve the same aim.
*





Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor

