In [1]:
library(GO.db)
library(topGO)
library(GOSim)
library(org.Sc.sgd.db)
library(igraph)

Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, cbind, colnames, do.call,
    duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
    is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
    paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
    Reduce, rownames, sapply, setdiff, sort, table, tapply, union,
    unique, unsplit, which, which.max, which.min

Loading required package: Biobase
Welcome to Bioconductor

 

In [2]:
file <- "yeast_uetz"

ont <- "BP"
p <- 0.5
init <- 1

db <- org.Sc.sgd.db
mapping <- "org.Sc.sgd.db"
ID <- "ENSEMBL"

##load all community gene lists
setwd(sprintf("/home/david/Documents/ghsom/%s_hierarchy_communities_%s_%s", file, p, init))

setOntology(ont, loadIC=TRUE)
setEvidenceLevel(evidences="all",organism=org.Sc.sgdORGANISM, gomap=org.Sc.sgdGO)

initializing GOSim package ...
-> retrieving GO information for all available genes for organism 'human' in GO database
-> filtering GO terms according to evidence levels 'all'
-> loading files with information content for corresponding GO category (human)
finished.
-> loading files with information content for corresponding GO category (human)
-> retrieving GO information for all available genes for organism 'Saccharomyces cerevisiae' in GO database
-> filtering GO terms according to evidence levels 'all'


In [3]:
generateMap <- function(filename){
    map <- as.matrix(read.csv(filename, sep=",", header = F))
    communities <- map[,1]
    map <- map[,2:ncol(map)]
    rownames(map) <- communities
    colnames(map) <- communities
    return (map)
}

In [4]:
#background gene list
backgroundFilename <- "all_genes.txt"
allGenes <- scan(backgroundFilename, character())

#shortest path files
shortestPathFiles  <- list.files(pattern="*shortest_path*")

#shortest paths list
shortestPaths <- sapply(shortestPathFiles, generateMap)
names(shortestPaths) <- sapply(names(shortestPaths), function(name) strsplit(name, "_")[[1]][[1]])

#communitiy assignemtns
assignments <- as.matrix(read.csv("assignment_matrix.txt", sep=",", header=F))
rownames(assignments) <- allGenes
colnames <- sapply(1:ncol(assignments), function(i) as.character(i-1))
colnames(assignments) <- colnames

In [5]:
getDepth <- function(com) {
    return(which(apply(assignments, 2, function(i) any(i == com))))
}

getGenes <- function(com){
    return(names(which(assignments[,getDepth(com)] == com)))
}

getSubCommunities <- function(com){
    return(try(as.character(unique(assignments[getGenes(com), getDepth(com) + 1]))))
}

getSuperCommunity <- function(com){
    return(try(as.character(unique(assignments[getGenes(com), getDepth(com) - 1]))))
}

getShortestPath <- function(com){
    return (try(shortestPaths[[com]]))
}

In [26]:
enrichmentResults <- sapply(2:max(assignments), function(i) 
    GOenrichment(getGenes(i), allGenes, cutoff=0.05, method="weight01"))


Building most specific GOs .....
	( 572 GO terms found. )

Build GO DAG topology ..........
	( 1848 GO terms and 4082 relations. )

Annotating nodes ...............
	( 252 genes annotated to the GO terms. )

			 -- Weight01 Algorithm -- 

		 the algorithm is scoring 561 nontrivial nodes
		 parameters: 
			 test statistic: fisher

	 Level 14:	1 nodes to be scored	(0 eliminated genes)

	 Level 13:	10 nodes to be scored	(0 eliminated genes)

	 Level 12:	23 nodes to be scored	(6 eliminated genes)

	 Level 11:	33 nodes to be scored	(21 eliminated genes)

	 Level 10:	45 nodes to be scored	(49 eliminated genes)

	 Level 9:	53 nodes to be scored	(74 eliminated genes)

	 Level 8:	61 nodes to be scored	(89 eliminated genes)

	 Level 7:	64 nodes to be scored	(118 eliminated genes)

	 Level 6:	92 nodes to be scored	(159 eliminated genes)

	 Level 5:	86 nodes to be scored	(182 eliminated genes)

	 Level 4:	58 nodes to be scored	(205 eliminated genes)

	 Level 3:	26 nodes to be scored	(221 eliminat

In [27]:
rownames(enrichmentResults) <- c("terms","p-values","genes")
colnames(enrichmentResults) <- 2:max(assignments)

In [28]:
enrichmentResults["terms", "6"]

Unnamed: 0,go_id,Term,Definition
2441,GO:0001731,formation of translation preinitiation complex,"The joining of the small ribosomal subunit, ternary complex, and mRNA."
14824,GO:0045454,cell redox homeostasis,Any process that maintains the redox environment of a cell or compartment within a cell.
35727,GO:0030950,establishment or maintenance of actin cytoskeleton polarity,"Any cellular process that results in the specification, formation or maintenance of polarized actin-based cytoskeletal structures."
36891,GO:0031505,fungal-type cell wall organization,"A process that is carried out at the cellular level which results in the assembly, arrangement of constituent parts, or disassembly of the fungal-type cell wall."
44024,GO:0034503,protein localization to nucleolar rDNA repeats,"Any process in which a protein is transported to, or maintained at, the rDNA repeats on a chromosome in the nucleolus."
54648,GO:0045143,homologous chromosome segregation,"The cell cycle process in which replicated homologous chromosomes are organized and then physically separated and apportioned to two sets during the first division of the meiotic cell cycle. Each replicated chromosome, composed of two sister chromatids, aligns at the cell equator, paired with its homologous partner; this pairing off, referred to as synapsis, permits genetic recombination. One homolog (both sister chromatids) of each morphologic type goes into each of the resulting chromosome sets."
78651,GO:0070550,rDNA condensation,"The process in which the chromatin structure of the rDNA repeats is compacted. In S. cerevisiae, condensation and resolution of the rDNA occurs during anaphase."
79446,GO:0070941,eisosome assembly,"The aggregation, arrangement and bonding together of a set of components to form an eisosome, a cell part that is composed of the eisosome membrane or MCC domain, a furrow-like plasma membrane sub-domain and associated integral transmembrane proteins, and the proteins (eisosome filaments) that form a scaffolding lattice on the cytoplasmic face."
83423,GO:0090002,establishment of protein localization to plasma membrane,The directed movement of a protein to a specific location in the plasma membrane.


In [34]:
enrichmentResults[["p-values", "14"]]

In [35]:
enrichmentResults[["p-values", "15"]]

In [39]:
mean(getTermSim(termlist = names(enrichmentResults[["p-values", "6"]]), method = "Resnik", verbose = F))

In [38]:
mean(getTermSim(termlist = names(enrichmentResults[["p-values", "14"]]), method = "Resnik", verbose = F))

In [37]:
mean(getTermSim(termlist = names(enrichmentResults[["p-values", "15"]]), method = "Resnik", verbose = F))

In [43]:
enrichmentResults["p-values",]

In [46]:
communitySimilarity <- function(community) {
    termSims <- getTermSim(termlist = names(community), method = "Resnik", verbose = F)
    if (length(geneSims) > 1) {
        return(mean(termSims[upper.tri(termSims)]))
    } else {
        return (NaN)
    }
}

In [53]:
enrichmentResults[["p-values", "5"]]

In [54]:
communitySimilarity(enrichmentResults[["p-values", "5"]])

ERROR: Error in communitySimilarity(enrichmentResults[["p-values", "5"]]): object 'geneSims' not found


In [49]:
layerSimilarity <- function(layer) {
    p-valueList <- enrichmentResults["p-values", unique(assignments[,layer][assignments[,layer] != -1])]
    communitiesSimilarity <- sapply(p-valueList, communitySimilarity)
    communitiesSimilarity <- communitiesSimilarity[!is.na(communitiesSimilarity)]
    return(mean(communitiesSimilarity))
}

In [None]:
layerMeanSimilarities <- sapply(colnames, layerSimilarity)