In [1]:
#libraries
library(GO.db)
library(topGO)
library(org.Sc.sgd.db)

Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, cbind, colnames, do.call,
    duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
    is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
    paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
    Reduce, rownames, sapply, setdiff, sort, table, tapply, union,
    unique, unsplit, which, which.max, which.min

Loading required package: Biobase
Welcome to Bioconductor

 

In [2]:
#background gene list
setwd('/home/david/Documents/ghsom')
allGenes <- scan("Y2H_union.txt", character())
allGenes <- unique(allGenes) 

In [3]:
##load all community gene lists
setwd("/home/david/Documents/ghsom/union_communities_04")

g <- list()
numCom <- 0
filename <- sprintf("community_%s.txt", numCom)
while (file.exists(filename)) {
    numCom <- numCom + 1
    g[[numCom]] <- scan(filename, character())
    filename <- sprintf("community_%s.txt", numCom)
}
# numCom <- numCom + 1
# numCom
numCom

In [4]:
#distances between neurons
shortest.path <- read.csv("shortest_path.csv", sep=",", header=FALSE)

In [5]:
cutOff <- 0.005

geneLists <- vector("list", numCom) 
GOdataObjects <- vector("list", numCom) 
resultFishers <- vector("list", numCom) 
resultFisher.elims <- vector("list", numCom) 
results <- vector("list", numCom) 
gos <- vector("list", numCom) 

#perform enrichment analyses
for (c in 1:numCom){
    
    #factor of interesting genes
    geneList <- factor(as.integer(allGenes %in% g[[c]]))
    names(geneList) <- allGenes
    geneLists[[c]] <- geneList
    
    #construct topGO object
    GOdata <- new("topGOdata", description=sprintf("topGO object for community %s", c),
                  ontology = "BP", allGenes = geneList,
                  annotationFun = annFUN.org, mapping = "org.Sc.sgd.db", 
                  ID = "ENSEMBL", nodeSize = 10)
    GOdataObjects[[c]] <- GOdata
    
#     #fishers exact test classic
    resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
    resultFishers[[c]] <- resultFisher
    
#     #fishers exact test elimination
    resultFisher.elim <- runTest(GOdata, algorithm = "elim", statistic = "fisher")
    resultFisher.elims[[c]] <- resultFisher.elim
    
    #tabulate results
    allRes <- GenTable(GOdata, classicFisher = resultFisher,
                  elimFisher = resultFisher.elim,
                  orderBy = "classicFisher", ranksOf = "elimFisher", topNodes = 20)
    results[[c]] <- allRes
    
    #go terms <0.01 on both tests
    gos[[c]] <- subset(allRes, classicFisher < cutOff & elimFisher < cutOff)$GO.ID
}


Building most specific GOs .....
	( 1870 GO terms found. )

Build GO DAG topology ..........
	( 3862 GO terms and 8744 relations. )

Annotating nodes ...............
	( 1913 genes annotated to the GO terms. )

			 -- Classic Algorithm -- 

		 the algorithm is scoring 590 nontrivial nodes
		 parameters: 
			 test statistic: fisher

			 -- Elim Algorithm -- 

		 the algorithm is scoring 590 nontrivial nodes
		 parameters: 
			 test statistic: fisher
			 cutOff: 0.01

	 Level 14:	4 nodes to be scored	(0 eliminated genes)

	 Level 13:	3 nodes to be scored	(0 eliminated genes)

	 Level 12:	6 nodes to be scored	(0 eliminated genes)

	 Level 11:	13 nodes to be scored	(0 eliminated genes)

	 Level 10:	18 nodes to be scored	(0 eliminated genes)

	 Level 9:	45 nodes to be scored	(0 eliminated genes)

	 Level 8:	57 nodes to be scored	(20 eliminated genes)

	 Level 7:	81 nodes to be scored	(72 eliminated genes)

	 Level 6:	111 nodes to be scored	(110 eliminated genes)

	 Level 5:	118 nodes to be 

In [None]:
##semantic similarity
library(GOSemSim)
scGO <- godata('org.Sc.sgd.db',  ont="BP", keytype="ENSEMBL")

GOSemSim v2.0.4  For help: https://guangchuangyu.github.io/GOSemSim

If you use GOSemSim in published research, please cite:
Guangchuang Yu, Fei Li, Yide Qin, Xiaochen Bo, Yibo Wu, Shengqi Wang. GOSemSim: an R package for measuring semantic similarity among GO terms and gene products Bioinformatics 2010, 26(7):976-978


In [None]:
distances <- numeric((numCom * (numCom - 1)) / 2)
semSims <- numeric((numCom * (numCom - 1)) / 2)

completed <- 0

for (c1 in 1:numCom) {
    
#     gs1 <- g[[c1]]
    if (length(gos[[c1]]) == 0) next
    
    for (c2 in c1:numCom) {
        
        if (c1 == c2) next
            
        if (length(gos[[c2]]) == 0) next
            
#         gs2 <- g[[c2]]    
        
        completed <- completed + 1  
        
        #compute semantic similarity of two protein clusters
#         semSims[completed] <- clusterSim(gs1, gs2, semData=scGO, measure="Wang", combine="BMA")
        semSims[completed] <- mgoSim(gos[[c1]], gos[[c2]], semData=scGO, measure="Wang", combine="BMA")
        distances[completed] <- shortest.path[c1, c2]
        
        print(sprintf("Completed: %s", completed))
    }
}
distances <- distances[distances > 0]
semSims <- semSims[semSims > 0]

In [None]:
plot(distances, semSims, xlab="Distance on Map", ylab="Semantic Similarity")

In [39]:
semSims

In [40]:
cor(distances, semSims)