/
bioMart_2021.Rmd
138 lines (105 loc) · 6.24 KB
/
bioMart_2021.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "Get gene length and pathways files 2020"
output: html_document
---
```{r}
base_directory <-"/Users/abouchakra/Dropbox/Uoft/UOFT/projects/5-Gene_Length_pathway"
current_working_directory <- file.path(base_directory, "3-R_directpathwayfiles_mathematica")
data_directory <- file.path(current_working_directory,"data2020")
gl_filename <- paste(data_directory, "sessioninfo.txt",sep="/")
write.table(as.matrix(sessionInfo()),gl_filename,col.name=TRUE,sep="\t",row.names=FALSE,quote=FALSE,append=FALSE)
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install(version = "3.10")
#BiocManager::install(c("GenomicFeatures", "AnnotationDbi"))
#BiocManager::install("biomaRt")
#install.packages(c("reticulate", "rlang", "spatstat", "tidyr", "vctrs"))
library("biomaRt")
#listMarts()
#listMarts(host="plants.ensembl.org")
#listMarts(host="metazoa.ensembl.org")
#listMarts(host="fungi.ensembl.org")
#listMarts("bacteria.ensembl.org")
#listMarts(host="protists.ensembl.org")
```
```{r}
pathwaysOpt<-1 #if you need go and pathways for the species too choose 1
#databases location
hostList<-c("ensembl.org","plants.ensembl.org","metazoa.ensembl.org","fungi.ensembl.org","protists.ensembl.org")
#databases
biomartList<-c("ensembl","plants_mart","metazoa_mart","fungi_mart","protists_mart")
#for(bm in 1:(length(biomartList))){
bm=4
mart = useMart(biomart = biomartList[bm],
host = hostList[bm])
en_Alldatasets<-listDatasets(mart)
#All_Spcs_Ensembl
gl_filename <- paste(data_directory, paste(biomartList[bm], "datasets.txt",sep="_"),sep="/")
write.table(en_Alldatasets[,1],gl_filename,col.name=TRUE,sep="\t",row.names=FALSE,quote=FALSE,append=FALSE)
mm<-55 #79 = humans #26 or 39 in meta worm, 106 mouse 50 fruitfly, 54 zebrafish, 70 ggallus, 103 mmulatta, 165 yeast, 179 sscrofa,195 fugu, 203 xtropicalis
#5 "agossypii", 27 "kpastoris", 55 "ylipolytica",
#for(mm in 142:(length(en_Alldatasets[,1]))){
#make sure it connects here. Might have to run below command multiple times.
mart = useMart(biomart=biomartList[bm], host = hostList[bm],dataset=en_Alldatasets[mm,1])
#,'transcript_length'
#'entrezgene',
#Using biomart get all transcripts for the given species.
genes = getBM(attributes = c('ensembl_gene_id','ensembl_transcript_id', 'external_gene_name', 'start_position','end_position', 'transcript_start', 'transcript_end','transcript_length'), filters=list(biotype='protein_coding',transcript_biotype = 'protein_coding'), mart=mart);
#use ensembl gene ids to get rid of duplcates - do you want to get rid of duplicates?
# there is a huge difference between number of transcripts and number of genes. If the
# analysis is gene centric then you will be throwing out a lot of information.
#currently duplicates in transcripts only are removed
genes <- unique(genes[which(!duplicated(genes[,2])),])
#genes$description = gsub("\\[Source.*", "", genes$description);
genes$gene_length <- genes$end_position - genes$start_position + 1
genes$transcript_lengthC <- genes$transcript_end - genes$transcript_start + 1
genes<- genes[order(genes[,1]),]
#need to replace empty with something
#is.null?
genes[which(genes[,1] == ""),1] <- "NULL"
genes[which(genes[,3] == ""),3] <- "NULL"
#
#All_Spcs_Ensembl
gl_filename <- paste(data_directory, paste(biomartList[bm],"All_Spcs_Ensembl",sep="_"), paste(en_Alldatasets[mm,1], "genelength.txt",sep="_"),sep="/")
write.table(genes,gl_filename,col.name=TRUE,sep="\t",row.names=FALSE,quote=FALSE,append=FALSE)
#The following section gets the go annotations and pathway information we need from ensembl
if(pathwaysOpt==1){
#ensembl_transcript_id'
go_annotation_bp <- getBM(attributes = c( 'ensembl_gene_id',"go_id","name_1006", "namespace_1003","go_linkage_type"),
filters=list(biotype='protein_coding',transcript_biotype = 'protein_coding')
, mart=mart);
#get just the go biological process subset
go_annotation_bp <- go_annotation_bp[which(go_annotation_bp$namespace_1003 == "biological_process"),]
#all transcripts are part of the go annotation and doesnt actually distinguish transcripts from one another...transcript version is too noisy
#gl_filename <- paste(data_directory, "All_Spcs_Pathways", paste(en_Alldatasets[mm,1], "gene to_go_pathway.csv",sep="_"),sep="/")
#write.table(go_annotation_bp[,c(1:2)],file=gl_filename,col.name=TRUE,sep="\t",row.names=FALSE,quote=FALSE)
#saves go_id and pathway name into one file
newgoid_name<-subset(go_annotation_bp,select = c("go_id","name_1006"))
newgoid_name<-unique(newgoid_name[which(!duplicated(newgoid_name[,1])),])
names(newgoid_name)<-c("go_id","pathway_name")
gl_filename <- paste(data_directory, "All_Spcs_Pathways", paste(en_Alldatasets[mm,1], "goid_to_go_pathway_name.csv",sep="_"),sep="/")
write.csv(newgoid_name,file=gl_filename,row.names=FALSE)
#,col.name=TRUE,sep="\t",row.names=FALSE,quote=FALSE)
#compute the unique pathway sets
go_pathway_sets <- aggregate(go_annotation_bp[,1:5],by = list(go_annotation_bp$go_id),FUN = function(x){list(unique(x))})
#i would like to save the goid to ensembl gene set data and the clean_pathways does that
gene.per.goid <- sapply(go_pathway_sets$ensembl_gene_id,length)
clean_go_pathway_sets <- go_pathway_sets
clean_go_pathway_sets$flat_ensembl_geneid<-unlist(lapply(go_pathway_sets$ensembl_gene_id, function(x){paste0(x,collapse = " ")} ))
clean_go_pathway_sets<-subset(clean_go_pathway_sets,select = c("Group.1","flat_ensembl_geneid"))
names(clean_go_pathway_sets)<-c("go_id","ensembl_geneid")
#save clean_go_pathways
gl_filename <- paste(data_directory, "All_Spcs_Pathways", paste(en_Alldatasets[mm,1], "goid_to_gene.csv",sep="_"),sep="/")
write.csv(clean_go_pathway_sets,file=gl_filename,row.names=FALSE)
#,col.name=TRUE,sep="\t",row.names=FALSE,quote=FALSE)
#gl_filename <- paste(data_directory, "All_Spcs_Pathways", paste(en_Alldatasets[mm,1], "goid_to_go_pathway_name.csv",sep="_"),sep="/")
#write.csv(go_pathway_sets[,c(3:4)],file=gl_filename)
#,col.name=TRUE,sep="\t",row.names=FALSE,quote=FALSE)
#gl_filename <- paste(data_directory, "All_Spcs_Pathways", paste(en_Alldatasets[mm,1], "go_pathway_sets.RData",sep="_"),sep="/")
#save(go_pathway_sets, file=gl_filename)
}#pathway
#to save memory
go_annotation_bp<-c()
#}#for datasets
#}#for databases
```