## Objective: To identify genotype-phenotype trait association in yeast
### Develop a workflow to identify genes indirectly associated with a certain yeast phenotype (butanol tolerance) using EKP and visualize them in an interactive knowledge graph.

In [3]:
library(dplyr)
library(tidyr)
library(sqldf)
library(splitstackshape)
library(stringr)
library(compare)


#### load EKP API

In [4]:
source("..//src/EuretosInfrastructure.R")




Retrieving page 0
Retrieving page 1
Retrieving page 2
Retrieving page 3
Retrieving page 4
Retrieving page 5
Retrieving page 6
Retrieving page 7
Retrieving page 8
Retrieving page 9
Retrieving page 10
Retrieving page 11


In [5]:
#### qtaro.abr.affrc.go.jp/qtab/table
setwd("~/odex4all_usecases/ODEX4all-UseCases/Bayer/data")

#### Data downloaded from QTARO database located at #### qtaro.abr.affrc.go.jp/qtab/table
#### This file can be changed to other gene symbols !

In [6]:
rice_genes <-read.csv("GeneInformationTable_Qtaro.csv",header=TRUE)

#### Select only morphological trait as these are associated with concept ids are dynamic (snapsnot date: 08-05-2017)
#### "grain size" (EKP concept id : 5899980)
#### "grain thickness" (EKP concept id  :5900661)
#### "grain number" (EKP concept id (rice specific) :4343608)
#### "kernel number" (EKP concept id:5900190)
#### "GRNB" (EKP concept:5900394)
#### "fruit number" (EKP concept:5900077)
#### "grain number per plant" (EKP concept (exact): 5900828)
#### "GN" (EKP concept:(vague many hits within EKP))
##### Note : Traits listed above were not specific within EKP for prototype Gene Number is considered here. It is not however restricted to this trait alone ! 

In [8]:
rice_genes <- select(rice_genes,gene_symbol,character_major)  
rice_genes<- filter(rice_genes, character_major == "Morphological trait")
rice_genes<- tolower(as.character(rice_genes[,"gene_symbol"]))
rice_genes <- unique(rice_genes)


In [9]:
head(rice_genes)

## Step 1a : Get the starting concept identifiers

In [27]:
head(start)


## Step 1b: Get the ending concept identifiers for "Grain Number"

In [11]:
end <- unlist(getTraitEKPID())
end<-end["content.id"] #EKP ID of Grain Number





## Step 2: Get Indirect relationships between "rice genes"(start) and "grain number"(end)

In [None]:
genes2GrainNumber<-getIndirectRelation(start,end)
save(genes2GrainNumber, file = "genes2GrainNumber.rda")



#### load the file from the disk


In [12]:
load("genes2GrainNumber.rda")


### Formatting and data cleaning

In [29]:
dfs<-as.matrix(getTableFromJson(genes2GrainNumber))
dfs[,"Predicate"]<-str_replace_all(dfs[,"Predicate"], "[^[:alnum:]]","")
dfs[,"Predicate"]<-str_replace_all(dfs[,"Predicate"], "c","")
dfs[,"Publications"]<-str_replace_all(dfs[,"Publications"], "[^[:alnum:]]","")
dfs[,"Publications"]<-str_replace_all(dfs[,"Publications"], "c","")
dfs<- data.frame(dfs, stringsAsFactors=FALSE)
options(warn=-1)

### Step 3: Map human redable triples from the reference database 
### reference list is collected from EKP

In [14]:
pred<-read.csv("Reference_Predicate_List.csv",header=TRUE)
pred<-pred[,c(2,3)]
colnames(pred)<-c("pred","names")


In [16]:
head(pred)

pred,names
10773733,gene product does not have associated anatomy
10773734,gene product is not element in pathway
10773735,gene product malfunction is not associated with
10773736,gene product does not play role in biological process
10773737,gene product does not have biochemical function
10773738,gene product is not biomarker type


### Step 4: Generate output in the form of triples

In [33]:
predicate_name<-sqldf('select * from dfs left join pred on pred.pred=dfs.Predicate')

In [35]:
tripleName<-cbind(subject_name[,"name"],as.character(predicate_name[,"names"]),object_name[,"name"],dfs[,"Publications"],dfs[,"Score"])

In [36]:
colnames(tripleName)<-c("Subject","Predicate","Object","Provenance","Score")

In [18]:
head(tripleName[,"Subject"])


In [19]:
head(tripleName[,"Predicate"])

In [20]:
head(tripleName[,"Object"])

In [25]:
head(tripleName[,"Score"])


### Step 5: Write output to a file

In [22]:
write.table(tripleName,file="~/odex4all_usecases/ODEX4all-UseCases/Bayer/src/ConceptsRelatedGrainNumberTriples.csv",sep=",",row.names = FALSE)