## Objective: To identify genotype-phenotype trait association in yeast
### Develop a workflow to identify genes indirectly associated with a certain yeast phenotype (butanol tolerance) using EKP and visualize them in an interactive knowledge graph using SPOT


### Load the API scripts with login credentials


In [1]:
library(dplyr)
library(tidyr)
library(sqldf)
library(splitstackshape)
library(stringr)
library(compare)
setwd("/home/anandgavai/AARestructure/ODEX4all-UseCases/DSM/src")


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Loading required package: DBI
Loading required package: data.table
------------------------------------------------------------------------------
data.table + dplyr code now lives in dtplyr.
Please library(dtplyr)!
------------------------------------------------------------------------------

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, last


Attaching package: ‘compare’

The following object is masked from ‘package:base’:

    isTRUE



### Load the API scripts with login credentials

In [27]:
source("/home/anandgavai/AARestructure/ODEX4all-UseCases/DSM/src/EuretosInfrastructure.R")
options(warn=-1)





Retrieving page 0
Retrieving page 1
Retrieving page 2
Retrieving page 3
Retrieving page 4
Retrieving page 5
Retrieving page 6
Retrieving page 7
Retrieving page 8
Retrieving page 9
Retrieving page 10
Retrieving page 11


### DSM workflow starts here:
### Load Input data provided by DSM this data consists of a list of yeast genes and a list of terms that represent butanol tolerance

In [4]:
yeast_genes<-read.csv("yeast_genes_sgdID.csv",header=TRUE)

## Step 1a : Get the starting concept identifiers

In [5]:
query = "/external/concepts/search"
start<-getConceptID(as.character(yeast_genes[,1]))


















In [26]:
head(start)

Id,EKP_Concept_Id,name
s000004214,4042749,ucc1 (saccharomyces cerevisiae s288c)
s000005850,4044156,pro2 (saccharomyces cerevisiae s288c)
s000003573,4043331,snx4 (saccharomyces cerevisiae s288c)
s000004640,4044162,msn2 (saccharomyces cerevisiae s288c)
s000001086,4045917,dog1 (saccharomyces cerevisiae s288c)
s000005393,4044097,mse1 (saccharomyces cerevisiae s288c)


## Step 1b: Get the ending concept identifiers for "resistance to chemicals"

In [6]:
end <- unlist(getResistanceEKPID())
end<-end["content.id"] #EKP ID of resistance to chemicals




In [8]:
head(end)

#### Note: The concept representing "resistance to chemicals" within EKP is indicated by its content.id

## Step 1c: Get the ending concept identifiers for "butanol tolerance"

In [9]:
end2<- unlist(getButanolID())
end2<-end2["content.id"] # EKP ID of butanol





In [10]:
head(end2)

#### Note: The concept representing "butanol tolerance" within EKP is indicated by its content.id

## Step 2a: Get Indirect relationships between "yeast genes"(start) and "resistance to chemicals"(end)

In [None]:
resistance2Chemicals<-getIndirectRelation(start,end)

In [33]:
load(file="resistance2Chemicals.rda")

In [37]:
head(resistance2Chemicals)

## Step 2b: Get Indirect relationships between "yeast genes"(start) and "resistance to butanol"(end)

In [None]:
resistance2Butanol<-getIndirectRelation(start,end2)

In [34]:
load(file="resistance2Butanol.rda")

In [38]:
head(resistance2Butanol)

### Formatting and data cleaning

In [35]:
dfs1<-as.matrix(getTableFromJson(resistance2Chemicals))
dfs1[,"Predicate"]<-str_replace_all(dfs1[,"Predicate"], "[^[:alnum:]]","")
dfs1[,"Predicate"]<-str_replace_all(dfs1[,"Predicate"], "c","")
dfs1[,"Publications"]<-str_replace_all(dfs1[,"Publications"], "[^[:alnum:]]","")
dfs1[,"Publications"]<-str_replace_all(dfs1[,"Publications"], "c","")
dfs1<- data.frame(dfs1, stringsAsFactors=FALSE)

### Formatting and data cleaning

In [41]:
dfs2<-as.matrix(getTableFromJson(resistance2Butanol))
dfs2[,"Predicate"]<-str_replace_all(dfs2[,"Predicate"], "[^[:alnum:]]","")
dfs2[,"Predicate"]<-str_replace_all(dfs2[,"Predicate"], "c","")
dfs2[,"Publications"]<-str_replace_all(dfs2[,"Publications"], "[^[:alnum:]]","")
dfs2[,"Publications"]<-str_replace_all(dfs2[,"Publications"], "c","")
dfs2<- data.frame(dfs2, stringsAsFactors=FALSE)

### Step 3: Intersect "resistance to chemicals" and "1-butanol" concepts

In [42]:
comparison <- compare(dfs1,dfs2,allowAll=TRUE)
dfs<-comparison$tM

In [43]:
head(dfs)

Subject,Predicate,Object,Publications,Score
1018089,9324406,4247440,210120024,11.8198
1018089,9324406,4247440,210209071,11.8198
1018089,9324406,4247440,210213483,11.8198
1018089,9324406,4247440,210225739,11.8198
1018089,9324410,4247440,210225186,11.8198
1018089,9324411,4247440,210219161,11.8198


In [44]:
dim(dfs)

### Step 4: Map human redable triples from the reference database 
### reference list is collected from EKP

In [None]:
pred<-read.csv("Reference_Predicate_List.csv",header=TRUE)
pred<-pred[,c(2,3)]
colnames(pred)<-c("pred","names")


subject_name<-getConceptName(dfs[,"Subject"])
dfs<-cbind(dfs,subject_name[,2])

object_name<-getConceptName(dfs[,"Object"])
dfs<-cbind(dfs,object_name[,2])

predicate_name<-sqldf('select * from dfs left join pred on pred.pred=dfs.Predicate')

tripleName<-cbind(subject_name[,"name"],as.character(predicate_name[,"names"]),object_name[,"name"],dfs[,"Publications"],dfs[,"Score"])
colnames(tripleName)<-c("Subject","Predicate","Object","Provenance","Score")

In [47]:
head(tripleName)

Subject,Predicate,Object,Provenance,Score
tellurite,is adjacent to,resistance to chemicals,210120024,11.8198
tellurite,is adjacent to,resistance to chemicals,210209071,11.8198
tellurite,is adjacent to,resistance to chemicals,210213483,11.8198
tellurite,is adjacent to,resistance to chemicals,210225739,11.8198
tellurite,is functionally related to,resistance to chemicals,210225186,11.8198
tellurite,affects,resistance to chemicals,210219161,11.8198


In [48]:
dim(tripleName)

### Step 4: Write output to a file and Vizualize these in Triple Viewer/Spot

In [34]:
write.table(tripleName,file="./triple.csv",sep=";",row.names=FALSE)