## Notebook for extracting graph from tsv

We have extracted data in /data/drug_interactions.tsv with the following fields:

- drug_interaction_id: id of drug A
- name: name of drug A
- description: interaction info of drug A with drug B
- drugbank_id: id of drug B

Now we want to extract a graph with nodes as the drugs and edges between each drug_interaction_id-drugbank_id pair

In [1]:
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

Intitializing Scala interpreter ...

Spark Web UI available at http://a61e81c06b19:4040
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1603063223840)
SparkSession available as 'spark'


import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD


In [2]:
// read in data 
val lines = sc.textFile("/home/jovyan/work/data/drug_interactions.tsv")
// skip header
val header = lines.first() // extract header
val data = lines.filter(row => row != header) // filter out header

lines: org.apache.spark.rdd.RDD[String] = /home/jovyan/work/data/drug_interactions.tsv MapPartitionsRDD[1] at textFile at <console>:30
header: String = drug_interaction_id	name	description	drugbank_id
data: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at <console>:33


In [3]:
data.take(2)

res0: Array[String] = Array(DB06605	Apixaban	Apixaban may increase the anticoagulant activities of Lepirudin.	DB00001, DB06695	Dabigatran etexilate	Dabigatran etexilate may increase the anticoagulant activities of Lepirudin.	DB00001)


In [4]:
val lines = data.map(line => line.split("\t"))
val drugs = lines.map(line => (line(0), line(1))).distinct()
val drugsWIndex = drugs.zipWithIndex()

lines: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[3] at map at <console>:31
drugs: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[7] at distinct at <console>:32
drugsWIndex: org.apache.spark.rdd.RDD[((String, String), Long)] = ZippedWithIndexRDD[8] at zipWithIndex at <console>:33


In [63]:
lines.take(5)

res38: Array[Array[String]] = Array(Array(DB06605, Apixaban, Apixaban may increase the anticoagulant activities of Lepirudin., DB00001), Array(DB06695, Dabigatran etexilate, Dabigatran etexilate may increase the anticoagulant activities of Lepirudin., DB00001), Array(DB01254, Dasatinib, The risk or severity of bleeding and hemorrhage can be increased when Dasatinib is combined with Lepirudin., DB00001), Array(DB01609, Deferasirox, The risk or severity of gastrointestinal bleeding can be increased when Lepirudin is combined with Deferasirox., DB00001), Array(DB01586, Ursodeoxycholic acid, The risk or severity of bleeding and bruising can be increased when Lepirudin is combined with Ursodeoxycholic acid., DB00001))


In [5]:
drugs.take(4)

res1: Array[(String, String)] = Array((DB00492,Fosinopril), (DB04826,Thenalidine), (DB01395,Drospirenone), (DB13557,Thiopropazate))


In [6]:
drugsWIndex.take(4)

res2: Array[((String, String), Long)] = Array(((DB00492,Fosinopril),0), ((DB04826,Thenalidine),1), ((DB01395,Drospirenone),2), ((DB13557,Thiopropazate),3))


In [11]:
// each vertex: (drug_interaction_id of drug A, (name of drug A))
// TODO could add some more info here to each vertex like number of drugs it interacts with, protein bindings etc?

In [84]:
val vertexRDD = drugsWIndex.map(drug => (drug._2, drug._1))

vertexRDD: org.apache.spark.rdd.RDD[(Long, (String, String))] = MapPartitionsRDD[112] at map at <console>:30


In [85]:
vertexRDD.take(1)

res47: Array[(Long, (String, String))] = Array((0,(DB00492,Fosinopril)))


In [42]:
val originalDrugIDs = lines.map(line => line(0)).zipWithIndex()

originalDrugIDs: org.apache.spark.rdd.RDD[(String, Long)] = ZippedWithIndexRDD[26] at zipWithIndex at <console>:30


In [43]:
originalDrugIDs.take(1)

res24: Array[(String, Long)] = Array((DB06605,0))


In [40]:
// join interaction IDs with vertexArray
val interactionDrugIDs = lines.map(line => line(3)).zipWithIndex()

interactionDrugIDs: org.apache.spark.rdd.RDD[(String, Long)] = ZippedWithIndexRDD[24] at zipWithIndex at <console>:31


In [41]:
interactionDrugIDs.take(1)

res23: Array[(String, Long)] = Array((DB00001,0))


In [20]:
val drug2NodeMap = drugsWIndex.map(drug => (drug._1._1, drug._2))
drug2NodeMap.take(1)

drug2NodeMap: org.apache.spark.rdd.RDD[(String, Long)] = MapPartitionsRDD[11] at map at <console>:30
res9: Array[(String, Long)] = Array((DB00492,0))


In [64]:
// TODO map interactionDrugIDs to nodeIDs using map from drug IDs to node IDs
val drugBids = drug2NodeMap.join(interactionDrugIDs)
                         .map(x => (x._2._2, (x._1, x._2._1)))
                         .sortByKey() // original order of drug interactions in original dataset, key is index used to identify order (added using zipWithIndex to interactionDrugIDs)

drugBids: org.apache.spark.rdd.RDD[(Long, (String, Long))] = ShuffledRDD[70] at sortByKey at <console>:37


In [55]:
drugBids.take(5)

res32: Array[(Long, (String, Long))] = Array((0,(DB00001,3022)), (1,(DB00001,3022)), (2,(DB00001,3022)), (3,(DB00001,3022)), (4,(DB00001,3022)))


In [65]:
// TODO map interactionDrugIDs to nodeIDs using map from drug IDs to node IDs
val drugAids = drug2NodeMap.join(originalDrugIDs)
                         .map(x => (x._2._2, (x._1, x._2._1)))
                         .sortByKey() // original order of drug interactions in original dataset, key is index used to identify order (added using zipWithIndex to interactionDrugIDs)

drugAids: org.apache.spark.rdd.RDD[(Long, (String, Long))] = ShuffledRDD[77] at sortByKey at <console>:37


In [66]:
drugAids.take(5)

res39: Array[(Long, (String, Long))] = Array((0,(DB06605,220)), (1,(DB06695,2363)), (2,(DB01254,3253)), (3,(DB01609,2748)), (4,(DB01586,448)))


In [61]:
originalDrugIDs.take(5) // matches with above

res36: Array[(String, Long)] = Array((DB06605,0), (DB06695,1), (DB01254,2), (DB01609,3), (DB01586,4))


In [69]:
val edgeData = drugAids.join(drugBids)
                        .sortByKey()

edgeData: org.apache.spark.rdd.RDD[(Long, ((String, Long), (String, Long)))] = ShuffledRDD[89] at sortByKey at <console>:33


In [70]:
edgeData.take(5)

res41: Array[(Long, ((String, Long), (String, Long)))] = Array((0,((DB06605,220),(DB00001,3022))), (1,((DB06695,2363),(DB00001,3022))), (2,((DB01254,3253),(DB00001,3022))), (3,((DB01609,2748),(DB00001,3022))), (4,((DB01586,448),(DB00001,3022))))


In [81]:
// Now we have the Node IDs for each drugA-drugB interaction,
// let's populate edge array for each interaction (row in tsv). 
val edgeRDD =  edgeData.map(x => x._2)
                    .map(x => Edge(x._1._2 , x._2._2, "interaction"))


edgeRDD: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[String]] = MapPartitionsRDD[99] at map at <console>:35


In [82]:
edgeRDD.take(5)

res46: Array[org.apache.spark.graphx.Edge[String]] = Array(Edge(220,3022,interaction), Edge(2363,3022,interaction), Edge(3253,3022,interaction), Edge(2748,3022,interaction), Edge(448,3022,interaction))


In [86]:
// create graph
val graph = Graph(vertexRDD, edgeRDD)

graph: org.apache.spark.graphx.Graph[(String, String),String] = org.apache.spark.graphx.impl.GraphImpl@36582ef0


In [87]:
// save vertices and edges
graph.vertices.saveAsTextFile("/home/jovyan/work/data/vertices.txt")
graph.edges.saveAsTextFile("/home/jovyan/work/data/edges.txt")