#### Import BTE modules

In [2]:
import pandas as pandas
from biothings_explorer.query.predict import Predict
from biothings_explorer.query.visualize import display_graph
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
import nest_asyncio
nest_asyncio.apply()
import networkx as nx
import matplotlib.pyplot as plt

%matplotlib inline
import warnings
warnings.filterwarnings("ignore") 

ht = Hint()

ModuleNotFoundError: No module named 'data_tools'

### Find representation of VAMP2 in BTE

In [None]:
# find representations of VAMP2 (ENSG00000220205)
vamp2 = ht.query("ENSG00000220205") ["Gene"] [0]
vamp2

### Biological Process related to VAMP2
Find all the **Biological Process** directly connected to the gene: **VAMP2** (no intermediate nodes [ ]) 

In [None]:
fc_BP = FindConnection(input_obj=vamp2, output_obj="BiologicalProcess", intermediate_nodes= [])

In [None]:
# set verbose=True will display all steps which BTE takes to find the connection
fc_BP.connect()

In [None]:
#Display results in a table 
df_BP=fc_BP.display_table_view ()
df_BP

In [None]:
#Display just the Biological Process retrieved
df_BP.output_name.value_counts()

In [None]:
# Display id's of the Biological Process retrieved 
bp_output_id = df_BP.output_id.value_counts()
bp_output_id

###  Query Path: BP>Genes>ChemicalSubstances>Disease
We want to find all the **Genes** associated to each **BiologicalProcess** connected to VAMP2 (previously retrieved). We also want to find the the **ChemicalSubstances** connecting each gene to **Diseases** of interest (Epilepsy, autism, intelectual disabilities)

For loop: 
1. Genes connected to the Biological Process (**bp_output_id**) 
    - Representation of Biological Process 
    - Find Genes directly connected to Biological Process. No intermediate nodes. 
    - Filter. Select only the Biological Process that have less than "x" genes related (**threshold_gene**)
    
2. Find intermediate ChemicalSubstance connecting Genes and diseases of interest (**diseases_id**)
    - Representation of genes selected
    - Representation of Diseases (diseases_id)

In [None]:
def bp_to_cs (bp_output_id,
              threshold_gene, 
              diseases_id):
    
#Clean ID results
    ##Remove first "GO" character
    BP_ID =[]
    for i in (bp_output_id.index.tolist()): #Get the index of pandas.Series to a list 
        if "GO" in i: 
            BP_ID.append(i[3:]) #Remove the first GO character in the ID and append to sliced list
        else: 
            not_GO_ID = (i) #Remove the not-GO IDs

#Genes connected to the Biological Process
    #Connection BP-Gene
    table_BP_genes_results = []
    for gene in BP_ID: 
        fc = FindConnection(input_obj=(ht.query(gene)["BiologicalProcess"][0]), #Find representation of BP
                            output_obj= "Gene", 
                            intermediate_nodes= [])
        fc.connect() 
        fc.results=(fc.display_table_view ())
        
        #Filter BP based on threshold_gene argument
        if len(fc.results) <= threshold_gene: 
            table_BP_genes_results.append(fc.results)
        else: 
            None 

#Concatenate tables BP-Genes      
    df = pandas.concat(table_BP_genes_results)
    #print (df)

    
#Get the Chemical substances related to the genes retrieved
#Connection Gene-Cs-Disease
    final_table = []
    for bp,pred_type,source,gene_id in zip(df["input"], df ["pred1"], df["pred1_pubmed"], df["output_name"]):
        for disease in diseases_id:
            cs = FindConnection(input_obj=(ht.query(gene_id)['Gene'][0]), 
                                output_obj= (ht.query(disease)['Disease'][0]),
                                intermediate_nodes= "ChemicalSubstance") #Next step would be to Filter Chemical Substances (??)
            cs.connect() # set verbose=True will display all steps which BTE takes to find the connection
            table_results_cs= (cs.display_table_view ())
            #Add columns related to the Biological Process 
            table_results_cs.insert(0, 'BiologicalProcess', bp) #Add biological Process
            table_results_cs.insert(1, 'pred_BP_Gene', pred_type) #Add the association type 
            table_results_cs.insert(2, 'pred_source', source) #Add the source of prediction
            
            final_table.append(table_results_cs)
    
    
#Concatenate tables  Gene-Disease-CS      
    final_results = pandas.concat(final_table)
    return (final_results)

#### Run for loop 

In [None]:
#List of disease
    #MONDO:0005027: Epilepsy #MONDO:0005260: Autism #MONDO:0001071: intellectual disability

diseases_id= ["MONDO:0005027","MONDO:0005260", "MONDO:0001071"]

#bp_to_cs function
    #Select the Biological Process that have 10 or less genes associated

test = bp_to_cs(bp_output_id, threshold_gene = 10, diseases_id=diseases_id )

In [None]:
#Display Results 
test

## Cytoscape 
Documentation https://dash.plotly.com/cytoscape

### Data wrangling for Cytoscape 

In [None]:
#Filter columns of interest 
df_entities = test[['BiologicalProcess', "pred_BP_Gene", "pred_source",
                    "input", "pred1", "pred1_pubmed",
                    "node1_name", "pred2", "pred2_pubmed",
                    "output_name"]]


#Rename columns 
df_entities = df_entities.rename(columns = {"pred_source": "source_BP_Gene",
                       "input": "Gene", "pred1": "pred_Gene_CS", "pred1_pubmed": "source_Gene_CS",
                       "node1_name": "ChemicalSubstance", "pred2": "pred_CS_Dis", "pred2_pubmed": "source_CS_Dis",
                       "output_name": "Disease"})

df_entities.head(10)

In [None]:
#Create a list from test results
my_list_entities = df_entities.values.tolist()

In [None]:
#my_list_entities

In [None]:
#Lists to store elements of interest  
source = [] 
target = []
association_type = []
number_pred_pubmed = []
pred_pubmed = []

for lis in my_list_entities: 
    for index,value in enumerate(lis):
        #Source
        if index % 10 == 0 or index % 10 == 3 or index % 10 == 6: 
                source.append(value)
        #Target
        if index % 10 == 3 or index % 10 == 6 or index % 10 == 9:
                target.append(value)
        #Association 
        elif index % 10 == 1 or index % 10 == 4 or index % 10 == 7:
                association_type.append(value)
        #pubmed Id
        elif index % 10 == 2 or index % 10 == 5 or index % 10 == 8:
                pred_pubmed.append(value)
                
#Number of pubmed IDs 
for result in pred_pubmed:
        if result == None:
            number_pred_pubmed.append(0)
        else:
            number_pred_pubmed.append(result.count(',') + 1) 
        
            
#Create data frame with generated lists 
d = {'source': source, 
    'target': target,
    "association_type": association_type,
    "pred_pubmed": pred_pubmed,
   "number_pred_pubmed": number_pred_pubmed} 
df= pandas.DataFrame(data=d)

df.head(10)

In [None]:
#To test results in Cytoscape: Select only the first 500 connections
df = df.head(500)

### Cytoscape: Elements = [ ]
We need to create **elements []** list of dictionaries for Cytoscape. This include the dictionaries for the **"nodes"** and **"edges"**
#### This is an example of  the structure required fot cytoscape elements list: 
```
elements = [
#Nodes 
{ "data": {"id": "BP1", "name": "BP1"}, 
"classes": "BiologicalProcess" }}, #This variable will later help us to manipulate each class indepentently 
{ "data": {"id": "Gene1", "name": "Gene1"}, 
"classes": "Gene" }}
....

#Edges
{'data': {'source': BP1, 
'target': Gene, 
'label': related_to, 
"weight": 2}, #Number of pubmed_id 
"group": "edges" }

]
```

##### Cytoscape: Nodes. 
We first are going to create the dictionaries of nodes, which includes each of our entities (BP, Gene, CSub, Disease). We are going to iterate through the elements on source and target column in our new data frame

In [None]:
#Get unique source entities and its index
import numpy as np
value_source, indx_source = np.unique(df["source"].values, return_index = True)

In [None]:
#Get unique target entities and its index
value_target, indx_target = np.unique(df["target"].values, return_index = True)

In [None]:
#Cytoscape Node format
my_nodes = []

for i, value in zip(indx_source, value_source):
    if i % 3 == 0: #Index position remainder = 0 (0,3,6,9,12...)
        my_nodes.append({
            "data": {
            "id": value,
            "name": value
            },
            "classes":"BiologicalProcess"})
    elif i % 3 == 1:  #Index position remainder = 1 (1,4,7,10....)
        my_nodes.append({
        "data": {
            "id": value,
            "name": value
            },
        "classes":"Gene"}) 
    elif i % 3 == 2:  #Index position remainder = 2 (2,5,8,11...)
        my_nodes.append({
        "data": {
            "id": value,
            "name": value
            },
        "classes":"ChemicalSubstance"})

for i, value in zip(indx_target,value_target):    
    if i % 3 == 2: #Index position remainder = 2 (2,5,8,11...)
        my_nodes.append({
        "data": {
            "id": value,
            "name": value
            },
        "classes":"Disease"})
        
(my_nodes[0:5])

##### Cytoscape: Edges
Then we create dictionaries of edges. Each dictionary includes the source and target elements. We are also including the association type between the target and source, and the number of pub_med id that connect this association. 

In [None]:
#Iterate to get Source-Target, the association type and the number of pred_pubmed

my_edges = []
for source,target,asso,pred_num in zip(df["source"], df["target"], 
                              df["association_type"], df["number_pred_pubmed"]):
    my_edges.append({
             'data': {'source': source, #Source
              'target': target, #Target
                'label': [asso], #Association type
                "weight": [pred_num]}, #Number of pred_pubmed ids  
               "group": "edges"    
               })
    
my_edges = [i for j, i in enumerate(my_edges) if i not in my_edges[:j]] #Remove duplicates 

(my_edges[0:5])

##### Concatenate my_nodes and  my_edges lists
Finally, we concatenate both list into one. 

In [None]:
my_list_elements = (my_nodes + my_edges)

##### Save into file and run it to PyCharm
We save into a json file and export it ino pycharm 

In [None]:
#Save list on file using json 
import json
with open("../results/vamp2_500_connections.txt", "w") as fp:
    json.dump(my_list_elements, fp)

## Conclusions and caveats

This notebook demonstrated the use of BioThings Explorer to retrieve potential new Chemical Substances to treat VAMP2 deficiency. The list of chemical substances retrieved in this notebook includes compounds already proposed in this article https://onlinelibrary.wiley.com/doi/abs/10.1002/humu.24109?af=R. <br>
The list of chemical substances is still long, the next step is to filtter results based on some restrictions (e.g. FDA approved treatments)