### Script Overview
This script creates a toy dataset from INDRA covid19, hosted on emma.indra.bio 

Emma puts together this graph on daily basis via a cron job that pulls in literature, does NER,  train new ML model..
It incorporates daily updates from CORD-19 and also searches the Internet, and runs about 6 text mining systems on those

The script converts the graph to BEL format via pybel library. 
The pybel library can be used to further process the graph and generate toy dataset outputs. 

In [None]:
import sys
import os

In [None]:
#sys.path.insert(0,"/home/USERNAME/pybel/src/") # If you are using a local version of the file

sys.path.append("/home/USERNAME/pybel/src/pybel/io/triples/") # trying to read in pybel.io.triples.api
print(sys.path)

In [None]:
from pybel.io.triples import api

In [None]:
# not working - IGNORE
import imp
imp.find_module("pybel")
import pybel
#pybel.__path__
triples_api = imp.load_source('api', "/home/lani_lichtenstein/pybel/src/pybel/io/triples/api.py")


In [None]:
#!python3 -m pip install indra --no-cache-dir --user

In [None]:
import getpass
import os
import sys
import time

import matplotlib.pyplot as plt
import pandas as pd
import pykeen
import torch
from pykeen.pipeline import pipeline
import pybel
import pybel_tools
import indra


%matplotlib inline

In [None]:
print(sys.version)

In [None]:
print(time.asctime())

In [None]:
print(getpass.getuser())

In [None]:
print(pykeen.get_version(with_git_hash=True))

In [None]:
print(pybel.get_version(with_git_hash=True))

In [None]:
# Convert Indra graph to Pybel
#https://emmaa.indra.bio/dashboard/covid19?tab=model

pybel_covid_graph=pybel.from_emmaa('covid19', date="2020-04-23-17-44-57") 

In [None]:
pybel_covid_graph.summarize() # summarise 

#### Approach A - Generate Triples

One approach to generating a toy dataset is to generate triples. 
Triples can be used to generate knowledge graph embeddings. 
They also contain grounded source and target identifiers, as well as details relation descriptions. 

This is not obtained using Approach B - Generate Raw Data with Evidence

In [None]:
import pybel.io.tsv.api

triples=pybel.io.tsv.api.get_triples(pybel_covid_graph)

In [None]:
import numpy as np
triples = np.array(triples)

In [None]:
triples_df=pd.DataFrame(triples)

In [None]:
triples_df.to_csv("indra_covid_toy_dataset_triples.csv",index=False,sep="\t",header=False)

#### Approach B - Generate Toy Dataset with Raw Text and Evidence

In [None]:
pybel_graph = pybel_covid_graph # rename graph for simplicity

# EXPLORE
# Create list of relation types
relations_pybel=pybel.struct.summary.count_relations(pybel_graph)
relations=[]
for i in relations_pybel.keys():
    relations.append(i)

print("There are " + str(len(relations)) + " relation types.") 
relations # not as detailed as triples

In [None]:
import logging
from pybel.dsl import BaseConcept
from tqdm import tqdm
#from pybel.io.triples import api

column_list=["Source", "Target", "Relation", "Evidence", "Citation"]
indra_df=pd.DataFrame(columns=column_list)

for u,v,data in tqdm(pybel_graph.edges(data=True)):

    source='NaN'
    target='NaN'
    evidence='NaN'
    relation='NaN'
    annotations='NaN'
    
    #h,r,t=to_triple(u,v,data) https://github.com/pybel/pybel/blob/master/src/pybel/io/triples/api.py
    
    if isinstance(u, BaseConcept):
        source=u.name
        #source_obo=u.obo
        
    if isinstance(v, BaseConcept):
        target=v.name
        #for key in v.keys():
        #    print(key)
        #print("next")
        
    if 'evidence' in data.keys():  # look also at pybel.has_edge_evidence() 
        #print("Explore evidence \n")
        #print(data['evidence'])
        evidence=data["evidence"]
    
    if 'relation' in data.keys():
        #print("Explore relation \n")
        #print(data['relation'])
        #print("\n")
        relation=data['relation']
        
    if 'annotations' in data.keys():
        #print("Explore relation \n")
        #print(data['relation'])
        #print("\n")
        annotations=data['relation']
        #print(annotations)
        
    if 'citation' in data.keys():
        #print("Explore relation \n")
        #print(data['relation'])
        #print("\n")
        citation=data['citation']
        
    tmp=pd.Series([source, target, relation, evidence, citation], index=column_list)
    indra_df=indra_df.append(tmp, ignore_index=True)


In [None]:
# explore
indra_df.shape
indra_df.head()

In [None]:
indra_df.to_csv("indra_covid_toy_dataset_raw_evidence.csv",index=False,sep="\t",header=False)

In [None]:
# filter for high belief score.. 
# ask Ben or John Bachman, Ben Gyori
# filter statements with ontology- e.g. chebi as this is a toy graph


In [None]:
# to add triples - read in api.py module and use to_triple
#https://github.com/pybel/pybel/blob/master/src/pybel/io/triples/api.py
