In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy.integrate import odeint
import networkx as nx
from pybel.examples import sialic_acid_graph as sag
import pybel as pb
import json
import time
import csv
from pybel.io.jupyter import to_jupyter
import torch
import pyro
import pandas as pd
import numpy as np
pyro.set_rng_seed(101)

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [3]:
import covid19kg

In [4]:
import covid19kg
graph = covid19kg.get_graph()
graph.summarize()

Covid19KG v0.0.1-dev
Number of Nodes: 3954
Number of Edges: 9484
Number of Citations: 185
Number of Authors: 950
Network Density: 6.07E-04
Number of Components: 29


In [5]:
dir(covid19kg)

['AUTHORS',
 'BELMetadata',
 'BELRepository',
 'HERE',
 'VERSION',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'get_graph',
 'get_graphs',
 'get_summary_df',
 'main',
 'metadata',
 'os',
 'repository',
 'serialize_authors']

In [6]:
#f = open('Epithelial Innate Immune Activation-2.0-Hs.jgf')
f = open('COVID19.jgf')
bel_statements = json.load(f)

In [7]:
# type has the structure of (type_parent, type_children, type_relation):[[edges]]
def get_information(jgf_file):
    # causal_relations = ['increases', 'decreases', 'directlyIncreases', 'directlyDecreases']
    types = []
    # nodes = set()
    parents = []
    parent_types = []
    children_types = []
    children = []
    str_list = []
    df = pd.DataFrame()
    
    for edges in jgf_file['graph']['edges']:
        parent = edges['source']
        child = edges['target']
        relation = edges['relation']
        text = edges['label']
        if relation.find('crease') > 0:
            # nodes.add(parent)
            # nodes.add(child)
            parents.append(parent)
            children.append(child)
            str_list.append(parent + '*' + relation + '*' + child)   
            parent_type = parent[:parent.find('(')]
            children_type = child[:child.find('(')]
            types.append(relation)
            children_types.append(children_type)
            parent_types.append(parent_type)
            # types.setdefault((parent_type,chidren_type,relation), []).append(parent + '*' + relation + '*' + child)
    df['parents'] = parents
    df['children'] = children
    df['types'] = types
    df['statements'] = str_list
    df['parent_types'] = parent_types
    df['children_types'] = children_types
    return df

In [8]:
df = get_information(bel_statements)

In [9]:
df.head()

Unnamed: 0,parents,children,types,statements,parent_types,children_types
0,path(MESH:COVID-19),path(MESH:Hypoalbuminemia),increases,path(MESH:COVID-19)*increases*path(MESH:Hypoalbuminemia),path,path
1,p(HGNC:JAK3),path(MESH:Lymphopenia),decreases,p(HGNC:JAK3)*decreases*path(MESH:Lymphopenia),p,path
2,"path(SDIS:""Liver Injury"")",p(HGNC:CRP),increases,"path(SDIS:""Liver Injury"")*increases*p(HGNC:CRP)",path,p
3,"bp(GO:""inflammatory response"")",p(HGNC:CRP),increases,"bp(GO:""inflammatory response"")*increases*p(HGNC:CRP)",bp,p
4,a(SCHEM:fibrate),p(HGNC:CRP),decreases,a(SCHEM:fibrate)*decreases*p(HGNC:CRP),a,p


In [25]:
df['parent_types'].unique()

array(['path', 'p', 'bp', 'a', 'act'], dtype=object)

In [26]:
df['children_types'].unique()

array(['path', 'p', 'sec', 'act', 'tloc', '', 'bp', 'r', 'pop'],
      dtype=object)

In [10]:
print(type(df[df['children'] == '(bp(GO:"response to hypoxia") increases r(HGNC:ELF2))']['children']))

<class 'pandas.core.series.Series'>


In [16]:
## identifying clusters
new_df = df.groupby(['children', 'types', 'children_types']).apply(lambda x: [list(x['parents']), list(x['parent_types'])]).apply(pd.Series)


In [22]:
new_df = new_df.reset_index(drop = False)
new_df.columns = ['children', 'types', 'children_types', 'parents', 'parents_types']

In [23]:
new_df

Unnamed: 0,children,types,children_types,parents,parents_types
0,"(bp(GO:""response to hypoxia"") increases r(HGNC:ELF2))",decreases,,[p(HGNC:ANGPT2)],[p]
1,"(bp(GO:""response to hypoxia"") increases r(HGNC:TEK))",decreases,,[p(HGNC:ANGPT2)],[p]
2,"act(complex(SCOMP:""Nfkb Complex""), ma(tscript))",increases,act,[p(HGNC:CRP)],[p]
3,"act(p(HGNC:BAD), ma(cat))",decreases,act,[p(HGNC:IL7)],[p]
4,"act(p(HGNC:TEK), ma(kin))",directlyDecreases,act,[p(HGNC:ANGPT2)],[p]
5,"act(p(HGNC:TEK), ma(kin))",directlyIncreases,act,[p(HGNC:ANGPT2)],[p]
6,"act(p(HGNC:TIE1), ma(kin))",directlyDecreases,act,[p(HGNC:ANGPT2)],[p]
7,"bp(GO:""B cell activation"")",increases,bp,[p(HGNC:IL7)],[p]
8,"bp(GO:""B cell differentiation"")",increases,bp,[p(HGNC:IL7)],[p]
9,"bp(GO:""B cell proliferation"")",increases,bp,[p(HGNC:IL7)],[p]


## Jeremy's guide for dealing with node types

Let's talk about Abundance -> Process first.   If you have multiple Abundance parents, then the function could simply be a weighted sum of abundances followed by an activation function to determine whether the process is on or off.  In the simple case where you have two abundance parents, the Parent-Child function resembles an OR gate if the weights are large, because either abundance (or both) is sufficient to activate the process.  If the weights are small, then the the Parent-Child function resembles an AND gate, because you need both parents in high abundance.


7:20
Let's talk about  Process -> Transformation next.    In this case, if the process is active, then the transformation is a continuous variable.  If the process in inactive, then the transformation cannot occur, so it is zero.


7:21
Let's talk about (Abundance*, Process) -> Transformation.  In this case, the transformation is a potentially nonlinear function of abundance if the Process is active, but zero otherwise.


7:23
if Transformation -> Abundance, then we have a continuous in, and continuous out, but it may be nonlinear.


7:24
Process -> Abundance implies  continuous if Process is active and 0 otherwise.


7:25
All of these rules are flipped if the causal relation is decreases instead of increases


7:26
If Abundance -> Transformation we also have (potentially nonlinear) continuous to continuous.


In [None]:
def get_distribution(node_type):
    if node_type in ['a', 'r', 'm', 'g', 'p', 'composite', 'complex', 'pop']:
        ## this is an abundance type node
        ## chose lognormal because we need a +ve continuous distribution here
        return dist.LogNormal(torch.tensor([0.0]), torch.tensor([1.0])
    if node_type in ['p', 'bp', 'path']: 
                              ## processes have binary distribution. path is a pathology process
                              return dist.Categorical(torch.tensor([0.5]), torch.tensor([0.5]))
    if node_type in ['act', 'molecularActivity', 'chap', 'pep', 'ribo']:
                              ## activity is continuous                       
                              return dist.LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
    if node_type in ['reaction', 'degradation']:
                              ## it should be continuous so starting with lognormal
                              return dist.LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
    if node_type in ['tloc', 'sec', 'surf', 'tscript', 'tport']:
                              ## transport category
                              return dist.LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
    if node_type in['gtp', 'cat', 'kin', 'phos']:
                              ## these are binary
                              return dist.Categorical(torch.tensor([0.5]), torch.tensor([0.5]))

In [14]:
import pyro.distributions as dist

def scm(graph):
    for ii in range(len(graph)):
        for jj in range(len(graph['parents_types'][ii])):
#         assign(graph['children'][ii], )
        
        
#     cloudy = pyro.sample('cloudy', dist.Bernoulli(0.3))
#     cloudy = 'cloudy' if cloudy.item() == 1.0 else 'sunny'
#     mean_temp = {'cloudy': 55.0, 'sunny': 75.0}[cloudy]
#     scale_temp = {'cloudy': 10.0, 'sunny': 15.0}[cloudy]
#     temp = pyro.sample('temp', dist.Normal(mean_temp, scale_temp))
#     return cloudy, temp.item()

## To do:
1. type is causal relationship
2. we need to group parent and children type according to CBN legend
3. Start building SCM using the prior distribution guide Jeremy gave us
4. Train a linear model to confirm if the threshold / distribution works on the sample generated from SCM
5. Automate this process and test it on different BEL models we have

## Future scope
1. Counterfactuals and interventions