# Importing Python Notebooks

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statistics import mean,mode,median 
import csv
from tqdm import tqdm
import time
%matplotlib inline


Bad key "text.kerning_factor" on line 4 in
D:\Downloads\Anaconda3\envs\sgraph\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


# Loading Data

In [2]:
dataset=pd.read_csv("all_data_19_07_19.tsv",delimiter="\t")
dataset.head()

Unnamed: 0,ENTITYA,TYPEA,IDA,DATABASEA,ENTITYB,TYPEB,IDB,DATABASEB,EFFECT,MECHANISM,...,MODIFICATIONA,MODASEQ,MODIFICATIONB,MODBSEQ,PMID,DIRECT,NOTES,ANNOTATOR,SENTENCE,SIGNOR_ID
0,cabazitaxel,chemical,CHEBI:63584,ChEBI,TUBA4A,protein,P68366,UNIPROT,down-regulates activity,binding,...,,,,,21770474,YES,,miannu,"Among these, larotaxel (XRP9881, formerly RPR1...",SIGNOR-259340
1,cabazitaxel,chemical,CHEBI:63584,ChEBI,TUBB1,protein,Q9H4B7,UNIPROT,down-regulates activity,binding,...,,,,,21770474,YES,,miannu,"Among these, larotaxel (XRP9881, formerly RPR1...",SIGNOR-259341
2,docetaxel anhydrous,chemical,CHEBI:4672,ChEBI,TUBA4A,protein,P68366,UNIPROT,down-regulates activity,binding,...,,,,,23337758,YES,,miannu,Tubulin exists in the cell as dimers of α and ...,SIGNOR-259342
3,docetaxel anhydrous,chemical,CHEBI:4672,ChEBI,TUBB1,protein,Q9H4B7,UNIPROT,down-regulates activity,binding,...,,,,,23337758,YES,,miannu,Tubulin exists in the cell as dimers of α and ...,SIGNOR-259343
4,eribulin mesylate,chemical,CHEBI:70710,ChEBI,TUBA4A,protein,P68366,UNIPROT,down-regulates activity,binding,...,,,,,16940412,YES,,miannu,The complex marine natural product halichondri...,SIGNOR-259344


# Data Wrangling and Cleaning

## Trim 1: Extract Relevant Columns

In [3]:
df = dataset[['ENTITYA','TYPEA','ENTITYB','TYPEB','MECHANISM','EFFECT','DIRECT']]
df.head()

Unnamed: 0,ENTITYA,TYPEA,ENTITYB,TYPEB,MECHANISM,EFFECT,DIRECT
0,cabazitaxel,chemical,TUBA4A,protein,binding,down-regulates activity,YES
1,cabazitaxel,chemical,TUBB1,protein,binding,down-regulates activity,YES
2,docetaxel anhydrous,chemical,TUBA4A,protein,binding,down-regulates activity,YES
3,docetaxel anhydrous,chemical,TUBB1,protein,binding,down-regulates activity,YES
4,eribulin mesylate,chemical,TUBA4A,protein,binding,down-regulates activity,YES


In [4]:
print(df.info())
print('-------------------------------')
print(df['TYPEA'].value_counts())
print('-------------------------------')
print(df['TYPEB'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22787 entries, 0 to 22786
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ENTITYA    22787 non-null  object
 1   TYPEA      22787 non-null  object
 2   ENTITYB    22787 non-null  object
 3   TYPEB      22787 non-null  object
 4   MECHANISM  21163 non-null  object
 5   EFFECT     22787 non-null  object
 6   DIRECT     22787 non-null  object
dtypes: object(7)
memory usage: 1.2+ MB
None
-------------------------------
protein           18463
chemical           2150
complex             907
proteinfamily       655
smallmolecule       378
stimulus             99
fusion protein       82
mirna                32
phenotype            21
Name: TYPEA, dtype: int64
-------------------------------
protein           20412
complex            1155
phenotype           624
proteinfamily       442
smallmolecule       111
mirna                20
fusion protein       16
chemical              

We further trim our dataset by limiting our entities to either proteins, complexes, or protein families.

In [5]:
relevant_types = ['protein','complex','proteinfamily']
df = df[df['TYPEA'].isin(relevant_types)]
df = df[df['TYPEB'].isin(relevant_types)]

In [6]:
print(df.info())
print('-------------------------------')
print(df['TYPEA'].value_counts())
print('-------------------------------')
print(df['TYPEB'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19289 entries, 10 to 22786
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ENTITYA    19289 non-null  object
 1   TYPEA      19289 non-null  object
 2   ENTITYB    19289 non-null  object
 3   TYPEB      19289 non-null  object
 4   MECHANISM  18461 non-null  object
 5   EFFECT     19289 non-null  object
 6   DIRECT     19289 non-null  object
dtypes: object(7)
memory usage: 1.2+ MB
None
-------------------------------
protein          17836
complex            828
proteinfamily      625
Name: TYPEA, dtype: int64
-------------------------------
protein          17805
complex           1074
proteinfamily      410
Name: TYPEB, dtype: int64


We further trim our dataset by limiting our connections to direct connections

In [7]:
df = df[df['DIRECT'] == 'YES']
print(df.info())
print('-------------------------------')
print(df['TYPEA'].value_counts())
print('-------------------------------')
print(df['TYPEB'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16260 entries, 10 to 22786
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ENTITYA    16260 non-null  object
 1   TYPEA      16260 non-null  object
 2   ENTITYB    16260 non-null  object
 3   TYPEB      16260 non-null  object
 4   MECHANISM  16151 non-null  object
 5   EFFECT     16260 non-null  object
 6   DIRECT     16260 non-null  object
dtypes: object(7)
memory usage: 1016.2+ KB
None
-------------------------------
protein          15072
complex            623
proteinfamily      565
Name: TYPEA, dtype: int64
-------------------------------
protein          14875
complex           1032
proteinfamily      353
Name: TYPEB, dtype: int64


Lastly, we remove vagueness from EFFECT by removing the all interactions with an unknown effect.

In [8]:
df = df[df['EFFECT'] != 'unknown']
print(df.info())
print('-------------------------------')
print(df['EFFECT'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15378 entries, 11 to 22786
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ENTITYA    15378 non-null  object
 1   TYPEA      15378 non-null  object
 2   ENTITYB    15378 non-null  object
 3   TYPEB      15378 non-null  object
 4   MECHANISM  15271 non-null  object
 5   EFFECT     15378 non-null  object
 6   DIRECT     15378 non-null  object
dtypes: object(7)
memory usage: 961.1+ KB
None
-------------------------------
up-regulates                                  5070
up-regulates activity                         4047
down-regulates                                2820
down-regulates activity                       2036
form complex                                   572
down-regulates quantity by destabilization     293
up-regulates quantity by expression            223
up-regulates quantity by stabilization         135
down-regulates quantity by repression           68
up

## Simplifying Column Values

For the purpose of this study, we will be simplifying our effects column by generalizing the different types to either up-regulating, down-regulating, or complex-forming.

In [9]:
df['EFFECT'].replace(regex={r'(up-regulates).*$': 'up', r'(down-regulates).*$': 'down','form complex':'complex'},inplace = True)
df['EFFECT'].value_counts()

up         9541
down       5265
complex     572
Name: EFFECT, dtype: int64

# Loading Graph via NetworkX

In [10]:
df_graph = df[['ENTITYA','ENTITYB','MECHANISM','EFFECT']]
Graph = nx.from_pandas_edgelist(df_graph,'ENTITYA','ENTITYB',edge_attr = ['MECHANISM','EFFECT'], create_using = nx.Graph())
G_mech = nx.get_edge_attributes(Graph,'MECHANISM')
G_effect = nx.get_edge_attributes(Graph,'EFFECT')

## Network Trimming with respect to effect

We trim and retain edges within a network with respect to the following condition statements:

* The **succeeding/preceeding node** must have the **same type of regulation** as the node of interest, else trim it off **unless** \
* The connection is a **binding mechanism** \
* The **succeeding/preceeding node** is a **complex** \
* **Parameters are easily configured**

To initialize our trimming function, we make an initial seed with no nodes and edges that we will grow our trimmed network from.

In [17]:
def build_sappling(seed):
    G = nx.Graph()
    states = {seed:['root']}
    edges = set(Graph.edges(seed))
    G.add_edges_from(Graph.edges(seed))
    for edge in edges:
        try:
            state = G_effect[edge]
            states[edge[0]] = [state]
        except KeyError:
            edge_inv = tuple([edge[1], edge[0]])
            state = G_effect[edge_inv]
            states[edge[1]] = [state]
            
    return G,states

In [18]:
sappling,states = build_sappling('FOXM1')
sappling.edges()

EdgeView([('FOXM1', 'CDK2'), ('FOXM1', 'CHEK2'), ('FOXM1', 'CDK1'), ('FOXM1', 'CDK4'), ('FOXM1', 'CyclinB/CDK1'), ('FOXM1', 'PLK1')])

In [19]:
def remove_copies(branches):
    unique_branches = []
    for branch in branches:
        branch1 = branch
        branch2 = (branch[1],branch[0])
        if (branch1 or branch2) not in set (sappling.edges):
            unique_branches.append(branch1)
    return unique_branches


def grow_tree(root,tree_branches,states):
    orig_edges = set(root.edges())
    time.sleep(1.0)
    for branch in tqdm(set(tree_branches)):
        bud = branch[1]
        try:
            bud_state = states[bud]
        except KeyError:
            continue
        potential_branches = remove_copies(set(Graph.edges(bud)))
        for pb in potential_branches:
            try:
                state = G_effect[pb]
                mech = G_mech[pb]
                node = pb[1]
            except KeyError:
                pb_inv = (pb[1],pb[0])
                state = G_effect[pb_inv]
                mech = G_mech[pb_inv]
                node = pb[0]
            #print(node)
            if ((state in bud_state) or ('complex' in bud_state) or ('complex' in state) or (mech == 'binding')):
                root.add_edge(pb[0],pb[1])
                if node not in states.keys():
                    states[node] = [state]
                elif node in states.keys():
                    states_vals = set(states[node]).union(set([state]))
                    states[node] = list(states_vals)
    updated_edges = set(root.edges())
    edge_diff = len(updated_edges)-len(orig_edges)
    new_edges = updated_edges.difference(orig_edges)
    time.sleep(1.0)
    print('# of new edges:',edge_diff)
    #print(new_edges)
    return edge_diff,new_edges
                           
#grow_tree(sappling,sappling.edges(),states)
#states

In [20]:
edge_diff = 1
lvl = 0
while edge_diff != 0:
    print('-------------')
    print(f'@ Level {lvl}:')
    if lvl == 0:
        edge_diff,new_edges = grow_tree(sappling,sappling.edges(),states)
    else:
        edge_diff,new_edges = grow_tree(sappling,new_edges,states)
    lvl +=1

-------------
@ Level 0:


100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 193.51it/s]


# of new edges: 205
-------------
@ Level 1:


100%|███████████████████████████████████████████████████████████████████████████████| 205/205 [00:01<00:00, 110.97it/s]


# of new edges: 1186
-------------
@ Level 2:


100%|██████████████████████████████████████████████████████████████████████████████| 1186/1186 [00:46<00:00, 25.44it/s]


# of new edges: 2751
-------------
@ Level 3:


100%|██████████████████████████████████████████████████████████████████████████████| 2751/2751 [02:50<00:00, 16.14it/s]


# of new edges: 2448
-------------
@ Level 4:


100%|██████████████████████████████████████████████████████████████████████████████| 2448/2448 [02:19<00:00, 17.56it/s]


# of new edges: 777
-------------
@ Level 5:


100%|████████████████████████████████████████████████████████████████████████████████| 777/777 [00:35<00:00, 22.06it/s]


# of new edges: 155
-------------
@ Level 6:


100%|████████████████████████████████████████████████████████████████████████████████| 155/155 [00:03<00:00, 39.26it/s]


# of new edges: 14
-------------
@ Level 7:


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 93.31it/s]


# of new edges: 0


In [None]:
print('Original Network Logistics:')
print('----------------------------------------')
print(f'Total Nodes:{len(Graph.nodes())}')
print(f'Total Edges:{len(Graph.edges())}')
print('----------------------------------------')
print('Trimmed Network Logistics:')
print('----------------------------------------')
print(f'Total Nodes:{len(sappling.nodes())}')
print(f'Total Edges:{len(sappling.edges())}')
print('----------------------------------------')