In [1]:
import re
import pandas as pd
import networkx as nx

# the relationship of child and parent PSI MI (2021-11-12)
### The child MI and its recent parent MI

In [2]:
mi_relation = {}
child = []
with open(r'mi.owl') as f:
    for row in f:
        if row.startswith('id: MI'):
            col = row.strip().split(" ")
            child.append(col[1])
        elif row.startswith('is_a: MI'):
            term = re.search('MI:[0-9]+', row.strip()).group()
            mi_relation.setdefault(child[-1], []).append(term)

In [3]:
mi_relation['MI:0071']

['MI:0013', 'MI:0091']

### The list of MI should not be included

In [4]:
mi_remove = ['MI:0000', 'MI:0001', 'MI:0045'] # molecular interaction, interaction detection method, and experimental interaction detection 
mi_remove2 = ['MI:0362', 'MI:0063', 'MI:0686'
             'MI:0300', 'MI:0590', 'MI:0500', 'MI:2233', 'MI:1149', 'MI:0353',
             'MI:1045', 'MI:0954', 'MI:0444', 'MI:0346', 'MI:0495', 'MI:0003',
             'MI:0333', 'MI:0116', 'MI:1064', 'MI:0190', 'MI:0313', 'MI:0640',
             'MI:0647', 'MI:0002'] # inference, interaction prediction, and unspecified method, etc
mi_remove3 = ['MI:0000', 'MI:0001', 'MI:0045', 
             'MI:0401', 'MI:0013', 'MI:0254', 'MI:0428', 'MI:1088', 'MI:0255', 'MI:0090']

In [5]:
children = []
grandparents = []
#out_f = open("go_grouping.txt", "a")
for ch in mi_relation.keys():
    for pa in mi_relation[ch]:
        #out_f.write(ch + "\t" + pa + "\n")
        if (ch in mi_remove) or (pa in mi_remove):
            pass
        else:
            children.append(ch)
            grandparents.append(pa)
            
mi_group = pd.DataFrame({'child': children, 'grandparent': grandparents})
mi_group.head()

Unnamed: 0,child,grandparent
0,MI:0004,MI:0091
1,MI:0004,MI:0400
2,MI:0005,MI:0810
3,MI:0006,MI:0019
4,MI:0007,MI:0019


In [6]:
mi_group.shape

(1592, 2)

In [7]:
children = []
parents = []
for ch in mi_relation.keys():
    for pa in mi_relation[ch]:
        if (ch in mi_remove3) or (pa in mi_remove3):
            pass
        else:
            children.append(ch)
            parents.append(pa)
            
mi_group2 = pd.DataFrame({'child': children, 'parent': parents})
mi_group2.head()

Unnamed: 0,child,parent
0,MI:0004,MI:0091
1,MI:0004,MI:0400
2,MI:0005,MI:0810
3,MI:0006,MI:0019
4,MI:0007,MI:0019


In [8]:
mi_group2.shape

(1524, 2)

### The function: find all the tip nodes of a given source (2021-11-13)

In [9]:
def find_ancestor(G, child):
    ancestors = []
    if child in G.nodes():
        parents = nx.predecessor(G, child, cutoff = 1)
        p = list(parents.keys())
        if len(p) == 1:
            #ancestor = p[0]
            if p[0] in mi_remove2:
                pass
            else:
                ancestors.append(p[0])
        else:
            for node in p[1:]:
                ancestors += find_ancestor(G, node)
        return list(set(ancestors))
    else:
        return []

### Find all the tip and second last tip nodes of a given source (2021-11-14)

In [10]:
G = nx.from_pandas_edgelist(mi_group, source='child', target='grandparent', 
                            edge_attr=None, create_using=nx.DiGraph()) # create_using: 创建的图的类型
G2 = nx.from_pandas_edgelist(mi_group2, source='child', target='parent', 
                            edge_attr=None, create_using=nx.DiGraph())

In [11]:
find_ancestor(G2, "MI:0004")

['MI:0091', 'MI:0400']

In [12]:
find_ancestor(G, "MI:0001")

[]

In [13]:
import os
print(os.getcwd())

/home/tang/Desktop/Poxviridae-human/data/PSI-MI


In [14]:
os.getcwd()

'/home/tang/Desktop/Poxviridae-human/data/PSI-MI'