In [1]:
import pandas as pd
import re
import gzip

from gene_id2symbol import id2name
from collections import Counter

# Human PPI data integration from five databases
## 1. Raw data extraction
### MatrixDB

In [2]:
gene_A = []
gene_B = []
method = []
pubmed = []
interaction_type = []
with gzip.open(r'../data/PPIs_from_databases/matrixdb_human.tab.gz', 'rt') as f:
    for row in f:
        col = row.strip().split('\t')
        gene_A.append(id2name(re.split('[:-]', col[0])[1]))
        gene_B.append(id2name(re.split('[:-]', col[1])[1]))
        method.append(col[6].replace('psi-mi:"', '').replace('"("', '(').replace('")', ')'))
        pubmed.append(re.search(r'(.*)(pubmed:[0-9]+)(\D*)(.*)', col[8]).group(2))
        interaction_type.append(col[11].replace('psi-mi:"', '').replace('"', ''))
        
matrixdb = pd.DataFrame({'gene_A': gene_A, 'gene_B': gene_B,
                         'interaction_detection_method': method,
                         'pubmed_id': pubmed,
                         'interaction_type': interaction_type,
                         'database': ['MatrixDB'] * len(gene_A)})

### BioGRID

In [3]:
gene_A = []
gene_B = []
method = []
pubmed = []
interaction_type = []
with gzip.open(r'../data/PPIs_from_databases/BIOGRID-ORGANISM-Homo_sapiens-3.5.182.mitab.txt.gz', 'rt') as f:
    for row in f:
        if row.startswith("#ID"):
            pass
        else:
            col = row.split("\t")
            if col[9] == 'taxid:9606' and col[10] == 'taxid:9606':
            #if col[9] == col[10]:
                gene_A.append(id2name(col[0].replace('entrez gene/locuslink:', '')))
                gene_B.append(id2name(col[1].replace('entrez gene/locuslink:', '')))
                method.append(col[6].replace('psi-mi:"', '').replace('"', ''))
                pubmed.append(col[8])
                interaction_type.append(col[11].replace('psi-mi:"', '').replace('"', ''))

biogrid = pd.DataFrame({'gene_A': gene_A, 'gene_B': gene_B,
                         'interaction_detection_method': method,
                         'pubmed_id': pubmed,
                         'interaction_type': interaction_type,
                         'database': ['BioGRID'] * len(gene_A)})

### IntAct

In [4]:
gene_A = []
gene_B = []
method = []
pubmed = []
interaction_type = []
with gzip.open(r'../data/PPIs_from_databases/intact_human.txt.gz', 'rt') as f:
    for row in f:
        col = row.strip().split('\t')
        if re.search(r'pubmed:unassigned[0-9]+', col[8]):  # pubmed:unassigned
            pass
        else:
            gene_A.append(id2name(re.split('[:-]', col[0])[1]))
            gene_B.append(id2name(re.split('[:-]', col[1])[1]))
            method.append(col[6].replace('psi-mi:"', '').replace('"', ''))
            pubmed.append(re.search(r'(.*)(pubmed:[0-9]+)(\D*)(.*)', col[8]).group(2))
            interaction_type.append(col[11].replace('psi-mi:"', '').replace('"', ''))

intact = pd.DataFrame({'gene_A': gene_A, 'gene_B': gene_B,
                         'interaction_detection_method': method,
                         'pubmed_id': pubmed,
                         'interaction_type': interaction_type,
                         'database': ['IntAct'] * len(gene_A)})

### MINT

In [5]:
gene_A = []
gene_B = []
method = []
pubmed = []
interaction_type = []
with gzip.open(r'../data/PPIs_from_databases/MINT_human.gz', 'rt') as f:
    for row in f:
        col = row.strip().split('\t')
        if col[9] == 'taxid:9606(human)|taxid:9606(Homo sapiens)' and col[10] == 'taxid:9606(human)|taxid:9606(Homo sapiens)':
            gene_A.append(id2name(re.split('[:-]', col[0])[1]))
            gene_B.append(id2name(re.split('[:-]', col[1])[1]))
            method.append(col[6].replace('psi-mi:"', '').replace('"', ''))
            pubmed.append(re.search(r'(.*)(pubmed:[0-9]+)(\D*)(.*)', col[8]).group(2))
            interaction_type.append(col[11].replace('psi-mi:"', '').replace('"', ''))

mint = pd.DataFrame({'gene_A': gene_A, 'gene_B': gene_B,
                         'interaction_detection_method': method,
                         'pubmed_id': pubmed,
                         'interaction_type': interaction_type,
                         'database': ['MINT'] * len(gene_A)})

### DIP

In [6]:
gene_A = []
gene_B = []
method = []
pubmed = []
interaction_type = []
with gzip.open(r'../data/PPIs_from_databases/Hsapi20170205.txt.gz', 'rt') as f:
    for row in f:
        col = row.strip().split("\t")
        if len(col) < 15:
            pass
        else:
            if col[9] == 'taxid:9606(Homo sapiens)' and col[10] == 'taxid:9606(Homo sapiens)':
                if bool(re.search("uniprotkb", col[0])) and bool(re.search("uniprotkb", col[1])):
                    gene_A.append(id2name(re.split('uniprotkb:', col[0])[1]))
                    gene_B.append(id2name(re.split('uniprotkb:', col[1])[1]))
                    method.append(col[6])
                    pubmed.append(re.sub(r'\|pubmed:DIP-[0-9]+', '', col[8]))
                    interaction_type.append(col[11])

dip = pd.DataFrame({'gene_A': gene_A, 'gene_B': gene_B,
                         'interaction_detection_method': method,
                         'pubmed_id': pubmed,
                         'interaction_type': interaction_type,
                         'database': ['DIP'] * len(gene_A)})

dip2 = dip[['gene_A', 'interaction_detection_method']]
dip2 = dip2.drop('interaction_detection_method', axis=1).\
    join(dip2['interaction_detection_method'].str.split('|', expand = True).\
         stack().reset_index(level=1, drop=True).rename('interaction_detection_method'))

dip3 = dip[['gene_B', 'pubmed_id']]
dip3 = dip3.drop('pubmed_id', axis=1).\
    join(dip3['pubmed_id'].str.split('|', expand = True).\
         stack().reset_index(level=1, drop=True).rename('pubmed_id'))

dip4 = dip[['interaction_type', 'database']]
dip4 = dip4.drop('interaction_type', axis=1).\
    join(dip4['interaction_type'].str.split('|', expand = True).\
         stack().reset_index(level=1, drop=True).rename('interaction_type'))

dip5 = pd.concat([dip2, dip3, dip4], axis=1)
dip5 = dip5[['gene_A', 'gene_B', 'interaction_detection_method', 'pubmed_id',
             'interaction_type', 'database']]

## 2. Intrgration

In [7]:
"""
Interactions without corresponding symbols were removed;
Interactions that genes interact with themselves were not included
"""
integrated_ppi = pd.concat([matrixdb, biogrid, intact, mint, dip5])
integrated_ppi2 = integrated_ppi[(integrated_ppi.gene_A != 0) & (integrated_ppi.gene_B != 0)]
integrated_ppi2 = integrated_ppi2.drop_duplicates()
integrated_ppi2 = integrated_ppi2[integrated_ppi2.gene_A != integrated_ppi2.gene_B]

In [8]:
integrated_ppi2.shape

(879242, 6)

In [9]:
"""
Records with the uninformative PSI-MI experimental interaction detection were discarded;
Records with invalid methods, genetic interactions and PTMs were also removed
"""
integrated_ppi3 = integrated_ppi2[integrated_ppi2.interaction_detection_method != 'MI:0045(experimental interaction detection)']
integrated_ppi3 = integrated_ppi3[integrated_ppi3.interaction_detection_method != 'MI:0686(unspecified method)']
integrated_ppi3 = integrated_ppi3[integrated_ppi3.interaction_type != 'MI:0218()']

integrated_ppi4 = integrated_ppi3[(integrated_ppi3.interaction_type == 'MI:0407(direct interaction)') |
(integrated_ppi3.interaction_type == 'MI:0914(association)') |
(integrated_ppi3.interaction_type == 'MI:0915(physical association)')]
integrated_ppi4 = integrated_ppi4.iloc[:,[0,1,2,3,5]]
integrated_ppi4 = integrated_ppi4.drop_duplicates()

In [10]:
integrated_ppi4.shape

(807203, 5)

In [11]:
"""Classify the detection method into three categories: 'invalid', 'binary' and 'non-binary"""
detection_method = pd.read_csv(r'../data/Interaction_detection_method.txt', sep="\t")
method_category = detection_method[['Interaction Detection Method', 'Binary call', 'Parent MI']]
method_category.columns = ['interaction_detection_method', 'binary_call', 'parent_mi']

integrated_ppi5 = pd.merge(integrated_ppi4, method_category, how='left',
                           on='interaction_detection_method')
integrated_ppi5 = integrated_ppi5[integrated_ppi5['binary_call'] != 'invalid']

In [12]:
integrated_ppi5.shape

(807050, 7)

In [13]:
"""Remove no protein-coding genes from the PPIs"""
pro_coding = pd.read_csv(r'../data/protein_coding_genes.txt', sep="\t", header=None)
pro_coding.columns = ['Gene']

integrated_ppi5 = pd.merge(integrated_ppi5, pro_coding, how='inner',
                           left_on='gene_A', right_on='Gene')
integrated_ppi5 = pd.merge(integrated_ppi5, pro_coding, how='inner',
                           left_on='gene_B', right_on='Gene')
integrated_ppi5 = integrated_ppi5.iloc[:,[0,1,2,3,4,5,6]]

In [14]:
integrated_ppi5.shape

(693568, 7)

In [15]:
"""Divide the PPIs into high-through (HT) and low-through (LT)"""
ppi_pubmed = integrated_ppi5[['gene_A', 'gene_B', 'pubmed_id']]
ppi_pubmed = ppi_pubmed.drop_duplicates()
ppi_pubmed2 = pd.DataFrame({'gene_A': ppi_pubmed['gene_B'].tolist(),
                                'gene_B': ppi_pubmed['gene_A'].tolist(),
                                'pubmed_id': ppi_pubmed['pubmed_id'].tolist()})
ppi_pubmed2 = pd.concat([ppi_pubmed, ppi_pubmed2], axis = 0, ignore_index=True)
ppi_pubmed2 = ppi_pubmed2.drop_duplicates()

from collections import Counter
pubmed_num = Counter(ppi_pubmed2['pubmed_id'].tolist())
pubmed_id = []
number = []
for pubmed in pubmed_num.keys():
    pubmed_id.append(pubmed)
    number.append(int(pubmed_num[pubmed]/2))

HT_list = []
for pubmed in pubmed_num.keys():
    if pubmed_num[pubmed] > 200:
        HT_list.append(pubmed)

throughput = []
for pubmed in integrated_ppi5['pubmed_id'].tolist():
    if pubmed in HT_list:
        throughput.append("HT")
    else:
        throughput.append("LT")

integrated_ppi5.insert(5, 'throughput', throughput)

integrated_ppi6 = integrated_ppi5.iloc[:,[0,1,3,4,5,6,7]]
integrated_ppi6 = integrated_ppi6.drop_duplicates()

integrated_ppi7 = pd.DataFrame({'gene_A': integrated_ppi6['gene_B'].tolist(),
                                'gene_B': integrated_ppi6['gene_A'].tolist(),
                                'pubmed_id': integrated_ppi6['pubmed_id'].tolist(),
                                'database': integrated_ppi6['database'].tolist(),
                                'throughput': integrated_ppi6['throughput'].tolist(),
                                'binary_call': integrated_ppi6['binary_call'].tolist(),
                                'parent_mi': integrated_ppi6['parent_mi'].tolist()})

integrated_ppi7 = pd.concat([integrated_ppi6, integrated_ppi7], axis = 0, ignore_index=True)
integrated_ppi7 = integrated_ppi7.drop_duplicates()

In [16]:
integrated_ppi7.shape

(1261508, 7)

In [17]:
"""
Combine the same interaction from different literature;
Combine the same interaction from different experimental scale;
Combine the same interaction from different database;
Join the same interaction divided into different method category;
Join the same interaction used different method (MI-PSI)
"""
def comb(df):
    return ';'.join(df.values)

def inter_comb(name):
    ppi = integrated_ppi7[['gene_A','gene_B',name]]
    ppi = ppi.drop_duplicates()
    ppi2 = ppi.groupby(['gene_A','gene_B'])[name].apply(comb)
    interaction_list = []
    for i in range(len(ppi2.index)):
        interaction_list.append(ppi2.index[i])
    ppi3 = pd.DataFrame(interaction_list)
    ppi3.columns = ppi2.index.names
    ppi3[name] = ppi2.values
    return ppi3

integrated_ppi8 = pd.DataFrame({'gene_A':[], 'gene_B':[]})
for name in integrated_ppi7.columns[2:8].to_list():
    ppi = inter_comb(name)
    integrated_ppi8 = pd.merge(integrated_ppi8, ppi, how='right',
                               on=['gene_A', 'gene_B'])

In [18]:
integrated_ppi8.shape

(619466, 7)

In [19]:
"""
Identification of the relationship between child and parent PSI-MI terms, and the redundant terms were removed.
"""
parentMI = detection_method[['Parent MI', 'Parent3 MI']]
parentMI = parentMI[parentMI['Parent MI'] != parentMI['Parent3 MI']]
parentMI = parentMI.drop_duplicates()
parent_mi = dict(zip(parentMI['Parent MI'], parentMI['Parent3 MI']))

parentMI2 = detection_method[['Parent MI', 'Parent2 MI']]
parentMI2 = parentMI2[parentMI2['Parent MI'] != parentMI2['Parent2 MI']]
parentMI2 = parentMI2.drop_duplicates()
parent_mi2 = dict(zip(parentMI2['Parent MI'], parentMI2['Parent2 MI']))

def mi_remove(x):
    if bool(re.search(";", x)):
        mis = x.split(';')
        mi_list = []
        for i in mis:
            if i in parent_mi.keys():
                if parent_mi[i] in mis:
                    mi_list.append(parent_mi[i])
        mi_set = set(mi_list)
        mis2 = [i for i in mis if i not in mi_set]
        return ';'.join(mis2)
    else:
        return x

def mi_remove2(x):
    if bool(re.search(";", x)):
        mis = x.split(';')
        mi_list = []
        for i in mis:
            if i in parent_mi2.keys():
                if parent_mi2[i] in mis:
                    mi_list.append(parent_mi2[i])
        mi_set = set(mi_list)
        mis2 = [i for i in mis if i not in mi_set]
        return ';'.join(mis2)
    else:
        return x
    
integrated_ppi8['parent_mi'] = integrated_ppi8['parent_mi'].apply(mi_remove)
integrated_ppi8['parent_mi'] = integrated_ppi8['parent_mi'].apply(mi_remove2)

In [20]:
integrated_ppi8.shape

(619466, 7)

In [21]:
"""Remove the replicated interaction pairs (retain A-B, but not B-A)"""
interaction = []
gene_A = []
gene_B = []
pubmed_id = []
database = []
throughput = []

for i in range(integrated_ppi8.shape[0]):
    if integrated_ppi8.iloc[i,1] + '-' + integrated_ppi8.iloc[i,0] in interaction:
        pass
    else:
        if (len(integrated_ppi8.iloc[i,6].split(';')) == 1) and (integrated_ppi8.iloc[i,4] == "HT"):
            pass
        else:
            interaction.append(integrated_ppi8.iloc[i,0] + '-' + integrated_ppi8.iloc[i,1])
            gene_A.append(integrated_ppi8.iloc[i,0])
            gene_B.append(integrated_ppi8.iloc[i,1])
            pubmed_id.append(integrated_ppi8.iloc[i,2])
            database.append(integrated_ppi8.iloc[i,3])
            throughput.append(integrated_ppi8.iloc[i,4])
        
integrated_ppi9 = pd.DataFrame({'Gene_A': gene_A, 
                                'Gene_B': gene_B, 
                                'Pubmed_id': pubmed_id,
                                'Database': database,
                                'Throughput': throughput})

In [22]:
integrated_ppi9.shape

(78261, 5)

In [23]:
"""Supplementary table 1"""
integrated_ppi9.to_csv(path_or_buf='../result/HUMPPI-2022',
                       index = False, sep = '\t')

In [24]:
""""""
integrated_ppi9[['Gene_A', 'Gene_B']].to_csv(path_or_buf='../intermediate/integrated_ppi.txt',
                       index = False, sep = '\t', header = None)