In [2]:
from src.databases.network import Network
from src.literature_mining.paper import Paper
import pandas as pd
from src.entities.protein import Protein
from src.entities.protein_protein import ProteinProtein
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Merge networks from database
intact = Network(db = 'IntAct', version = '2024-02-14', type = 'MADS_vs_MADS')
biogrid = Network(db = 'BioGRID', version = '4.4.233', type = 'MADS_vs_MADS')
pplappisite = Network(db = 'PlaPPISite', type = 'MADS_vs_MADS')

# Add negatives
intact = intact.add_negatives_per_species()
biogrid = biogrid.add_negatives_per_species()
pplappisite = pplappisite.add_negatives_per_species()

# Add ´From´ column
intact.df['From'] = 'IntAct'
biogrid.df['From'] = 'BioGRID'
pplappisite.df['From'] = 'PlaPPISite'

# Merge
databases = intact.merge(biogrid).merge(pplappisite)
databases.df

100%|██████████| 6/6 [00:00<00:00, 111.72it/s]
100%|██████████| 2/2 [00:00<00:00, 126.55it/s]
100%|██████████| 2/2 [00:00<00:00, 81.71it/s]


Unnamed: 0,A,B,A-B,Species_A,Species_B,Interaction,From
0,A0A1D6I854,A0A1D6I854,A0A1D6I854-A0A1D6I854,4577,4577,1,IntAct
1,P17839,A2RVQ5,A2RVQ5-P17839,3702,3702,1,IntAct
2,Q5XXN7,A2RVQ5,A2RVQ5-Q5XXN7,3702,3702,1,IntAct
3,Q5XXL4,A2RVQ5,A2RVQ5-Q5XXL4,3702,3702,1,IntAct
4,O22456,A2RVQ5,A2RVQ5-O22456,3702,3702,1,IntAct
...,...,...,...,...,...,...,...
4930,Q9M2K8,Q5XXE7,Q9M2K8-Q5XXE7,3702,3702,0,PlaPPISite
4937,Q9M2K8,Q1PEU3,Q9M2K8-Q1PEU3,3702,3702,0,PlaPPISite
4940,Q9M2K8,Q5XXN8,Q9M2K8-Q5XXN8,3702,3702,0,PlaPPISite
4945,Q9M2K8,Q5XXN7,Q9M2K8-Q5XXN7,3702,3702,0,PlaPPISite


In [9]:
# Analyze papers
df = Paper.analyse_papers()
df

100%|██████████| 62/62 [01:00<00:00,  1.02it/s]


Unnamed: 0,A,B,Interaction,A-B,From
0,A0A1B2YKT1,A0A1B2YKT1,1.0,A0A1B2YKT1-A0A1B2YKT1,Ai (2017)
15,A0A1B2YKT5,A0A1B2YKT1,1.0,A0A1B2YKT1-A0A1B2YKT5,Ai (2017)
20,A0A1B2YKT7,A0A1B2YKT1,1.0,A0A1B2YKT1-A0A1B2YKT7,Ai (2017)
10,A0A1B2YKT8,A0A1B2YKT1,1.0,A0A1B2YKT1-A0A1B2YKT8,Ai (2017)
5,A0A1B2YKU0,A0A1B2YKT1,1.0,A0A1B2YKT1-A0A1B2YKU0,Ai (2017)
...,...,...,...,...,...
0,S5U5N5,S5U5N5,1.0,S5U5N5-S5U5N5,Zhao (2021)
2,S5YHG7,S5U5N5,1.0,S5U5N5-S5YHG7,Zhao (2021)
4,W8CRN8,S5U5N5,0.0,S5U5N5-W8CRN8,Zhao (2021)
3,W8CSC4,S5U5N5,0.0,S5U5N5-W8CSC4,Zhao (2021)


In [19]:
intact.df[intact.df['A-B'] == 'Q38840-A2RVQ5']

Unnamed: 0,A,B,A-B,Species_A,Species_B,Interaction,From
1441,Q38840,A2RVQ5,Q38840-A2RVQ5,3702,3702,0,IntAct


In [16]:
merged = pd.concat([databases.df, df])
merged = merged.drop(['Species_A', 'Species_B'], axis = 1)
merged = merged.drop_duplicates('A-B', keep='last')
merged

Unnamed: 0,A,B,A-B,Interaction,From
0,A0A1D6I854,A0A1D6I854,A0A1D6I854-A0A1D6I854,1.0,IntAct
2,Q5XXN7,A2RVQ5,A2RVQ5-Q5XXN7,1.0,IntAct
3,Q5XXL4,A2RVQ5,A2RVQ5-Q5XXL4,1.0,IntAct
7,A2RVQ5,P29386,A2RVQ5-P29386,1.0,IntAct
15,A2RVQ5,Q9SA07,A2RVQ5-Q9SA07,1.0,IntAct
...,...,...,...,...,...
0,S5U5N5,S5U5N5,S5U5N5-S5U5N5,1.0,Zhao (2021)
2,S5YHG7,S5U5N5,S5U5N5-S5YHG7,1.0,Zhao (2021)
4,W8CRN8,S5U5N5,S5U5N5-W8CRN8,0.0,Zhao (2021)
3,W8CSC4,S5U5N5,S5U5N5-W8CSC4,0.0,Zhao (2021)


In [5]:
# Check that all interactors are MIKC proteins and the mutations are processed correctly
interactors = set(df['A']) | set(df['B'])
for i in interactors:
    try:
        Protein(i)
    except:
        print(i)

In [6]:
# Save Protein objects
for interactor in interactors:
    Protein(interactor).save()

In [13]:
# Save ProteinProtein objects + interaction + origin
interactions = merged['A-B'].tolist()
for p1, p2 in tqdm([i.split('-') for i in interactions]):
    prot1 = Protein(p1)
    prot2 = Protein(p2)
    ppi = ProteinProtein(prot1, prot2)
    if ppi.p1.name == 'Q38840':
        print(ppi.p1.name, ppi.p2.name)
    ab = '-'.join(sorted([prot1.name, prot2.name]))
    interaction = merged[merged['A-B'] == ab]['Interaction'].values[0]
    #ppi.interaction = 1 if interaction > 0  else 0
    #ppi.origin = merged[merged['A-B'] == ab]['From'].values[0]
    #ppi.save()

NameError: name 'merged' is not defined