# Merge sources

Notebook to gather the data from each source (online databases, literature mining, and scoring) separately and then join them together in a single table that can be transformed into an object-oriented database to which additional metadata will be added

In [1]:
# Third-party modules
import pandas as pd
from tqdm import tqdm
from multitax import NcbiTx

# Custom modules
from src.misc import path
from src.misc.logger import logger
from src.sources.paper import Paper
#from src.sources.scoring import Scoring
#from src.entities.protein import Protein
from src.databases.network import Network
from src.entities.interactor import Interactor
#from src.entities.protein_protein import ProteinProtein

In [4]:
ncbi_tx = NcbiTx()

### Database source

In [35]:
# Merge networks from database
intact = Network(db = 'IntAct', version = '2025-03-28', type = 'MADS_vs_MADS', standarized = True)
biogrid = Network(db = 'BioGRID', version = '4.4.246', type = 'MADS_vs_MADS', standarized = True)
pplappisite = Network(db = 'PlaPPISite', type = 'MADS_vs_MADS', standarized = True)

# NOT add negatives
#intact.add_negatives_per_species()
#biogrid.add_negatives_per_species()
#pplappisite.add_negatives_per_species()

# Add 'From' column
intact.df['From'] = 'IntAct'
biogrid.df['From'] = 'BioGRID'
pplappisite.df['From'] = 'PlaPPISite'

# Add 'Interaction' column
intact.df['Interaction'] = [[('IntAct', [1])] for _ in range(intact.df.shape[0])]
biogrid.df['Interaction'] = [[('BioGRID', [1])] for _ in range(biogrid.df.shape[0])]
pplappisite.df['Interaction'] = [[('PlaPPISite', [1])] for _ in range(pplappisite.df.shape[0])]

# Merge
databases = Network.merge(intact, biogrid, pplappisite)

# Remove duplicates
databases.df.drop_duplicates('Seq', inplace = True)

# Add, remove and reorder coulumns
databases.df['bioID_A'] = ''
databases.df['bioID_B'] = ''
databases.df.rename(columns = {'A': 'UniProtID_A', 'B': 'UniProtID_B'}, inplace = True)
databases.df.rename(columns = {'Species_A': 'TaxonID_A', 'Species_B': 'TaxonID_B'}, inplace = True)
databases.df.rename(columns = {'Seq_A': 'Sequence_A', 'Seq_B': 'Sequence_B'}, inplace = True)
taxonID2species = {taxonID:ncbi_tx.name_lineage(str(taxonID))[-1] for taxonID in pd.concat([databases.df['TaxonID_A'], databases.df['TaxonID_B']]).unique()}
databases.df['Species_A'] = databases.df['TaxonID_A'].map(taxonID2species)
databases.df['Species_B'] = databases.df['TaxonID_B'].map(taxonID2species)
databases.df = databases.df[['bioID_A', 'bioID_B', 'UniProtID_A', 'UniProtID_B', 'Sequence_A', 'Sequence_B', 'Interaction', 'From', 'TaxonID_A', 'TaxonID_B', 'Species_A', 'Species_B']]

print(f'Number of interactions: {databases.df.shape[0]}')

Number of interactions: 440


### Literature mining source

In [2]:
# Initialize mining database
mining_database = pd.DataFrame()

# Validate filestems -> must be in the OnHold folder
Paper.validate_filestems()

# Process papers
excel_folder = path.LITERATUREMINING / 'Excels'
excel_filenames = sorted(list(excel_folder.iterdir()))
for excel_filename in tqdm(excel_filenames):
    author, year = excel_filename.stem.split('_')
    paper = Paper(author, year)
    paper.process()
    df = paper.interaction_df()
    mining_database = pd.concat([mining_database, df])

# Restart index
mining_database.reset_index(drop = True, inplace = True)

  0%|          | 0/88 [00:00<?, ?it/s]paper.py INFO AbrahamJuarez (2020) has 4 interactions
paper.py INFO Ai (2017) has 15 interactions
paper.py INFO Alvarez-Buylla (2010) has 4 interactions
  3%|▎         | 3/88 [00:00<00:03, 26.92it/s]paper.py INFO Bartlett (2016) has 144 interactions
paper.py INFO Chang (2009) has 10 interactions
paper.py INFO Chi (2017) has 3 interactions
  7%|▋         | 6/88 [00:00<00:07, 10.70it/s]paper.py INFO Ciannamea (2006) has 169 interactions
paper.py INFO Cseke (2007) has 5 interactions
  9%|▉         | 8/88 [00:01<00:12,  6.64it/s]paper.py INFO Cui (2010) has 15 interactions
paper.py INFO Davies (1996) has 19 interactions
 11%|█▏        | 10/88 [00:01<00:09,  8.50it/s]paper.py INFO Davies (1999) has 8 interactions
paper.py INFO DeBruijn (2018) has 4 interactions
paper.py INFO Dirks-Mulder (2019) has 105 interactions
 15%|█▍        | 13/88 [00:01<00:08,  8.66it/s]paper.py INFO Favaro (2002) has 42 interactions
paper.py INFO Ferrario (2003) has 70 interact

In [5]:
# Add species columns
guidelines = pd.read_excel(path.LITERATUREMINING / 'Guidelines.xlsx', sheet_name = 'Species')
prefix2taxonID = dict(zip(guidelines['Abbreviation'], guidelines['NCBI']))
taxonID2name = {taxonID: ncbi_tx.name_lineage(str(taxonID))[-1] for taxonID in prefix2taxonID.values()}
mining_database['TaxonID_A'] = mining_database['bioID_A'].str[:2].map(prefix2taxonID)
mining_database['TaxonID_B'] = mining_database['bioID_B'].str[:2].map(prefix2taxonID)
mining_database['Species_A'] = mining_database['TaxonID_A'].map(taxonID2name)
mining_database['Species_B'] = mining_database['TaxonID_B'].map(taxonID2name)
mining_database

Unnamed: 0,bioID_A,bioID_B,UniProtID_A,UniProtID_B,Sequence_A,Sequence_B,Interaction,From,TaxonID_A,TaxonID_B,Species_A,Species_B
0,ZmSI1,ZmSTS1,Q9M6N9,Q9AR51,MGRGKIEIKRIENATNRQVTYSKRRTGIMKKARELTVLCDAQVAII...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,"[(AbrahamJuarez_2020, [1])]",AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
1,ZmSTS1_G81D,ZmSTS1,Q9AR51,Q9AR51,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,"[(AbrahamJuarez_2020, [1])]",AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
2,ZmSTS1_G81D,ZmSTS1_G81D,Q9AR51,Q9AR51,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,"[(AbrahamJuarez_2020, [1])]",AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
3,ZmSTS1,ZmSTS1,Q9AR51,Q9AR51,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,"[(AbrahamJuarez_2020, [0])]",AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
4,TePI,TePI,A0A1B2YKT1,A0A1B2YKT1,MGRGKIEIKRIENTSNRQVTYSKRKNGIIKKAKEITVLCDANVSLV...,MGRGKIEIKRIENTSNRQVTYSKRKNGIIKKAKEITVLCDANVSLV...,"[(Ai_2017, [1])]",Ai (2017),13708,13708,Tagetes erecta,Tagetes erecta
...,...,...,...,...,...,...,...,...,...,...,...,...
4325,PmSEP4,PmSEP4,,,MGRGKVELKRIENKINQQVTFAKRRNGLLKKAYELSVLCDAEVALI...,MGRGKVELKRIENKINQQVTFAKRRNGLLKKAYELSVLCDAEVALI...,"[(Zhou_2017, [1])]",Zhou (2017),102107,102107,Prunus mume,Prunus mume
4326,PmAG,PmSEP1,A7UGU4,,MAYENKSMSLDSPQRKLGRGKIEIKRIENTTNRQVTFCKRRNGLLK...,MGRGRVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCDAEVALI...,"[(Zhou_2017, [0])]",Zhou (2017),102107,102107,Prunus mume,Prunus mume
4327,PmAG,PmSEP2,A7UGU4,,MAYENKSMSLDSPQRKLGRGKIEIKRIENTTNRQVTFCKRRNGLLK...,MGRGRVELKRIENKINRQVTFAKRRNGLLKKAYELSILCDAEVALI...,"[(Zhou_2017, [1])]",Zhou (2017),102107,102107,Prunus mume,Prunus mume
4328,PmAG,PmSEP3,A7UGU4,,MAYENKSMSLDSPQRKLGRGKIEIKRIENTTNRQVTFCKRRNGLLK...,MGRGRVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCEAEVALI...,"[(Zhou_2017, [1])]",Zhou (2017),102107,102107,Prunus mume,Prunus mume


In [39]:
# Summary Papers table
summary = pd.DataFrame(columns = ['Author', 'Year', 'Interactions', 'Taxon IDs', 'Species'])
df = mining_database
for paper in df['From'].unique():
    author, year = paper.split(' (')
    year = year[:-1]
    n_interactions = df[df['From'] == paper].shape[0]
    speciesA = df[df['From'] == paper]['Species_A'].unique().tolist()
    speciesB = df[df['From'] == paper]['Species_B'].unique().tolist()
    species = set(speciesA + speciesB)
    taxonIDsA = df[df['From'] == paper]['TaxonID_A'].unique().tolist()
    taxonIDsB = df[df['From'] == paper]['TaxonID_B'].unique().tolist()
    taxonIDs = set(taxonIDsA + taxonIDsB)
    taxonIDs = map(str, taxonIDs)
    summary = pd.concat([
        summary, 
        pd.DataFrame(
            [[author, year, n_interactions, ', '.join(taxonIDs), ', '.join(species)]], 
            columns = summary.columns)], 
            ignore_index = True)
summary.sort_values(['Author', 'Year'], inplace = True, )
summary.reset_index(drop = True, inplace = True)
summary.to_excel(path.LITERATUREMINING / 'Summary_Papers.xlsx', index = False)
