# Merge sources

Notebook to gather the data from each source (online databases, literature mining, and scoring) separately and then join them together in a single table that can be transformed into an object-oriented database to which additional metadata will be added

In [1]:
# Third-party modules
import pandas as pd
from tqdm import tqdm
from multitax import NcbiTx

# Custom modules
from src.misc import path
from src.entities.ppi import PPI
from src.misc.logger import logger
from src.sources.paper import Paper
from src.sources.scoring import Scoring
from src.entities.protein import Protein
from src.databases.network import Network
from src.entities.interactor import Interactor
logger.setLevel(20)

In [2]:
ncbi_tx = NcbiTx()

### Database source

In [3]:
# Merge networks from database
intact = Network(db = 'IntAct', version = '2025-03-28', type = 'MADS_vs_MADS', standarized = True)
biogrid = Network(db = 'BioGRID', version = '4.4.246', type = 'MADS_vs_MADS', standarized = True)
plappisite = Network(db = 'PlaPPISite', type = 'MADS_vs_MADS', standarized = True)

# NOT add negatives
#intact.add_negatives_per_species()
#biogrid.add_negatives_per_species()
#pplappisite.add_negatives_per_species()

# Add 'From' column
intact.df['Origin'] = 'IntAct'
biogrid.df['Origin'] = 'BioGRID'
plappisite.df['Origin'] = 'PlaPPISite'

# Add 'Interaction' column
intact.df['Interaction'] = [[1] for _ in range(intact.df.shape[0])]
biogrid.df['Interaction'] = [[1] for _ in range(biogrid.df.shape[0])]
plappisite.df['Interaction'] = [[1] for _ in range(plappisite.df.shape[0])]

# Merge
databases = Network.merge(intact, biogrid, plappisite)

# Remove duplicates
databases.df.drop_duplicates('Seq', inplace = True)

# Add, remove and reorder coulumns
databases.df['bioID_A'] = ''
databases.df['bioID_B'] = ''
databases.df.rename(columns = {'A': 'UniProtID_A', 'B': 'UniProtID_B'}, inplace = True)
databases.df.rename(columns = {'Species_A': 'TaxonID_A', 'Species_B': 'TaxonID_B'}, inplace = True)
databases.df.rename(columns = {'Seq_A': 'Sequence_A', 'Seq_B': 'Sequence_B'}, inplace = True)
taxonID2species = {taxonID:ncbi_tx.name_lineage(str(taxonID))[-1] for taxonID in pd.concat([databases.df['TaxonID_A'], databases.df['TaxonID_B']]).unique()}
databases.df['Species_A'] = databases.df['TaxonID_A'].map(taxonID2species)
databases.df['Species_B'] = databases.df['TaxonID_B'].map(taxonID2species)
databases.df = databases.df[['bioID_A', 'bioID_B', 'UniProtID_A', 'UniProtID_B', 'Sequence_A', 'Sequence_B', 'Interaction', 'Origin', 'TaxonID_A', 'TaxonID_B', 'Species_A', 'Species_B']]

# Reorder A,B proteins based on alphabetical order of their sequences
for i, row in databases.df.iterrows():
    bioA, bioB = row['bioID_A'], row['bioID_B']
    uniprotA, uniprotB = row['UniProtID_A'], row['UniProtID_B']
    seqA, seqB = row['Sequence_A'], row['Sequence_B']
    taxonA, taxonB = row['TaxonID_A'], row['TaxonID_B']
    speciesA, speciesB = row['Species_A'], row['Species_B']
    if sorted([seqA, seqB]) != [seqA, seqB]:
        databases.df.at[i, 'bioID_A'] = bioB
        databases.df.at[i, 'bioID_B'] = bioA
        databases.df.at[i, 'UniProtID_A'] = uniprotB
        databases.df.at[i, 'UniProtID_B'] = uniprotA
        databases.df.at[i, 'Sequence_A'] = seqB
        databases.df.at[i, 'Sequence_B'] = seqA
        databases.df.at[i, 'TaxonID_A'] = taxonB
        databases.df.at[i, 'TaxonID_B'] = taxonA
        databases.df.at[i, 'Species_A'] = speciesB
        databases.df.at[i, 'Species_B'] = speciesA

print(f'Number of interactions: {databases.df.shape[0]}')

Number of interactions: 440


### Literature mining source

In [4]:
# Initialize mining database
mining_database = pd.DataFrame()

# Validate filestems -> must be in the OnHold folder
Paper.validate_filestems()

# Process papers
excel_folder = path.LITERATUREMINING / 'Excels'
excel_filenames = sorted(list(excel_folder.iterdir()))
for excel_filename in tqdm(excel_filenames):
    author, year = excel_filename.stem.split('_')
    paper = Paper(author, year)
    paper.process()
    df = paper.interaction_df()
    mining_database = pd.concat([mining_database, df])

# Restart index
mining_database.reset_index(drop = True, inplace = True)

  0%|          | 0/88 [00:00<?, ?it/s]paper.py INFO AbrahamJuarez (2020) has 4 interactions
paper.py INFO Ai (2017) has 15 interactions
paper.py INFO Alvarez-Buylla (2010) has 4 interactions
paper.py INFO Bartlett (2016) has 144 interactions
  5%|▍         | 4/88 [00:00<00:04, 18.44it/s]paper.py INFO Chang (2009) has 10 interactions
paper.py INFO Chi (2017) has 3 interactions
paper.py INFO Ciannamea (2006) has 169 interactions
  8%|▊         | 7/88 [00:00<00:04, 16.76it/s]paper.py INFO Cseke (2007) has 5 interactions
paper.py INFO Cui (2010) has 15 interactions
paper.py INFO Davies (1996) has 19 interactions
paper.py INFO Davies (1999) has 8 interactions
paper.py INFO DeBruijn (2018) has 4 interactions
 14%|█▎        | 12/88 [00:00<00:02, 25.60it/s]paper.py INFO Dirks-Mulder (2019) has 102 interactions
paper.py INFO Favaro (2002) has 23 interactions
paper.py INFO Ferrario (2003) has 70 interactions
 17%|█▋        | 15/88 [00:00<00:04, 17.73it/s]paper.py INFO Galimba (2012) has 4 intera

In [5]:
# Add species columns
guidelines = pd.read_excel(path.LITERATUREMINING / 'Guidelines.xlsx', sheet_name = 'Species')
prefix2taxonID = dict(zip(guidelines['Abbreviation'], guidelines['NCBI']))
taxonID2name = {taxonID: ncbi_tx.name_lineage(str(taxonID))[-1] for taxonID in prefix2taxonID.values()}
mining_database['TaxonID_A'] = mining_database['bioID_A'].str[:2].map(prefix2taxonID)
mining_database['TaxonID_B'] = mining_database['bioID_B'].str[:2].map(prefix2taxonID)
mining_database['Species_A'] = mining_database['TaxonID_A'].map(taxonID2name)
mining_database['Species_B'] = mining_database['TaxonID_B'].map(taxonID2name)
mining_database

Unnamed: 0,bioID_A,bioID_B,UniProtID_A,UniProtID_B,Sequence_A,Sequence_B,Interaction,Origin,TaxonID_A,TaxonID_B,Species_A,Species_B
0,ZmSI1,ZmSTS1,Q9M6N9,Q9AR51,MGRGKIEIKRIENATNRQVTYSKRRTGIMKKARELTVLCDAQVAII...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,[1],AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
1,ZmSTS1_G81D,ZmSTS1,Q9AR51,Q9AR51,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,[1],AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
2,ZmSTS1_G81D,ZmSTS1_G81D,Q9AR51,Q9AR51,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,[1],AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
3,ZmSTS1,ZmSTS1,Q9AR51,Q9AR51,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,MGRGKIEIKRIENSTNRQVTFSKRRNGILKKAREISVLCDAEVGVV...,[0],AbrahamJuarez (2020),4577,4577,Zea mays,Zea mays
4,TePI,TePI,A0A1B2YKT1,A0A1B2YKT1,MGRGKIEIKRIENTSNRQVTYSKRKNGIIKKAKEITVLCDANVSLV...,MGRGKIEIKRIENTSNRQVTYSKRKNGIIKKAKEITVLCDANVSLV...,[1],Ai (2017),13708,13708,Tagetes erecta,Tagetes erecta
...,...,...,...,...,...,...,...,...,...,...,...,...
4234,PmSEP4,PmSEP4,,,MGRGKVELKRIENKINQQVTFAKRRNGLLKKAYELSVLCDAEVALI...,MGRGKVELKRIENKINQQVTFAKRRNGLLKKAYELSVLCDAEVALI...,[1],Zhou (2017),102107,102107,Prunus mume,Prunus mume
4235,PmAG,PmSEP1,A7UGU4,,MAYENKSMSLDSPQRKLGRGKIEIKRIENTTNRQVTFCKRRNGLLK...,MGRGRVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCDAEVALI...,[0],Zhou (2017),102107,102107,Prunus mume,Prunus mume
4236,PmAG,PmSEP2,A7UGU4,,MAYENKSMSLDSPQRKLGRGKIEIKRIENTTNRQVTFCKRRNGLLK...,MGRGRVELKRIENKINRQVTFAKRRNGLLKKAYELSILCDAEVALI...,[1],Zhou (2017),102107,102107,Prunus mume,Prunus mume
4237,PmAG,PmSEP3,A7UGU4,,MAYENKSMSLDSPQRKLGRGKIEIKRIENTTNRQVTFCKRRNGLLK...,MGRGRVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCEAEVALI...,[1],Zhou (2017),102107,102107,Prunus mume,Prunus mume


In [6]:
# Summary Papers table
summary = pd.DataFrame(columns = ['Author', 'Year', 'Interactions', 'Taxon IDs', 'Species'])
df = mining_database
for paper in df['Origin'].unique():
    author, year = paper.split(' (')
    year = year[:-1]
    n_interactions = df[df['Origin'] == paper].shape[0]
    speciesA = df[df['Origin'] == paper]['Species_A'].unique().tolist()
    speciesB = df[df['Origin'] == paper]['Species_B'].unique().tolist()
    species = set(speciesA + speciesB)
    taxonIDsA = df[df['Origin'] == paper]['TaxonID_A'].unique().tolist()
    taxonIDsB = df[df['Origin'] == paper]['TaxonID_B'].unique().tolist()
    taxonIDs = set(taxonIDsA + taxonIDsB)
    taxonIDs = map(str, taxonIDs)
    summary = pd.concat([
        summary, 
        pd.DataFrame(
            [[author, year, n_interactions, ', '.join(taxonIDs), ', '.join(species)]], 
            columns = summary.columns)], 
            ignore_index = True)
summary.sort_values(['Author', 'Year'], inplace = True, )
summary.reset_index(drop = True, inplace = True)
summary.to_excel(path.LITERATUREMINING / 'Summary_Papers.xlsx', index = False)


### Scoring source

In [7]:
# Scoring database
scoring_df = pd.DataFrame()

for file in path.SCORING.iterdir():
    print(file.name)
    
    if 'LEDGE.xlsx' in file.name: continue

    scoring = Scoring(file.stem)
    df = scoring.process()
    scoring_df = pd.concat([scoring_df, df])

# Restart index
scoring_df.reset_index(drop = True, inplace = True)

SOC1&AGL14_MUT.xlsx
AGL14&.xlsx
AG&STK1.xlsx
SEP3&SEP4.xlsx
SOC1&AGL19.xlsx
SOC1&AGL14.xlsx
AGL17&AGL21.xlsx
AGL16&ANR1.xlsx
LEDGE.xlsx
SOC1&AGL42.xlsx
SEP1&SEP2.xlsx
AP1&CAL.xlsx


### Merge sources

In [8]:
source = pd.concat([scoring_df, mining_database, databases.df])
source.reset_index(drop = True, inplace = True)
source['Interaction'] = source['Interaction'].apply(lambda x: [x])
source

Unnamed: 0,bioID_A,bioID_B,UniProtID_A,UniProtID_B,Sequence_A,Sequence_B,Interaction,Origin,TaxonID_A,TaxonID_B,Species_A,Species_B
0,AG,AGL14δ58S,P17839,,MAYQSELGGDSSPLRKSGRGKIEIKRIENTTNRQVTFCRRRNGLLK...,MVRGKTEMKRIENATSRQVTFSKRRNGLLKKAFELSVLCDAEVALI...,"[[0, 0]]",SOC1&AGL14_MUT,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
1,SHP1,AGL14δ58S,P29381,,MEEGGSSHDAESSKKLGRGKIEIKRIENTTNRQVTFCKRRNGLLKK...,MVRGKTEMKRIENATSRQVTFSKRRNGLLKKAFELSVLCDAEVALI...,"[[0, 0]]",SOC1&AGL14_MUT,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
2,SEP1,AGL14δ58S,P29382,,MGRGRVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCDAEVALI...,MVRGKTEMKRIENATSRQVTFSKRRNGLLKKAFELSVLCDAEVALI...,"[[AUTO, 1]]",SOC1&AGL14_MUT,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
3,SEP4-1,AGL14δ58S,P29383,,MGRGKVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCDAEIALL...,MVRGKTEMKRIENATSRQVTFSKRRNGLLKKAFELSVLCDAEVALI...,"[[0, 0]]",SOC1&AGL14_MUT,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
4,SEP4-2,AGL14δ58S,P29383-2,,MGRGKVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCDAEIALL...,MVRGKTEMKRIENATSRQVTFSKRRNGLLKKAFELSVLCDAEVALI...,"[[0, 0]]",SOC1&AGL14_MUT,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
...,...,...,...,...,...,...,...,...,...,...,...,...
7432,,,P29383,P29383,MGRGKVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCDAEIALL...,MGRGKVELKRIENKINRQVTFAKRRNGLLKKAYELSVLCDAEIALL...,[[1]],BioGRID,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
7433,,,Q38876,Q38876,MGRGRVQLKRIENKINRQVTFSKRRSGLLKKAHEISVLCDAEVALI...,MGRGRVQLKRIENKINRQVTFSKRRSGLLKKAHEISVLCDAEVALI...,[[1]],BioGRID,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
7434,,,P17839,P29385,MAYQSELGGDSSPLRKSGRGKIEIKRIENTTNRQVTFCKRRNGLLK...,MEGGASNEVAESSKKIGRGKIEIKRIENTTNRQVTFCKRRNGLLKK...,[[1]],PlaPPISite,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana
7435,,,P29381,P29385,MEEGGSSHDAESSKKLGRGKIEIKRIENTTNRQVTFCKRRNGLLKK...,MEGGASNEVAESSKKIGRGKIEIKRIENTTNRQVTFCKRRNGLLKK...,[[1]],PlaPPISite,3702,3702,Arabidopsis thaliana,Arabidopsis thaliana


In [9]:
# Remove redundancies between papers, studies and sources
redundancy = []
drop_col = []
for idx, row in source.iterrows():

    # Concatenate sequences
    seqs = '='.join([row['Sequence_A'], row['Sequence_B']])

    # No redundancy
    if seqs not in redundancy:
        redundancy.append(seqs)
        drop_col.append(False)
    # Redundancy
    else:
        redundancy.append('')
        drop_col.append(True)

        # Calculate original index
        original_idx = redundancy.index(seqs)

        # Update values
        source.at[original_idx, 'Interaction'] = source.at[original_idx, 'Interaction'] + source.at[idx, 'Interaction']
        source.at[original_idx, 'Origin'] = source.at[original_idx, 'Origin'] + '|' + source.at[idx, 'Origin']

# Drop redundancies
source = source[~pd.Series(drop_col)]
source.reset_index(drop = True, inplace = True)
# Split 'Origin' column
source['Origin'] = source['Origin'].apply(lambda x: x.split('|'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
# Drop interactions with only 'AUTO', 'NLW' or 'ND' values
drop_col = []
for idx, row in source.iterrows():
    interaction = [item for lst in row['Interaction'] for item in lst]
    if 1 in interaction or 0 in interaction or 'NC' in interaction:
        drop_col.append(False)
    else:
        drop_col.append(True)

source = source[~pd.Series(drop_col)]
source.reset_index(drop = True, inplace = True)

In [11]:
# Remove interactions with non-canonical aminoacids
source = source[~source['Sequence_A'].str.contains('[^ACDEFGHIKLMNPQRSTVWY]', regex = True)]
source = source[~source['Sequence_B'].str.contains('[^ACDEFGHIKLMNPQRSTVWY]', regex = True)]
source.reset_index(drop = True, inplace = True)

In [12]:
# Identify errors in sequences like '*', '-' or non-canonical amino acids like 'X
aminoacids = set('ACDEFGHIKLMNPQRSTVWY')
for idx, row in tqdm(source.iterrows(), total=len(source)):
    seqA = row['Sequence_A']
    seqB = row['Sequence_B']

    if set(seqA) - aminoacids:
        logger.warning(f'Non canonical amino acids in row {idx}: {set(seqA) - aminoacids}')
    if set(seqB) - aminoacids:
        logger.warning(f'Non canonical amino acids in row {idx}: {set(seqB) - aminoacids}')


100%|██████████| 5798/5798 [00:00<00:00, 11403.54it/s]


In [13]:
# Remove specific interactions
banned = [
        'SlRIN',            # It has 2 K domains
        'SlTM3',            # Does not have an M domain
        'ZaMADS70',         # Does not have an M domain
        'SlMBP13',          # Not a MIKC protein
        'TtAG1-del3-del13', # Deletion of kink between K1 and K2
        'Ta42G17',          # Does not have K2 and K3 domains
        'Ta57H08',          # Does not have an M domain
        ]
source = source[~source['bioID_A'].isin(banned)]
source = source[~source['bioID_B'].isin(banned)]
source.reset_index(drop = True, inplace = True)

### Save objects

In [14]:
# Add to Protein and ProteinProtein classes
for idx, row in tqdm(source.iterrows(), total=len(source)):
    
    # Add protein A
    if not Protein.in_database(row['Sequence_A']):
        p_A = Protein.new(
            bioID = row['bioID_A'],
            uniprotID = row['UniProtID_A'],
            seq = row['Sequence_A'],
            taxonID = int(row['TaxonID_A']),
            species = row['Species_A'],
            domains = Interactor(row['UniProtID_A']).domains
        )
        p_A.pickle()

    # Add protein B
    if not Protein.in_database(row['Sequence_B']):
        p_B = Protein.new(
            bioID = row['bioID_B'],
            uniprotID = row['UniProtID_B'],
            seq = row['Sequence_B'],
            taxonID = int(row['TaxonID_B']),
            species = row['Species_B'],
            domains = Interactor(row['UniProtID_B']).domains
        )
        p_B.pickle()
    
    # Add protein-protein interaction
    ppi = PPI.new(
        p1 = Protein(row['Sequence_A']),
        p2 = Protein(row['Sequence_B']),
        interaction = row['Interaction'],
        origin = row['Origin']
    )
    ppi.pickle()

# Sanity check -> all proteins are the unique ones
unique_proteins = set(source['Sequence_A']) | set(source['Sequence_B'])
in_database_proteins = list(path.PROTEIN.iterdir())
assert len(unique_proteins) == len(in_database_proteins), 'Not all proteins are unique'

100%|██████████| 5724/5724 [00:09<00:00, 581.59it/s]
