In [1]:
# Built-in modules
import pprint
from collections import Counter

# Third-party modules
import numpy as np
import pandas as pd
import sklearn as sk

# Custom modules
from src.entities.ppi import PPI
from src.misc.logger import logger
from src.entities.protein import Protein

In [2]:
# BioGRID, PlaPPISite and IntAct PPIs
plappisite, biogrid, intact, total = 0, 0, 0, 0
for ppi in PPI.iterate():
    if 'PlaPPISite' in ppi.origin or 'BioGRID' in ppi.origin or 'IntAct' in ppi.origin:
        total += 1
    if 'PlaPPISite' in ppi.origin:
        plappisite += 1
    elif 'BioGRID' in ppi.origin:
        biogrid += 1
    elif 'IntAct' in ppi.origin:
        intact += 1

logger.info(f'Total PPIs in databases: {total}')
logger.info(f'PlaPPISite: {plappisite}, BioGRID: {biogrid}, IntAct: {intact}')

100%|██████████| 5724/5724 [00:09<00:00, 614.68it/s] 
612109363.py INFO Total PPIs in databases: 437
612109363.py INFO PlaPPISite: 3, BioGRID: 38, IntAct: 396


In [3]:
# Total proteins and PPIs
prots = [p for p in Protein.iterate()]
ppis = [ppi for ppi in PPI.iterate()]
logger.info(f'Total proteins: {len(prots)}, Total PPIs: {len(ppis)}')

100%|██████████| 661/661 [00:00<00:00, 7030.96it/s]
100%|██████████| 5724/5724 [00:16<00:00, 347.22it/s] 
2422032191.py INFO Total proteins: 661, Total PPIs: 5724


In [4]:
# Origin of PPIs
single_origin = 0
multiple_origin = 0
origins = []
for ppi in PPI.iterate():
    if len(ppi.origin) == 1:
        single_origin += 1
        continue
    multiple_origin += 1
    has_database = lambda x: any(y in ('BioGRID', 'PlaPPISite', 'IntAct') for y in x )
    has_mining = lambda x: any('(' in y for y in x)
    has_independent = lambda x: any('&' in y for y in x)
    n_origins = has_database(ppi.origin) + has_mining(ppi.origin) + has_independent(ppi.origin)
    origins.append(n_origins)
origins = Counter(origins)
logger.info(f'Single origin: {single_origin}, Multiple origin: {multiple_origin}')
logger.info(f'Origins: {origins}')

100%|██████████| 5724/5724 [00:06<00:00, 834.17it/s] 
861304860.py INFO Single origin: 4607, Multiple origin: 1117
861304860.py INFO Origins: Counter({1: 661, 2: 387, 3: 69})


In [5]:
# Positive and negative PPIs
labels = [ppi.interaction for ppi in PPI.iterate(interact=True)]
positive = sum(labels)
negative = len(labels) - positive
logger.info(f'Positive PPIs: {positive}, Negative PPIs: {negative}')
ratio = negative / positive
logger.info(f'Ratio of negative PPIs: {ratio:.2f}')

100%|██████████| 5724/5724 [00:07<00:00, 765.23it/s] 
663214831.py INFO Positive PPIs: 1920, Negative PPIs: 3582
663214831.py INFO Ratio of negative PPIs: 1.87


In [10]:
# Zhang et al 2018 batch effects
zhang2018_arabidopsis = [
    1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,0,1,1,0,0,0,1,0,0,1,1,1,0,1,
    1,0,0,0,0,0,0,0,0,1,1,1,1,0,
    0,0,0,0,0,0,0,0,0,1,0,0,0,
    0,0,0,0,0,1,0,1,0,0,0,0,
    1,0,0,0,0,0,0,0,1,0,1,
    0,0,0,0,0,0,0,0,1,0,
    0,1,0,0,1,0,0,0,0,
    0,0,0,0,1,0,0,1,
    0,0,0,0,0,0,1,
    0,0,0,0,0,0,
    0,1,1,0,1,
    0,0,1,1,
    0,0,1,
    0,0,
    0
    ]
folter_arabidopsis = [
    0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,1,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
    0,0,0,1,0,0,0,1,0,0,1,0,1,0,1,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,1,1,0,1,0,1,0,0,
    0,0,0,1,0,0,1,0,1,0,1,
    0,0,1,0,0,1,0,0,0,1,
    0,1,0,0,1,0,0,0,0,
    0,0,0,0,1,1,1,1,
    0,0,0,0,0,0,1,
    0,0,0,0,0,1,
    0,1,1,0,1,
    0,0,0,1,
    1,0,1,
    0,1,
    1
]

accuracy = sk.metrics.accuracy_score(zhang2018_arabidopsis, folter_arabidopsis)
balanced_accuracy = sk.metrics.balanced_accuracy_score(zhang2018_arabidopsis, folter_arabidopsis)
logger.info(f'Accuracy Zhang2018 (true) vs Folter (predicted): {accuracy}')
logger.info(f'Balanced accuracy Zhang2018 (true) vs Folter (predicted): {balanced_accuracy}')
accuracy = sk.metrics.accuracy_score(folter_arabidopsis, zhang2018_arabidopsis)
balanced_accuracy = sk.metrics.balanced_accuracy_score(folter_arabidopsis, zhang2018_arabidopsis)
logger.info(f'Accuracy Folter (true) vs Zhang2018 (predicted): {accuracy}')
logger.info(f'Balanced accuracy Folter (true) vs Zhang2018 (predicted): {balanced_accuracy}')
logger.info('-'*50)

zhang2018_tomato = [
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
    0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,
    0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,
    0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,0,
    0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,1,0,1,1,1,0,0,1,
    0,0,0,0,1,0,1,1,0,0,0,0,
    0,0,0,0,0,0,1,0,0,0,0,
    0,0,1,0,0,1,0,1,0,0,
    0,1,0,0,0,0,0,0,0,
    1,0,1,0,1,0,0,0,
    0,0,0,1,0,0,0,
    0,0,1,1,0,1,
    1,1,1,1,1,
    0,1,1,1,
    0,0,1,
    0,0,
    0
]

leseberg_tomato = [
    0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,
    0,0,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,
    0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,
    1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,1,1,1,1,1,1,1,0,1,
    0,0,0,1,1,0,1,1,1,1,0,1,
    0,0,0,1,0,1,1,1,1,0,0,
    0,0,1,0,1,1,1,1,0,0,
    0,1,0,1,0,1,1,0,1,
    0,1,1,0,1,1,1,1,
    0,0,0,1,1,0,1,
    1,0,1,1,1,1,
    0,1,1,1,1,
    1,0,1,1,
    1,1,1,
    0,0,
    1
]

accuracy = sk.metrics.accuracy_score(zhang2018_tomato, leseberg_tomato)
balanced_accuracy = sk.metrics.balanced_accuracy_score(zhang2018_tomato, leseberg_tomato)
logger.info(f'Accuracy Zhang2018 (true) vs Leseberg (predicted): {accuracy}')
logger.info(f'Balanced accuracy Zhang2018 (true) vs Leseberg (predicted): {balanced_accuracy}')
accuracy = sk.metrics.accuracy_score(leseberg_tomato, zhang2018_tomato)
balanced_accuracy = sk.metrics.balanced_accuracy_score(leseberg_tomato, zhang2018_tomato)
logger.info(f'Accuracy Leseberg (true) vs Zhang2018 (predicted): {accuracy}')
logger.info(f'Balanced accuracy Leseberg (true) vs Zhang2018 (predicted): {balanced_accuracy}')

1483523831.py INFO Accuracy Zhang2018 (true) vs Folter (predicted): 0.7973856209150327
1483523831.py INFO Balanced accuracy Zhang2018 (true) vs Folter (predicted): 0.7290823211875843
1483523831.py INFO Accuracy Folter (true) vs Zhang2018 (predicted): 0.7973856209150327
1483523831.py INFO Balanced accuracy Folter (true) vs Zhang2018 (predicted): 0.7330663615560641
1483523831.py INFO --------------------------------------------------
1483523831.py INFO Accuracy Zhang2018 (true) vs Leseberg (predicted): 0.7445887445887446
1483523831.py INFO Balanced accuracy Zhang2018 (true) vs Leseberg (predicted): 0.7425381903642774
1483523831.py INFO Accuracy Leseberg (true) vs Zhang2018 (predicted): 0.7445887445887446
1483523831.py INFO Balanced accuracy Leseberg (true) vs Zhang2018 (predicted): 0.6698765432098766


In [8]:
# Classify PPI interactions
ppi_types = {
    'Single origin | single value | 1 or 0': 0,
    'Single origin | single value | NC': 0,
    'Single origin | single value | AUTO, NLW, ND, nan': 0,
    'Single origin | two values | 1, 0': 0,
    'Single origin | two values | NC, NC': 0,
    'Single origin | multiple values | _ < average < _ ': 0,
    'Single origin | multiple values | avergae > _': 0,
    'Single origin | multiple values | _ < average': 0,
    'Multiple origins | All Isas | ?': 0,
    'Multiple origins | All Isas | no ?': 0,
    'Multiple origins | agreement': 0,
    'Multiple origins | disagreement | majority vote': 0,
    'Multiple origins | disagreement | no majority': 0,
}

def score_single_origin_interactions(values):
    # Fully disregard nan, 'AUTO', 'ND', 'NLW', as they are not useful
    values = [v for v in values if v not in (None, 'AUTO', 'ND', 'NLW') and not pd.isna(v)]
    if values == []: return '?'
    # Consider 'NC' as 0.5, as it is sth in between 0 and 1
    values = [0.5 if v == 'NC' else v for v in values]
    # Single value -> return it if != 0.5
    if len(values) == 1:
        if values[0] == 0.5:
            return '?'
        return int(values[0])
    # Two values -> [1, 0] = 1 and [NC, NC] = ?
    if len(values) == 2:
        if 1 in values:
            return 1
        elif 0 in values:
            return 0
        elif values == [0.5, 0.5]:
            return '?'
        else:
            raise ValueError(f'Unexpected values: {values}')
    # Multiple values -> average
    average = sum(values) / len(values)
    if average > 3/5:
        return 1
    elif average < 2/5:
        return 0
    else:
        return '?'

for ppi in PPI.iterate():
    if len(ppi.origin) == 1:
        values = ppi.interaction[0]
        values = [v for v in values if v not in (None, 'AUTO', 'ND', 'NLW') and not pd.isna(v)]
        values = [0.5 if v == 'NC' else v for v in values]
        if values == []:
            ppi_types['Single origin | single value |  AUTO, NLW, ND, nan'] += 1
        elif values == [0.5]:
            ppi_types['Single origin | single value | NC'] += 1
        elif len(values) == 1 and values[0] in (0, 1):
            ppi_types['Single origin | single value | 1 or 0'] += 1
        elif len(values) == 2:
            if values == [0.5, 0.5]:
                ppi_types['Single origin | two values | NC, NC'] += 1
            else:
                ppi_types['Single origin | two values | 1, 0'] += 1
        elif len(values) > 2:
            average = sum(values) / len(values)
            if average < 2/5:
                ppi_types['Single origin | multiple values | _ < average'] += 1
            elif average > 3/5:
                ppi_types['Single origin | multiple values | avergae > _'] += 1
            else:
                ppi_types['Single origin | multiple values | _ < average < _ '] += 1
    if len(ppi.origin) > 1:
        origin2interactions = {origin:interaction for origin, interaction in zip(ppi.origin, ppi.interaction)}
        scoring_origin = [origin for origin in ppi.origin if '&' in origin]
        origin2interactions['scoring'] = []
        for k in scoring_origin:
            if k in scoring_origin:
                origin2interactions['scoring'].extend(origin2interactions[k])
                del origin2interactions[k]
        if origin2interactions['scoring'] == []:
            del origin2interactions['scoring']
        if len(origin2interactions) == 1 and 'scoring' in origin2interactions:
            #ppi_types['Multiple origins | All Isas'] += 1
            scored = score_single_origin_interactions(origin2interactions['scoring'])
            if scored == '?':
                ppi_types['Multiple origins | All Isas | ?'] += 1
            else:
                ppi_types['Multiple origins | All Isas | no ?'] += 1
            
        else:
            scored = [score_single_origin_interactions(interactions) for interactions in origin2interactions.values()]
            if len(set(scored)) == 1:
                ppi_types['Multiple origins | agreement'] += 1
                continue

            scored = [s for s in scored if s != '?']
            freqs = list(Counter(scored).values())
            is_balanced = lambda x: max(x) == min(x) 
            if len(set(scored)) == 1 or not is_balanced(freqs):
                ppi_types['Multiple origins | disagreement | majority vote'] += 1
            else:
                ppi_types['Multiple origins | disagreement | no majority'] += 1

ppis = [ppi for ppi in PPI.iterate()]
assert sum(ppi_types.values()) == len(ppis), 'The sum of the PPI types does not match the total number of PPIs.'
pprint.pp(ppi_types)

# Batch effects
total = ppi_types['Multiple origins | agreement'] + ppi_types['Multiple origins | disagreement | majority vote'] + ppi_types['Multiple origins | disagreement | no majority']
disagree = ppi_types['Multiple origins | disagreement | majority vote'] + ppi_types['Multiple origins | disagreement | no majority']
batch_effects = disagree / total * 100
logger.info(f'Batch effects: {batch_effects * 100:.2f}%')
max_acc = 100 - batch_effects + batch_effects / 2
logger.info(f'Maximum accuracy: {max_acc:.2f}%')

# Non-determined PPIs
non_determined = ppi_types['Single origin | single value | AUTO, NLW, ND, nan'] + \
                 ppi_types['Single origin | single value | NC'] + \
                 ppi_types['Single origin | two values | NC, NC'] + \
                 ppi_types['Single origin | multiple values | _ < average < _ '] + \
                 ppi_types['Multiple origins | All Isas | ?'] + \
                 ppi_types[ 'Multiple origins | disagreement | no majority']
logger.info(f'PPIs without a clear label: {non_determined} -> {non_determined / len(ppis) * 100:.2f}%')

100%|██████████| 5724/5724 [00:11<00:00, 494.63it/s]
100%|██████████| 5724/5724 [00:02<00:00, 1940.27it/s]
757271456.py INFO Batch effects: 3859.06%
757271456.py INFO Maximum accuracy: 80.70%
757271456.py INFO PPIs without a clear label: 222 -> 3.88%


{'Single origin | single value | 1 or 0': 2428,
 'Single origin | single value | NC': 8,
 'Single origin | single value | AUTO, NLW, ND, nan': 0,
 'Single origin | two values | 1, 0': 2115,
 'Single origin | two values | NC, NC': 0,
 'Single origin | multiple values | _ < average < _ ': 5,
 'Single origin | multiple values | avergae > _': 16,
 'Single origin | multiple values | _ < average': 35,
 'Multiple origins | All Isas | ?': 52,
 'Multiple origins | All Isas | no ?': 469,
 'Multiple origins | agreement': 366,
 'Multiple origins | disagreement | majority vote': 73,
 'Multiple origins | disagreement | no majority': 157}


In [9]:
species = set()
for ppi in PPI.iterate():
    species.add(ppi.p1.taxonID)
    species.add(ppi.p2.taxonID)
logger.info(f'Total species: {len(species)}')

100%|██████████| 5724/5724 [00:12<00:00, 466.66it/s]
2665704710.py INFO Total species: 74
