In [2]:
"""
https://wiki.geneontology.org/index.php/Guide_to_GO_Evidence_Codes

TODO
add arg parser
transform into a script
add more evidence codes
use GOA parser?
"""

import pandas as pd
import numpy as np
import networkx
# conda install -c biobuilds obonet
import obonet

In [3]:
data_dir = "/home/damiano/Projects/CAFA-evaluator_data"

# The OBO must have "ontology: IDPO" header (first line)
graph = obonet.read_obo("{}/go_21_oct_2022.obo".format(data_dir))
# graph.nodes(data=True)
df_ont = pd.DataFrame([[node[0], node[1]['namespace'], node[1]['name']] for node in graph.nodes(data=True)], columns=['term', 'namespace', 'name'])
df_ont

Unnamed: 0,term,namespace,name
0,GO:0000001,biological_process,mitochondrion inheritance
1,GO:0000002,biological_process,mitochondrial genome maintenance
2,GO:0000003,biological_process,reproduction
3,GO:0000006,molecular_function,high-affinity zinc transmembrane transporter a...
4,GO:0000007,molecular_function,low-affinity zinc ion transmembrane transporte...
...,...,...,...
43324,GO:2001314,biological_process,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...
43325,GO:2001315,biological_process,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...
43326,GO:2001316,biological_process,kojic acid metabolic process
43327,GO:2001317,biological_process,kojic acid biosynthetic process


In [4]:
# Create the ancestors dictionary
ancestors_dict = {}
for node in graph.nodes(data=True):
    # print(node[0], networkx.descendants(graph, node[0]), node[1].get('is_a'))
    ancestors_dict[node[0]] = networkx.descendants(graph, node[0])
ancestors_dict

{'GO:0000001': {'GO:0006996',
  'GO:0007005',
  'GO:0008150',
  'GO:0009987',
  'GO:0016043',
  'GO:0048308',
  'GO:0048311',
  'GO:0051179',
  'GO:0051640',
  'GO:0051646',
  'GO:0071840'},
 'GO:0000002': {'GO:0006996',
  'GO:0007005',
  'GO:0008150',
  'GO:0009987',
  'GO:0016043',
  'GO:0071840'},
 'GO:0000003': {'GO:0008150'},
 'GO:0000006': {'GO:0000041',
  'GO:0003674',
  'GO:0005215',
  'GO:0005385',
  'GO:0006810',
  'GO:0006811',
  'GO:0006812',
  'GO:0006829',
  'GO:0008150',
  'GO:0008324',
  'GO:0009987',
  'GO:0015075',
  'GO:0015318',
  'GO:0022857',
  'GO:0022890',
  'GO:0030001',
  'GO:0034220',
  'GO:0046873',
  'GO:0046915',
  'GO:0051179',
  'GO:0051234',
  'GO:0055085',
  'GO:0071577',
  'GO:0098655',
  'GO:0098660',
  'GO:0098662'},
 'GO:0000007': {'GO:0000041',
  'GO:0003674',
  'GO:0005215',
  'GO:0005385',
  'GO:0006810',
  'GO:0006811',
  'GO:0006812',
  'GO:0006829',
  'GO:0008150',
  'GO:0008324',
  'GO:0009987',
  'GO:0015075',
  'GO:0015318',
  'GO:0022857'

In [5]:
# Parse Swiss-Prot annotations, filter by evidence codes and add ancestors
# EXP, IDA, IMP, IGI, IEP, TAS, IC
eco_mapping = [('EXP', 'ECO:0000269'), ('IDA', 'ECO:0000314'), ('IPI', 'ECO:0000353'), ('IMP', 'ECO:0000315'),
               ('IGI', 'ECO:0000316'), ('IEP', 'ECO:0000270'), ('HTP', 'ECO:0006056'), ('HDA', 'ECO:0007005'),
               ('HMP', 'ECO:0007001'), ('HGI', 'ECO:0007003'), ('HEP', 'ECO:0007007'), ('ISS', 'ECO:0000250'),
               ('ISO', 'ECO:0000266'), ('ISA', 'ECO:0000247'), ('ISM', 'ECO:0000255'), ('IGC', 'ECO:0000317'),
               ('IBA', 'ECO:0000318'), ('IBD', 'ECO:0000319'), ('IKR', 'ECO:0000320'), ('IRD', 'ECO:0000321')]
valid_eco = set([eco for ec, eco in eco_mapping if ec in ['EXP', 'IDA', 'IMP', 'IGI', 'IEP', 'TAS', 'IC']])

reference_file = "{}/uniprot_sprot_go.tsv".format(data_dir)

reference_dataset = {}
with open(reference_file) as f:
    for line in f:
        acc, term, eco = line.strip().split()
        if eco in valid_eco and term in ancestors_dict:
            reference_dataset.setdefault(acc, set()).add(term)
            reference_dataset[acc].update(ancestors_dict[term])

In [6]:
ia = {}  # {term: [observed ancestors]
for acc in reference_dataset:
    for term in df_ont['term']:
        parents = set(graph.successors(term))
        if parents.issubset(reference_dataset[acc]):
            ia.setdefault(term, [0, 0])
            ia[term][0] += 1
        parents.add(term)
        if parents.issubset(reference_dataset[acc]):
            ia.setdefault(term, [0, 0])
            ia[term][1] += 1
print(ia)

{'GO:0000003': [55765, 5758], 'GO:0000035': [20593, 5], 'GO:0000149': [8602, 76], 'GO:0000322': [7354, 258], 'GO:0000323': [7354, 848], 'GO:0000325': [7354, 141], 'GO:0000407': [24268, 85], 'GO:0000408': [11459, 14], 'GO:0000417': [11459, 8], 'GO:0000922': [610, 192], 'GO:0000943': [13788, 2], 'GO:0000974': [11459, 121], 'GO:0001098': [8602, 68], 'GO:0001114': [11459, 0], 'GO:0001115': [3210, 3], 'GO:0001505': [7270, 363], 'GO:0001533': [15424, 51], 'GO:0001534': [422, 4], 'GO:0001772': [15424, 49], 'GO:0001775': [46847, 1366], 'GO:0001784': [36, 15], 'GO:0001846': [8602, 19], 'GO:0001848': [8602, 25], 'GO:0001891': [15424, 62], 'GO:0001906': [46847, 431], 'GO:0001917': [44938, 72], 'GO:0001918': [8602, 1], 'GO:0001965': [8602, 20], 'GO:0001968': [8602, 30], 'GO:0002039': [8602, 14], 'GO:0002046': [8602, 3], 'GO:0002096': [5867, 2], 'GO:0002133': [11459, 2], 'GO:0002162': [8602, 7], 'GO:0002167': [11459, 0], 'GO:0002177': [2245, 19], 'GO:0002193': [11459, 3], 'GO:0002376': [55765, 3557

In [7]:
df_prob = pd.DataFrame([[k, *v] for k, v in ia.items()], columns=['term', 'co_occurring_parents', 'co_occurring'])
df_prob['p_cond'] = df_prob['co_occurring'] / df_prob['co_occurring_parents']
df_prob['ia'] = -np.log2(df_prob['p_cond'])
df_prob.to_csv('{}/information_content.tsv'.format(data_dir), sep='\t', index=False)
df_prob

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,term,co_occurring_parents,co_occurring,p_cond,ia
0,GO:0000003,55765,5758,0.103255,3.275720
1,GO:0000035,20593,5,0.000243,12.007938
2,GO:0000149,8602,76,0.008835,6.822529
3,GO:0000322,7354,258,0.035083,4.833086
4,GO:0000323,7354,848,0.115311,3.116393
...,...,...,...,...,...
37581,GO:1904126,1,0,0.000000,inf
37582,GO:1904934,1,1,1.000000,-0.000000
37583,GO:0080169,1,1,1.000000,-0.000000
37584,GO:0021899,1,1,1.000000,-0.000000
