In [1]:

def parse_obo(obo_file):
    """
    Parse the ontology (OBO format) and store into a
    dictionary, exclude obsolete terms
    
    graph = {
        <term_id>: {
            id: <term_id>, 
            name: <definition>, 
            is_a: [<parent_id>, ...] 
            is_obsolete: False, 
            namespace: <namespace>
        }
    }
    """
    graph = {}  # { term_id : term_object }
    obj = {}  # { id: term_id, name: definition, is_a: list_of_parents, is_obsolete: True, namespace: namespace }
    with open(obo_file) as f:
        for line in f:
            line = line.strip().split(": ")
            if line and len(line) > 1:
                # print(line)
                k, v = line[:2]
                if k == "id" and v.startswith("GO:"):
                    obj["id"] = v
                elif k == "name":
                    obj["def"] = v
                elif k == "is_a" and v.startswith("GO:"):
                    obj.setdefault("is_a", []).append(v.split()[0])
                elif k == "is_obsolete":
                    obj["is_obsolete"] = True
                elif k == "namespace":
                    obj["namespace"] = v
            else:
                if obj.get("id") and not obj.get("is_obsolete"):
                    if "is_a" not in obj:
                        obj["is_root"] = True
                    graph[obj["id"]] = obj
                    # print(obj)
                obj = {}
    return graph

# Gene ontology OBO file is available at
# http://geneontology.org/docs/download-ontology/
graph = parse_obo("data/go.obo")
graph

roots = set()    
    for node in graph:
        if graph[node].get("is_root"):
            roots.add(node)
# roots

{'GO:0000001': {'id': 'GO:0000001',
  'def': 'mitochondrion inheritance',
  'namespace': 'biological_process',
  'is_a': ['GO:0048308', 'GO:0048311']},
 'GO:0000002': {'id': 'GO:0000002',
  'def': 'mitochondrial genome maintenance',
  'namespace': 'biological_process',
  'is_a': ['GO:0007005']},
 'GO:0000003': {'id': 'GO:0000003',
  'def': 'reproduction',
  'namespace': 'biological_process',
  'is_a': ['GO:0008150']},
 'GO:0000006': {'id': 'GO:0000006',
  'def': 'high-affinity zinc transmembrane transporter activity',
  'namespace': 'molecular_function',
  'is_a': ['GO:0005385']},
 'GO:0000007': {'id': 'GO:0000007',
  'def': 'low-affinity zinc ion transmembrane transporter activity',
  'namespace': 'molecular_function',
  'is_a': ['GO:0005385']},
 'GO:0000009': {'id': 'GO:0000009',
  'def': 'alpha-1,6-mannosyltransferase activity',
  'namespace': 'molecular_function',
  'is_a': ['GO:0000030']},
 'GO:0000010': {'id': 'GO:0000010',
  'def': 'trans-hexaprenyltranstransferase activity',
  

In [10]:
def get_ancestors(graph, roots):
    """
    Build a dictionary of ancestors
    and calculate terms depth (shortest path)
    """
    depth = {}  # { term : depth }
    ancestors = {}  # { term : list_of_ancestor_terms }
    for node in graph:
        c = 0
        node_ancestors = []
        node_parents = graph[node].get("is_a")

        # Loop parents levels (parents of parents) until no more parents
        while node_parents:
            c += 1

            # Set root
            if node not in depth and roots.intersection(set(node_parents)):
                depth[node] = c

            # Add ancestors
            node_ancestors.extend(node_parents)

            # Update the list of parents (1 level up)
            node_parents = [term for parent in node_parents for term in graph[parent].get("is_a", [])]

        ancestors[node] = set(node_ancestors)
    return ancestors, depth


ancestors, depth = get_ancestors(graph, roots)
# roots
# depth
ancestors

{'GO:0000001': {'GO:0006996',
  'GO:0007005',
  'GO:0008150',
  'GO:0009987',
  'GO:0016043',
  'GO:0048308',
  'GO:0048311',
  'GO:0051179',
  'GO:0051640',
  'GO:0051641',
  'GO:0051646',
  'GO:0071840'},
 'GO:0000002': {'GO:0006996',
  'GO:0007005',
  'GO:0008150',
  'GO:0009987',
  'GO:0016043',
  'GO:0071840'},
 'GO:0000003': {'GO:0008150'},
 'GO:0000006': {'GO:0003674',
  'GO:0005215',
  'GO:0005385',
  'GO:0008324',
  'GO:0015075',
  'GO:0015318',
  'GO:0022857',
  'GO:0022890',
  'GO:0046873',
  'GO:0046915'},
 'GO:0000007': {'GO:0003674',
  'GO:0005215',
  'GO:0005385',
  'GO:0008324',
  'GO:0015075',
  'GO:0015318',
  'GO:0022857',
  'GO:0022890',
  'GO:0046873',
  'GO:0046915'},
 'GO:0000009': {'GO:0000030',
  'GO:0003674',
  'GO:0003824',
  'GO:0016740',
  'GO:0016757',
  'GO:0016758'},
 'GO:0000010': {'GO:0003674',
  'GO:0003824',
  'GO:0004659',
  'GO:0016740',
  'GO:0016765'},
 'GO:0000011': {'GO:0006996',
  'GO:0007033',
  'GO:0008150',
  'GO:0009987',
  'GO:0016043',
 

In [6]:
def get_children(ancestors):
    children = {}  # { node : list_of_children }, leaf terms are not keys
    for node in ancestors:
        for ancestor in ancestors[node]:
            children.setdefault(ancestor, set()).add(node)
    return children

children = get_children(ancestors)
children

{'GO:0051646': {'GO:0000001',
  'GO:0019896',
  'GO:0034640',
  'GO:0034642',
  'GO:0034643',
  'GO:0047497',
  'GO:0048311',
  'GO:0048312',
  'GO:0051654',
  'GO:0051659',
  'GO:0098939',
  'GO:0098957',
  'GO:0098958',
  'GO:0098959',
  'GO:0098972'},
 'GO:0051641': {'GO:0000001',
  'GO:0000054',
  'GO:0000055',
  'GO:0000056',
  'GO:0000132',
  'GO:0000743',
  'GO:0000745',
  'GO:0001766',
  'GO:0001844',
  'GO:0002175',
  'GO:0003425',
  'GO:0006404',
  'GO:0006405',
  'GO:0006406',
  'GO:0006407',
  'GO:0006408',
  'GO:0006409',
  'GO:0006605',
  'GO:0006606',
  'GO:0006607',
  'GO:0006610',
  'GO:0006611',
  'GO:0006612',
  'GO:0006613',
  'GO:0006614',
  'GO:0006616',
  'GO:0006620',
  'GO:0006622',
  'GO:0006623',
  'GO:0006625',
  'GO:0006626',
  'GO:0006839',
  'GO:0006853',
  'GO:0006886',
  'GO:0006888',
  'GO:0006893',
  'GO:0006895',
  'GO:0006896',
  'GO:0006904',
  'GO:0006913',
  'GO:0007034',
  'GO:0007038',
  'GO:0007041',
  'GO:0007079',
  'GO:0007080',
  'GO:00070

# Exercise

1. How many different ancestors and leaf terms?

2. How many ancestors and leaf terms?

3. How many terms are children of GO:0016791?

4. How many terms for each sub-ontology?

5. Which is the term with the largest number of ancestors?

6. How many leaf terms for each sub-ontology?

7. How many terms at minimum depth 2 (1 node between the root and the term)

8. How many terms for each branch at depth 1?

In [8]:
# How many different ancestors and leaf terms?
ancestors_ids = set([ancestor for node in ancestors for ancestor in ancestors[node]])
print("Ancestors", len(ancestors_ids))  # number of ancestors
print("Leaves", len(set(graph.keys()) - ancestors_ids))  # number of leaf terms

# How many ancestors and leaf terms?
print("Ancestors", len(children))  # number of ancestors
print("Leaves", len(set(graph.keys()) - set(children.keys())))  # number of leaf terms

# How many terms are children of GO:0016791?

# Counting from ancestors
print("Children GO:0016791", len([node for node in ancestors if "GO:0016791" in ancestors[node]]))  # number of children

# The same but querying children dictionary
print("Children GO:0016791", len(children["GO:0016791"]), graph["GO:0016791"]["def"])  # number of children

# How many terms for each sub-ontology?
for root in roots:
    print("Children", root, graph[root], len(children[root]))

# Which is the term with the largest number of ancestors?
node = sorted([(len(ancestors[node]), node) for node in ancestors])[-1]
print("Larger set of ancestors", node, graph[node[1]])

# How many leaf terms for each sub-ontology?
for root in roots:
    print(graph[root]["namespace"], len(set(children[root]) - set(children.keys())))  # number of leaf terms

# How many terms at minimum depth 2 (1 node between the root and the term)
print(len([node for node in depth if depth[node] == 2]))

# How many terms for each branch at depth 1?
print(len([node for node in depth if depth[node] == 1]))

Ancestors 16424
Leaves 27367
Ancestors 16424
Leaves 27367
Children GO:0016791 177
Children GO:0016791 177 phosphatase activity
Children GO:0008150 {'id': 'GO:0008150', 'def': 'biological_process', 'namespace': 'biological_process', 'is_root': True} 28437
Children GO:0005575 {'id': 'GO:0005575', 'def': 'cellular_component', 'namespace': 'cellular_component', 'is_root': True} 4182
Children GO:0003674 {'id': 'GO:0003674', 'def': 'molecular_function', 'namespace': 'molecular_function', 'is_root': True} 11169
Larger set of ancestors (88, 'GO:0071042') {'id': 'GO:0071042', 'def': 'nuclear polyadenylation-dependent mRNA catabolic process', 'namespace': 'biological_process', 'is_a': ['GO:0071047']}
biological_process 14981
cellular_component 3225
molecular_function 9161
1409
55


In [25]:
#!/usr/binx/env python

import gzip


def gen_block(file_name):
    """
    Genrator function that parses GOA GAF files (https://www.ebi.ac.uk/GOA/downloads)
    The generator yields a block of lines corresponding to the same protein
    UniProtKB       A0A024R1R8      hCG_2014768             GO:0002181      PMID:21873635   IBA     PANTHER:PTN002008372|SGD:S000007246     P       HCG2014768, isoform CRA_a       hCG_2014768     protein taxon:9606      20171102        GO_Central
    UniProtKB       A0A024RBG1      NUDT4B          GO:0003723      GO_REF:0000037  IEA     UniProtKB-KW:KW-0694    F       Diphosphoinositol polyphosphate phosphohydrolase NUDT4B NUDT4B  protein taxon:9606      20191109        UniProt
    UniProtKB       A0A024RBG1      NUDT4B          GO:0005829      GO_REF:0000052  IDA             C       Diphosphoinositol polyphosphate phosphohydrolase NUDT4B NUDT4B  protein taxon:9606      20161204        HPA
    """
    with gzip.open(file_name) as f:
        name, old_name = None, None
        chunk = []
        for line in f:
            line = line.decode()
            if line and line[0] != "!":
                _, name, _, _, term, _, ec, _, namespace, protein_name = line.split("\t")[:10]
                if old_name and chunk and name != old_name:
                    yield (old_name, set(chunk))  # return a set as there can be repetitions, i.e. the same term with different evidence codes
                    chunk = []
                old_name = name
                chunk.append(term)
        # Last line
        if old_name:
            yield (old_name, set(chunk))

# https://www.ebi.ac.uk/GOA/downloads
annotations_generator = gen_block("data/goa_human.gaf.gz")

for acc, annotations in annotations_generator:
    print(acc, annotations)

A0A024R1R8 {'GO:0002181'}
A0A024RBG1 {'GO:1901911', 'GO:0005634', 'GO:0046872', 'GO:0034432', 'GO:0003723', 'GO:0050072', 'GO:1901907', 'GO:0071543', 'GO:0005737', 'GO:0034431', 'GO:1901909', 'GO:0052842', 'GO:0008486', 'GO:0000298', 'GO:0052840', 'GO:0005829'}
A0A075B6H5 {'GO:0005886', 'GO:0007166'}
A0A075B6H7 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6H8 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6H9 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6I0 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6I1 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6I3 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6I4 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6I6 {'GO:0006955', 'GO:0002250', 'GO:0005886', 'GO:0019814', 'GO:0005615'}
A0A075B6I7 {'GO:0006955', 'GO:0002250', 'G

O00422 {'GO:0000381', 'GO:0045892', 'GO:0005634', 'GO:0008380', 'GO:0061574', 'GO:0005515', 'GO:0048025', 'GO:0016607', 'GO:0016604', 'GO:0005654', 'GO:0035145', 'GO:0000118', 'GO:0003723', 'GO:0043065', 'GO:0003714', 'GO:0006357', 'GO:0006397', 'GO:0005829'}
O00423 {'GO:0015630', 'GO:0007405', 'GO:0048471', 'GO:0000226', 'GO:0005515', 'GO:0002244', 'GO:0007420', 'GO:0008017', 'GO:0005874', 'GO:0015631', 'GO:0005875', 'GO:0005509', 'GO:1990023', 'GO:0007052', 'GO:0072686', 'GO:0005829', 'GO:0097431'}
O00425 {'GO:0009653', 'GO:0005634', 'GO:0010468', 'GO:0005515', 'GO:0045182', 'GO:0048027', 'GO:0007399', 'GO:0051028', 'GO:0005737', 'GO:0003729', 'GO:0001817', 'GO:0051252', 'GO:0006412', 'GO:0003730', 'GO:0003723', 'GO:0017148', 'GO:0005829'}
O00429 {'GO:0070585', 'GO:0090141', 'GO:0048471', 'GO:0005737', 'GO:1903146', 'GO:0090023', 'GO:0005789', 'GO:0000266', 'GO:0061025', 'GO:0050714', 'GO:0007029', 'GO:0010468', 'GO:0090200', 'GO:1904666', 'GO:0016559', 'GO:0030672', 'GO:0005739', 'G

O60645 {'GO:0030496', 'GO:0030667', 'GO:0048471', 'GO:0005515', 'GO:0000145', 'GO:0000149', 'GO:0015031', 'GO:0042734', 'GO:0006887', 'GO:0045296', 'GO:0030426', 'GO:0005794', 'GO:0051601', 'GO:0005829'}
O60656 {'GO:0042803', 'GO:0016021', 'GO:0052697', 'GO:0042573', 'GO:0006805', 'GO:0019899', 'GO:0015020', 'GO:0005783', 'GO:0005789', 'GO:0052695', 'GO:0052696', 'GO:0001972', 'GO:0051552'}
O60658 {'GO:0046872', 'GO:0071364', 'GO:0060548', 'GO:0006355', 'GO:0004114', 'GO:0007165', 'GO:0070062', 'GO:0001934', 'GO:0070374', 'GO:0006198', 'GO:1903206', 'GO:0019900', 'GO:0047555', 'GO:0004115', 'GO:0005829'}
O60662 {'GO:0005856', 'GO:0031430', 'GO:0005737', 'GO:0005886', 'GO:0045661', 'GO:0005789', 'GO:0030239', 'GO:0031463', 'GO:0006941', 'GO:0035914', 'GO:0031143', 'GO:0005654', 'GO:0033017', 'GO:0001726', 'GO:0016567', 'GO:0005515', 'GO:0045214', 'GO:0005829', 'GO:0048741', 'GO:2000291', 'GO:2001014'}
O60663 {'GO:0005634', 'GO:0046872', 'GO:0000981', 'GO:0045944', 'GO:0005515', 'GO:0006

P00540 {'GO:0004672', 'GO:0000212', 'GO:0051296', 'GO:0106310', 'GO:0006325', 'GO:0005515', 'GO:0004712', 'GO:0000165', 'GO:0007165', 'GO:0005737', 'GO:0004709', 'GO:0004674', 'GO:0043410', 'GO:0040020', 'GO:0046777', 'GO:1902103', 'GO:0005524', 'GO:0005829'}
P00558 {'GO:0045121', 'GO:0006094', 'GO:0004618', 'GO:0030855', 'GO:0031639', 'GO:0005515', 'GO:0043531', 'GO:0016310', 'GO:0047134', 'GO:0016020', 'GO:0070062', 'GO:0006096', 'GO:0071456', 'GO:0016525', 'GO:0005615', 'GO:0005524', 'GO:0005829'}
P00568 {'GO:0006165', 'GO:0046940', 'GO:0001520', 'GO:0009142', 'GO:0005737', 'GO:0046034', 'GO:0006172', 'GO:0046033', 'GO:0070062', 'GO:0015949', 'GO:0004017', 'GO:0004550', 'GO:0005524', 'GO:0005829'}
P00709 {'GO:0006915', 'GO:0000139', 'GO:0042742', 'GO:0007165', 'GO:0005615', 'GO:0007267', 'GO:0005509', 'GO:0005989', 'GO:0004461', 'GO:0005796'}
P00734 {'GO:0051480', 'GO:0030194', 'GO:0005576', 'GO:0005886', 'GO:0070945', 'GO:0007596', 'GO:0008360', 'GO:0005788', 'GO:0005615', 'GO:0008

P0CG47 {'GO:0019941', 'GO:0005634', 'GO:0031315', 'GO:0061136', 'GO:0005737', 'GO:0051881', 'GO:0005886', 'GO:0031982', 'GO:0031398', 'GO:0005789', 'GO:0097009', 'GO:0005615', 'GO:0047497', 'GO:0008585', 'GO:0021888', 'GO:0005654', 'GO:0005739', 'GO:0048812', 'GO:0072520', 'GO:1901214', 'GO:1902255', 'GO:0043025', 'GO:0031625', 'GO:0016567', 'GO:0007144', 'GO:0030666', 'GO:0005515', 'GO:0070062', 'GO:0005829', 'GO:1902527', 'GO:0007141', 'GO:0031386', 'GO:0010008', 'GO:0043005', 'GO:0060613', 'GO:0005741'}
P0CG48 {'GO:0019941', 'GO:0005634', 'GO:0031315', 'GO:0005737', 'GO:0002020', 'GO:0005886', 'GO:0031982', 'GO:0005789', 'GO:0005615', 'GO:0005654', 'GO:0031625', 'GO:0016567', 'GO:0030666', 'GO:0005515', 'GO:0070062', 'GO:0005829', 'GO:0031386', 'GO:0010008', 'GO:0003723', 'GO:0005741'}
P0CH99 {'GO:0005634', 'GO:0046872', 'GO:0000981', 'GO:0005515', 'GO:0000977', 'GO:0001227', 'GO:0000122', 'GO:0006357'}
P0CI00 {'GO:0005634', 'GO:0046872', 'GO:0000981', 'GO:0000977', 'GO:0001227', 'G

P21399 {'GO:0005737', 'GO:0051539', 'GO:0046872', 'GO:0006417', 'GO:0006879', 'GO:0005739', 'GO:0006099', 'GO:0009791', 'GO:0051538', 'GO:0005515', 'GO:0047780', 'GO:0070062', 'GO:0030350', 'GO:0005794', 'GO:0010040', 'GO:0005829', 'GO:0050892', 'GO:0006101', 'GO:0003723', 'GO:0005783', 'GO:0003994'}
P21439 {'GO:0005737', 'GO:0032376', 'GO:0005886', 'GO:0015629', 'GO:0046581', 'GO:0045121', 'GO:0042626', 'GO:2001140', 'GO:0099038', 'GO:0140359', 'GO:0016021', 'GO:0005548', 'GO:0006629', 'GO:0005654', 'GO:0099040', 'GO:0061092', 'GO:0055085', 'GO:0055088', 'GO:0090554', 'GO:0030136', 'GO:0005524', 'GO:0005925', 'GO:1903413', 'GO:0005515', 'GO:1901557', 'GO:0070062', 'GO:0045332', 'GO:0005887', 'GO:0016324', 'GO:0005829', 'GO:0042908', 'GO:0016020', 'GO:0042910', 'GO:0090555', 'GO:0032782'}
P21452 {'GO:0006936', 'GO:0005886', 'GO:0007217', 'GO:0004995', 'GO:0014057', 'GO:0035106', 'GO:0033685', 'GO:0007588', 'GO:0070474', 'GO:0005515', 'GO:0070459', 'GO:0061827', 'GO:0051602', 'GO:004311

P36543 {'GO:0016787', 'GO:0000221', 'GO:0005768', 'GO:0005515', 'GO:0051117', 'GO:1902600', 'GO:0005902', 'GO:0016324', 'GO:0070062', 'GO:0016469', 'GO:0005765', 'GO:0016241', 'GO:0046961', 'GO:0005829'}
P36544 {'GO:1904645', 'GO:0005892', 'GO:0050893', 'GO:0007165', 'GO:0005886', 'GO:0022848', 'GO:1905144', 'GO:0032720', 'GO:0050808', 'GO:1902004', 'GO:0045766', 'GO:1905906', 'GO:0016021', 'GO:0015643', 'GO:0042391', 'GO:0001934', 'GO:1901214', 'GO:0007614', 'GO:0035094', 'GO:0001540', 'GO:0043410', 'GO:1902991', 'GO:0060079', 'GO:0006874', 'GO:2000463', 'GO:0007613', 'GO:0097061', 'GO:0005262', 'GO:0050890', 'GO:0098815', 'GO:0005515', 'GO:1902430', 'GO:0017081', 'GO:0042166', 'GO:0034220', 'GO:0070374', 'GO:0007268', 'GO:0045211', 'GO:0045202', 'GO:0050877', 'GO:0005216', 'GO:0008284', 'GO:0005887', 'GO:0001666', 'GO:1900273', 'GO:0070588', 'GO:0042803', 'GO:0007271', 'GO:0007611', 'GO:0044853', 'GO:0051247', 'GO:0015464', 'GO:0140059', 'GO:0006811', 'GO:1905920', 'GO:0030594', 'GO:

P53355 {'GO:0002834', 'GO:0071346', 'GO:0005634', 'GO:1990722', 'GO:0071447', 'GO:0005737', 'GO:0005886', 'GO:0010506', 'GO:0015629', 'GO:0035556', 'GO:0043065', 'GO:1904094', 'GO:0043280', 'GO:0005524', 'GO:0002357', 'GO:0006915', 'GO:0004672', 'GO:0043066', 'GO:0008625', 'GO:0106310', 'GO:0005515', 'GO:0005516', 'GO:0004674', 'GO:0046777', 'GO:0006468', 'GO:0017148', 'GO:0005525', 'GO:0042802', 'GO:0004683', 'GO:0042981', 'GO:0004712', 'GO:0010508', 'GO:0017075', 'GO:2000310', 'GO:0097190'}
P53365 {'GO:0005938', 'GO:0070273', 'GO:0005737', 'GO:0005886', 'GO:0030032', 'GO:0006886', 'GO:0045296', 'GO:0007264', 'GO:0005543', 'GO:0001726', 'GO:0030036', 'GO:0005515', 'GO:0034315', 'GO:0031267', 'GO:0030742', 'GO:0005829', 'GO:0005525', 'GO:0042802', 'GO:0140090', 'GO:0031529', 'GO:0032588', 'GO:0019904', 'GO:0000423', 'GO:0034497'}
P53367 {'GO:0000139', 'GO:0070273', 'GO:0005515', 'GO:0006886', 'GO:0034315', 'GO:0032588', 'GO:0050708', 'GO:1905280', 'GO:0019904', 'GO:0005543', 'GO:000582

P67812 {'GO:0008233', 'GO:0005515', 'GO:0016021', 'GO:0006465', 'GO:0005787', 'GO:0005789', 'GO:0004252'}
P67870 {'GO:0005634', 'GO:0051101', 'GO:0005576', 'GO:0016055', 'GO:0005956', 'GO:0007165', 'GO:0005737', 'GO:0005886', 'GO:0034622', 'GO:0010862', 'GO:0018107', 'GO:0046872', 'GO:0019887', 'GO:0043537', 'GO:0080163', 'GO:0005654', 'GO:0061629', 'GO:0003682', 'GO:0034774', 'GO:0008285', 'GO:0005515', 'GO:0033211', 'GO:0070062', 'GO:0004674', 'GO:0031519', 'GO:0006468', 'GO:0005829', 'GO:0042802', 'GO:0005102', 'GO:0032927', 'GO:1904813', 'GO:0019904', 'GO:0061154'}
P67936 {'GO:0005884', 'GO:0008307', 'GO:0006936', 'GO:0005856', 'GO:0002102', 'GO:0051015', 'GO:0046982', 'GO:0030863', 'GO:0005515', 'GO:0070062', 'GO:0005509', 'GO:0001725', 'GO:0007015', 'GO:0005829', 'GO:0001649', 'GO:0042802', 'GO:0042803', 'GO:0016020', 'GO:0005862', 'GO:0005925'}
P68032 {'GO:0000146', 'GO:0005884', 'GO:0005737', 'GO:0030017', 'GO:0005869', 'GO:0005615', 'GO:0030175', 'GO:0090131', 'GO:0031674', 'G

Q12933 {'GO:0005164', 'GO:0043120', 'GO:0034351', 'GO:0043123', 'GO:0005938', 'GO:0051091', 'GO:0005174', 'GO:0007250', 'GO:0023019', 'GO:0000151', 'GO:0007165', 'GO:0034622', 'GO:0019899', 'GO:0002726', 'GO:0031996', 'GO:0019901', 'GO:0048255', 'GO:0043254', 'GO:0045121', 'GO:0008270', 'GO:0071732', 'GO:0097300', 'GO:0097057', 'GO:0051092', 'GO:0005654', 'GO:1903265', 'GO:0002947', 'GO:0031625', 'GO:0031435', 'GO:0070059', 'GO:0051865', 'GO:0097400', 'GO:2001238', 'GO:0004842', 'GO:0005515', 'GO:0032743', 'GO:1990604', 'GO:0030674', 'GO:0070534', 'GO:0030163', 'GO:1901215', 'GO:0044877', 'GO:0005829', 'GO:0065003', 'GO:0046625', 'GO:1903721', 'GO:0012506', 'GO:0042802', 'GO:0035631', 'GO:0042981', 'GO:0043507', 'GO:0043122', 'GO:0009898', 'GO:0034976', 'GO:0046330', 'GO:0033209', 'GO:0002637', 'GO:0019903'}
Q12934 {'GO:0005938', 'GO:0005515', 'GO:0005212', 'GO:0005737', 'GO:0048469', 'GO:0005886', 'GO:0008150', 'GO:0005882', 'GO:0045109', 'GO:0070307', 'GO:0005200'}
Q12946 {'GO:000563

Q15596 {'GO:0005634', 'GO:0030374', 'GO:0045475', 'GO:0000978', 'GO:0046983', 'GO:0005737', 'GO:0000122', 'GO:0000785', 'GO:0001162', 'GO:0016922', 'GO:0006355', 'GO:0005654', 'GO:0003682', 'GO:0010906', 'GO:0005515', 'GO:0017162', 'GO:0032870', 'GO:1904017', 'GO:0005667', 'GO:0032991', 'GO:0003713', 'GO:0045944', 'GO:0016604', 'GO:0032922', 'GO:0019904', 'GO:0032570'}
Q15599 {'GO:0043495', 'GO:0008022', 'GO:0005634', 'GO:0042802', 'GO:0012505', 'GO:0005515', 'GO:0005102', 'GO:0031799', 'GO:0005886', 'GO:0070062', 'GO:0019902', 'GO:0072659', 'GO:0031800', 'GO:0008013', 'GO:0045296', 'GO:0016324', 'GO:0065003', 'GO:0005925'}
Q155Q3 {'GO:0003779', 'GO:0005856', 'GO:0043679', 'GO:0030177', 'GO:0021869', 'GO:0032956', 'GO:0050772', 'GO:0060070', 'GO:0043025', 'GO:0031435', 'GO:0070507', 'GO:0021799', 'GO:0005515', 'GO:0043015', 'GO:0045665', 'GO:0032991', 'GO:0005829', 'GO:0021695', 'GO:0046330', 'GO:0019904', 'GO:0005925'}
Q15612 {'GO:0007608', 'GO:0004930', 'GO:0016021', 'GO:0005886', 'G

Q5SW24 {'GO:0007162', 'GO:0002244', 'GO:0070097', 'GO:0005737', 'GO:0003382', 'GO:0051018', 'GO:0008134', 'GO:0043588', 'GO:0005080', 'GO:0008013', 'GO:1900108', 'GO:0072061'}
Q5SW79 {'GO:0120103', 'GO:0005814', 'GO:0005515', 'GO:0005874', 'GO:0005813', 'GO:0072686', 'GO:0005829'}
Q5SW96 {'GO:0055037', 'GO:0030159', 'GO:0071345', 'GO:0005886', 'GO:1905581', 'GO:0035612', 'GO:0042982', 'GO:0005883', 'GO:0009925', 'GO:0031623', 'GO:0050750', 'GO:0035650', 'GO:0030276', 'GO:1904707', 'GO:0048260', 'GO:0030669', 'GO:0001540', 'GO:1903076', 'GO:1905602', 'GO:0034383', 'GO:0035591', 'GO:0005515', 'GO:0006898', 'GO:0042632', 'GO:0005829', 'GO:0030424', 'GO:0005769', 'GO:0001784', 'GO:0030301', 'GO:0009898', 'GO:0090118', 'GO:0035615', 'GO:0090205', 'GO:0005546', 'GO:0043393', 'GO:0008203'}
Q5SWA1 {'GO:0019888', 'GO:0042542', 'GO:1903898', 'GO:0005515', 'GO:0034976', 'GO:0070262', 'GO:0032516', 'GO:0006983', 'GO:0005783', 'GO:0000164', 'GO:1903912', 'GO:0001933'}
Q5SWH9 {'GO:0005515', 'GO:0016

Q6UXH9 {'GO:0005509', 'GO:0005576'}
Q6UXI7 {'GO:0030198', 'GO:0062023', 'GO:0005576', 'GO:0021510', 'GO:0005539', 'GO:0010811', 'GO:0005614'}
Q6UXI9 {'GO:0005576', 'GO:0097195', 'GO:0030485', 'GO:0005201', 'GO:0005178', 'GO:0030511', 'GO:0045184', 'GO:0071356', 'GO:0001657', 'GO:0007160', 'GO:0005604', 'GO:0062023', 'GO:0070062', 'GO:0070374', 'GO:0005509', 'GO:0045987', 'GO:0010811', 'GO:0030198', 'GO:2000721', 'GO:0033631', 'GO:0016020', 'GO:0001658', 'GO:0010694', 'GO:0045669'}
Q6UXK2 {'GO:0045773', 'GO:0005515', 'GO:0016021', 'GO:0005886', 'GO:0009986'}
Q6UXK5 {'GO:0051965', 'GO:0005515', 'GO:0016021', 'GO:0031012', 'GO:0005615'}
Q6UXL0 {'GO:0048873', 'GO:0032703', 'GO:0002765', 'GO:0005515', 'GO:0032753', 'GO:0005886', 'GO:0019221', 'GO:0042130', 'GO:0004896', 'GO:0042015', 'GO:0032733', 'GO:0005887', 'GO:0032689', 'GO:0002437', 'GO:0001808'}
Q6UXM1 {'GO:0005515', 'GO:0016021', 'GO:0032474', 'GO:0005886', 'GO:0031012', 'GO:0030659', 'GO:0005615'}
Q6UXN2 {'GO:0005576', 'GO:0045087'

Q86Z14 {'GO:0005104', 'GO:0005515', 'GO:0004553', 'GO:0016021', 'GO:0005886', 'GO:0017134', 'GO:0008543', 'GO:0090080', 'GO:0008284', 'GO:0005975'}
Q86Z20 {'GO:0042802', 'GO:0005515', 'GO:2000146', 'GO:0005737', 'GO:0003674', 'GO:0090630', 'GO:0035024'}
Q86Z23 {'GO:0048147', 'GO:0042802', 'GO:0005515', 'GO:0045599', 'GO:0070373', 'GO:0005581', 'GO:0005615'}
Q8HWS3 {'GO:0005634', 'GO:0000978', 'GO:0001228', 'GO:0000977', 'GO:0000785', 'GO:0003309', 'GO:0003311', 'GO:0000981', 'GO:0042593', 'GO:0090104', 'GO:0031018', 'GO:0045893', 'GO:0005515', 'GO:0035774', 'GO:0006357', 'GO:0003310', 'GO:0045944', 'GO:0050796', 'GO:0000976'}
Q8IU54 {'GO:0005576', 'GO:0051607', 'GO:0045345', 'GO:0005615', 'GO:0002829', 'GO:0032729', 'GO:0032003', 'GO:0043381', 'GO:0045893', 'GO:0005125', 'GO:0008285', 'GO:0045892', 'GO:0032002', 'GO:0032696', 'GO:0046427', 'GO:0045087', 'GO:0032714', 'GO:0042531', 'GO:0007259', 'GO:0005102', 'GO:0050778', 'GO:0045581'}
Q8IU57 {'GO:0005515', 'GO:0051607', 'GO:0016021', 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Q9H5X1 {'GO:0046872', 'GO:0097428', 'GO:0007059', 'GO:0005515', 'GO:0005654', 'GO:0097361', 'GO:0106035', 'GO:0016226', 'GO:0005829'}
Q9H5Y7 {'GO:0051965', 'GO:0001964', 'GO:0090102', 'GO:0005886', 'GO:0035264', 'GO:1905606', 'GO:0007409', 'GO:0007416', 'GO:0008344', 'GO:0009986', 'GO:0060007', 'GO:0002093', 'GO:0005887', 'GO:0021562', 'GO:0031223', 'GO:0007601', 'GO:0007605', 'GO:0060384', 'GO:0002088'}
Q9H5Z1 {'GO:0000398', 'GO:0003724', 'GO:0016887', 'GO:0071013', 'GO:0003723', 'GO:0001701', 'GO:0005524'}
Q9H5Z6 {'GO:0005515', 'GO:0005739', 'GO:0005654'}
Q9H609 {'GO:0005634', 'GO:0046872', 'GO:0000981', 'GO:0043565', 'GO:0005515', 'GO:0006357'}
Q9H611 {'GO:0043139', 'GO:0051974', 'GO:0000781', 'GO:0000287', 'GO:0000002', 'GO:0032508', 'GO:0016887', 'GO:0005654', 'GO:0010521', 'GO:0005739', 'GO:0006310', 'GO:0005524', 'GO:0042162', 'GO:0006281', 'GO:0051880', 'GO:0007004', 'GO:0005657', 'GO:0017116', 'GO:0006260', 'GO:0032211', 'GO:0033678', 'GO:0032204'}
Q9H628 {'GO:0005525', 'GO:00

Q9NXW2 {'GO:0016020', 'GO:0030544', 'GO:0051085', 'GO:0036503', 'GO:0034622', 'GO:0030176', 'GO:0030433', 'GO:0005789', 'GO:0071218', 'GO:0005783', 'GO:0031965'}
Q9NXW9 {'GO:0003779', 'GO:0005737', 'GO:0032451', 'GO:0016491', 'GO:0046872', 'GO:0035511', 'GO:0005730', 'GO:0006325', 'GO:0070938', 'GO:0080111', 'GO:0035516', 'GO:0030496', 'GO:0005515', 'GO:0006482', 'GO:0070989', 'GO:0036090', 'GO:0031032', 'GO:1902275', 'GO:0016706'}
Q9NXX6 {'GO:0005634', 'GO:0006281', 'GO:0005515', 'GO:0016604', 'GO:0030915', 'GO:0005654', 'GO:2001022'}
Q9NXZ1 {'GO:0034472', 'GO:0016604', 'GO:0005654', 'GO:0032039'}
Q9NXZ2 {'GO:0003724', 'GO:0005515', 'GO:0016887', 'GO:0003723', 'GO:0005524'}
Q9NY12 {'GO:0031429', 'GO:0005515', 'GO:0090661', 'GO:0000454', 'GO:0000781', 'GO:0005654', 'GO:0005697', 'GO:0001650', 'GO:0007004', 'GO:0034513', 'GO:0003723', 'GO:0072589', 'GO:0070034'}
Q9NY15 {'GO:0030666', 'GO:0005044', 'GO:0007155', 'GO:0005515', 'GO:0015035', 'GO:0005886', 'GO:0007267', 'GO:0006898', 'GO:00

Q9UM73 {'GO:0007165', 'GO:0038061', 'GO:0004713', 'GO:0005886', 'GO:0048666', 'GO:0007169', 'GO:0097009', 'GO:0060159', 'GO:0033674', 'GO:0016310', 'GO:0051092', 'GO:0018108', 'GO:1900006', 'GO:0043410', 'GO:0005524', 'GO:0050995', 'GO:0045664', 'GO:0005515', 'GO:0036269', 'GO:0070062', 'GO:0042127', 'GO:0046777', 'GO:0005887', 'GO:0004704', 'GO:0032991', 'GO:0042802', 'GO:0030534', 'GO:0042981', 'GO:0004712', 'GO:0004714', 'GO:0090648', 'GO:0021766', 'GO:0043235'}
Q9UM82 {'GO:0007283', 'GO:0030159', 'GO:0005515', 'GO:0010803', 'GO:0005737', 'GO:0050727', 'GO:1990108', 'GO:0070536', 'GO:0072520', 'GO:0001650', 'GO:1990381', 'GO:0060544', 'GO:0005654', 'GO:0044877', 'GO:0070266'}
Q9UMD9 {'GO:0030198', 'GO:0005604', 'GO:0030020', 'GO:0062023', 'GO:0005576', 'GO:0005515', 'GO:0008544', 'GO:0005911', 'GO:0005201', 'GO:0005886', 'GO:0005887', 'GO:0031012', 'GO:0005788', 'GO:0005581', 'GO:0030056', 'GO:0007160', 'GO:0031581', 'GO:0005615'}
Q9UMF0 {'GO:0007155', 'GO:0005515', 'GO:0006909', 'G

In [28]:
# How many proteins are directly annotated with 
# "regulation of kinase activity" (GO:0043549)
# and "mitochondrion" (GO:0005739)?

proteins_counts = {}  # { term : number_of_proteins }
annotations_generator = gen_block("data/goa_human.gaf.gz")
for acc, annotations in annotations_generator:
    for term in annotations:
        proteins_counts.setdefault(term, 0)
        proteins_counts[term] += 1
print(graph["GO:0043549"]["def"], proteins_counts.get("GO:0043549"))
print(graph["GO:0005739"]["def"], proteins_counts.get("GO:0005739"))

regulation of kinase activity 6
mitochondrion 1318


In [29]:
# How many proteins are "regulation of kinase activity" (GO:0043549)?
# According to the "true path rule" you have to count direct annotations
# plus proteins annotated with children of that term

proteins = {}  # { term : proteins_annotated_with_term } It contains proteins annotated directly with the term or its children
annotations_generator = gen_block("data/goa_human.gaf.gz")
for acc, annotations in annotations_generator:
    # Copy direct annotations
    terms = copy.copy(annotations)
    # Add ancestors
    for term in annotations:
        terms.update(ancestors.get(term, set()))
    # For each term add protein accession to proteins dict
    for term in terms:
        proteins.setdefault(term, set()).add(acc)
print(graph["GO:0043549"]["def"], len(proteins["GO:0043549"]))
print(graph["GO:0005739"]["def"], len(proteins["GO:0005739"]))

regulation of kinase activity 748
mitochondrion 1318


In [30]:
# Which are the 5 most abundant "biological process" terms 
# in mitochondrial proteins (GO:0005739 mitochondrion)?

terms_count = {}  # { term : count } count within the mitochondrial proteins set
annotations_generator = gen_block("data/goa_human.gaf.gz")
for acc, annotations in annotations_generator:
    if acc in proteins["GO:0005739"]:
        # Copy direct annotations
        terms = copy.copy(annotations)
        # Add ancestors
        for term in annotations:
            terms.update(ancestors.get(term, set()))
        # For each term add protein accession to proteins dict
        for term in terms:
            terms_count.setdefault(term, 0)
            terms_count[term] += 1

# Sort by count and filter by biological_process namespace
data = sorted([(k, v) for k, v in terms_count.items()], key=lambda x: x[1], reverse=True)
for (k, v) in list(filter(lambda x: graph[x[0]]["namespace"] == "biological_process", data))[:20]:
    print(k, v, graph[k]["def"])

KeyError: 'GO:1902586'

In [None]:

# *** Which are the top 20 most enriched terms in mitochondrial proteins (annotated with GO:0005739)?
#     Measure the ratio between a term in mitochondrial proteins and in the rest of the human proteins
#     and select those with the higher "fold-increase" (ratio)
terms_set = {}  # { term : count }  mitochondrial proteins
terms_rest = {}  #  { term : count }  other proteins
proteins_set = 0  # number of mitochondrial proteins
proteins_rest = 0  # number of remaining proteins
annotations_generator = gen_block("data/goa_human.gaf.gz")
for acc, annotations in annotations_generator:
    # Copy direct annotations
    terms = copy.copy(annotations)
    # Add ancestors
    for term in annotations:
        terms.update(ancestors.get(term, set()))
    # For each term add protein accession to proteins dict

    if acc in proteins["GO:0005739"]:
        proteins_set += 1
        for term in terms:
            terms_set.setdefault(term, 0)
            terms_set[term] += 1
    else:
        proteins_rest += 1
        for term in terms:
            terms_rest.setdefault(term, 0)
            terms_rest[term] += 1

data = []
for term in terms_set:
    ratio_set = (terms_set[term] + 1) / proteins_set  # add pseudo count
    ratio_rest = terms_rest.get(term, 1) / proteins_rest  # add pseudo count
    fold_increase = ratio_set / ratio_rest
    data.append((term, terms_set[term], terms_rest.get(term, 0), ratio_set, ratio_rest, fold_increase, graph[term]["namespace"], graph[term]["def"]))
for ele in sorted(data, key=lambda x: x[5], reverse=True)[:20]:
    print("{} {} {} {:.2g} {:.2g} {:.2g} {} {}".format(*ele))