# Ontology overview
This notebook contains code for converting go.obo file into dataframe object and export it as csv. There is also a function `check_all_paths`, which accepts as arguments previously loaded with obonet graph and list of gene ontology (GO) id's. Function looks for relations between all nodes in list and returns all of it in that format: [[(node1, node2, relation), node1, node2, relation], [...]].

For next improvments, there is need to:
- Convert code into object paradigm, and use ex. fastAPI for comunication with frontend
- Optimize `check_all_paths` function and use multithreading

In [1]:
import obonet
import networkx as nx
import pandas as pd

graph = obonet.read_obo('go.obo')

nodes_df = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index').reset_index()
nodes_df = nodes_df.rename(columns={'index': 'id'})
print("Nodes DataFrame Head:")
print(nodes_df.head())

edges_df = nx.to_pandas_edgelist(graph)
print("Edges DataFrame Head:")
print(edges_df.head())


print("Final Nodes DataFrame:")
print(nodes_df.head())
print("Final Edges DataFrame:")
print(edges_df.head())

Nodes DataFrame Head:
           id                                               name  \
0  GO:0000001                          mitochondrion inheritance   
1  GO:0000002                   mitochondrial genome maintenance   
2  GO:0000006  high-affinity zinc transmembrane transporter a...   
3  GO:0000007  low-affinity zinc ion transmembrane transporte...   
4  GO:0000009             alpha-1,6-mannosyltransferase activity   

            namespace                                                def  \
0  biological_process  "The distribution of mitochondria, including t...   
1  biological_process  "The maintenance of the structure and integrit...   
2  molecular_function  "Enables the transfer of zinc ions (Zn2+) from...   
3  molecular_function  "Enables the transfer of a solute or solutes f...   
4  molecular_function  "Catalysis of the transfer of a mannose residu...   

                                             synonym  \
0             ["mitochondrial inheritance" EXACT []]   


In [2]:
nodes_df[['id', 'name', 'namespace', 'def', 'is_a', 'relationship']].head()

Unnamed: 0,id,name,namespace,def,is_a,relationship
0,GO:0000001,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","[GO:0048308, GO:0048311]",
1,GO:0000002,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",[GO:0007005],
2,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",[GO:0005385],
3,GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function,"""Enables the transfer of a solute or solutes f...",[GO:0005385],
4,GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,"""Catalysis of the transfer of a mannose residu...",[GO:0000030],


In [3]:
nodes_df[['id', 'name', 'namespace', 'def', 'is_a', 'relationship']].to_csv('go_nodes.csv', index=False)

In [4]:
def check_all_shortest_paths(graph, ontology_ids):
    found_paths_with_relations = []
    for i, node1 in enumerate(ontology_ids):
        for node2 in ontology_ids[i+1:]:
            if nx.has_path(graph, node1, node2):
                path = nx.shortest_path(graph, node1, node2)
                path_with_relations = []
                
                for j in range(len(path) - 1):
                    start_node = path[j]
                    end_node = path[j + 1]
                    relations = graph[start_node][end_node]
                    relation_type = next(iter(relations), None)
                    path_with_relations.append((start_node, end_node, relation_type))
                
                found_paths_with_relations.append(path_with_relations)
             
    return found_paths_with_relations

In [5]:
ontology_ids = nodes_df['id'].sample(100).tolist()
found_paths_with_relations = check_all_shortest_paths(graph, ontology_ids)

import json
with open('found_paths_with_relations.json', 'w') as f:
    json.dump(found_paths_with_relations, f, indent=4)
found_paths_with_relations

[[('GO:0098960', 'GO:0030594', 'is_a')],
 [('GO:0030126', 'GO:0030663', 'part_of'),
  ('GO:0030663', 'GO:0030660', 'is_a')]]

In [6]:
import pandas as pd

def get_connections(df, start_node):
    connections = []
    visited = set()
    stack = [start_node]

    while stack:
        current_node = stack.pop()
        if current_node in visited:
            continue
        visited.add(current_node)
        
        row = df.loc[df['id'] == current_node]
        if row.empty:
            continue
        
        is_a_values = row['is_a'].values[0]
        if type(is_a_values) != float:
            for target_node in is_a_values:
                connections.append((current_node, target_node, 'is_a'))
                stack.append(target_node)
        
        relationship_values = row['relationship'].values[0]
        if type(relationship_values) != float:
            for relationship in relationship_values:
                rel_type, target_node = relationship.split(' ')
                connections.append((current_node, target_node, rel_type))
                stack.append(target_node)
    
    return connections

connections = []
for i in range(len(found_paths_with_relations)):
    a = get_connections(nodes_df, found_paths_with_relations[i][0][0])
    connections.extend(a)
print(connections)
print(len(connections))
connections = set(connections) # remove duplicates if there is any
print(len(connections))


[('GO:0098960', 'GO:0030594', 'is_a'), ('GO:0098960', 'GO:0045211', 'occurs_in'), ('GO:0098960', 'GO:0099565', 'part_of'), ('GO:0099565', 'GO:0007166', 'is_a'), ('GO:0099565', 'GO:0050877', 'is_a'), ('GO:0099565', 'GO:0098794', 'occurs_in'), ('GO:0099565', 'GO:0007268', 'part_of'), ('GO:0007268', 'GO:0098916', 'is_a'), ('GO:0098916', 'GO:0099537', 'is_a'), ('GO:0099537', 'GO:0099536', 'is_a'), ('GO:0099536', 'GO:0007267', 'is_a'), ('GO:0099536', 'GO:0045202', 'occurs_in'), ('GO:0045202', 'GO:0030054', 'is_a'), ('GO:0030054', 'GO:0110165', 'is_a'), ('GO:0110165', 'GO:0005575', 'is_a'), ('GO:0007267', 'GO:0007154', 'is_a'), ('GO:0007267', 'GO:0023052', 'is_a'), ('GO:0023052', 'GO:0050789', 'is_a'), ('GO:0050789', 'GO:0065007', 'is_a'), ('GO:0050789', 'GO:0008150', 'regulates'), ('GO:0065007', 'GO:0008150', 'is_a'), ('GO:0007154', 'GO:0009987', 'is_a'), ('GO:0009987', 'GO:0008150', 'is_a'), ('GO:0098794', 'GO:0110165', 'is_a'), ('GO:0098794', 'GO:0045202', 'part_of'), ('GO:0050877', 'GO:0

In [7]:
import json
with open('connections.json', 'w') as f:
    json.dump(list(connections), f, indent=4)

In [9]:
import sqlite3

conn = sqlite3.connect('viz4go_backend/go_terms.db')
query = "SELECT * FROM go_terms LIMIT 1000"
df = pd.read_sql_query(query, conn)
conn.close()

df

Unnamed: 0,id,name,namespace,def,is_a,relationship
0,GO:0000001,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","GO:0048308,GO:0048311",
1,GO:0000002,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",GO:0007005,
2,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",GO:0005385,
3,GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function,"""Enables the transfer of a solute or solutes f...",GO:0005385,
4,GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,"""Catalysis of the transfer of a mannose residu...",GO:0000030,
...,...,...,...,...,...,...
995,GO:0001937,negative regulation of endothelial cell prolif...,biological_process,"""Any process that stops, prevents, or reduces ...","GO:0001936,GO:0050680",negatively_regulates GO:0001935
996,GO:0001938,positive regulation of endothelial cell prolif...,biological_process,"""Any process that activates or increases the r...","GO:0001936,GO:0050679",positively_regulates GO:0001935
997,GO:0001939,female pronucleus,cellular_component,"""The pronucleus originating from the ovum that...",GO:0045120,
998,GO:0001940,male pronucleus,cellular_component,"""The pronucleus originating from the spermatoz...",GO:0045120,


In [16]:
go_terms = set()
for term1, term2, _ in connections:
    go_terms.add(term1)
    go_terms.add(term2)
print(len(go_terms))

placeholders = ', '.join('?' for _ in go_terms)
query = f"""
SELECT DISTINCT id, name, namespace, def, is_a, relationship
FROM go_terms
WHERE id IN ({placeholders})
"""

conn = sqlite3.connect('viz4go_backend/go_terms.db')

df = pd.read_sql_query(query, conn, params=list(go_terms))

conn.close()

df

64


Unnamed: 0,id,name,namespace,def,is_a,relationship
0,GO:0003008,system process,biological_process,"""A multicellular organismal process carried ou...",GO:0032501,
1,GO:0003674,molecular_function,molecular_function,"""A molecular process that can be carried out b...",,
2,GO:0005575,cellular_component,cellular_component,"""A location, relative to cellular compartments...",,
3,GO:0005622,intracellular anatomical structure,cellular_component,"""A component of a cell contained within (but n...",GO:0110165,
4,GO:0005737,cytoplasm,cellular_component,"""The contents of a cell excluding the plasma m...",GO:0110165,part_of GO:0005622
...,...,...,...,...,...,...
59,GO:0098960,postsynaptic neurotransmitter receptor activity,molecular_function,"""Neurotransmitter receptor activity occurring ...",GO:0030594,"occurs_in GO:0045211,part_of GO:0099565"
60,GO:0099536,synaptic signaling,biological_process,"""Cell-cell signaling to, from or within a syna...",GO:0007267,occurs_in GO:0045202
61,GO:0099537,trans-synaptic signaling,biological_process,"""Cell-cell signaling in either direction acros...",GO:0099536,
62,GO:0099565,"chemical synaptic transmission, postsynaptic",biological_process,"""The part of synaptic transmission occurring i...","GO:0007166,GO:0050877","occurs_in GO:0098794,part_of GO:0007268"


In [19]:
# convert df into json
import json
df_json = json.loads(df.to_json(orient='records'))
with open('go_terms.json', 'w') as f:
    json.dump(df_json, f, indent=4)