# Ontology overview
This notebook contains code for converting go.obo file into dataframe object and export it as csv. There is also a function `check_all_paths`, which accepts as arguments previously loaded with obonet graph and list of gene ontology (GO) id's. Function looks for relations between all nodes in list and returns all of it in that format: [[(node1, node2, relation), node1, node2, relation], [...]].

For next improvments, there is need to:
- Convert code into object paradigm, and use ex. fastAPI for comunication with frontend
- Optimize `check_all_paths` function and use multithreading

In [2]:
import obonet
import networkx as nx
import pandas as pd

graph = obonet.read_obo('viz4go_backend/data/go.obo')

nodes_df = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index').reset_index()
nodes_df = nodes_df.rename(columns={'index': 'id'})
print("Nodes DataFrame Head:")
print(nodes_df.head())

edges_df = nx.to_pandas_edgelist(graph)
print("Edges DataFrame Head:")
print(edges_df.head())


print("Final Nodes DataFrame:")
print(nodes_df.head())
print("Final Edges DataFrame:")
print(edges_df.head())

Nodes DataFrame Head:
           id                                               name  \
0  GO:0000001                          mitochondrion inheritance   
1  GO:0000002                   mitochondrial genome maintenance   
2  GO:0000006  high-affinity zinc transmembrane transporter a...   
3  GO:0000007  low-affinity zinc ion transmembrane transporte...   
4  GO:0000009             alpha-1,6-mannosyltransferase activity   

            namespace                                                def  \
0  biological_process  "The distribution of mitochondria, including t...   
1  biological_process  "The maintenance of the structure and integrit...   
2  molecular_function  "Enables the transfer of zinc ions (Zn2+) from...   
3  molecular_function  "Enables the transfer of a solute or solutes f...   
4  molecular_function  "Catalysis of the transfer of a mannose residu...   

                                             synonym  \
0             ["mitochondrial inheritance" EXACT []]   


In [2]:
nodes_df[['id', 'name', 'namespace', 'def', 'is_a', 'relationship']].head()

Unnamed: 0,id,name,namespace,def,is_a,relationship
0,GO:0000001,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","[GO:0048308, GO:0048311]",
1,GO:0000002,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",[GO:0007005],
2,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",[GO:0005385],
3,GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function,"""Enables the transfer of a solute or solutes f...",[GO:0005385],
4,GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,"""Catalysis of the transfer of a mannose residu...",[GO:0000030],


In [3]:
nodes_df[['id', 'name', 'namespace', 'def', 'is_a', 'relationship']].to_csv('go_nodes.csv', index=False)

In [3]:
def check_all_shortest_paths(graph, ontology_ids):
    found_paths_with_relations = []
    for i, node1 in enumerate(ontology_ids):
        for node2 in ontology_ids[i+1:]:
            if nx.has_path(graph, node1, node2):
                path = nx.shortest_path(graph, node1, node2)
                path_with_relations = []
                
                for j in range(len(path) - 1):
                    start_node = path[j]
                    end_node = path[j + 1]
                    relations = graph[start_node][end_node]
                    relation_type = next(iter(relations), None)
                    path_with_relations.append((start_node, end_node, relation_type))
                
                found_paths_with_relations.append(path_with_relations)
             
    return found_paths_with_relations

In [7]:
ontology_ids = nodes_df['id'].sample(100).tolist()
found_paths_with_relations = check_all_shortest_paths(graph, ontology_ids)

import json
with open('found_paths_with_relations.json', 'w') as f:
    json.dump(found_paths_with_relations, f, indent=4)
found_paths_with_relations

[[('GO:0061413', 'GO:0000429', 'is_a'),
  ('GO:0000429', 'GO:0006357', 'is_a'),
  ('GO:0006357', 'GO:0006355', 'is_a'),
  ('GO:0006355', 'GO:0010468', 'is_a'),
  ('GO:0010468', 'GO:0010556', 'is_a'),
  ('GO:0010556', 'GO:0031326', 'is_a'),
  ('GO:0031326', 'GO:0031323', 'is_a')],
 [('GO:1902138', 'GO:0046189', 'is_a'), ('GO:0046189', 'GO:0018958', 'is_a')],
 [('GO:1902138', 'GO:0034312', 'is_a'),
  ('GO:0034312', 'GO:0034311', 'is_a'),
  ('GO:0034311', 'GO:0019751', 'is_a'),
  ('GO:0019751', 'GO:0006066', 'is_a')],
 [('GO:0002729', 'GO:0002720', 'is_a'),
  ('GO:0002720', 'GO:0001819', 'is_a'),
  ('GO:0001819', 'GO:0001817', 'is_a'),
  ('GO:0001817', 'GO:0010468', 'is_a'),
  ('GO:0010468', 'GO:0010556', 'is_a'),
  ('GO:0010556', 'GO:0031326', 'is_a'),
  ('GO:0031326', 'GO:0031323', 'is_a')],
 [('GO:0032343', 'GO:0032341', 'is_a'),
  ('GO:0032341', 'GO:0034308', 'is_a'),
  ('GO:0034308', 'GO:0006066', 'is_a')],
 [('GO:0000454', 'GO:0031118', 'is_a'),
  ('GO:0031118', 'GO:0000154', 'is_a'

In [18]:
def get_connections(df, start_nodes):
    connections = []
    visited = set()
    stack = start_nodes

    while stack:
        current_node = stack.pop()
        if current_node in visited:
            continue
        visited.add(current_node)
        
        row = df.loc[df['id'] == current_node]
        if row.empty:
            continue
        
        is_a_values = row['is_a'].values[0]
        if type(is_a_values) != float:
            for target_node in is_a_values:
                connections.append((current_node, target_node, 'is_a'))
                stack.append(target_node)
        
        relationship_values = row['relationship'].values[0]
        if type(relationship_values) != float:
            for relationship in relationship_values:
                rel_type, target_node = relationship.split(' ')
                connections.append((current_node, target_node, rel_type))
                stack.append(target_node)
    
    return connections


connections = get_connections(nodes_df, ['GO:0030126','GO:0030126','GO:0000454'])

print(connections)
print(len(connections))
connections = set(connections) # remove duplicates if there is any
print(len(connections))


[('GO:0000454', 'GO:0031118', 'is_a'), ('GO:0031118', 'GO:0000154', 'is_a'), ('GO:0031118', 'GO:0001522', 'is_a'), ('GO:0001522', 'GO:0009451', 'is_a'), ('GO:0009451', 'GO:0016070', 'is_a'), ('GO:0009451', 'GO:0043412', 'is_a'), ('GO:0043412', 'GO:0043170', 'is_a'), ('GO:0043170', 'GO:0008152', 'is_a'), ('GO:0008152', 'GO:0008150', 'is_a'), ('GO:0016070', 'GO:0090304', 'is_a'), ('GO:0090304', 'GO:0006139', 'is_a'), ('GO:0090304', 'GO:0043170', 'is_a'), ('GO:0006139', 'GO:0044238', 'is_a'), ('GO:0044238', 'GO:0008152', 'is_a'), ('GO:0000154', 'GO:0006364', 'is_a'), ('GO:0000154', 'GO:0009451', 'is_a'), ('GO:0006364', 'GO:0006396', 'is_a'), ('GO:0006364', 'GO:0016072', 'is_a'), ('GO:0006364', 'GO:0042254', 'part_of'), ('GO:0042254', 'GO:0022613', 'is_a'), ('GO:0022613', 'GO:0044085', 'is_a'), ('GO:0044085', 'GO:0071840', 'is_a'), ('GO:0071840', 'GO:0009987', 'is_a'), ('GO:0009987', 'GO:0008150', 'is_a'), ('GO:0016072', 'GO:0016070', 'is_a'), ('GO:0006396', 'GO:0044238', 'is_a'), ('GO:000

In [7]:
import json
with open('connections.json', 'w') as f:
    json.dump(list(connections), f, indent=4)

In [12]:
import sqlite3

conn = sqlite3.connect('viz4go_backend/go_terms.db')
query = "SELECT * FROM go_terms"
df = pd.read_sql_query(query, conn)
conn.close()

df

Unnamed: 0,id,name,namespace,def,is_a,relationship
0,GO:0000001,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","GO:0048308,GO:0048311",
1,GO:0000002,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",GO:0007005,
2,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",GO:0005385,
3,GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function,"""Enables the transfer of a solute or solutes f...",GO:0005385,
4,GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,"""Catalysis of the transfer of a mannose residu...",GO:0000030,
...,...,...,...,...,...,...
42088,GO:2001313,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways involving...","GO:0006040,GO:0006793,GO:0009225",
42089,GO:2001314,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...","GO:0009227,GO:0046348,GO:2001313",
42090,GO:2001315,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...","GO:0009226,GO:0046349,GO:2001313",
42091,GO:2001316,kojic acid metabolic process,biological_process,"""The chemical reactions and pathways involving...","GO:0034308,GO:0042180,GO:0120254",


In [10]:
# check the type of relationship column
df['relationship'].apply(type).value_counts()

relationship
<class 'str'>    1000
Name: count, dtype: int64

In [16]:
go_terms = set()
for term1, term2, _ in connections:
    go_terms.add(term1)
    go_terms.add(term2)
print(len(go_terms))

placeholders = ', '.join('?' for _ in go_terms)
query = f"""
SELECT DISTINCT id, name, namespace, def, is_a, relationship
FROM go_terms
WHERE id IN ({placeholders})
"""

conn = sqlite3.connect('viz4go_backend/go_terms.db')

df = pd.read_sql_query(query, conn, params=list(go_terms))

conn.close()

df

64


Unnamed: 0,id,name,namespace,def,is_a,relationship
0,GO:0003008,system process,biological_process,"""A multicellular organismal process carried ou...",GO:0032501,
1,GO:0003674,molecular_function,molecular_function,"""A molecular process that can be carried out b...",,
2,GO:0005575,cellular_component,cellular_component,"""A location, relative to cellular compartments...",,
3,GO:0005622,intracellular anatomical structure,cellular_component,"""A component of a cell contained within (but n...",GO:0110165,
4,GO:0005737,cytoplasm,cellular_component,"""The contents of a cell excluding the plasma m...",GO:0110165,part_of GO:0005622
...,...,...,...,...,...,...
59,GO:0098960,postsynaptic neurotransmitter receptor activity,molecular_function,"""Neurotransmitter receptor activity occurring ...",GO:0030594,"occurs_in GO:0045211,part_of GO:0099565"
60,GO:0099536,synaptic signaling,biological_process,"""Cell-cell signaling to, from or within a syna...",GO:0007267,occurs_in GO:0045202
61,GO:0099537,trans-synaptic signaling,biological_process,"""Cell-cell signaling in either direction acros...",GO:0099536,
62,GO:0099565,"chemical synaptic transmission, postsynaptic",biological_process,"""The part of synaptic transmission occurring i...","GO:0007166,GO:0050877","occurs_in GO:0098794,part_of GO:0007268"


In [19]:
# convert df into json
import json
df_json = json.loads(df.to_json(orient='records'))
with open('go_terms.json', 'w') as f:
    json.dump(df_json, f, indent=4)

In [1]:
import pandas as pd
import re

def extract_go_terms_from_csv(file_path: str) -> list:
    """
    Funkcja do wyodrębniania GO termów z pliku CSV.
    
    Args:
        file_path (str): Ścieżka do pliku CSV.

    Returns:
        list: Lista unikalnych GO termów znalezionych w pliku.
    """
    # Wczytanie pliku CSV do DataFrame
    df = pd.read_csv(file_path)
    
    # Regularne wyrażenie do rozpoznawania GO termów
    go_term_pattern = re.compile(r'^GO:\d{7}$')
    
    # Wykrywanie kolumny zawierającej GO termy
    go_columns = [col for col in df.columns if df[col].astype(str).str.contains(go_term_pattern).any()]
    
    # Jeśli żadna kolumna nie zawiera GO termów, zwracamy pustą listę
    if not go_columns:
        return []
    
    # Wyodrębnianie unikalnych GO termów z wykrytej kolumny
    go_terms = df[go_columns[0]].dropna().unique().tolist()
    
    return go_terms

# Przykład użycia funkcji
file_path = 'viz4go_backend/data/deepfri_test_data/1_BP.csv'  # Zmień na rzeczywistą ścieżkę do pliku
go_terms = extract_go_terms_from_csv(file_path)
print("Znalezione GO termy:", go_terms)


Znalezione GO termy: ['GO:0043412', 'GO:0006464', 'GO:0036211', 'GO:0051716', 'GO:0006950']
