<a href="https://colab.research.google.com/github/lbdlab/NGS-LBD/blob/master/notebooks/HPO_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install neo4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neo4j
  Downloading neo4j-4.4.3.tar.gz (90 kB)
[K     |████████████████████████████████| 90 kB 3.4 MB/s 
Building wheels for collected packages: neo4j
  Building wheel for neo4j (setup.py) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-4.4.3-py3-none-any.whl size=116069 sha256=bada22dc1e0385d9e764a0655c492675aeaf7e535a114f708b6d5b131d61fae1
  Stored in directory: /root/.cache/pip/wheels/db/dd/76/acacd519878f133f2f869aec70db548d89e04013209c3c62bc
Successfully built neo4j
Installing collected packages: neo4j
Successfully installed neo4j-4.4.3


In [2]:
import pandas as pd
from neo4j import GraphDatabase

In [3]:
# Connect to Neo4j
driver = GraphDatabase.driver('neo4j://212.235.239.171:22333', auth=('neo4j', 'ngslbd2020'))

def read_query(query, database="semmed432202", params={}):
    with driver.session(database=database) as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [4]:
umls_from_hpo = read_query("""
MATCH (n:UMLS) 
RETURN collect(distinct split(n.id, ':')[1]) AS result
""", "hpo")

In [5]:
len(umls_from_hpo['result'].to_list()[0])

12868

In [6]:
read_query("""
MATCH (n) RETURN count(*)
""")

Unnamed: 0,count(*)
0,344513571


In [8]:
# Set HPO where UMLS is exact match
get_labels = read_query("""
UNWIND $data AS row
MATCH (c:Concept)
WHERE c.cui = row
SET c.is_hpo = True
WITH c
UNWIND labels(c) AS label
RETURN label, count(*) AS count
ORDER BY count DESC
""", params={'data': umls_from_hpo['result'].to_list()[0]})

In [21]:
get_labels.head()

Unnamed: 0,label,count
0,Concept,4939
1,dsyn,1919
2,fndg,1449
3,cgab,648
4,neop,428


In [22]:
# Set HPO WHERE UMLS is part of pipe delimiter
get_labels = read_query("""
MATCH (c:Concept)
WITH c, split(c.cui, "|") AS ids
WHERE any(id IN ids WHERE id IN $data)
SET c.is_hpo = True
WITH c
UNWIND labels(c) AS label
RETURN label, count(*) AS count
ORDER BY count DESC
""", params={'data': umls_from_hpo['result'].to_list()[0]})

In [23]:
get_labels

Unnamed: 0,label,count
0,Concept,4940
1,dsyn,1919
2,fndg,1449
3,cgab,648
4,neop,428
5,patf,313
6,anab,303
7,sosy,297
8,mobd,112
9,acab,80


In [None]:
"""
Samo človeški geni
Neki nared z entrez gene datasetom
"""

In [10]:
from google.colab import files
uploaded = files.upload()

Saving entrez_gene_ids_human_only.tsv to entrez_gene_ids_human_only.tsv


In [11]:
human_genes = uploaded['entrez_gene_ids_human_only.tsv'].decode("utf-8").split('\n')

In [13]:
# Direktne unikatni vozlišča, torej exact match by CUI
entrez_genes = read_query("""
UNWIND $data AS row
MATCH (c:Concept)
WHERE c.cui = row
SET c.is_hpo = True
RETURN labels(c), count(*) AS count
ORDER BY count DESC
""", params={'data': human_genes})

In [14]:
entrez_genes

Unnamed: 0,labels(c),count
0,"[aapp, Concept, gngm]",14785
1,"[Concept, gngm]",2859
2,"[aapp, Concept]",526


In [None]:
"""
Split CUI
Poglejmo koliko od teh genov je v semmedbdb
"""

In [24]:
# Split po pipe vozlišča za human entrez genes
entrez_genes = read_query("""
MATCH (c)
WHERE c:gngm OR c:aapp
WITH c, split(c.cui, "|") AS ids
WHERE any(id IN ids WHERE id IN $data)
SET c.is_hpo = True
RETURN labels(c), count(*) AS count
ORDER BY count DESC
""", params={'data': human_genes})

In [25]:
entrez_genes

Unnamed: 0,labels(c),count
0,"[aapp, Concept, gngm]",36105
1,"[aapp, Concept]",8999
2,"[Concept, gngm]",8264
3,"[aapp, Concept, gngm, bacs]",1843
4,"[aapp, Concept, gngm, enzy]",856
...,...,...
63,"[aapp, Concept, gngm, strd]",1
64,"[aapp, Concept, gngm, orch]",1
65,"[phsu, aapp, Concept, gngm, lipd]",1
66,"[phsu, Concept, gngm, bacs]",1
