In [1]:
import sys, os, io, json
from neo4j import GraphDatabase
from py2neo import Graph
from pathlib import Path
from pandas import DataFrame
import pandas as pd
import networkx as nx

graph = Graph("bolt://localhost:7687")
driver = GraphDatabase.driver('bolt://localhost:7687', auth=None)


In [6]:
# Add num_desc to all onto classes, return the max value

df = DataFrame(graph.run("""
MATCH (ancestor:`ols:Class`)<-[:`biolink:broad_match`*1..]-(subclass:`ols:Class`)
WITH ancestor, count(DISTINCT subclass) AS num_desc
SET ancestor.num_desc = num_desc
RETURN max(num_desc) AS max_num_desc
""").data())

max_num_desc=df['max_num_desc'][0]
print(max_num_desc)



3475900


In [14]:
# For all onto classes set ic to a normalised value based on the number of descendants where 0 descendants = 1 ic
# upper level classes like owl:Thing, bfo continuant will have very low ic scores (< 0.02)

df = DataFrame(graph.run("""
MATCH (cl:`ols:Class`)
SET cl.ic = 1.0 - (cl.num_desc/$max_num_desc)
""", {
    'max_num_desc': float(max_num_desc)
}).data())



In [None]:

df = DataFrame(graph.run("""
CREATE INDEX ic FOR (n:GraphNode) ON (n.ic)
""").data())

# Which nodes have an IC score of less than 0.5?

In [24]:

df = DataFrame(graph.run("""
MATCH (n:GraphNode) WHERE n.ic < 0.5
RETURN n.`grebi:name`[0] AS name, n.ic AS ic
""").data())

print(df.to_markdown())

|    | name                          |          ic |
|---:|:------------------------------|------------:|
|  0 | entity                        | 0           |
|  1 | Thing                         | 2.47418e-05 |
|  2 | entity                        | 2.56049e-05 |
|  3 | experimental factor           | 0.00913087  |
|  4 | bfo:continuant                | 0.0175767   |
|  5 | bfo:independent_continuant    | 0.0793748   |
|  6 | material entity               | 0.0793815   |
|  7 | object                        | 0.0867643   |
|  8 | biological entity             | 0.0906039   |
|  9 | organismal entity             | 0.251327    |
| 10 | obi:organism                  | 0.253235    |
| 11 | obo:ncbitaxon.owl             | 0.253238    |
| 12 | cellular organisms or viruses | 0.259099    |
| 13 | NCBI_taxonomy:131567          | 0.33141     |
| 14 | Archaea or Eukaryota          | 0.493794    |
| 15 | Eukaryota                     | 0.498043    |


In [28]:

df = DataFrame(graph.run("""
MATCH (n:GraphNode) WHERE n.ic < 0.8
RETURN n.`grebi:name`[0] AS name, n.ic AS ic
""").data())

print(df.to_markdown())

|    | name                                               |          ic |
|---:|:---------------------------------------------------|------------:|
|  0 | entity                                             | 0           |
|  1 | Thing                                              | 2.47418e-05 |
|  2 | entity                                             | 2.56049e-05 |
|  3 | experimental factor                                | 0.00913087  |
|  4 | bfo:continuant                                     | 0.0175767   |
|  5 | bfo:independent_continuant                         | 0.0793748   |
|  6 | material entity                                    | 0.0793815   |
|  7 | object                                             | 0.0867643   |
|  8 | biological entity                                  | 0.0906039   |
|  9 | organismal entity                                  | 0.251327    |
| 10 | obi:organism                                       | 0.253235    |
| 11 | obo:ncbitaxon.owl              