In [1]:
pip install rdflib

Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hDownloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-7.0.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
"""Disease Ontology

Structure:
    1. Imports, Variables, Functions
    2. Load data
"""

# 1. Imports, Variables, Functions
# imports
from rdflib import Graph

# variables
data_path = "../data/DiseaseOntology/doid.owl"


# functions
def load_ontology(path):
    """Load the ontology from the given path."""
    g = Graph()
    g.parse(path, format="xml")
    return g


def get_diseases(g):
    """Retrieve diseases from the ontology."""
    qres = g.query(
        """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?label WHERE {
            ?subject rdfs:label ?label .
        }
        """
    )
    diseases = [row.label.toPython() for row in qres]
    return diseases


# 2. Load data
graph = load_ontology(data_path)
diseases = get_diseases(graph)
print(diseases)

['definition', 'definition source', 'has_ontology_root_term', 'term replaced by', 'DO_AGR_slim', 'DO_CFDE_slim', 'DO_FlyBase_slim', 'DO_GXD_slim', 'DO_IEDB_slim', 'DO_MGI_slim', 'DO_RAD_slim', 'DO_cancer_slim', 'DO_infectious_disease_slim', 'DO_rare_slim', 'GOLD', 'NCIthesaurus', 'TopNodes_DOcancerslim', 'gram-negative_bacterial_infectious_disease', 'gram-positive_bacterial_infectious_disease', 'sexually_transmitted_infectious_disease', 'tick-borne_infectious_disease', 'zoonotic_infectious_disease', 'dc:date', 'description', 'title', 'dc:type', 'license', 'subset_property', 'auto-generated-by', 'created_by', 'creation_date', 'date', 'default-namespace', 'has_alternative_id', 'has_broad_synonym', 'database_cross_reference', 'has_exact_synonym', 'has_narrow_synonym', 'has_obo_format_version', 'has_obo_namespace', 'has_related_synonym', 'id', 'in_subset', 'saved-by', 'comment', 'rdfs:isDefinedBy', 'owl:deprecated', 'owl:versionInfo', 'has broader match', 'has close match', 'has exact matc

In [8]:
import rdflib

# Load the OWL file
data_path = "../data/DiseaseOntology/doid.owl"
g = rdflib.Graph()
g.parse(data_path, format="xml")

# Print the number of triples
print(f"Number of triples: {len(g)}")

# Print all triples
for s, p, o in g:
    print(f"Subject: {s}")
    print(f"Predicate: {p}")
    print(f"Object: {o}")
    print("")

# Optionally, inspect specific elements, like classes or properties
# List all classes
classes = set(g.subjects(rdflib.RDF.type, rdflib.OWL.Class))
print("Classes:")
for cls in classes:
    print(cls)

# List all object properties
properties = set(g.subjects(rdflib.RDF.type, rdflib.OWL.ObjectProperty))
print("\nObject Properties:")
for prop in properties:
    print(prop)

In [4]:
len(diseases)

13924

In [6]:
# 1. Imports, Variables, Functions

# imports
import re

# variables
data_path = "../data/DiseaseOntology/doid.obo"


# functions
def extract_disease_names_from_obo(file_path):
    """
    Extracts disease names from an OBO formatted file.

    Args:
    - file_path (str): Path to the OBO file.

    Returns:
    - List[str]: A list of disease names.
    """

    # Open and read the content of the OBO file
    with open(file_path, "r") as f:
        content = f.read()

    # The OBO format divides entries using '[Term]'. We split the content based on this to get individual entries.
    terms = content.split("[Term]")

    disease_names = []  # List to store extracted disease names

    # Iterate over each term/entry
    for term in terms:
        # Use a regular expression to search for the line that starts with 'name: '
        # This line contains the name of the disease.
        match = re.search(r"name: (.+)", term)

        # If a match is found (i.e., the term has a name), extract it and add to the list
        if match:
            disease_name = match.group(
                1
            )  # The actual name is captured in the first group of the regex
            disease_names.append(disease_name)

    return disease_names


# 2. Load data

# Call the function to extract disease names and store them in the 'disease_names' variable
disease_names = extract_disease_names_from_obo(data_path)

# Print the extracted disease names
print(disease_names)

['angiosarcoma', 'pterygium', 'disease of metabolism', 'shrimp allergy', 'aspirin allergy', 'benzylpenicillin allergy', 'amoxicillin allergy', 'ceftriaxone allergy', 'carbamazepine allergy', 'abacavir allergy', 'isoniazide allergy', 'lidocaine allergy', 'mepivacaine allergy', 'phenobarbital allergy', 'phenytoin allergy', 'ranitidine allergy', 'corticosteroid allergy', 'sulfonamide allergy', 'sulfamethoxazole allergy', 'suprofen allergy', 'thiopental allergy', 'D-mannitol allergy', 'cefotaxime allergy', 'cephalosporin allergy', 'amodiaquine allergy', 'cefaclor allergy', 'ceftazidime allergy', 'cefuroxime allergy', 'chlorhexidine allergy', 'cyclophosphamide allergy', 'succinylcholine allergy', 'trimethoprim allergy', 'cefixime allergy', 'diclofenac allergy', 'carbapenem allergy', 'piperacillin allergy', 'rocuronium allergy', 'sulfasalazine allergy', 'tubocurarine allergy', 'aztreonam allergy', 'meropenem allergy', 'hexamethylene diisocyanate allergic asthma', 'isocyanates allergic asthma

In [1]:
pip install obonet networkx pyvis


Note: you may need to restart the kernel to use updated packages.


In [7]:
import obonet
import networkx as nx
from pyvis.network import Network
from IPython.display import IFrame

# Load the OBO file
data_path = "../data/DiseaseOntology/doid.obo"
graph = obonet.read_obo(data_path)

# Convert to a NetworkX graph
G = nx.DiGraph(graph)

# Visualize the graph using pyvis with in-line CDN resources
net = Network(
    height="750px",
    width="100%",
    bgcolor="#222222",
    font_color="white",
    cdn_resources="in_line",
)
net.from_nx(G)
net.save_graph("disease_ontology_obo.html")

# Display the HTML file
IFrame("disease_ontology_obo.html", width="100%", height="750px")

In [13]:
G = nx.DiGraph(graph)

# Print basic information about the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# Choose an example node (first node in the graph)
example_node = list(G.nodes)[0]
node_data = G.nodes[example_node]

# Print information about the example node
print(f"\nExample Node ID: {example_node}")
print(f"Node Data: {node_data}")

# Get edges associated with the example node
edges = list(G.edges(example_node, data=True))

# Print edges associated with the example node
print(f"\nEdges associated with the example node:")
for edge in edges:
    print(edge)

# Additional: print a sample of labels (terms) for other nodes
print("\nSample node labels (terms):")
sample_labels = {
    node: data.get("name", "No label") for node, data in list(G.nodes(data=True))
}
for node, label in sample_labels.items():
    print(f"{node}: {label}")

Number of nodes: 11386
Number of edges: 15809

Example Node ID: DOID:0001816
Node Data: {'name': 'angiosarcoma', 'alt_id': ['DOID:267', 'DOID:4508'], 'def': '"A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels." [url:http\\://en.wikipedia.org/wiki/Hemangiosarcoma, url:https\\://en.wikipedia.org/wiki/Angiosarcoma, url:https\\://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C3088, url:https\\://www.ncbi.nlm.nih.gov/pubmed/23327728]', 'subset': ['DO_cancer_slim', 'NCIthesaurus'], 'synonym': ['"hemangiosarcoma" EXACT []'], 'xref': ['ICDO:9120/3', 'MESH:D006394', 'NCI:C3088', 'NCI:C9275', 'SNOMEDCT_US_2023_03_01:39000009', 'UMLS_CUI:C0018923', 'UMLS_CUI:C0854893'], 'is_a': ['DOID:175']}

Edges associated with the example node:
('DOID:0001816', 'DOID:175', {})

Sample node labels (terms):
DOID:0001816: angiosarcoma
DOID:0002116: pterygium
DOID:0014667: disease of metabolism
DOID:0040001: shrimp allerg

In [29]:
# mapping between ids and names

# Create a dictionary to hold the mapping
id_to_name = {}

# Iterate through the nodes in the graph
for node, data in G.nodes(data=True):
    # Get the primary ID and name
    primary_id = node
    name = data.get("name", "No name")

    # Add primary ID to the mapping
    id_to_name[primary_id] = name

    # If there are alternative IDs, add them to the mapping as well
    alt_ids = data.get("alt_id", [])
    for alt_id in alt_ids:
        id_to_name[alt_id] = name

In [20]:
G.edges(example_node)

OutEdgeDataView([('DOID:0001816', 'DOID:175')])

In [15]:
sample_labels["DOID:175"]

'vascular cancer'

In [21]:
# Function to check the shortest path distance between two nodes
def check_distance(graph, node1, node2):
    try:
        distance = nx.shortest_path_length(graph, source=node1, target=node2)
        print(f"The shortest path distance between {node1} and {node2} is: {distance}")
    except nx.NetworkXNoPath:
        print(f"There is no path between {node1} and {node2}")


# Example nodes (replace with your specific nodes)
node1 = "DOID:0050156"  # Example node 1
node2 = "DOID:0050161"  # Example node 2

# Check the distance between the example nodes
check_distance(G, node1, node2)

The shortest path distance between DOID:0050156 and DOID:0050161 is: 4
