In [1]:
pip install rdflib

Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-7.0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
"""Disease Ontology

Structure:
    1. Imports, Variables, Functions
    2. Load data
"""

# 1. Imports, Variables, Functions
# imports
from rdflib import Graph

# variables
data_path = "../data/doid.owl"

# functions
def load_ontology(path):
    """Load the ontology from the given path."""
    g = Graph()
    g.parse(path, format="xml")
    return g

def get_diseases(g):
    """Retrieve diseases from the ontology."""
    qres = g.query(
        """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?label WHERE {
            ?subject rdfs:label ?label .
        }
        """
    )
    diseases = [row.label.toPython() for row in qres]
    return diseases

# 2. Load data
graph = load_ontology(data_path)
diseases = get_diseases(graph)
print(diseases)


['definition', 'definition source', 'has_ontology_root_term', 'term replaced by', 'DO_AGR_slim', 'DO_CFDE_slim', 'DO_FlyBase_slim', 'DO_GXD_slim', 'DO_IEDB_slim', 'DO_MGI_slim', 'DO_RAD_slim', 'DO_cancer_slim', 'DO_infectious_disease_slim', 'DO_rare_slim', 'GOLD', 'NCIthesaurus', 'TopNodes_DOcancerslim', 'gram-negative_bacterial_infectious_disease', 'gram-positive_bacterial_infectious_disease', 'sexually_transmitted_infectious_disease', 'tick-borne_infectious_disease', 'zoonotic_infectious_disease', 'dc:date', 'description', 'title', 'dc:type', 'license', 'subset_property', 'auto-generated-by', 'created_by', 'creation_date', 'date', 'default-namespace', 'has_alternative_id', 'has_broad_synonym', 'database_cross_reference', 'has_exact_synonym', 'has_narrow_synonym', 'has_obo_format_version', 'has_obo_namespace', 'has_related_synonym', 'id', 'in_subset', 'saved-by', 'comment', 'rdfs:isDefinedBy', 'owl:deprecated', 'owl:versionInfo', 'has broader match', 'has close match', 'has exact matc

In [19]:
len(diseases)

13924

In [21]:
# 1. Imports, Variables, Functions

# imports
import re 

# variables
data_path = "../data/doid.obo"

# functions
def extract_disease_names_from_obo(file_path):
    """
    Extracts disease names from an OBO formatted file.

    Args:
    - file_path (str): Path to the OBO file.

    Returns:
    - List[str]: A list of disease names.
    """
    
    # Open and read the content of the OBO file
    with open(file_path, 'r') as f:
        content = f.read()

    # The OBO format divides entries using '[Term]'. We split the content based on this to get individual entries.
    terms = content.split('[Term]')
    
    disease_names = []  # List to store extracted disease names
    
    # Iterate over each term/entry
    for term in terms:
        # Use a regular expression to search for the line that starts with 'name: '
        # This line contains the name of the disease.
        match = re.search(r'name: (.+)', term)
        
        # If a match is found (i.e., the term has a name), extract it and add to the list
        if match:
            disease_name = match.group(1)  # The actual name is captured in the first group of the regex
            disease_names.append(disease_name)

    return disease_names

# 2. Load data

# Call the function to extract disease names and store them in the 'disease_names' variable
disease_names = extract_disease_names_from_obo(data_path)

# Print the extracted disease names
print(disease_names)


['angiosarcoma', 'pterygium', 'disease of metabolism', 'shrimp allergy', 'aspirin allergy', 'benzylpenicillin allergy', 'amoxicillin allergy', 'ceftriaxone allergy', 'carbamazepine allergy', 'abacavir allergy', 'isoniazide allergy', 'lidocaine allergy', 'mepivacaine allergy', 'phenobarbital allergy', 'phenytoin allergy', 'ranitidine allergy', 'corticosteroid allergy', 'sulfonamide allergy', 'sulfamethoxazole allergy', 'suprofen allergy', 'thiopental allergy', 'D-mannitol allergy', 'cefotaxime allergy', 'cephalosporin allergy', 'amodiaquine allergy', 'cefaclor allergy', 'ceftazidime allergy', 'cefuroxime allergy', 'chlorhexidine allergy', 'cyclophosphamide allergy', 'succinylcholine allergy', 'trimethoprim allergy', 'cefixime allergy', 'diclofenac allergy', 'carbapenem allergy', 'piperacillin allergy', 'rocuronium allergy', 'sulfasalazine allergy', 'tubocurarine allergy', 'aztreonam allergy', 'meropenem allergy', 'hexamethylene diisocyanate allergic asthma', 'isocyanates allergic asthma