# Description

## Introduction

    The main goal of the **Description** module is to introduce means to enrich the PropaPhen Ontology in order to acquire a domain-specific ontology for a specific phenomenon

In [1]:
%load_ext autoreload
%autoreload 2

## Libraries

### Installation

In [107]:
#!pip install owlready2
#!pip install tqdm
!pip install pkt_kg

Defaulting to user installation because normal site-packages is not writeable
Collecting pkt_kg
  Obtaining dependency information for pkt_kg from https://files.pythonhosted.org/packages/b8/39/1bc3fa878665e0c94b6cbb897419b6ddd3a7023db041f01fe0d7c1b99969/pkt_kg-3.1.2-py3-none-any.whl.metadata
  Downloading pkt_kg-3.1.2-py3-none-any.whl.metadata (20 kB)
Collecting argparse (from pkt_kg)
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting Cython>=0.29.14 (from pkt_kg)
  Obtaining dependency information for Cython>=0.29.14 from https://files.pythonhosted.org/packages/a3/97/dcf27a5708e8e121788d9e5c32df04763cf5063dd5e4320f21543dbdeed8/Cython-3.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached Cython-3.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting networkx (from pkt_kg)
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading types_requests-2.31.0.20240106-py3-none-any.whl (14 kB)
Downloading urllib3-2.1.0-py3-none-any.whl (104 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.6/104.6 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading msgpack-1.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (534 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m534.8/534.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m31m13.6 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading protobuf-4.25.1-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manyli

### Standard

In [2]:
from owlready2 import *
import pandas as pd
from tqdm import tqdm

### Custom

## Globals

In [3]:
path_propaphen = "../PropaPhen/PropaPhen.owl"
path_to_mrrel = "data/MRREL.RRF"
path_to_srdef = "data/SRDEF"
path_to_worldkg_ontology = "data/WorldKG_Ontolgy.owl"
path_to_worldkg_nodes = "data/worldkg_nodes.csv"
path_to_worldkg_edges = "data/worldkg_edges.csv"

In [38]:
path_save_umlsonto = "data/umlsonto.owl"
path_save_worldkg = "data/worldkg.owl"

## Ontologies

### PropaPhen

In [103]:
propaphen = get_ontology("file://" + path_propaphen)

In [104]:
propaphen = propaphen.load()   

In [7]:
propaphen_classes = list(propaphen.classes())
propaphen_dict_classes = dict(zip([x.__name__ for x in propaphen_classes],propaphen_classes))

# UMLS


---

UMLS has as main concepts ([ref](https://www.nlm.nih.gov/research/umls/new_users/online_learning/Meta_005.html)):
 - Concept Unique Identifier (CUI):
 - Lexical (term) Unique Identifiers (LUI)
 - String Unique Identifiers (SUI)
 - Atom Unique Identifier (AUI):

Other than those concepts, UMLS also counts with a Semantic Network that consists of (1) a set of broad subject categories, or **Semantic Types**, that provide a consistent categorization of all concepts represented in the UMLS Metathesaurus, and (2) a set of useful and important relationships, or Semantic Relations, that exist between Semantic Types.

---
This subsection describes the aforementioned concepts and retrieves all properties found in the UMLS 2023AB repository

In [8]:
umlsonto = get_ontology("https://w3id.org/def/umls")
# Adding concepts/classes
with umlsonto:
    class UMLSEntity(Thing):
        pass
    class CUI(UMLSEntity):
        pass
    class LUI(UMLSEntity):
        pass
    class SUI(UMLSEntity):
        pass
    class AUI(UMLSEntity):
        pass
    class SemanticType(UMLSEntity):
        pass
    AllDisjoint([CUI, LUI, SUI, AUI, SemanticType])

In [9]:
list(umlsonto.classes())

[umls.UMLSEntity, umls.CUI, umls.LUI, umls.SUI, umls.AUI, umls.SemanticType]

#### Adding CUI/AUI relationships

In [10]:
aui_rel = set()
cui_rel = set()
aui_cui_rel = set()
cui_aui_rel = set()

In [11]:
with open(path_to_mrrel, mode= 'r', encoding= 'utf-8') as mrrel: 
    for line in tqdm(mrrel):
        line = line.split("|")
        start_node= line[4]
        end_node= line[0]
        # aui and aui
        if line[6]== 'AUI' and line[2]== 'AUI': # style 2
            aui_rel.add(line[3])  
        elif line[6]== 'AUI': # style 2
            cui_aui_rel.add(line[3])
        if line[2]== 'AUI': # syle 1
            aui_cui_rel.add(line[3])
        else:
            cui_rel.add(line[3])

55685992it [01:06, 840386.72it/s]


In [12]:
cui_domain=cui_rel | cui_aui_rel
aui_domain= aui_cui_rel | aui_rel
cui_range = cui_rel | aui_cui_rel
aui_range = aui_rel | cui_aui_rel

In [13]:
all_rel = list((cui_rel | cui_aui_rel | aui_cui_rel | aui_rel) -  set([str(x) for x in range(10)]))

In [14]:
with umlsonto:
    for rel in all_rel:
        opdomain = []
        oprange = []
        if rel in cui_domain:
            opdomain.append(CUI)
        if rel in aui_domain:
            opdomain.append(AUI)
        if rel in cui_range:
            oprange.append(CUI)
        if rel in aui_range:
            oprange.append(AUI)
        addRelation = type(rel,(ObjectProperty,),  {
            'domain' : opdomain,
            'range' : oprange
        })

#### Adding Semantic Network relationships

In [15]:
semanticrels = pd.read_csv(path_to_srdef,sep='|', header=None, usecols=[0,1])

In [16]:
semanticrelslist = semanticrels[semanticrels[0]=='RL'][1].tolist()

In [17]:
# For each relation
with umlsonto:
    for rel in semanticrelslist:
        addRelation = type(rel,(ObjectProperty,),  {
            'domain' : [SemanticType],
            'range' : [SemanticType]
        })

#### isa type

In [18]:
with umlsonto:
    addRelation = type("isa",(ObjectProperty,),  {
            'domain' : [SemanticType],
            'range' : [SemanticType]
        })

#### CUI and Semantic Network

In [19]:
# For each relation
with umlsonto:
    addRelation = type("STY",(ObjectProperty,),  {
        'domain' : [CUI],
        'range' : [SemanticType]
    })

In [36]:
umlsonto.save(file=path_save_umlsonto,format="rdfxml")

### World-KG

In [22]:
worldkg = get_ontology("file://" + path_to_worldkg_ontology)

In [24]:
worldkg = worldkg.load()

In [25]:
worldkg_dict_classes = dict(zip([x.__name__ for x in 
                                worldkg.classes()],
                               worldkg.classes()))

In [26]:
kgnodes = pd.read_csv(path_to_worldkg_nodes)

  kgnodes = pd.read_csv(path_to_worldkg_nodes)


In [27]:
dict_label = dict(zip(kgnodes.loc[:,"id:ID"],kgnodes.loc[:,":LABEL"]))

In [28]:
dict_str_to_class = {}
for value in set(dict_label.values()):
    dict_str_to_class[value] = worldkg_dict_classes[value.split(":")[-1]]

In [29]:
kgedges = pd.read_csv(path_to_worldkg_edges)

In [30]:
wkgdomain = {}
wkgrange = {}
for worldtype in set(dict_label.values()):
    wkgdomain[worldtype] = set()
    wkgrange[worldtype] = set()

In [31]:
all_rel = set()
with open(path_to_worldkg_edges, mode= 'r', encoding= 'utf-8') as mrrel: 
    count = 0
    for line in tqdm(mrrel):
        if count == 0:
            count += 1
            continue
        line = line.split(",")
        property_name = line[2].replace("\n","")
        all_rel.add(property_name)
        domain_obj = dict_label[line[0]]
        range_obj = dict_label[line[1]]
        wkgdomain[domain_obj].add(property_name)
        wkgrange[range_obj].add(property_name)

2228040it [00:03, 724546.97it/s]


In [32]:
# For each relation
with worldkg:
    for rel in all_rel:
        domainlist = []
        rangelist = []
        for key in wkgdomain.keys():
            if rel in wkgdomain[key]:
                domainlist.append(dict_str_to_class[key])
        for key in wkgrange.keys():
            if rel in wkgrange[key]:
                rangelist.append(dict_str_to_class[key])
        addRelation = type(rel,(ObjectProperty,),  {
            'domain' : domainlist,
            'range' : rangelist
        })

In [39]:
worldkg.save(file=path_save_worldkg,format="rdfxml")

## Semi-Automatic Alignment

In [97]:
propaphen.imported_ontologies.append(umlsonto)
propaphen.imported_ontologies.append(worldkg)

In [106]:
propaphen.search(iri="*City")

[ontology.City, schema.City, schema.capitalCity]

In [138]:
import glob
#import needed libraries
from pkt_kg.utils import merges_ontologies
# set-up input variables
write_location = 'resources/'
merged_ontology_file = '/PropaPhenPlus.owl'
ontology_repository = glob.glob('data/saved/*.owl')

In [139]:
ontology_repository

['data/saved/umlsonto.owl',
 'data/saved/PropaPhen.owl',
 'data/saved/worldkg.owl']

In [140]:
# merge the ontologies
merges_ontologies(ontology_repository, write_location, merged_ontology_file)

Merging Ontologies: worldkg.owl, PropaPhen.owl
2024-01-08 18:52:22,178 ERROR (CommandRunner:4641) could not parse:data/saved/PropaPhen.owl
org.semanticweb.owlapi.model.UnloadableImportException: Could not load imported ontology: <http://www.opengis.net/ont/geosparql> Cause: Problem parsing http://www.opengis.net/ont/geosparql
Could not parse ontology.  Either a suitable parser could not be found, or parsing failed.  See parser logs below for explanation.
The following parsers were tried:
1) org.semanticweb.owlapi.rdf.rdfxml.parser.RDFXMLParser@340b9973
2) org.semanticweb.owlapi.owlxml.parser.OWLXMLParser@20a14b55
3) org.semanticweb.owlapi.functional.parser.OWLFunctionalSyntaxOWLParser@5d2a4eed
4) org.semanticweb.owlapi.rio.RioParserImpl : org.semanticweb.owlapi.formats.RioTurtleDocumentFormatFactory@95fd655c
5) org.semanticweb.owlapi.manchestersyntax.parser.ManchesterOWLSyntaxOntologyParser@553f1d75
6) org.semanticweb.owlapi.rio.RioParserImpl : org.semanticweb.owlapi.formats.NQuadsDocu

[Fatal Error] :1:50: White spaces are required between publicId and systemId.


IndexError: pop from empty list

## Individual Instantiation in Neo4j

## Ontology + Individual in Neo4j