## logmap dissease landscape matching analysis
- first thing we want to check is how to get xrefs for known mappings

In [1]:
import polars as pl
from textdistance import levenshtein
seed = 101

## load in names that are not found 

## load novel mappings and add a column for edit similarity

In [2]:
def normalized_edit_similarity(x):
    return levenshtein.normalized_similarity(x['source name'].upper(), x['target name'].upper())

In [3]:
novel_maps = pl.read_csv('output/logmap/disease_landscape/full_analysis/semra_novel_mappings.tsv', separator='\t')
novel_maps = novel_maps.with_columns(
    edit_similarity = pl.struct(['source name', 'target name']).map_elements(normalized_edit_similarity, return_dtype=pl.Float32)
)
novel_maps = novel_maps.select([
    'source prefix', 
    'target prefix',
    'source identifier', 
    'target identifier', 
    'source name', 
    'target name',
    'confidence',
    'edit_similarity'
])

## remove cases where a class is mapped to its self 

In [4]:
novel_maps = novel_maps.remove(pl.col('target identifier') == pl.col("source identifier"))

In [5]:
direct_matches = novel_maps.filter(pl.col("edit_similarity").eq(1)) ## around 42,000
novel_maps = novel_maps.remove(pl.col("edit_similarity").eq(1))

## lets spot check some maps that do not have exact lexical matches

In [6]:
to_check = novel_maps.sample(n=10, seed=seed)
to_check.write_csv("to_check.tsv", separator='\t')
to_check = to_check.select([
    'source identifier', 
    'target identifier',
    'source name', 
    'target name',
    'confidence',
    'edit_similarity'
]
)

## case 1

- Mondo 
    ```
    id: MONDO:0019622
    name: non-specific interstitial pneumonia
    def: "Idiopathic interstitial pneumonia characterized by chronic inflammation and fibrosis in the interstitial lung tissue. It includes cases that cannot be classified into one of the other types of idiopathic interstitial pneumonia." [NCIT:C35717]
    subset: gard_rare {source="GARD:19167", source="MONDO:GARD"}
    subset: nord_rare {source="MONDO:NORD"}
    subset: ordo_disorder {source="Orphanet:91364"}
    subset: orphanet_rare {source="Orphanet:91364"}
    subset: rare
    synonym: "non-specific idiopathic interstitial pneumonia" EXACT [Orphanet:91364]
    synonym: "nonspecific interstitial pneumonia" EXACT [DOID:2801, MONDO:0002431, NCIT:C35717]
    synonym: "NSIP" EXACT ABBREVIATION [DOID:2801, NCIT:C35717, Orphanet:91364]
    ```
- ICD11
    ```
    id: icd11:1319771917
    name: Idiopathic non-specific interstitial pneumonia
    def: "Idiopathic nonspecific interstitial pneumonia is one type of idiopathic interstitial pneumonias \(IIPs\). Idiopathic nonspecific interstitial pneumonia has been associated with many medical conditions\, although a causal link has not been identified. In addition\, it may be idiopathic. Nonspecific Interstitial Pneumonia\(NSIP\) originated as a histopathologic categorization reserved for surgical lung biopsies not demonstrating a clearly identifiable pattern. The histopathologic pattern of NSIP is found in a wide variety of diseases of known cause \(e.g.\, hypersensitivity pneumonitis\, drug-related\, acquired immunodeficiency syndrome \[AIDS\]–related\, and collagen-vascular diseases\). Moreover\, many patients diagnosed with idiopathic NSIP meet the case definition of undifferentiated connective tissue disease\, suggesting that idiopathic NSIP might actually be an autoimmune disease. In addition\, it is estimated that up to 15% to 20% of patients who present with a chronic ILD either have an occult connective tissue disease or subsequently develop a clinically overt connective tissue disease. In this particular group of patients\, the initial clinical presentation may be essentially indistinguishable from that of several IIPs \(especially NSIP and UIP\)."
    synonym: "NSIP - [Nonspecific Interstitial Pneumonia]" []
    is_a: icd11:1297512293
    ```
- Mondo has a synonym that is the same as the name used in ICD11 so this seems correct

## case 2 

- DOID:
    ```
    id: DOID:11387
    name: epidural abscess
    def: "A central nervous system disease that is characterized by a collection of pus (infected material) between the outer covering of the brain and spinal cord and the bones of the skull or spine and is caused by infection in the area between the bones of the skull or spine, and the membranes covering the brain and spinal cord (meninges)." [url:http\://www.nlm.nih.gov/medlineplus/ency/article/001416.htm]
    synonym: "abscess epidural" EXACT []
    xref: MESH:D020802
    xref: SNOMEDCT_US_2023_03_01:310671007
    xref: UMLS_CUI:C0270629
    is_a: DOID:331 ! central nervous system disease
    ```
- ICD11
    ```
    id: icd11:1299705786
    name: Intraspinal epidural abscess
    def: "A condition of the epidural space\, caused by an infection with a bacterial\, viral\, fungal\, or parasitic source. This condition is characterised by a focal accumulation of purulent material within the epidural space. This condition presents with symptoms depending on the location of the abscess. Transmission is through haematogenous spread of the infectious agent commonly from a cutaneous or mucosal source."
    synonym: "epidural abscess" []
    synonym: "epidural abscess of spinal cord" []
    synonym: "epidural embolic abscess of spinal cord" []
    synonym: "epidural embolic abscess of spinal cord, any part" []
    synonym: "spinal epidural abscess" []
    property_value: skos:exactMatch icd11.code:1D03.4
    is_a: icd11:1483190070
    ```
- ICD11 has the the name from DOID as a synonym so I think this is correct

# case 3 

- EFO
    ```
    id: efo:EFO_0010638
    name: atopic asthma
    def: "An asthma that is characterized by symptoms that are triggered by an allergic reaction caused by inhaled allergens such as dust mite allergen, pet dander, pollen and mold." []
    synonym: "allergic asthma" EXACT []
    ```
- icd11
    ```
    id: icd11:1779929269
    name: Allergic asthma\, uncomplicated
    synonym: "allergic asthma NOS" []
    property_value: skos:exactMatch icd11.code:CA23.02
    is_a: icd11:1870104478
    ```

- EFO has the same term as a synonym so this seems to be correct

# case 4 

- ICD11
    ```
    id: icd11:1802011368
    name: Autosomal dominant osteopetrosis type 1
    def: "Autosomal dominant osteopetrosis type I \(ADO I\) is a sclerosing bone disorder characterised by skeletal densification that predominantly involves the cranial vault. Clinical signs include chronic bone pain and disorders of the cranial nerves \(trigeminal neuralgia\, facial palsy\, hearing loss\)."
    is_a: icd11:1498426606
    ```
- Gard
    ```
    id: gard:4151
    name: Autosomal dominant osteopetrosis 1
    synonym: "LRP5 osteopetrosis (disease)" []
    synonym: "OPTA1" []
    synonym: "autosomal dominant osteopetrosis type 1" []
    synonym: "osteopetrosis (disease) caused by mutation in LRP5" []
    synonym: "osteopetrosis, autosomal dominant type 1" []
    is_a: uncategorized

    ```
- Gard lists the term from ICD as a synonym so I think this is correct

## case 5

- NCIT
    ```
    id: NCIT:C98993
    name: Monosomy 13q Syndrome
    def: "A rare syndrome that is characterized by the partial deletion of the long arm of chromosome 13. Signs and symptoms include low birth weight, craniofacial malformations, hands and feet malformations, and mental and psychomotor retardation." [] {NCIT:P378="NCI"}
    subset: NCIT:C90259
    subset: NCIT:C99147
    synonym: "13q Deletion Syndrome" EXACT [] {NCIT:P383="SY", NCIT:P384="NCI"}
    synonym: "13q Syndrome" EXACT [] {NCIT:P383="SY", NCIT:P384="caDSR"}
    synonym: "Monosomy 13q Syndrome" EXACT [] {NCIT:P383="PT", NCIT:P384="NICHD"}
    synonym: "Monosomy 13q Syndrome" EXACT [] {NCIT:P383="PT", NCIT:P384="NCI"}
    is_a: NCIT:C28193 ! Syndrome
    ```
- orphanet.ordo
    ```
    id: ORDO:Orphanet_1587
    name: Monosomy 13q14 syndrome
    xref: ICD-10:Q93.5 {ECO:0000218="- NTBT (ORPHAcode is narrower than the targeted code used to represent it).\n- Attributed code (ICD-10/ICD-11: The targeted code is assigned by Orphanet)."}
    xref: ICD-11:LD44.D {ECO:0000218="- NTBT (ORPHAcode is narrower than the targeted code used to represent it).\n- Index term (ICD-10: Orphanet entity listed in the ICD-10 Index. ICD-11: Orphanet entity listed in the ICD-11 Foundation)."}
    xref: OMIM:613884 {ECO:0000218="E (Exact mapping: the two concepts are equivalent)"}
    xref: UMLS:C4749304 {ECO:0000218="E (Exact mapping: the two concepts are equivalent)"}
    is_a: ORDO:Orphanet_377789 ! Malformation syndrome
    is_a: ORDO:Orphanet_557493 ! disorder
    relationship: BFO:0000050 ORDO:Orphanet_262101 ! part_of Partial deletion of the long arm of chromosome 13 syndrome
    relationship: BFO:0000050 ORDO:Orphanet_98574 ! part_of Syndromic epicanthus
    relationship: BFO:0000050 ORDO:Orphanet_98642 ! part_of Chromosomal anomaly with cataract
    property_value: efo:alternative_term "Del(13)(q14)" xsd:string
    property_value: efo:alternative_term "Deletion 13q14" xsd:string
    property_value: efo:definition "Monosomy 13q14 is a rare chromosomal anomaly syndrome, resulting from a partial deletion of the long arm of chromosome 13, characterized by developmental delay, variable degrees of intellectual disability, retinoblastoma and craniofacial dysmorphism (incl. micro/dolichocephaly, high and broad forehead, prominent eyebrows, thick, anteverted ear lobes, short nose with a broad nasal bridge and bulbous tip, prominent philtrum, large mouth with thin upper lip and thick, everted lower lip). Other features reported include high birth weight, macrocephaly, pinealoma, hepatomegaly, inguinal hernia and cryptorchidism." xsd:string
    property_value: efo:definition_citation "Orphanet" xsd:string
    ```
- seems to be a narrow match 

## case 6

- MONDO
    ```
    id: MONDO:0010078
    name: spondyloperipheral dysplasia
    def: "A condition caused by by truncating mutations in the C-propeptide of COL2A1. Like other type II collagen disorders it is characterized by short stature, platyspondyly and epiphyseal dysplasia. A distinguishing feature is the presence of brachydactyly with a prominent first toe." [https://orcid.org/0000-0001-8612-1062]
    subset: clingen {source="MONDO:CLINGEN"}
    subset: gard_rare {source="GARD:4994", source="MONDO:GARD"}
    subset: ordo_disorder {source="Orphanet:1856"}
    subset: orphanet_rare {source="Orphanet:1856"}
    subset: otar {source="MONDO:OTAR"}
    subset: rare
    synonym: "spondyloperipheral dysplasia" EXACT [DOID:0112195, NCIT:C135088, OMIM:271700]
    synonym: "spondyloperipheral dysplasia with short ulna" RELATED []
    synonym: "spondyloperipheral dysplasia-short ulna syndrome" EXACT [DOID:0112195, https://orcid.org/0000-0001-8612-1062, Orphanet:1856]
    ```
- icd11
    ```
    id: icd11:67659062
    name: Spondyloperipheral dysplasia - short ulna
    is_a: icd11:1977414063
    ```
- this seems to be a narrow match 

## lets check case 7

- ICD11
    ```
    id: icd11:1981283512
    name: Localised porokeratosis
    def: "Porokeratosis confined to a localised area of the skin surface. It presents as single or multiple papules or plaques which may coalesce into giant verrucous forms."
    synonym: "Porokeratosis of Mibelli" []
    is_a: icd11:29524620
    ```

- MONDO
    ```
    id: MONDO:0008290
    name: porokeratosis 1, Mibelli type
    subset: gard_rare {source="GARD:15108", source="MONDO:GARD"}
    subset: rare
    synonym: "POROK1" EXACT ABBREVIATION [MONDO:Lexical, OMIM:175800]
    synonym: "porokeratosis 1, MIBELLI type" RELATED [MONDO:Lexical]
    synonym: "porokeratosis 1, multiple types" RELATED []
    synonym: "porokeratosis of Mibelli" RELATED []
    xref: GARD:15108 {source="MONDO:GARD"}
    xref: OMIM:175800 {source="MONDO:equivalentTo"}
    is_a: MONDO:0019141 {source="Orphanet:735/btnt"} ! porokeratosis of Mibelli
    ```
- They both have `Porokeratosis of Mibelli` as a synonym, but also the MONDO class seems to be a child class of its synonym. I think the match is correct, but it unclear?

## lets check case 8

- ICD11
    ```
    id: icd11:1313183227
    name: Lobular carcinoma in situ of breast
    synonym: "lobular carcinoma in situ unspecified site" []
    synonym: "noninfiltrating lobular carcinoma of breast" []
    synonym: "noninfiltrating lobular carcinoma unspecified site" []
    property_value: skos:exactMatch icd11.code:2E65.0
    is_a: icd11:1264360394
    ```
- NCIT
    ```
    name: Breast Lobular Carcinoma In Situ
    def: "A non-invasive adenocarcinoma of the breast characterized by a proliferation of monomorphic cells completely filling the lumina.  The overall lobular architecture is preserved.  It is frequently multifocal (90% in some series) and bilateral.  It seldom becomes invasive; however there is an increased risk of infiltrating ductal adenocarcinoma." [] {NCIT:P378="NCI"}
    synonym: "Lobular Carcinoma in situ of Breast" EXACT [] {NCIT:P383="SY", NCIT:P384="NCI"}
    synonym: "Lobular carcinoma in situ of breast" EXACT [] {NCIT:P383="PT", NCIT:P385="D05.0", NCIT:P384="mCode", NCIT:P386="ICD-10 CM"}
    ```
- this seems to be correct since they share synonyms

# check case 9

- ICD11
    ```
    id: icd11:544479555
    name: Pulmonary eosinophilia
    def: "Pulmonary eosinophilia are a heterogeneous group of disorders that share the feature of abnormally increased numbers of eosinophils."
    synonym: "Weingarten's syndrome" []
    synonym: "eosinophilic lung infiltrate" []
    property_value: skos:exactMatch icd11.code:CB02
    is_a: icd11:111658096
    ```
- EFO
    ```
    id: efo:EFO_0007257
    name: eosinophilic pneumonia
    def: "A pneumonia in which certain type of white blood cell called an eosinophil accumulates in the lung. These cells cause disruption of the normal air spaces (alveoli) where oxygen is extracted from the atmosphere. It is caused by certain medications or environmental triggers, parasitic infections, and cancer. The most common symptoms include cough, fever, difficulty breathing, and sweating at night." []
    def: "An inflammatory lung disorder characterized by an increased number of eosinophils in the lungs. The majority of cases are idiopathic, without identifiable cause. In a minority of cases, medications, fungal infections, and environmental triggers have been implicated. It manifests as acute or chronic. Acute eosinophilic pneumonia is a severe and rapidly progressing pneumonia that may lead to respiratory failure requiring mechanical ventilation. Chronic eosinophilic pneumonia follows a slower course and manifests as fever, dyspnea, cough, and weight loss." [NCIT:C35150]
    subset: nord_rare {source="MONDO:NORD"}
    subset: otar {source="MONDO:OTAR"}
    subset: rare
    synonym: "eosinophilic pneumonia" EXACT [] {comment="preferred label from MONDO"}
    synonym: "eosinophilic pneumonia" EXACT [DOID:5870, NCIT:C35150]
    synonym: "Pneumonia, eosinophilic" EXACT []
    synonym: "pneumonia, eosinophilic" EXACT [DOID:5870]
    synonym: "Pulmonary Eosinophilia" EXACT []
    ```
- EFO has the exact name as the one used in ICD11 as a synonym so I think this is correct

# check case 10

- ICD11
    ```
    id: icd11:1589625540
    name: Chondromalacia patellae
    def: "A disease of the knee joint\, caused by damage to the cartilage under the patella. This disease is characterised by pain in the front of the knee that worsens when walking up or down stairs. This disease may be associated with injury or overuse."
    synonym: "patellofemoral chondromalacia" []
    synonym: "patellofemoral pain syndrome" []
    property_value: skos:exactMatch icd11.code:FB82.00
    is_a: icd11:29494765
    is_a: icd11:547071520
    ```
- DOID
    ```
    [Term]
    id: DOID:14284
    name: patellofemoral pain syndrome
    xref: MESH:D046788
    xref: UMLS_CUI:C0877149
    is_a: DOID:381 ! arthropathy
    ```
- ICD11 has the exact name as the one used in DOID as a synonym so I think this is correct

## lets make sure that there are not overlaps between direct matches and semra 
- here is an example direct match from logmap


In [17]:
direct_matches.filter(
    (pl.col('target identifier').eq("gard:10572"))
    &
    (pl.col('source identifier').eq("orphanet.ordo:85278"))
)

source prefix,target prefix,source identifier,target identifier,source name,target name,confidence,edit_similarity
str,str,str,str,str,str,f64,f32
"""orphanet.ordo""","""gard""","""orphanet.ordo:85278""","""gard:10572""","""Christianson syndrome""","""Christianson syndrome""",0.5,1.0


- it is not present in the Semra dataset

In [18]:
semra_df = pl.read_csv('resources/semra_disease_landscape_mappings.tsv',separator='\t')
semra_df.filter(
    ((pl.col("subject_id").eq('orphanet.ordo:85278')) |  (pl.col("object_id").eq('orphanet.ordo:85278')))
    &
    ((pl.col("subject_id").eq('gard:10572')) | (pl.col("object_id").eq('gard:10572')))
) 

subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_set,mapping_set_version,mapping_set_license,mapping_set_confidence,author_id,comment
str,str,str,str,str,str,str,str,str,f64,str,str


- it is worth noting however that, mappings between `orphanet.ordo:85278` and `doid:0060825` as well as `gard:10572` and `doid:0060825` are in the dataset. So this map is consistent with those inferred from Semra

In [19]:
semra_df.filter(
    ((pl.col("subject_id").eq('orphanet.ordo:85278')) |  (pl.col("object_id").eq('orphanet.ordo:85278')))
    &
    ((pl.col("subject_id").eq('doid:0060825')) | (pl.col("object_id").eq('doid:0060825')))
) 

subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_set,mapping_set_version,mapping_set_license,mapping_set_confidence,author_id,comment
str,str,str,str,str,str,str,str,str,f64,str,str
"""doid:0060825""","""Christianson syndrome""","""oboinowl:hasDbXref""","""orphanet.ordo:85278""","""Christianson syndrome""","""semapv:UnspecifiedMatching""","""doid""","""http://purl.obolibrary.org/obo…","""CC0-1.0""",0.99,,
"""orphanet.ordo:85278""","""Christianson syndrome""","""oboinowl:hasDbXref""","""doid:0060825""","""Christianson syndrome""","""semapv:MappingInversion""","""doid""",,,1.0,,"""doid:0060825 orphanet.ordo:852…"
"""doid:0060825""","""Christianson syndrome""","""skos:exactMatch""","""orphanet.ordo:85278""","""Christianson syndrome""","""semapv:BackgroundKnowledgeBase…","""doid""",,,0.9995,,"""doid:0060825 orphanet.ordo:852…"
"""orphanet.ordo:85278""","""Christianson syndrome""","""skos:exactMatch""","""doid:0060825""","""Christianson syndrome""","""semapv:MappingInversion""","""doid""",,,1.0,,"""doid:0060825 orphanet.ordo:852…"


In [20]:
semra_df.filter(
    ((pl.col("subject_id").eq('gard:10572')) |  (pl.col("object_id").eq('gard:10572')))
    &
    ((pl.col("subject_id").eq('doid:0060825')) | (pl.col("object_id").eq('doid:0060825')))
) 

subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_set,mapping_set_version,mapping_set_license,mapping_set_confidence,author_id,comment
str,str,str,str,str,str,str,str,str,f64,str,str
"""doid:0060825""","""Christianson syndrome""","""oboinowl:hasDbXref""","""gard:10572""","""Christianson syndrome""","""semapv:UnspecifiedMatching""","""doid""","""http://purl.obolibrary.org/obo…","""CC0-1.0""",0.99,,
"""gard:10572""","""Christianson syndrome""","""oboinowl:hasDbXref""","""doid:0060825""","""Christianson syndrome""","""semapv:MappingInversion""","""doid""",,,1.0,,"""doid:0060825 gard:10572"""
