### AUTHORS: All explorations by Selina Pi

# Rare disease analyses

In [2]:
!pip install openpyxl



In [3]:
import numpy as np
import pandas as pd
import os 
import sys
sys.path.append('..')

In [4]:
from SIMP_LLM.DRKG_loading   import  read_tsv, print_head

In [5]:
DATA_DIR           = os.path.join("../data")
verbose            =  True 

## Rare diseases

Prevalence of rare diseases from Orphanet 2022: https://www.orpha.net/orphacom/cahiers/docs/GB/Prevalence_of_rare_diseases_by_alphabetical_list.pdf

Downloaded datasets:

*  SNOMED CT-Orphanet nomenclature map from https://www.orphadata.com/alignments/ (direct link: http://www.orphadata.com/data/nomenclature/ORPHA-SNOMEDCT_Mapping_File_production.xlsx), accessed 5/22/23

* Rare diseases and alignment with ICD-10, ICD-11, OMIM, UMLS, MeSH, MedDRA and GARD from https://www.orphadata.com/alignments/ (direct link: https://www.orphadata.com/data/xml/en_product1.xml, renamed en_product1-Orphadata.xml), accessed 5/22/23


In [5]:
relation_file = 'ORPHA-SNOMEDCT_Mapping_File_production.xlsx'
df = pd.read_excel(os.path.join(DATA_DIR,relation_file), skiprows=2)
df

Unnamed: 0,ORPHAcode,ORPHA Main term,SNOMED concept ID
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,726021008
1,6,3-methylcrotonyl-CoA carboxylase deficiency,13144005
2,7,3C syndrome,718556007
3,8,"47,XYY syndrome",50749006
4,9,Tetrasomy X,10567003
...,...,...,...
6431,589905,PHIP-related behavioral problems-intellectual ...,1208987006
6432,590539,Isolated melanotic schwannoma,404024000
6433,592570,TRAF7-associated heart defect-digital anomalie...,1208998007
6434,595356,Localized dystrophic epidermolysis bullosa,254186008


### Rare diseases and alignment with ICD-10 etc.

#### Test run with modular code

In [6]:
from SIMP_LLM.raredisease_loading import get_orphan_data

In [9]:
# Test with wrong file path
wrongpath = os.path.join(DATA_DIR, 'en_product1-Orphadat.xml')
print(os.path.isfile(wrongpath))
orphan_names, orphan_codes = get_orphan_data(wrongpath, verbose=verbose)

False
Orphanet file not found in this directory. May need to download from Google Drive data folder.


FileNotFoundError: [Errno 2] No such file or directory: '../data/en_product1-Orphadat.xml'

In [10]:
# Test with correct file path
orphan_names, orphan_codes = get_orphan_data(os.path.join(DATA_DIR, 'en_product1-Orphadata.xml'), verbose=verbose)


 Long-form orphan disease data (before processing):

+----+-----------+------------------------------------------------------------------------+
|    | cols      | data                                                                   |
|----+-----------+------------------------------------------------------------------------|
|  0 | Orphacode | 166024                                                                 |
|  1 | Name      | Multiple epiphyseal dysplasia, Al-Gazali type                          |
|  2 | Synonym   | Multiple epiphyseal dysplasia-macrocephaly-distinctive facies syndrome |
|  3 | Source    | ICD-10                                                                 |
|  4 | Reference | Q77.3                                                                  |
+----+-----------+------------------------------------------------------------------------+

 Long-form orphan disease data (after processing):

+----+-------------------------+---------------------------------

In [7]:
orphan_codes

Unnamed: 0,Orphacode,Name,code_source,code,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",ICD-10,Q77.3,NTBT (ORPHA code's Narrower Term maps to a Bro...,Attributed (The ICD code is attributed by Orph...,Validated
1,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",OMIM,607131,E (Exact mapping: the two concepts are equival...,,Validated
2,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",UMLS,C1846722,E (Exact mapping: the two concepts are equival...,,Validated
3,58,Alexander disease,OMIM,203450,E (Exact mapping: the two concepts are equival...,,Validated
4,58,Alexander disease,MeSH,D038261,E (Exact mapping: the two concepts are equival...,,Validated
...,...,...,...,...,...,...,...
30794,620368,EGF-related primary hypomagnesemia with intell...,UMLS,C5681825,E (Exact mapping: the two concepts are equival...,,Validated
30795,617910,Conjunctival malignant melanoma,UMLS,C0346360,E (Exact mapping: the two concepts are equival...,,Validated
30796,619948,Early-onset autoimmunity-autoinflammation-immu...,UMLS,C5680416,E (Exact mapping: the two concepts are equival...,,Validated
30797,619360,NON RARE IN EUROPE: Isolated hereditary persis...,ICD-10,D56.4,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated


#### Original messy code below

In [None]:
import xml.etree.ElementTree as ET

relation_file = 'en_product1-Orphadata.xml'

tree = ET.parse(os.path.join(DATA_DIR,relation_file))
root = tree.getroot()[1]

In [31]:

data = []
cols = []

ignoreElems = ['DisorderFlagList', 'DisorderType', 'DisorderGroup','DisorderDisorderAssociationList']
passElems = ['Disorder', 'Expertlink', 'Synonymlist', 'Externalreferencelist', 'Externalreference']
attribElems = []
appendElems = ['Synonym']
endElems = ['Externalreferencelist']

def printRecur(root):
    """Recursively adds elements to list from XML file."""
    for i, child in enumerate(root):
        if child.tag in ignoreElems:            # Fully ignore some elements and their children
            continue
        if child.tag.title() not in passElems:  # Look at child elements and add to list unless specified (doesn't work yet)
            if child.tag.title() in appendElems and i>0:
                data[-1] = data[-1] + '|' + child.attrib.get('name', child.text)
                # print(data[-1]) # Just to see how synonym string gets built
            else:
                cols.append(child.tag.title())
                if child.tag in attribElems:
                    data.append(list(child.attrib.values())[0])
                else:
                    data.append(child.attrib.get('name', child.text))
        printRecur(child)                       # Look at children of child element
    if root.tag.title() in endElems:            # Mark end of specified sections for later use
            cols.append('END_' + root.tag.title())
            data.append('\n')

printRecur(root)

long_df = pd.DataFrame([])
long_df['cols'] = cols
long_df['data'] = data
long_df

Unnamed: 0,cols,data
0,Orphacode,166024
1,Name,"Multiple epiphyseal dysplasia, Al-Gazali type"
2,Synonym,Multiple epiphyseal dysplasia-macrocephaly-dis...
3,Source,ICD-10
4,Reference,Q77.3
...,...,...
327818,Disordermappingicdrelation,
327819,Disordermappingvalidationstatus,\n
327820,Name,Validated
327821,END_Externalreferencelist,\n


In [64]:
# Clean long form orphan disease data
long_df_processed = long_df.copy()
long_df_processed = long_df_processed.dropna()
disease_id = 'Orphacode'

# Flag disease ID
long_df_processed['disease_id'] = np.where(long_df_processed['cols'] == disease_id, long_df_processed['data'], None)
long_df_processed['disease_id'] = long_df_processed['disease_id'].ffill()

# Add code source
long_df_processed['code_source'] = np.where(long_df_processed['cols'] == 'Source', long_df_processed['data'], None)
long_df_processed['code_source'] = np.where(long_df_processed['cols'] == 'END_Externalreferencelist', 'SKIP', long_df_processed['code_source'])
long_df_processed['code_source'] = long_df_processed['code_source'].ffill()

# Add code
long_df_processed['code'] = np.where(long_df_processed['cols'] == 'Reference', long_df_processed['data'], None)
long_df_processed['code'] = np.where(long_df_processed['cols'] == 'END_Externalreferencelist', 'SKIP', long_df_processed['code'])
long_df_processed['code'] = long_df_processed['code'].ffill()

# Rename 'Name' rows with true name 1 row up
long_df_processed['cols'] = np.where((long_df_processed['cols'] == 'Name') & (long_df_processed['data'].shift(1).str.startswith('\n')), long_df_processed['cols'].shift(1), long_df_processed['cols'])

# Remove \n rows
long_df_processed = long_df_processed[~long_df_processed['data'].str.startswith('\n')]

# Rename cols associated with specific source and remove source columns
# long_df_processed['cols'] = np.where(long_df_processed['code_source'].isin([None, 'SKIP']), long_df_processed['cols'], long_df_processed['code_source'] + '_' + long_df_processed['cols'])
long_df_processed = long_df_processed[~long_df_processed['cols'].str.contains('Source')]

# Manually consolidate 'definition' entries
if long_df_processed[long_df_processed['cols']=='Textsectiontype'].drop_duplicates(subset='data').shape[0] == 1:
    print('long_df_processed')
    long_df_processed['cols'] = np.where(long_df_processed['cols'] == 'Contents', 'Definition', long_df_processed['cols'])
    long_df_processed = long_df_processed[long_df_processed['cols'] != 'Textsectiontype']

long_df_processed[:20]
# # long_df_processed[long_df_processed.isnull().any(axis=1)].drop_duplicates()
# # long_df_processed[480:500]
# long_df_processed[long_df_processed['cols']=='Externalreferencelist']
# long_df_processed[20:50]

test


Unnamed: 0,cols,data,disease_id,code_source,code
0,Orphacode,166024,166024,,
1,Name,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,,
2,Synonym,Multiple epiphyseal dysplasia-macrocephaly-dis...,166024,,
4,Reference,Q77.3,166024,ICD-10,Q77.3
6,Disordermappingrelation,NTBT (ORPHA code's Narrower Term maps to a Bro...,166024,ICD-10,Q77.3
8,Disordermappingicdrelation,Attributed (The ICD code is attributed by Orph...,166024,ICD-10,Q77.3
10,Disordermappingvalidationstatus,Validated,166024,ICD-10,Q77.3
12,Reference,607131,166024,OMIM,607131
14,Disordermappingrelation,E (Exact mapping: the two concepts are equival...,166024,OMIM,607131
17,Disordermappingvalidationstatus,Validated,166024,OMIM,607131


In [65]:
# Check for duplicates
check_dupe = long_df_processed.groupby(by=['disease_id', 'code_source', 'cols']).agg(
    ct = ('data', 'count')
).reset_index()

check_dupe = check_dupe[(check_dupe['ct']>1)]

check_dupe.drop_duplicates(subset=['code_source', 'cols'])
# check_dupe[check_dupe['code_source'].isin(['SKIP', None])]

Unnamed: 0,disease_id,code_source,cols,ct
36,100,OMIM,Disordermappingrelation,2
37,100,OMIM,Disordermappingvalidationstatus,2
38,100,OMIM,Reference,2
99,100006,ICD-10,Disordermappingicdrelation,2
100,100006,ICD-10,Disordermappingrelation,2
101,100006,ICD-10,Disordermappingvalidationstatus,2
102,100006,ICD-10,Reference,2
103,100006,MeSH,Disordermappingrelation,2
104,100006,MeSH,Disordermappingvalidationstatus,2
105,100006,MeSH,Reference,2


In [66]:
# Get orphan names
orphan_names = long_df_processed[long_df_processed['code_source'].isin([None, 'SKIP'])]
colnames = orphan_names['cols'].drop_duplicates().to_list()
orphan_names = pd.pivot(orphan_names,  index='disease_id', columns='cols', values='data').reindex(colnames, axis=1)
# orphan_names[orphan_names['Info'].isna()==False]
orphan_names


cols,Orphacode,Name,Synonym,Definition,Info
disease_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,10,"48,XXYY syndrome",,A rare sex chromosome number anomaly disorder ...,
100,100,Ataxia-telangiectasia,Louis-Bar syndrome,A rare disorder characterized by the associati...,
1000,1000,Ocular albinism with late-onset sensorineural ...,Ocular albinism with late-onset sensorineural ...,Ocular albinism with late-onset sensorineural ...,
100000,100000,Reticular perineurioma,,,
100001,100001,Sclerosing perineurioma,,,
...,...,...,...,...,...
99989,99989,Intermediate DEND syndrome,Developmental delay-epilepsy-neonatal diabetes...,"A rare, genetic, neonatal diabetes mellitus sy...",
99990,99990,Brill-Zinsser disease,Brill disease|Recrudescent typhus,,
99991,99991,Relapsing epidemic typhus,,,
99994,99994,Complex regional pain syndrome type 2,Causalgia,"Complex regional pain syndrome type 2 (CRPS2),...",


In [77]:
# Get orphan codes
_orphan_codes = long_df_processed[~long_df_processed['code_source'].isin([None, 'SKIP'])].merge(orphan_names[['Orphacode', 'Name']], how='left', left_on='disease_id', right_on='Orphacode')
_orphan_codes['id_code'] = _orphan_codes['disease_id'] + _orphan_codes['code_source'] + _orphan_codes['code']
colnames = _orphan_codes['cols'].drop_duplicates().to_list()
orphan_codes = pd.pivot(_orphan_codes,  index='id_code', columns='cols', values='data').reindex(colnames, axis=1).reset_index()
col_list = ['Orphacode', 'Name', 'code_source', 'id_code']
orphan_codes = _orphan_codes[col_list].drop_duplicates().merge(orphan_codes, how='left', on='id_code')
orphan_codes = orphan_codes.drop(columns=['id_code']).rename(columns={'Reference':'code'})
orphan_codes

Unnamed: 0,Orphacode,Name,code_source,code,Disordermappingrelation,Disordermappingicdrelation,Disordermappingvalidationstatus
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",ICD-10,Q77.3,NTBT (ORPHA code's Narrower Term maps to a Bro...,Attributed (The ICD code is attributed by Orph...,Validated
1,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",OMIM,607131,E (Exact mapping: the two concepts are equival...,,Validated
2,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",UMLS,C1846722,E (Exact mapping: the two concepts are equival...,,Validated
3,58,Alexander disease,OMIM,203450,E (Exact mapping: the two concepts are equival...,,Validated
4,58,Alexander disease,MeSH,D038261,E (Exact mapping: the two concepts are equival...,,Validated
...,...,...,...,...,...,...,...
30794,620368,EGF-related primary hypomagnesemia with intell...,UMLS,C5681825,E (Exact mapping: the two concepts are equival...,,Validated
30795,617910,Conjunctival malignant melanoma,UMLS,C0346360,E (Exact mapping: the two concepts are equival...,,Validated
30796,619948,Early-onset autoimmunity-autoinflammation-immu...,UMLS,C5680416,E (Exact mapping: the two concepts are equival...,,Validated
30797,619360,NON RARE IN EUROPE: Isolated hereditary persis...,ICD-10,D56.4,E (Exact mapping: the two concepts are equival...,Specific code (The ORPHA code has its own code...,Validated


In [301]:
# check for repeated?
check = 'Synonym'


# Rename 'Name' rows with true name 1 row up
long_df_processed[(long_df_processed['cols']==check) & (long_df_processed['cols'].shift(1) == check)]

long_df.iloc[357:361]

Unnamed: 0,cols,data
357,Name,Multiple sulfatase deficiency
358,Synonym,"Juvenile sulfatidosis, Austin type|MSD|Mucosul..."
359,Source,ICD-10
360,Reference,E75.2


In [298]:
interestElems = ['Synonymlist']
appendElems = ['Synonym']

data=[]
cols=[]
def testRecur(root):
    """Recursively adds elements to list."""
    for i, child in enumerate(root):
        if child.tag.title() in appendElems:  # Look at child elements and add to list unless specified (doesn't work yet)
            # if root.tag.title() in interestElems:
            print(i)
            print(child.attrib.get('name', child.text))
            # if root.tag.title() in interestElems and i>0:
            #     data[-1] = data[-1] + '|' + child.attrib.get('name', child.text)
            #     print('test')
            # else:
            #     cols.append(child.tag.title())
            #     if child.tag in attribElems:
            #         data.append(list(child.attrib.values())[0])
            #     else:
            #         data.append(child.attrib.get('name', child.text))
        testRecur(child)                       # Look at children of child element

testRecur(root[:15])


0
Multiple epiphyseal dysplasia-macrocephaly-distinctive facies syndrome
0
AxD
0
Lysosomal alpha-D-mannosidase deficiency
0
Aspartylglucosaminidase deficiency
0
Juvenile sulfatidosis, Austin type
1
MSD
2
Mucosulfatidosis
0
Beta-mannosidase deficiency
0
Fetal-onset olivopontocerebellar hypoplasia
1
PCH5
0
ACY2 deficiency
1
Aminoacylase 2 deficiency
2
Aspartoacylase deficiency
3
Spongy degeneration of the brain
0
Fatal infantile encephalopathy with olivopontocerebellar hypoplasia
1
Olivopontocerebellar hypoplasia
2
PCH4


In [276]:
# check for repeated?
check = 'Orphacode'

# Rename 'Name' rows with true name 1 row up
long_df_processed[(long_df_processed['cols']==check) & (long_df_processed['cols'].shift(1) == check)]


Unnamed: 0,cols,data,code_source


In [308]:
long_df_processed[long_df_processed['cols']=='Info']

Unnamed: 0,cols,data,code_source
506,Info,This entity has been excluded from the Orphane...,SKIP
2610,Info,This term does not characterize a disease but ...,SKIP
2677,Info,This disease is not rare in Europe. It does no...,SKIP
3023,Info,This term does not characterize a disease but ...,SKIP
3281,Info,This disease is not rare in Europe. It does no...,SKIP
...,...,...,...
363030,Info,This entity has been obsoleted from the Orphan...,SKIP
363048,Info,This entity has been obsoleted from the Orphan...,SKIP
363198,Info,This entity has been obsoleted from the Orphan...,SKIP
363216,Info,This entity has been obsoleted from the Orphan...,SKIP


In [307]:
long_df_processed.groupby('cols').agg(
    ct=('data', 'count')
)



Unnamed: 0_level_0,ct
cols,Unnamed: 1_level_1
Definition,6686
Disorder,10705
Disorderdisorderassociationtype,2157
GARD_Disordermappingrelation,3871
GARD_Disordermappingvalidationstatus,3871
GARD_Reference,3871
ICD-10_Disordermappingicdrelation,8500
ICD-10_Disordermappingrelation,8500
ICD-10_Disordermappingvalidationstatus,8500
ICD-10_Reference,8500


In [68]:
root[0][0].text

'166024'

## Repurposed drugs

Data from Clue: The Drug Repurposing Hub (https://clue.io/repurposing#download-data)

Latest version: 3/24/2020 <br>
Access date: 5/22/2023

In [7]:
from SIMP_LLM.raredisease_loading import read_and_process_rep_drugs

In [5]:
!wget https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt
# Since this didn't work, downloaded manually to Data folder

zsh:1: command not found: wget


In [5]:
# def read_and_process_rep_drugs(relation_file, verbose=False):
#   """  
#   Process OMIM lookup table in the following ways:
#   - Remove extra variables
#   - Clean disease name
#   - Add "Disease::OMIM:" in front of OMIM ID to match DRKG format
#   """
#   df = pd.read_csv(os.path.join(DATA_DIR,relation_file), sep="\t", comment='!')

#   if verbose:
#     print(f"\n {relation_file}  Dataframe (Before processing):\n")
#     print_head(df)

#     print(f"\n {relation_file}  Dataframe (After processing):\n")
#     print_head(df)
#   return df




In [11]:
relation_file = 'repurposing_drugs_20200324.txt'
rep_drugs_df       =  read_and_process_rep_drugs(os.path.join(DATA_DIR, relation_file), verbose=verbose)    # Process entity names for clarity (e.g., F8 -> Gene F8) 
rep_drugs_df


 ../data/repurposing_drugs_20200324.txt  Dataframe (Before processing):

+----+------------------------------+------------------+---------------------------------+----------------------------------------------------------------------------------------------+----------------------+---------------------+
|    | pert_iname                   | clinical_phase   | moa                             | target                                                                                       | disease_area         | indication          |
|----+------------------------------+------------------+---------------------------------+----------------------------------------------------------------------------------------------+----------------------+---------------------|
|  0 | (R)-(-)-apomorphine          | Launched         | dopamine receptor agonist       | ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|DRD5|HTR1A|HTR1B|HTR1D|HTR2A|HTR2B|HTR2C|HTR5A | neurology/psychiatry | Parkinson's Disease |
| 

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,
6794,80841-78-7,Preclinical,,,,
6795,9-aminoacridine,Preclinical,,,,
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,


In [12]:
rep_drugs_df[~rep_drugs_df['indication'].isna()]

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
32,abacavir,Launched,nucleoside reverse transcriptase inhibitor,,infectious disease,human immunodeficiency virus (HIV-1)
34,abamectin,Launched,benzodiazepine receptor agonist,GABBR1|GABBR2,infectious disease,gastrointestinal parasites
39,abemaciclib,Launched,CDK inhibitor,CDK4|CDK6,oncology,breast cancer
40,abiraterone,Launched,androgen biosynthesis inhibitor,CYP11B1|CYP17A1,oncology,prostate cancer
...,...,...,...,...,...,...
6697,3-(4-methylbenzylidene)camphor,Launched,endocrine disruptor,,dermatology,sunscreen lotion
6725,4-aminohippuric-acid,Launched,,SLC22A6,nephrology,renal diagnostic agent
6755,5-aminolevulinic-acid,Launched,oxidizing agent,ALAD,oncology|dermatology,glioma|actinic keratosis (AK)
6760,5-fluorouracil,Launched,thymidylate synthase inhibitor,DPYD|TYMS,oncology,colorectal cancer|breast cancer|pancreatic can...
