# Rare disease analyses

In [1]:
!pip install openpyxl



In [2]:
import numpy as np
import pandas as pd
import os 
import sys
sys.path.append('..')

In [3]:
from SIMP_LLM.DRKG_loading   import  read_tsv, print_head

In [4]:
DATA_DIR           = os.path.join("../data")
verbose            =  True 

## Rare diseases

Prevalence of rare diseases from Orphanet 2022: https://www.orpha.net/orphacom/cahiers/docs/GB/Prevalence_of_rare_diseases_by_alphabetical_list.pdf

Downloaded datasets:

*  SNOMED CT-Orphanet nomenclature map from https://www.orphadata.com/alignments/ (direct link: http://www.orphadata.com/data/nomenclature/ORPHA-SNOMEDCT_Mapping_File_production.xlsx), accessed 5/22/23

* Rare diseases and alignment with ICD-10, ICD-11, OMIM, UMLS, MeSH, MedDRA and GARD from https://www.orphadata.com/alignments/ (direct link: https://www.orphadata.com/data/xml/en_product1.xml, renamed en_product1-Orphadata.xml), accessed 5/22/23


In [5]:
relation_file = 'ORPHA-SNOMEDCT_Mapping_File_production.xlsx'
df = pd.read_excel(os.path.join(DATA_DIR,relation_file), skiprows=2)
df

Unnamed: 0,ORPHAcode,ORPHA Main term,SNOMED concept ID
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,726021008
1,6,3-methylcrotonyl-CoA carboxylase deficiency,13144005
2,7,3C syndrome,718556007
3,8,"47,XYY syndrome",50749006
4,9,Tetrasomy X,10567003
...,...,...,...
6431,589905,PHIP-related behavioral problems-intellectual ...,1208987006
6432,590539,Isolated melanotic schwannoma,404024000
6433,592570,TRAF7-associated heart defect-digital anomalie...,1208998007
6434,595356,Localized dystrophic epidermolysis bullosa,254186008


In [6]:
import xml.etree.ElementTree as ET

relation_file = 'en_product1-Orphadata.xml'

tree = ET.parse(os.path.join(DATA_DIR,relation_file))
root = tree.getroot()[1]

In [31]:

data = []
cols = []

ignoreElems = ['DisorderFlagList', 'DisorderType', 'DisorderGroup','DisorderDisorderAssociationList']
passElems = ['Disorder', 'Expertlink', 'Synonymlist', 'Externalreferencelist', 'Externalreference']
attribElems = []
appendElems = ['Synonym']
endElems = ['Externalreferencelist']

def printRecur(root):
    """Recursively adds elements to list."""
    for i, child in enumerate(root):
        if child.tag in ignoreElems:            # Fully ignore some elements and their children
            continue
        if child.tag.title() not in passElems:  # Look at child elements and add to list unless specified (doesn't work yet)
            if child.tag.title() in appendElems and i>0:
                data[-1] = data[-1] + '|' + child.attrib.get('name', child.text)
                # print(data[-1]) # Just to see how synonym string gets built
            else:
                cols.append(child.tag.title())
                if child.tag in attribElems:
                    data.append(list(child.attrib.values())[0])
                else:
                    data.append(child.attrib.get('name', child.text))
        printRecur(child)                       # Look at children of child element
    if root.tag.title() in endElems:            # Mark end of specified sections for later use
            cols.append('END_' + root.tag.title())
            data.append('\n')

printRecur(root)

long_df = pd.DataFrame([])
long_df['cols'] = cols
long_df['data'] = data
long_df

Unnamed: 0,cols,data
0,Orphacode,166024
1,Name,"Multiple epiphyseal dysplasia, Al-Gazali type"
2,Synonym,Multiple epiphyseal dysplasia-macrocephaly-dis...
3,Source,ICD-10
4,Reference,Q77.3
...,...,...
327818,Disordermappingicdrelation,
327819,Disordermappingvalidationstatus,\n
327820,Name,Validated
327821,END_Externalreferencelist,\n


In [35]:
# Clean long form orphan disease data
test = long_df.copy()
test = test.dropna()
disease_id = 'Orphacode'

# Flag disease ID
test['disease_id'] = np.where(test['cols'] == disease_id, test['data'], None)
test['disease_id'] = test['disease_id'].ffill()

# Flag code source
test['code_source'] = np.where(test['cols'] == 'Source', test['data'], None)
test['code_source'] = np.where(test['cols'] == 'END_Externalreferencelist', 'SKIP', test['code_source'])
test['code_source'] = test['code_source'].ffill()

# Rename 'Name' rows with true name 1 row up
test['cols'] = np.where((test['cols'] == 'Name') & (test['data'].shift(1).str.startswith('\n')), test['cols'].shift(1), test['cols'])

# Remove \n rows
test = test[~test['data'].str.startswith('\n')]

# Rename cols associated with specific source and remove source columns
# test['cols'] = np.where(test['code_source'].isin([None, 'SKIP']), test['cols'], test['code_source'] + '_' + test['cols'])
test = test[~test['cols'].str.contains('Source')]

# Manually consolidate 'definition' entries
if test[test['cols']=='Textsectiontype'].drop_duplicates().shape[0] == 1:
    test['cols'] = np.where(test['cols'] == 'Contents', 'Definition', test['cols'])
    test = test[test['cols'] != 'Textsectiontype']

test[:20]
# # test[test.isnull().any(axis=1)].drop_duplicates()
# # test[480:500]
# test[test['cols']=='Externalreferencelist']
# test[20:50]

Unnamed: 0,cols,data,disease_id,code_source
0,Orphacode,166024,166024,
1,Name,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,
2,Synonym,Multiple epiphyseal dysplasia-macrocephaly-dis...,166024,
4,Reference,Q77.3,166024,ICD-10
6,Disordermappingrelation,NTBT (ORPHA code's Narrower Term maps to a Bro...,166024,ICD-10
8,Disordermappingicdrelation,Attributed (The ICD code is attributed by Orph...,166024,ICD-10
10,Disordermappingvalidationstatus,Validated,166024,ICD-10
12,Reference,607131,166024,OMIM
14,Disordermappingrelation,E (Exact mapping: the two concepts are equival...,166024,OMIM
17,Disordermappingvalidationstatus,Validated,166024,OMIM


In [39]:
# Check for duplicates
check_dupe = test.groupby(by=['disease_id', 'code_source', 'cols']).agg(
    ct = ('data', 'count')
).reset_index()

check_dupe = check_dupe[(check_dupe['ct']>1)]

# check_dupe['cols'].drop_duplicates()
check_dupe[check_dupe['code_source'].isin(['SKIP', None])]

Unnamed: 0,disease_id,code_source,cols,ct


In [25]:
test[test['disease_id']=='10']

Unnamed: 0,cols,data,disease_id,code_source
647,Disorder,10,10,SKIP
648,Orphacode,206,10,SKIP
649,Name,NON RARE IN EUROPE: Crohn disease,10,SKIP
651,ICD-10_Reference,K50.1,10,ICD-10
653,ICD-10_Disordermappingrelation,BTNT (ORPHA code's Broader Term maps to a Narr...,10,ICD-10
655,ICD-10_Disordermappingicdrelation,Specific code (The ORPHA code has its own code...,10,ICD-10
657,ICD-10_Disordermappingvalidationstatus,Validated,10,ICD-10
659,ICD-10_Reference,K50.0,10,ICD-10
661,ICD-10_Disordermappingrelation,BTNT (ORPHA code's Broader Term maps to a Narr...,10,ICD-10
663,ICD-10_Disordermappingicdrelation,Specific code (The ORPHA code has its own code...,10,ICD-10


In [21]:
test2 = pd.pivot(test,  index='disease_id', columns='cols', values='data')

ValueError: Index contains duplicate entries, cannot reshape

In [310]:
test2

cols,Definition,Disorder,Disorderdisorderassociationtype,GARD_Disordermappingrelation,GARD_Disordermappingvalidationstatus,GARD_Reference,ICD-10_Disordermappingicdrelation,ICD-10_Disordermappingrelation,ICD-10_Disordermappingvalidationstatus,ICD-10_Reference,...,MedDRA_Reference,Name,OMIM_Disordermappingrelation,OMIM_Disordermappingvalidationstatus,OMIM_Reference,Orphacode,Synonym,UMLS_Disordermappingrelation,UMLS_Disordermappingvalidationstatus,UMLS_Reference
0,,17601,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,166024,,,,
2,,,,,,,,,,,...,,"Multiple epiphyseal dysplasia, Al-Gazali type",,,,,,,,
3,,,,,,,,,,,...,,,,,,,Multiple epiphyseal dysplasia-macrocephaly-dis...,,,
5,,,,,,,,,,Q77.3,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364320,,,,,,,,,,,...,,Hereditary persistence of fetal hemoglobin-int...,,,,,,,,
364321,,,,,,,,,,,...,,,,,,,Dias-Logan syndrome,,,
364323,,,,,,,,,,,...,,,,,617101,,,,,
364325,,,,,,,,,,,...,,,E (Exact mapping: the two concepts are equival...,,,,,,,


In [301]:
# check for repeated?
check = 'Synonym'


# Rename 'Name' rows with true name 1 row up
test[(test['cols']==check) & (test['cols'].shift(1) == check)]

long_df.iloc[357:361]

Unnamed: 0,cols,data
357,Name,Multiple sulfatase deficiency
358,Synonym,"Juvenile sulfatidosis, Austin type|MSD|Mucosul..."
359,Source,ICD-10
360,Reference,E75.2


In [298]:
interestElems = ['Synonymlist']
appendElems = ['Synonym']

data=[]
cols=[]
def testRecur(root):
    """Recursively adds elements to list."""
    for i, child in enumerate(root):
        if child.tag.title() in appendElems:  # Look at child elements and add to list unless specified (doesn't work yet)
            # if root.tag.title() in interestElems:
            print(i)
            print(child.attrib.get('name', child.text))
            # if root.tag.title() in interestElems and i>0:
            #     data[-1] = data[-1] + '|' + child.attrib.get('name', child.text)
            #     print('test')
            # else:
            #     cols.append(child.tag.title())
            #     if child.tag in attribElems:
            #         data.append(list(child.attrib.values())[0])
            #     else:
            #         data.append(child.attrib.get('name', child.text))
        testRecur(child)                       # Look at children of child element

testRecur(root[:15])


0
Multiple epiphyseal dysplasia-macrocephaly-distinctive facies syndrome
0
AxD
0
Lysosomal alpha-D-mannosidase deficiency
0
Aspartylglucosaminidase deficiency
0
Juvenile sulfatidosis, Austin type
1
MSD
2
Mucosulfatidosis
0
Beta-mannosidase deficiency
0
Fetal-onset olivopontocerebellar hypoplasia
1
PCH5
0
ACY2 deficiency
1
Aminoacylase 2 deficiency
2
Aspartoacylase deficiency
3
Spongy degeneration of the brain
0
Fatal infantile encephalopathy with olivopontocerebellar hypoplasia
1
Olivopontocerebellar hypoplasia
2
PCH4


In [276]:
# check for repeated?
check = 'Orphacode'

# Rename 'Name' rows with true name 1 row up
test[(test['cols']==check) & (test['cols'].shift(1) == check)]


Unnamed: 0,cols,data,code_source


In [308]:
test[test['cols']=='Info']

Unnamed: 0,cols,data,code_source
506,Info,This entity has been excluded from the Orphane...,SKIP
2610,Info,This term does not characterize a disease but ...,SKIP
2677,Info,This disease is not rare in Europe. It does no...,SKIP
3023,Info,This term does not characterize a disease but ...,SKIP
3281,Info,This disease is not rare in Europe. It does no...,SKIP
...,...,...,...
363030,Info,This entity has been obsoleted from the Orphan...,SKIP
363048,Info,This entity has been obsoleted from the Orphan...,SKIP
363198,Info,This entity has been obsoleted from the Orphan...,SKIP
363216,Info,This entity has been obsoleted from the Orphan...,SKIP


In [307]:
test.groupby('cols').agg(
    ct=('data', 'count')
)



Unnamed: 0_level_0,ct
cols,Unnamed: 1_level_1
Definition,6686
Disorder,10705
Disorderdisorderassociationtype,2157
GARD_Disordermappingrelation,3871
GARD_Disordermappingvalidationstatus,3871
GARD_Reference,3871
ICD-10_Disordermappingicdrelation,8500
ICD-10_Disordermappingrelation,8500
ICD-10_Disordermappingvalidationstatus,8500
ICD-10_Reference,8500


In [68]:
root[0][0].text

'166024'

## Repurposed drugs

Data from Clue: The Drug Repurposing Hub (https://clue.io/repurposing#download-data)

Latest version: 3/24/2020 <br>
Access date: 5/22/2023

In [5]:
!wget https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt
# Since this didn't work, downloaded manually to Data folder

zsh:1: command not found: wget


In [19]:
def read_and_process_rep_drugs(relation_file, verbose=False):
  """  
  Process OMIM lookup table in the following ways:
  - Remove extra variables
  - Clean disease name
  - Add "Disease::OMIM:" in front of OMIM ID to match DRKG format
  """
  df = pd.read_csv(os.path.join(DATA_DIR,relation_file), sep="\t", comment='!')

  if verbose:
    print(f"\n {relation_file}  Dataframe (Before processing):\n")
    print_head(df)

    print(f"\n {relation_file}  Dataframe (After processing):\n")
    print_head(df)
  return df




In [21]:
relation_file = 'repurposing_drugs_20200324.txt'
rep_drugs_df       =  read_and_process_rep_drugs(relation_file, verbose=verbose)    # Process entity names for clarity (e.g., F8 -> Gene F8) 
rep_drugs_df


 repurposing_drugs_20200324.txt  Dataframe (Before processing):

+----+------------------------------+------------------+---------------------------------+----------------------------------------------------------------------------------------------+----------------------+---------------------+
|    | pert_iname                   | clinical_phase   | moa                             | target                                                                                       | disease_area         | indication          |
|----+------------------------------+------------------+---------------------------------+----------------------------------------------------------------------------------------------+----------------------+---------------------|
|  0 | (R)-(-)-apomorphine          | Launched         | dopamine receptor agonist       | ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|DRD5|HTR1A|HTR1B|HTR1D|HTR2A|HTR2B|HTR2C|HTR5A | neurology/psychiatry | Parkinson's Disease |
|  1 | (R)

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,
6794,80841-78-7,Preclinical,,,,
6795,9-aminoacridine,Preclinical,,,,
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,
