# Rare disease analyses

In [23]:
!pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


In [6]:
import numpy as np
import pandas as pd
import os 
import sys
sys.path.append('..')

In [9]:
from SIMP_LLM.DRKG_loading   import  read_tsv, print_head

In [8]:
DATA_DIR           = os.path.join("../data")
verbose            =  True 

## Rare diseases

Prevalence of rare diseases from Orphanet 2022: https://www.orpha.net/orphacom/cahiers/docs/GB/Prevalence_of_rare_diseases_by_alphabetical_list.pdf

Downloaded datasets:

*  SNOMED CT-Orphanet nomenclature map from https://www.orphadata.com/alignments/ (direct link: http://www.orphadata.com/data/nomenclature/ORPHA-SNOMEDCT_Mapping_File_production.xlsx), accessed 5/22/23

* Rare diseases and alignment with ICD-10, ICD-11, OMIM, UMLS, MeSH, MedDRA and GARD from https://www.orphadata.com/alignments/ (direct link: https://www.orphadata.com/data/xml/en_product1.xml, renamed en_product1-Orphadata.xml), accessed 5/22/23


In [25]:
relation_file = 'ORPHA-SNOMEDCT_Mapping_File_production.xlsx'
df = pd.read_excel(os.path.join(DATA_DIR,relation_file), skiprows=2)
df

Unnamed: 0,ORPHAcode,ORPHA Main term,SNOMED concept ID
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,726021008
1,6,3-methylcrotonyl-CoA carboxylase deficiency,13144005
2,7,3C syndrome,718556007
3,8,"47,XYY syndrome",50749006
4,9,Tetrasomy X,10567003
...,...,...,...
6431,589905,PHIP-related behavioral problems-intellectual ...,1208987006
6432,590539,Isolated melanotic schwannoma,404024000
6433,592570,TRAF7-associated heart defect-digital anomalie...,1208998007
6434,595356,Localized dystrophic epidermolysis bullosa,254186008


In [None]:
import xml.etree.ElementTree as ET

relation_file = 'en_product1-Orphadata.xml'

tree = ET.parse(os.path.join(DATA_DIR,relation_file))
root = tree.getroot()[1]

In [248]:

data = []
cols = []

ignoreElems = ['DisorderFlagList', 'DisorderType', 'DisorderGroup']
passElems = ['Disorder', 'Expertlink', 'Synonymlist', 'Externalreferencelist', 'Externalreference']
endElems = ['Externalreferencelist']

def printRecur(root):
    """Recursively adds elements to list."""
    for i, child in enumerate(root):
        if child.tag in ignoreElems:            # Fully ignore some elements and their children
            continue
        if child.tag.title() not in passElems:  # Look at child elements and add to list unless specified
            cols.append(child.tag.title())
            data.append(child.attrib.get('name', child.text))
        printRecur(child)                       # Look at children of child element
    if root.tag.title() in endElems:            # Mark end of specified sections for later use
            cols.append('END_' + root.tag.title())
            data.append('\n')

printRecur(root)

long_df = pd.DataFrame([])
long_df['cols'] = cols
long_df['data'] = data
long_df

Unnamed: 0,cols,data
0,Orphacode,166024
1,Name,"Multiple epiphyseal dysplasia, Al-Gazali type"
2,Synonym,Multiple epiphyseal dysplasia-macrocephaly-dis...
3,Source,ICD-10
4,Reference,Q77.3
...,...,...
360798,Disordermappingvalidationstatus,\n
360799,Name,Validated
360800,END_Externalreferencelist,\n
360801,Disorderdisorderassociationlist,\n


In [249]:
# Clean long form orphan disease data
test = long_df.copy()
test = test.dropna()

# Flag code source
test['code_source'] = np.where(test['cols'] == 'Source', test['data'], None)
test['code_source'] = np.where(test['cols'] == 'END_Externalreferencelist', 'SKIP', test['code_source'])
test['code_source'] = test['code_source'].ffill()

# Rename 'Name' rows with true name 1 row up
test['cols'] = np.where((test['cols'] == 'Name') & (test['data'].shift(1).str.startswith('\n')), test['cols'].shift(1), test['cols'])

# Remove \n rows
test = test[~test['data'].str.startswith('\n')]

# Rename cols associated with specific source and remove source columns
test['cols'] = np.where(test['code_source'].isin([None, 'SKIP']), test['cols'], test['code_source'] + '_' + test['cols'])
test = test[~test['cols'].str.contains('_Source')]

test[:20]
# # test[test.isnull().any(axis=1)].drop_duplicates()
# # test[480:500]
# test[test['cols']=='Externalreferencelist']
# test[20:50]

Unnamed: 0,cols,data,code_source
0,Orphacode,166024,
1,Name,"Multiple epiphyseal dysplasia, Al-Gazali type",
2,Synonym,Multiple epiphyseal dysplasia-macrocephaly-dis...,
4,ICD-10_Reference,Q77.3,ICD-10
6,ICD-10_Disordermappingrelation,NTBT (ORPHA code's Narrower Term maps to a Bro...,ICD-10
8,ICD-10_Disordermappingicdrelation,Attributed (The ICD code is attributed by Orph...,ICD-10
10,ICD-10_Disordermappingvalidationstatus,Validated,ICD-10
12,OMIM_Reference,607131,OMIM
14,OMIM_Disordermappingrelation,E (Exact mapping: the two concepts are equival...,OMIM
17,OMIM_Disordermappingvalidationstatus,Validated,OMIM


In [250]:
test.groupby('cols').agg(
    ct=('data', 'count')
)

Unnamed: 0_level_0,ct
cols,Unnamed: 1_level_1
Contents,6686
Disorderdisorderassociationtype,2157
GARD_Disordermappingrelation,3871
GARD_Disordermappingvalidationstatus,3871
GARD_Reference,3871
ICD-10_Disordermappingicdrelation,8500
ICD-10_Disordermappingrelation,8500
ICD-10_Disordermappingvalidationstatus,8500
ICD-10_Reference,8500
ICD-11_Disordermappingicdrelation,1992


In [68]:
root[0][0].text

'166024'

## Repurposed drugs

Data from Clue: The Drug Repurposing Hub (https://clue.io/repurposing#download-data)

Latest version: 3/24/2020 <br>
Access date: 5/22/2023

In [5]:
!wget https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt
# Since this didn't work, downloaded manually to Data folder

zsh:1: command not found: wget


In [19]:
def read_and_process_rep_drugs(relation_file, verbose=False):
  """  
  Process OMIM lookup table in the following ways:
  - Remove extra variables
  - Clean disease name
  - Add "Disease::OMIM:" in front of OMIM ID to match DRKG format
  """
  df = pd.read_csv(os.path.join(DATA_DIR,relation_file), sep="\t", comment='!')

  if verbose:
    print(f"\n {relation_file}  Dataframe (Before processing):\n")
    print_head(df)

    print(f"\n {relation_file}  Dataframe (After processing):\n")
    print_head(df)
  return df




In [21]:
relation_file = 'repurposing_drugs_20200324.txt'
rep_drugs_df       =  read_and_process_rep_drugs(relation_file, verbose=verbose)    # Process entity names for clarity (e.g., F8 -> Gene F8) 
rep_drugs_df


 repurposing_drugs_20200324.txt  Dataframe (Before processing):

+----+------------------------------+------------------+---------------------------------+----------------------------------------------------------------------------------------------+----------------------+---------------------+
|    | pert_iname                   | clinical_phase   | moa                             | target                                                                                       | disease_area         | indication          |
|----+------------------------------+------------------+---------------------------------+----------------------------------------------------------------------------------------------+----------------------+---------------------|
|  0 | (R)-(-)-apomorphine          | Launched         | dopamine receptor agonist       | ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|DRD5|HTR1A|HTR1B|HTR1D|HTR2A|HTR2B|HTR2C|HTR5A | neurology/psychiatry | Parkinson's Disease |
|  1 | (R)

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,
6794,80841-78-7,Preclinical,,,,
6795,9-aminoacridine,Preclinical,,,,
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,
