# BioMedGraphica Disease

## 1. Data Access

### Direct Download Links  
**ICD-10**: Can be downloaded directly via the link without the need for registration. [Link](https://icdcdn.who.int/static/releasefiles/2024-01/mapping.zip)  
**ICD-11**: Can be downloaded directly via the link without the need for registration. [Link](https://icdcdn.who.int/static/releasefiles/2024-01/SimpleTabulation-ICD-11-MMS-en.zip)  
**Disease Ontology**: Can be downloaded directly via the link without the need for registration. [Link](https://github.com/DiseaseOntology/HumanDiseaseOntology/blob/main/DOreports/allXREFinDO.tsv)  
**MeSH**: Can be downloaded directly via the link without the need for registration. [Link](https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.xml)  
**UMLS**: Need Registration. [Link](https://download.nlm.nih.gov/umls/kss/2024AB/umls-2024AB-metathesaurus-full.zip?_gl=1*1etpc4l*_ga*MTA5NTI1Nzc2My4xNzEwOTU5NjM5*_ga_7147EPK006*MTczODAwMzk4Ny4xMTEuMS4xNzM4MDA0MTA4LjAuMC4w*_ga_P1FPTH9PL4*MTczODAwMzk4Ny4xMTAuMS4xNzM4MDA0MTA4LjAuMC4w)  
**Snomed CT**: Need Registration. [Link](https://download.nlm.nih.gov/umls/kss/IHTSDO2025/IHTSDO20250101/SnomedCT_InternationalRF2_PRODUCTION_20250101T120000Z.zip?_gl=1*ejsjb5*_ga*MTA5NTI1Nzc2My4xNzEwOTU5NjM5*_ga_7147EPK006*MTczODAwMzk4Ny4xMTEuMS4xNzM4MDA0MjQ4LjAuMC4w*_ga_P1FPTH9PL4*MTczODAwMzk4Ny4xMTAuMS4xNzM4MDA0MjQ4LjAuMC4w)  
**Mondo**: Can be downloaded directly via the link without the need for registration. [Link1](https://github.com/monarch-initiative/mondo/blob/master/reports/xrefs.tsv); [Link2](https://github.com/monarch-initiative/mondo/releases/latest/download/mondo.obo)  

## 2. Load Data

### 2.1 ICD-11

In [38]:
import pandas as pd

df_icd11 = pd.read_csv('SimpleTabulation-ICD-11-MMS-en.txt', sep='\t')
df_icd11_id = df_icd11[df_icd11['ClassKind'] == 'category']
df_icd11_id = df_icd11_id[['Code', 'Title']]
df_icd11_id = df_icd11_id.rename(columns={'Code': 'ICD11_ID', 'Title': 'ICD11_Title'})
df_icd11_id['ICD11_Title'] = df_icd11_id['ICD11_Title'].str.replace('-', '', regex=False)
df_icd11_id['ICD11_Title'] = df_icd11_id['ICD11_Title'].str.lstrip()
df_icd11_id.drop_duplicates(inplace=True)
df_icd11_id.reset_index(drop=True, inplace=True)
df_icd11_id

  df_icd11 = pd.read_csv('SimpleTabulation-ICD-11-MMS-en.txt', sep='\t')


Unnamed: 0,ICD11_ID,ICD11_Title
0,1A00,Cholera
1,1A01,Intestinal infection due to other Vibrio
2,1A02,Intestinal infections due to Shigella
3,1A03,Intestinal infections due to Escherichia coli
4,1A03.0,Enteropathogenic Escherichia coli infection
...,...,...
34658,XD36Q1,"Infusion Pumps, Syringe"
34659,XD1N14,"Infusion Pumps, Syringe, Nuclear Magnetic Reso..."
34660,XD80Z7,Medical/medicinal gas systems and relative acc...
34661,XD4U38,General purpose electrocardiographs


### 2.2 ICD-10

In [39]:
# Download link: https://icdcdn.who.int/static/releasefiles/2024-01/mapping.zip 
df_icd10 = pd.read_csv('ICD10_ICD11_mapping/10To11MapToOneCategory.txt', sep='\t')
df_icd10 = df_icd10[df_icd10['10DepthInKind'] == 2]

df_icd10_11 = df_icd10[['icd10Code','icd11Code']]
df_icd10_11['icd11Code'] = df_icd10_11['icd11Code'].str.split('&')
df_icd10_11 = df_icd10_11.explode('icd11Code')
df_icd10_11['icd11Code'] = df_icd10_11['icd11Code'].str.strip()
df_icd10_11 = df_icd10_11.rename(columns={'icd10Code': 'ICD10_ID', 'icd11Code': 'ICD11_ID'})
df_icd10_11.replace('nan', pd.NA, inplace=True)
df_icd10_11.dropna(subset=['ICD11_ID'], inplace=True)
df_icd10_11.reset_index(drop=True, inplace=True)

df_icd10_11['ICD11_ID'] = df_icd10_11['ICD11_ID'].astype(str)
df_icd10_11_merge = df_icd10_11.groupby('ICD10_ID').agg({'ICD11_ID': lambda x: ';'.join(x.dropna())}).reset_index()
df_icd10_11_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_icd10_11['icd11Code'] = df_icd10_11['icd11Code'].str.split('&')


Unnamed: 0,ICD10_ID,ICD11_ID
0,A00.0,1A00;XN8P1
1,A00.1,1A00;XN62R
2,A00.9,1A00
3,A01.0,1A07.Z
4,A01.1,1A08;XN1K5
...,...,...
10072,Z99.2,QB42
10073,Z99.3,QB44
10074,Z99.4,QB4Z
10075,Z99.8,QB4Z


### 2.3 Disease Ontology

In [40]:
# https://github.com/DiseaseOntology/HumanDiseaseOntology/blob/main/DOreports/allXREFinDO.tsv
df_do_xref = pd.read_csv('allXREFinDO.tsv', sep='\t')

df_do_xref[['database', 'database_id']] = df_do_xref['xref'].str.split(':', expand=True)
df_do_xref_pivot = df_do_xref.groupby(['id', 'label', 'database'])['database_id'].agg(lambda x: ';'.join(x)).reset_index()

# Pivot the table to show each 'database' as a column, with values joined by ';' where applicable
df_do_xref_pivot = df_do_xref_pivot.pivot_table(index=['id', 'label'], columns='database', values='database_id', aggfunc=lambda x: ';'.join(x)).reset_index()
df_do_xref_filter = df_do_xref_pivot[['id', 'label', 'UMLS_CUI', 'MESH', 'ICD10CM', 'MIM']]
df_do_xref_filter

database,id,label,UMLS_CUI,MESH,ICD10CM,MIM
0,DOID:0001816,angiosarcoma,C0018923;C0854893,D006394,,
1,DOID:0002116,pterygium,C0033999,,,
2,DOID:0014667,disease of metabolism,C0025517,D008659,E88.9,
3,DOID:0040002,aspirin allergy,C0004058,,,
4,DOID:0040003,benzylpenicillin allergy,C0571411,,,
...,...,...,...,...,...,...
11057,DOID:9987,orbit sarcoma,C1335131,,,
11058,DOID:9988,tertiary neurosyphilis,C0027927,D009494,A52.3,
11059,DOID:999,hypereosinophilic syndrome,C0014457,D004802,D72.1,
11060,DOID:9993,hypoglycemia,C0020615,D007003,E16.2,


### 2.4 MeSH

In [5]:
# Download link: https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2024.xml 

import xml.etree.ElementTree as ET
import csv

def extract_disease_descriptors(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    disease_descriptors = []
    
    for descriptor_record in root.findall('.//DescriptorRecord'):
        tree_numbers = [tree_number.text for tree_number in descriptor_record.findall('.//TreeNumberList/TreeNumber')]
        
        # Disease's tree number starts with 'C'
        if any(tree_number.startswith('C') for tree_number in tree_numbers):
            descriptor_ui = descriptor_record.find('.//DescriptorUI').text
            descriptor_name = descriptor_record.find('.//DescriptorName/String').text
            disease_descriptors.append((descriptor_ui, descriptor_name))
    
    return disease_descriptors

def save_to_csv(disease_descriptors, output_file):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["DescriptorUI", "DescriptorName"])
        for descriptor in disease_descriptors:
            writer.writerow(descriptor)

input_file_path = 'desc2025.xml'

disease_list = extract_disease_descriptors(input_file_path)
df_MeSH = pd.DataFrame(disease_list, columns=['DescriptorUI', 'DescriptorName'])
df_MeSH

Unnamed: 0,DescriptorUI,DescriptorName
0,D000006,"Abdomen, Acute"
1,D000007,Abdominal Injuries
2,D000008,Abdominal Neoplasms
3,D000012,Abetalipoproteinemia
4,D000013,Congenital Abnormalities
...,...,...
5051,D000099067,Blastic Plasmacytoid Dendritic Cell Neoplasm
5052,D000099070,Foveomacular Retinitis
5053,D000099072,Oral Allergy Syndrome
5054,D000099074,Diabesity


### 2.5 UMLS

Determine the disease id of UMLS

In [6]:
df_umls = pd.read_csv('2024AB/META/MRSTY.RRF', sep='|', header=None)
df_umls.drop(columns=[6], inplace=True)
df_umls.columns = ['UMLS ID', 'Unique identifier of Semantic Type', 'Semantic Type tree number', 'Semantic Type','Unique identifier for attribute','Content View Flag']

df_umls_disease = df_umls[df_umls['Semantic Type'] == 'Disease or Syndrome']
df_umls_disease.drop_duplicates(inplace=True)
df_umls_disease.reset_index(drop=True, inplace=True)
df_umls_disease

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_umls_disease.drop_duplicates(inplace=True)


Unnamed: 0,UMLS ID,Unique identifier of Semantic Type,Semantic Type tree number,Semantic Type,Unique identifier for attribute,Content View Flag
0,C0000744,T047,B2.2.1.2.1,Disease or Syndrome,AT89446043,256.0
1,C0000774,T047,B2.2.1.2.1,Disease or Syndrome,AT08560908,256.0
2,C0000809,T047,B2.2.1.2.1,Disease or Syndrome,AT17681802,256.0
3,C0000814,T047,B2.2.1.2.1,Disease or Syndrome,AT89564783,2304.0
4,C0000823,T047,B2.2.1.2.1,Disease or Syndrome,AT17683829,256.0
...,...,...,...,...,...,...
123411,C5940805,T047,B2.2.1.2.1,Disease or Syndrome,AT301463902,
123412,C5940806,T047,B2.2.1.2.1,Disease or Syndrome,AT301583283,
123413,C5942143,T047,B2.2.1.2.1,Disease or Syndrome,AT301483756,8192.0
123414,C5942144,T047,B2.2.1.2.1,Disease or Syndrome,AT301565372,8192.0


UMLS Xref

In [7]:
df_umls_name = pd.read_csv('2024AB/META/MRCONSO.RRF', sep='|', header=None)
df_umls_name.columns = ['UMLS ID', 'Language', 'Term status', 'Unique identifier for term', 'String type', 
                        'Unique identifier for string', 'Atom status', 'AUI', 'SAUI', 'SCUI', 'SDUI', 'SAB', 'TTY', 'CODE', 'String', 'SRL', 'SUPPRESS', 'CVF','NA']
df_umls_name.drop(columns=['NA'], inplace=True)
df_umls_name

  df_umls_name = pd.read_csv('2024AB/META/MRCONSO.RRF', sep='|', header=None)


Unnamed: 0,UMLS ID,Language,Term status,Unique identifier for term,String type,Unique identifier for string,Atom status,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,String,SRL,SUPPRESS,CVF
0,C0000005,ENG,P,L0000005,PF,S0007492,Y,A26634265,,M0019694,D012711,MSH,PEP,D012711,(131)I-Macroaggregated Albumin,0,N,256.0
1,C0000005,ENG,S,L0270109,PF,S0007491,Y,A26634266,,M0019694,D012711,MSH,ET,D012711,(131)I-MAA,0,N,256.0
2,C0000005,FRE,P,L6220710,PF,S7133957,Y,A13433185,,M0019694,D012711,MSHFRE,PEP,D012711,Macroagrégats d'albumine marquée à l'iode 131,3,N,
3,C0000005,FRE,S,L6215648,PF,S7133916,Y,A27488794,,M0019694,D012711,MSHFRE,ET,D012711,MAA-I 131,3,N,
4,C0000005,FRE,S,L6215656,PF,S7133956,Y,A27614225,,M0019694,D012711,MSHFRE,ET,D012711,Macroagrégats d'albumine humaine marquée à l'i...,3,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16704674,C5942200,ENG,P,L19626559,PF,S23411143,Y,A36854647,,447562003,,SNOMEDCT_US,XM,447562003,SNOMEDCT_US_2024_09_01 to ICD10_2016 Mappings,9,N,
16704675,C5942201,ENG,P,L19626699,PF,S23411405,Y,A36855669,,,,SRC,VPT,V-MED-RT_2024_09_03,"Medication Reference Terminology, 2024_09_03",0,N,
16704676,C5942201,ENG,S,L19626698,PF,S23411404,Y,A36855671,,,,SRC,VAB,V-MED-RT_2024_09_03,MED-RT_2024_09_03,0,N,
16704677,C5942202,ENG,P,L19626707,PF,S23411431,Y,A36855812,,,,SRC,VPT,V-MVX2024_09_03,"Manufacturers of Vaccines, 2024_09_03",0,N,


Load the xref

In [8]:
# https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html 
df_umls_name_filter = df_umls_name[df_umls_name['Language'] == 'ENG'] # English language
df_umls_name_filter = df_umls_name_filter[df_umls_name['Term status'] == 'P'] # Preferred LUI of the CUI
df_umls_name_filter = df_umls_name_filter[df_umls_name['String type'] == 'PF'] # Preferred form of term

umls_name_filter = df_umls_name_filter[['UMLS ID', 'SAB', 'CODE','String']]
umls_name_filter.drop_duplicates(inplace=True)
umls_name_filter.reset_index(drop=True, inplace=True)

values_to_keep = ['MSH', 'SNOMEDCT_US', 'String', 'ICD10CM', 'OMIM']
df_umls_filter = umls_name_filter[umls_name_filter['SAB'].isin(values_to_keep)]
df_umls_filter

  df_umls_name_filter = df_umls_name_filter[df_umls_name['Term status'] == 'P'] # Preferred LUI of the CUI
  df_umls_name_filter = df_umls_name_filter[df_umls_name['String type'] == 'PF'] # Preferred form of term
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  umls_name_filter.drop_duplicates(inplace=True)


Unnamed: 0,UMLS ID,SAB,CODE,String
0,C0000005,MSH,D012711,(131)I-Macroaggregated Albumin
3,C0000052,MSH,D015061,"1,4-alpha-Glucan Branching Enzyme"
5,C0000074,MSH,D010742,1-Alkyl-2-Acylphosphatidates
6,C0000084,MSH,D015055,1-Carboxyglutamic Acid
8,C0000096,MSH,D015056,1-Methyl-3-isobutylxanthine
...,...,...,...,...
4252982,C5942196,SNOMEDCT_US,851321000124104,Provision of payment for utility arrears
4252983,C5942197,SNOMEDCT_US,851331000124101,Provision of payment for housing mold remediat...
4252984,C5942198,SNOMEDCT_US,861171000124107,Inadequate digital access due to inadequate in...
4252985,C5942199,SNOMEDCT_US,6011000124106,SNOMEDCT_US_2024_09_01 to ICD10CM_2024 Mappings


Filtering for disease ids in Xref

In [41]:
df_umls_disease = df_umls_filter[df_umls_filter['UMLS ID'].isin(df_umls_disease['UMLS ID'])]
df_umls_disease['String'] = df_umls_disease.groupby('UMLS ID')['String'].transform(lambda x: ';'.join(set(x)))
pivot_umls = df_umls_disease.pivot_table(index='UMLS ID', columns='SAB', values='CODE', aggfunc=list)
pivot_umls = pivot_umls.applymap(lambda x: ';'.join(x) if isinstance(x, list) else x)
string_df = df_umls_disease[['UMLS ID', 'String']].drop_duplicates().set_index('UMLS ID')
pivot_umls = pivot_umls.join(string_df)
pivot_umls.reset_index(inplace=True)
pivot_umls

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_umls_disease['String'] = df_umls_disease.groupby('UMLS ID')['String'].transform(lambda x: ';'.join(set(x)))
  pivot_umls = pivot_umls.applymap(lambda x: ';'.join(x) if isinstance(x, list) else x)


Unnamed: 0,UMLS ID,ICD10CM,MSH,OMIM,SNOMEDCT_US,String
0,C0000744,E78.6,D000012,MTHU014871,83123000;190787008,Abetalipoproteinemia
1,C0000774,,,,47344007,Abnormality of secretion of gastrin
2,C0000809,,D000026,,,"Abortion, Habitual"
3,C0000814,O02.1,,,156087000;198616002;16607004;267187007,Missed abortion
4,C0000823,,D000034,,,"Abortion, Veterinary"
...,...,...,...,...,...,...
69356,C5940805,,D011559,,,Fulminant Intracranial Hypertension
69357,C5940806,,D011559,,,Secondary Intracranial Hypertension
69358,C5942143,,,,16262221000119100,Mild obstructive sleep apnea of adult
69359,C5942144,,,,16262271000119104,Moderate obstructive sleep apnea of adult


### 2.6 Mondo

mondo xref

In [42]:
df_mondo = pd.read_csv('xrefs.tsv', sep='\t')
df_mondo['database'] = df_mondo['xref'].apply(lambda x: x.split(':')[0])
df_mondo['reference'] = df_mondo['xref'].apply(lambda x: x.split(':')[1])

pivot_df_mondo = df_mondo.groupby(['cls', 'database'])['reference'].apply(lambda x: ';'.join(x)).unstack()
pivot_df_mondo.reset_index(inplace=True)

df_mondo_xref = pivot_df_mondo[['cls','UMLS','MESH', 'OMIM']]
df_mondo_xref = df_mondo_xref.rename(columns={'cls': 'MONDO_ID', 'UMLS': 'UMLS_CUI', 'MESH': 'MESH_ID', 'OMIM': 'OMIM_ID'})
df_mondo_xref

database,MONDO_ID,UMLS_CUI,MESH_ID,OMIM_ID
0,MONDO:0000001,C0012634,D004194,
1,MONDO:0000004,C0405580,D000309,
2,MONDO:0000005,,,
3,MONDO:0000009,C0005818,,
4,MONDO:0000015,C1285186,,
...,...,...,...,...
26891,MONDO:8000030,,,
26892,MONDO:8000031,,,
26893,MONDO:8000032,,,
26894,MONDO:8000033,,,


mondo info

In [43]:
def parse_obo(file_path):
    terms = []
    with open(file_path, 'r') as file:
        term = {}
        for line in file:
            line = line.strip()
            if line == "[Term]":
                if term:
                    terms.append(term)
                    term = {}
            elif line.startswith("id: "):
                term['id'] = line[4:]
            elif line.startswith("name: "):
                term['name'] = line[6:]
        if term:  # Add the last term
            terms.append(term)
    return terms

# Parse the OBO file
file_path = 'mondo.obo'
mondo_terms = parse_obo(file_path)

# Filter terms to keep only those with "MONDO" in the id
filtered_terms = [term for term in mondo_terms if "MONDO" in term['id']]

# Create a dataframe from the filtered terms
df_mondo_obo = pd.DataFrame(filtered_terms)
df_mondo_obo = df_mondo_obo.rename(columns={'id': 'MONDO_ID', 'name': 'MONDO_Name'})

df_mondo_merge = df_mondo_obo.merge(df_mondo_xref, on='MONDO_ID', how='outer')

In [45]:
df_mondo_merge_umls = df_mondo_merge[df_mondo_merge['UMLS_CUI'].notnull()]
df_mondo_merge_umls.drop(columns=['MESH_ID'], inplace=True)

df_mondo_merge_rest = df_mondo_merge[df_mondo_merge['UMLS_CUI'].isnull()]
df_mondo_merge_rest.drop(columns=['UMLS_CUI'], inplace=True)
df_mondo_merge_rest = df_mondo_merge_rest.groupby('MESH_ID').agg({
    'MONDO_ID': lambda x: ';'.join(x.dropna().astype(str)), 
    'MONDO_Name': lambda x: ' | '.join(x.dropna().astype(str)), 
    'OMIM_ID': lambda x: ';'.join(x.dropna().astype(str))
}).reset_index()

df_mondo_merge_rest.replace('', pd.NA, inplace=True)
df_mondo_merge_rest

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mondo_merge_umls.drop(columns=['MESH_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mondo_merge_rest.drop(columns=['UMLS_CUI'], inplace=True)


Unnamed: 0,MESH_ID,MONDO_ID,MONDO_Name,OMIM_ID
0,C049262,MONDO:0007578,obsolete esterase B,133260
1,C531667,MONDO:0016312,5-fluorouracil poisoning,
2,C535346,MONDO:0022109,obsolete catatrichy,116850
3,C535530,MONDO:0020530,Mendelian susceptibility to mycobacterial dise...,
4,C535565,MONDO:0007238,amastia,
...,...,...,...,...
366,D058249,MONDO:0800445,Birt-Hogg-Dube syndrome 1,135150
367,D058968,MONDO:0025510,"pythiosis, non-human animal",
368,D060048,MONDO:0018382,epiphysiolysis of the hip,182260
369,D062625,MONDO:0003464,cystadenofibroma,


### 2.7 SNOMED CT

In [46]:
# download link: https://download.nlm.nih.gov/mlb/utsauth/USExt/SnomedCT_ManagedServiceUS_PRODUCTION_US1000124_20250301T120000Z.zip?_gl=1*18c046q*_ga*MTA5NTI1Nzc2My4xNzEwOTU5NjM5*_ga_7147EPK006*MTc0MzAzMzM5Mi4xMjcuMS4xNzQzMDMzNTM5LjAuMC4w*_ga_P1FPTH9PL4*MTc0MzAzMzM5Mi4xMjYuMS4xNzQzMDMzNTM5LjAuMC4w
df_snomed_name = pd.read_csv('SnomedCT_ManagedServiceUS_PRODUCTION_US1000124_20250301T120000Z\Snapshot\Terminology\sct2_Description_Snapshot-en_US1000124_20250301.txt', delimiter='\t')
df_snomed_name = df_snomed_name[df_snomed_name['active'] == 1]
df_snomed_name = df_snomed_name[['conceptId', 'term']]
df_snomed_name.drop_duplicates(inplace=True)
df_snomed_name.reset_index(drop=True, inplace=True)
df_snomed_name.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1374405 entries, 0 to 1374404
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   conceptId  1374405 non-null  int64 
 1   term       1374403 non-null  object
dtypes: int64(1), object(1)
memory usage: 21.0+ MB


In [47]:
df_snomed_name['conceptId'] = df_snomed_name['conceptId'].astype(str)
df_snomed_name = df_snomed_name.dropna(subset=['term'])
df_snomed_name = df_snomed_name.groupby('conceptId')['term'].apply(lambda x: ' | '.join(x)).reset_index()
df_snomed_name

Unnamed: 0,conceptId,term
0,100000000,BITTER-3 | BITTER-3 (product)
1,10000006,Radiating chest pain | Radiating chest pain (f...
2,1000001000004108,DNA mismatch repair protein PMS2 | Mismatch re...
3,1000004,Sprain | Joint injury | Sprain (morphologic ab...
4,100001001,BIZOLIN 200 | BIZOLIN 200 (product)
...,...,...
527306,99995009,BIOVAX | BIOVAX (product)
527307,99996005,BIOVAX +YERSIVAX | BIOVAX +YERSIVAX (product)
527308,99997001,BISMO-KOTE | BISMO-KOTE (product)
527309,99998006,BISMU-KOTE | BISMU-KOTE (product)


## 3. Merge Data

In [48]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator, split_values=True):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                if split_values:
                    combined.update(row[column].split(separator))
                else:
                    combined.add(row[column])
        return separator.join(combined)

    df[merge_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

### 3.1 UMLS and MeSH

In [49]:
df_umls_mesh = pd.merge(pivot_umls, df_MeSH, left_on='MSH', right_on='DescriptorUI', how='outer')

df_umls_mesh_v1 = df_umls_mesh.copy()
df_umls_mesh_v1 = merge_column(df_umls_mesh_v1, 'DescriptorUI', 'MSH', 'MeSH')
df_umls_mesh_v1.rename(columns={'DescriptorName': 'MeSH_Name', 'String': 'UMLS_Name'}, inplace=True)
df_umls_mesh_v1.drop_duplicates(inplace=True)
df_umls_mesh_v1.reset_index(drop=True, inplace=True)
df_umls_mesh_v1

Unnamed: 0,UMLS ID,ICD10CM,OMIM,SNOMEDCT_US,UMLS_Name,MeSH_Name,MeSH
0,C3495589,,,707608003,Jalili syndrome,,C000596385
1,C1867235,,,,"Retinoschisis, Autosomal Dominant",,C000598640
2,C3281200,,,,Leukoencephalopathy Brain Calcifications and C...,,C000598644
3,C1866785,,,765092004,Spheroid body myopathy,,C000598645
4,C3888318,,,,"Myopathy, familial idiopathic inflammatory",,C000598744
...,...,...,...,...,...,...,...
71453,C5935680,,620493,,MACULAR DYSTROPHY WITHOUT CONE DYSFUNCTION,,
71454,C5935682,,271110,,SPINAL MUSCULAR ATROPHY WITH MICROCEPHALY AND ...,,
71455,C5942143,,,16262221000119100,Mild obstructive sleep apnea of adult,,
71456,C5942144,,,16262271000119104,Moderate obstructive sleep apnea of adult,,


### 3.2 Add ICD10

In [50]:
df_umls_mesh_v3_exploded = df_umls_mesh_v1.assign(ICD10CM=df_umls_mesh_v1['ICD10CM'].str.split(';')).explode('ICD10CM')

df_umls_mesh_icd10 = pd.merge(df_umls_mesh_v3_exploded, df_icd10_11_merge, left_on='ICD10CM', right_on='ICD10_ID', how='outer')
df_umls_mesh_icd10_v1 = merge_column(df_umls_mesh_icd10, 'ICD10_ID', 'ICD10CM', 'ICD10')
df_umls_mesh_icd10_v1

Unnamed: 0,UMLS ID,OMIM,SNOMEDCT_US,UMLS_Name,MeSH_Name,MeSH,ICD11_ID,ICD10
0,C0008354,,186087007;154269008;63650001,Cholera,Cholera,D002771,,A00
1,C0694449,,,Certain infectious and parasitic diseases (A00...,,,,A00-B99
2,C2880083,,,diseases generally recognized as communicable ...,,,,A00-B99
3,C0494021,,,"Cholera due to Vibrio cholerae 01, biovar chol...",,,1A00;XN8P1,A00.0
4,,,,,,,1A00;XN62R,A00.1
...,...,...,...,...,...,...,...,...
79577,C5935680,620493,,MACULAR DYSTROPHY WITHOUT CONE DYSFUNCTION,,,,
79578,C5935682,271110,,SPINAL MUSCULAR ATROPHY WITH MICROCEPHALY AND ...,,,,
79579,C5942143,,16262221000119100,Mild obstructive sleep apnea of adult,,,,
79580,C5942144,,16262271000119104,Moderate obstructive sleep apnea of adult,,,,


### 3.3 Add ICD11

In [51]:
df_umls_mesh_icd10_v1['ICD11_ID'] = df_umls_mesh_icd10_v1['ICD11_ID'].str.split(';')
df_umls_mesh_icd10_v1 = df_umls_mesh_icd10_v1.explode('ICD11_ID')
df_umls_mesh_icd10_v1_exploded = df_umls_mesh_icd10_v1.assign(ICD11_ID=df_umls_mesh_icd10_v1['ICD11_ID'].str.split(';')).explode('ICD11_ID')
df_umls_mesh_icd10_icd11 = pd.merge(df_umls_mesh_icd10_v1_exploded, df_icd11_id, left_on='ICD11_ID', right_on='ICD11_ID', how='outer')
df_umls_mesh_icd10_icd11

Unnamed: 0,UMLS ID,OMIM,SNOMEDCT_US,UMLS_Name,MeSH_Name,MeSH,ICD11_ID,ICD10,ICD11_Title
0,,,,,,,02,D37.0,
1,,,,,,,02,D37.1,
2,,,,,,,02,D37.2,
3,,,,,,,02,D37.3,
4,,,,,,,02,D37.4,
...,...,...,...,...,...,...,...,...,...
109623,C5935680,620493,,MACULAR DYSTROPHY WITHOUT CONE DYSFUNCTION,,,,,
109624,C5935682,271110,,SPINAL MUSCULAR ATROPHY WITH MICROCEPHALY AND ...,,,,,
109625,C5942143,,16262221000119100,Mild obstructive sleep apnea of adult,,,,,
109626,C5942144,,16262271000119104,Moderate obstructive sleep apnea of adult,,,,,


### 3.4 Add Disease Ontology

DO and UMLS

In [53]:
df_do_xref_umls = df_do_xref_filter[['id', 'UMLS_CUI','label']].dropna(subset=['UMLS_CUI'], axis=0)
df_do_xref_umls_unique = df_do_xref_umls.groupby('UMLS_CUI').agg({
    'id': lambda x: ';'.join(x.dropna()), 
    'label': lambda x: ' | '.join(x.dropna())}).reset_index()

df_umls_mesh_icd10_icd11_do = pd.merge(df_umls_mesh_icd10_icd11, df_do_xref_umls_unique, left_on='UMLS ID', right_on='UMLS_CUI', how='outer')
df_umls_mesh_icd10_icd11_do_v1 = merge_column(df_umls_mesh_icd10_icd11_do, 'UMLS_CUI', 'UMLS ID', 'UMLS')
df_umls_mesh_icd10_icd11_do_v1

Unnamed: 0,OMIM,SNOMEDCT_US,UMLS_Name,MeSH_Name,MeSH,ICD11_ID,ICD10,ICD11_Title,id,label,UMLS
0,MTHU014871,83123000;190787008,Abetalipoproteinemia,Abetalipoproteinemia,D000012,5C81.Z,E78.6,"Hypolipoproteinaemia, unspecified",DOID:1386,abetalipoproteinemia,C0000744
1,,47344007,Abnormality of secretion of gastrin,,,,,,DOID:13656,gastrin secretion abnormality,C0000774
2,,,"Abortion, Habitual","Abortion, Habitual",D000026,,,,,,C0000809
3,,156087000;198616002;16607004;267187007,Missed abortion,,,JA03,O02.1,Missed abortion,,,C0000814
4,,,,,,,,,DOID:9667,placental abruption,C0000821
...,...,...,...,...,...,...,...,...,...,...,...
112894,,,,Cardiotoxicity,D066126,,,,,,
112895,,,,Pectus Carinatum,D066166,,,,,,
112896,,,,Allesthesia,D066190,,,,,,
112897,,,,Vascular Remodeling,D066253,,,,,,


DO and MESH

In [55]:
unmatched_do = df_do_xref_filter[~df_do_xref_filter['id'].isin(df_do_xref_umls['id'])]

df_do_xref_mesh = unmatched_do[['id', 'MESH','label']].dropna(subset=['MESH'], axis=0)
df_do_xref_mesh_unique = df_do_xref_mesh.groupby('MESH').agg({
    'id': lambda x: ';'.join(x.unique()),
    'label': lambda x: ' | '.join(x.dropna())
}).reset_index()

df_umls_mesh_icd10_icd11_do_v2 = pd.merge(df_umls_mesh_icd10_icd11_do_v1, df_do_xref_mesh_unique, left_on='MeSH', right_on='MESH', how='outer')
df_umls_mesh_icd10_icd11_do_v2 = merge_column(df_umls_mesh_icd10_icd11_do_v2, 'MESH', 'MeSH', 'MeSH_ID')
df_umls_mesh_icd10_icd11_do_v2 = merge_string_columns(df_umls_mesh_icd10_icd11_do_v2, ['id_x', 'id_y'], 'DO_ID', separator=';', split_values=True)
df_umls_mesh_icd10_icd11_do_v2 = merge_string_columns(df_umls_mesh_icd10_icd11_do_v2, ['label_x', 'label_y'], 'DO_Name', separator=' | ', split_values=True)
df_umls_mesh_icd10_icd11_do_v2.replace('', pd.NA, inplace=True)
df_umls_mesh_icd10_icd11_do_v2

Unnamed: 0,OMIM,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD10,ICD11_Title,UMLS,MeSH_ID,DO_ID,DO_Name
0,,707608003,Jalili syndrome,,,,,C3495589,C000596385,DOID:0111404,Jalili syndrome
1,,,"Retinoschisis, Autosomal Dominant",,,,,C1867235,C000598640,,
2,,,Leukoencephalopathy Brain Calcifications and C...,,,,,C3281200,C000598644,,
3,,765092004,Spheroid body myopathy,,,,,C1866785,C000598645,,
4,,,,,,,,,C000598645;C535906,DOID:0080094,myofibrillar myopathy 3
...,...,...,...,...,...,...,...,...,...,...,...
113061,,,,,XY9P,,Postpartum condition or complication,,,,
113062,,,,,XY9Q,,Diagnosis confirmed by histology,,,,
113063,,,,,XY9R,,Diagnosis confirmed by imaging,,,,
113064,,,,,XY9S,,"Unspecified as to episode of care, or not appl...",,,,


DO and ICD10

In [56]:
unmatched_do1 = unmatched_do[~unmatched_do['id'].isin(df_do_xref_mesh['id'])]

df_do_xref_icd10 = unmatched_do1[['id', 'ICD10CM','label']].dropna(subset=['ICD10CM'], axis=0)
df_do_xref_icd10_unique = df_do_xref_icd10.groupby('ICD10CM').agg({
    'id': lambda x: ';'.join(x.unique()),
    'label': lambda x: ' | '.join(x.dropna())
}).reset_index()

df_umls_mesh_icd10_icd11_do_v3 = df_umls_mesh_icd10_icd11_do_v2.copy()
df_umls_mesh_icd10_icd11_do_v3_expanded = df_umls_mesh_icd10_icd11_do_v3.assign(ICD10=df_umls_mesh_icd10_icd11_do_v3['ICD10'].str.split(';')).explode('ICD10')
df_umls_mesh_icd10_icd11_do_v4 = pd.merge(df_umls_mesh_icd10_icd11_do_v3_expanded, df_do_xref_icd10_unique, left_on='ICD10', right_on='ICD10CM', how='outer')
df_umls_mesh_icd10_icd11_do_v4 = merge_column(df_umls_mesh_icd10_icd11_do_v4, 'ICD10', 'ICD10CM', 'ICD10_ID')
df_umls_mesh_icd10_icd11_do_v4 = merge_string_columns(df_umls_mesh_icd10_icd11_do_v4, ['DO_ID', 'id'], 'DO ID', separator=';', split_values=True)
df_umls_mesh_icd10_icd11_do_v4 = merge_string_columns(df_umls_mesh_icd10_icd11_do_v4, ['DO_Name', 'label'], 'DO Name', separator=' | ', split_values=True)
df_umls_mesh_icd10_icd11_do_v4.replace('', pd.NA, inplace=True)
df_umls_mesh_icd10_icd11_do_v4

Unnamed: 0,OMIM,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD11_Title,UMLS,MeSH_ID,ICD10_ID,DO ID,DO Name
0,,186087007;154269008;63650001,Cholera,Cholera,,,C0008354,D002771,A00,DOID:1498,cholera
1,,,Certain infectious and parasitic diseases (A00...,,,,C0694449,,A00-B99,,
2,,,diseases generally recognized as communicable ...,,,,C2880083,,A00-B99,,
3,,,"Cholera due to Vibrio cholerae 01, biovar chol...",,1A00,Cholera,C0494021,,A00.0,,
4,,,"Cholera due to Vibrio cholerae 01, biovar chol...",,XN8P1,"Vibrio cholerae O1, biovar cholerae",C0494021,,A00.0,,
...,...,...,...,...,...,...,...,...,...,...,...
113071,,,,,XY9P,Postpartum condition or complication,,,,,
113072,,,,,XY9Q,Diagnosis confirmed by histology,,,,,
113073,,,,,XY9R,Diagnosis confirmed by imaging,,,,,
113074,,,,,XY9S,"Unspecified as to episode of care, or not appl...",,,,,


DO and OMIM

In [57]:
unmatched_do2 = unmatched_do1[~unmatched_do1['id'].isin(df_do_xref_icd10['id'])]

df_do_xref_omim = unmatched_do2[['id', 'MIM','label']].dropna(subset=['MIM'], axis=0)
df_do_xref_omim_unique = df_do_xref_omim.groupby('MIM').agg({
    'id': lambda x: ';'.join(x.unique()),
    'label': lambda x: ' | '.join(x.dropna())
}).reset_index()

df_umls_mesh_icd10_icd11_do_v7 = df_umls_mesh_icd10_icd11_do_v4.copy()
df_umls_mesh_icd10_icd11_do_v7_expanded = df_umls_mesh_icd10_icd11_do_v7.assign(OMIM=df_umls_mesh_icd10_icd11_do_v7['OMIM'].str.split(';')).explode('OMIM')

df_umls_mesh_icd10_icd11_do_v8 = pd.merge(df_umls_mesh_icd10_icd11_do_v7_expanded, df_do_xref_omim_unique, left_on='OMIM', right_on='MIM', how='outer')
df_umls_mesh_icd10_icd11_do_v9 = merge_column(df_umls_mesh_icd10_icd11_do_v8, 'MIM', 'OMIM', 'OMIM_ID')
df_umls_mesh_icd10_icd11_do_v9 = merge_string_columns(df_umls_mesh_icd10_icd11_do_v9, ['DO ID', 'id'], 'DO_ID', separator=';', split_values=True)
df_umls_mesh_icd10_icd11_do_v9 = merge_string_columns(df_umls_mesh_icd10_icd11_do_v9, ['DO Name', 'label'], 'DO_Name', separator=' | ', split_values=True)
df_umls_mesh_icd10_icd11_do_v9.replace('', pd.NA, inplace=True)
df_umls_mesh_icd10_icd11_do_v9

Unnamed: 0,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD11_Title,UMLS,MeSH_ID,ICD10_ID,OMIM_ID,DO_ID,DO_Name
0,,"AARSKOG SYNDROME, AUTOSOMAL DOMINANT",,,,C3149220,,,100050,DOID:0111825,autosomal dominant Aarskog syndrome
1,,"MULTIPLE PTERYGIUM SYNDROME, LETHAL TYPE",,,,C1854678,,,100690,,
2,,"MYASTHENIC SYNDROME, CONGENITAL, 1A, SLOW-CHANNEL",,,,C4084823,,,100690,,
3,,"MYASTHENIC SYNDROME, CONGENITAL, 1B, FAST-CHANNEL",,,,C4225405,,,100690,,
4,,"MYASTHENIC SYNDROME, CONGENITAL, 2C, ASSOCIATE...",,,,C4225373,,,100710,,
...,...,...,...,...,...,...,...,...,...,...,...
117475,,,,XY9P,Postpartum condition or complication,,,,,,
117476,,,,XY9Q,Diagnosis confirmed by histology,,,,,,
117477,,,,XY9R,Diagnosis confirmed by imaging,,,,,,
117478,,,,XY9S,"Unspecified as to episode of care, or not appl...",,,,,,


### 3.5 Add Mondo

In [58]:
df_umls_mesh_icd10_icd11_do_full_exploded = df_umls_mesh_icd10_icd11_do_v9.assign(UMLS=df_umls_mesh_icd10_icd11_do_v9['UMLS'].str.split(';')).explode('UMLS')
df_umls_mesh_icd10_icd11_do_mondo = pd.merge(df_umls_mesh_icd10_icd11_do_full_exploded, df_mondo_merge_umls, left_on='UMLS', right_on='UMLS_CUI', how='outer')
df_umls_mesh_icd10_icd11_do_mondo = merge_column(df_umls_mesh_icd10_icd11_do_mondo, 'UMLS_CUI', 'UMLS', 'UMLS_ID')
df_umls_mesh_icd10_icd11_do_mondo = merge_column(df_umls_mesh_icd10_icd11_do_mondo, 'OMIM_ID_x', 'OMIM_ID_y', 'OMIM_ID')
df_umls_mesh_icd10_icd11_do_mondo

Unnamed: 0,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD11_Title,MeSH_ID,ICD10_ID,DO_ID,DO_Name,MONDO_ID,MONDO_Name,UMLS_ID,OMIM_ID
0,83123000;190787008,Abetalipoproteinemia,Abetalipoproteinemia,5C81.Z,"Hypolipoproteinaemia, unspecified",D000012,E78.6,DOID:1386,abetalipoproteinemia,MONDO:0008692,abetalipoproteinemia,C0000744,MTHU014871
0,83123000;190787008,Abetalipoproteinemia,Abetalipoproteinemia,5C81.Z,"Hypolipoproteinaemia, unspecified",D000012,E78.6,DOID:1386,abetalipoproteinemia,MONDO:0008692,abetalipoproteinemia,C0000744,200100
1,47344007,Abnormality of secretion of gastrin,,,,,,DOID:13656,gastrin secretion abnormality,MONDO:0001770,gastrin secretion abnormality,C0000774,
2,,"Abortion, Habitual","Abortion, Habitual",,,D000026,,,,MONDO:0006774,habitual spontaneous abortion,C0000809,
3,156087000;198616002;16607004;267187007,Missed abortion,,JA03,Missed abortion,,O02.1,,,,,C0000814,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122726,,,,XY9P,Postpartum condition or complication,,,,,,,,
122727,,,,XY9Q,Diagnosis confirmed by histology,,,,,,,,
122728,,,,XY9R,Diagnosis confirmed by imaging,,,,,,,,
122729,,,,XY9S,"Unspecified as to episode of care, or not appl...",,,,,,,,


In [59]:
df_umls_mesh_icd10_icd11_do_mondo_v1 = df_umls_mesh_icd10_icd11_do_mondo.copy()
df_umls_mesh_icd10_icd11_do_mondo_v1_exploded = df_umls_mesh_icd10_icd11_do_mondo_v1.assign(MeSH_ID=df_umls_mesh_icd10_icd11_do_mondo_v1['MeSH_ID'].str.split(';')).explode('MeSH_ID')
df_umls_mesh_icd10_icd11_do_mondo_v1 = pd.merge(df_umls_mesh_icd10_icd11_do_mondo_v1_exploded, df_mondo_merge_rest, left_on='MeSH_ID', right_on='MESH_ID', how='outer')
df_umls_mesh_icd10_icd11_do_mondo_v1 = merge_column(df_umls_mesh_icd10_icd11_do_mondo_v1, 'MESH_ID', 'MeSH_ID', 'MeSH')
df_umls_mesh_icd10_icd11_do_mondo_v1 = merge_column(df_umls_mesh_icd10_icd11_do_mondo_v1, 'OMIM_ID_x', 'OMIM_ID_y', 'OMIM_ID')
df_umls_mesh_icd10_icd11_do_mondo_v1 = merge_column(df_umls_mesh_icd10_icd11_do_mondo_v1, 'MONDO_ID_x', 'MONDO_ID_y', 'MONDO_ID')
df_umls_mesh_icd10_icd11_do_mondo_v1 = merge_string_columns(df_umls_mesh_icd10_icd11_do_mondo_v1, {'MONDO_Name_x', 'MONDO_Name_y'}, 'MONDO_Name', separator=' | ', split_values=True)
df_umls_mesh_icd10_icd11_do_mondo_v1

Unnamed: 0,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD11_Title,ICD10_ID,DO_ID,DO_Name,UMLS_ID,MeSH,OMIM_ID,MONDO_ID,MONDO_Name
0,707608003,Jalili syndrome,,,,,DOID:0111404,Jalili syndrome,C3495589,C000596385,217080,MONDO:0009007,Jalili syndrome
1,,"Retinoschisis, Autosomal Dominant",,,,,,,C1867235,C000598640,180270,MONDO:0008382,"retinoschisis, autosomal dominant"
2,,Leukoencephalopathy Brain Calcifications and C...,,,,,,,C3281200,C000598644,614561,MONDO:0013803,leukoencephalopathy with calcifications and cysts
3,765092004,Spheroid body myopathy,,,,,,,C1866785,C000598645,,,
4,,,,,,,DOID:0080094,myofibrillar myopathy 3,,C000598645,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
124250,,,,XY9P,Postpartum condition or complication,,,,,,,,
124251,,,,XY9Q,Diagnosis confirmed by histology,,,,,,,,
124252,,,,XY9R,Diagnosis confirmed by imaging,,,,,,,,
124253,,,,XY9S,"Unspecified as to episode of care, or not appl...",,,,,,,,


### 3.6 Add SNOMED CT Name

In [60]:
# use df_snomed_name to add SNOMED CT name
df_umls_mesh_icd10_icd11_do_mondo_v1['SNOMEDCT_US'] = df_umls_mesh_icd10_icd11_do_mondo_v1['SNOMEDCT_US'].str.split(';')
df_umls_mesh_icd10_icd11_do_mondo_v1 = df_umls_mesh_icd10_icd11_do_mondo_v1.explode('SNOMEDCT_US')

df_umls_mesh_icd10_icd11_do_mondo_v1 = pd.merge(df_umls_mesh_icd10_icd11_do_mondo_v1, df_snomed_name, left_on='SNOMEDCT_US', right_on='conceptId', how='left')
df_umls_mesh_icd10_icd11_do_mondo_v1.drop(columns=['conceptId'], inplace=True)
df_umls_mesh_icd10_icd11_do_mondo_v1.rename(columns={'term': 'SNOMEDCT_Name'}, inplace=True)
df_umls_mesh_icd10_icd11_do_mondo_v1

Unnamed: 0,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD11_Title,ICD10_ID,DO_ID,DO_Name,UMLS_ID,MeSH,OMIM_ID,MONDO_ID,MONDO_Name,SNOMEDCT_Name
0,707608003,Jalili syndrome,,,,,DOID:0111404,Jalili syndrome,C3495589,C000596385,217080,MONDO:0009007,Jalili syndrome,Jalili syndrome | Amelogenesis imperfecta co-o...
1,,"Retinoschisis, Autosomal Dominant",,,,,,,C1867235,C000598640,180270,MONDO:0008382,"retinoschisis, autosomal dominant",
2,,Leukoencephalopathy Brain Calcifications and C...,,,,,,,C3281200,C000598644,614561,MONDO:0013803,leukoencephalopathy with calcifications and cysts,
3,765092004,Spheroid body myopathy,,,,,,,C1866785,C000598645,,,,Spheroid body myopathy | Spheroid body myopath...
4,,,,,,,DOID:0080094,myofibrillar myopathy 3,,C000598645,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129613,,,,XY9P,Postpartum condition or complication,,,,,,,,,
129614,,,,XY9Q,Diagnosis confirmed by histology,,,,,,,,,
129615,,,,XY9R,Diagnosis confirmed by imaging,,,,,,,,,
129616,,,,XY9S,"Unspecified as to episode of care, or not appl...",,,,,,,,,


### 3.7 Make UMLS unique

In [85]:
def data_cleaning(df, column):
    df_no_duplicates = df[~(df[column].duplicated(keep=False) & df[column].notna()) | df[column].isna()]
    df_duplicates = df[df[column].notna() & df.duplicated(column, keep=False)].sort_values(column)

    def choose_separator(colname):
        if 'name' in colname.lower():
            return ' | '
        elif 'title' in colname.lower():
            return ' | '
        else:
            return ';'

    def merge_entries(series):
        filtered = series.dropna().astype(str)
        colname = series.name
        sep = choose_separator(colname)
        return sep.join(filtered.unique())

    df_duplicates_merged = df_duplicates.groupby(column).agg(merge_entries).reset_index()

    return df_no_duplicates, df_duplicates_merged


In [86]:
unique_umls = df_umls_mesh_icd10_icd11_do_mondo_v1.copy()
unique_umls['UMLS_ID'] = unique_umls['UMLS_ID'].str.split(';')
unique_umls = unique_umls.explode('UMLS_ID')
unique_umls.drop_duplicates(inplace=True)
unique_umls.reset_index(drop=True, inplace=True)
unique_umls

Unnamed: 0,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD11_Title,ICD10_ID,DO_ID,DO_Name,UMLS_ID,MeSH,OMIM_ID,MONDO_ID,MONDO_Name,SNOMEDCT_Name
0,707608003,Jalili syndrome,,,,,DOID:0111404,Jalili syndrome,C3495589,C000596385,217080,MONDO:0009007,Jalili syndrome,Jalili syndrome | Amelogenesis imperfecta co-o...
1,,"Retinoschisis, Autosomal Dominant",,,,,,,C1867235,C000598640,180270,MONDO:0008382,"retinoschisis, autosomal dominant",
2,,Leukoencephalopathy Brain Calcifications and C...,,,,,,,C3281200,C000598644,614561,MONDO:0013803,leukoencephalopathy with calcifications and cysts,
3,765092004,Spheroid body myopathy,,,,,,,C1866785,C000598645,,,,Spheroid body myopathy | Spheroid body myopath...
4,,,,,,,DOID:0080094,myofibrillar myopathy 3,,C000598645,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129614,,,,XY9P,Postpartum condition or complication,,,,,,,,,
129615,,,,XY9Q,Diagnosis confirmed by histology,,,,,,,,,
129616,,,,XY9R,Diagnosis confirmed by imaging,,,,,,,,,
129617,,,,XY9S,"Unspecified as to episode of care, or not appl...",,,,,,,,,


In [87]:
no_duplicated_umls, duplicated_umls = data_cleaning(unique_umls, 'UMLS_ID')
final_disease = pd.concat([no_duplicated_umls, duplicated_umls], ignore_index=True)
final_disease.replace('', pd.NA, inplace=True)
final_disease

Unnamed: 0,SNOMEDCT_US,UMLS_Name,MeSH_Name,ICD11_ID,ICD11_Title,ICD10_ID,DO_ID,DO_Name,UMLS_ID,MeSH,OMIM_ID,MONDO_ID,MONDO_Name,SNOMEDCT_Name
0,707608003,Jalili syndrome,,,,,DOID:0111404,Jalili syndrome,C3495589,C000596385,217080,MONDO:0009007,Jalili syndrome,Jalili syndrome | Amelogenesis imperfecta co-o...
1,,"Retinoschisis, Autosomal Dominant",,,,,,,C1867235,C000598640,180270,MONDO:0008382,"retinoschisis, autosomal dominant",
2,,Leukoencephalopathy Brain Calcifications and C...,,,,,,,C3281200,C000598644,614561,MONDO:0013803,leukoencephalopathy with calcifications and cysts,
3,765092004,Spheroid body myopathy,,,,,,,C1866785,C000598645,,,,Spheroid body myopathy | Spheroid body myopath...
4,,,,,,,DOID:0080094,myofibrillar myopathy 3,,C000598645,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118809,,IMMUNODEFICIENCY 123 WITH HPV-RELATED VERRUCOSIS,,,,,,,C5935639,,186760;620901,MONDO:0971177,immunodeficiency 123 with HPV-related verrucosis,
118810,,POLYCYSTIC KIDNEY DISEASE 8,,,,,,,C5935640,,609799;620903,MONDO:0971178,polycystic kidney disease 8,
118811,,ARTERIAL TORTUOSITY-BONE FRAGILITY SYNDROME,,,,,,,C5935641,,130660;620908,MONDO:0971179,arterial tortuosity-bone fragility syndrome,
118812,,OTOFACIAL NEURODEVELOPMENTAL SYNDROME,,,,,,,C5935642,,618365;620910,MONDO:0975705,otofacial neurodevelopmental syndrome,


In [88]:
final_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118814 entries, 0 to 118813
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   SNOMEDCT_US    44458 non-null  object
 1   UMLS_Name      69361 non-null  object
 2   MeSH_Name      7156 non-null   object
 3   ICD11_ID       40690 non-null  object
 4   ICD11_Title    40586 non-null  object
 5   ICD10_ID       21931 non-null  object
 6   DO_ID          10689 non-null  object
 7   DO_Name        10689 non-null  object
 8   UMLS_ID        77484 non-null  object
 9   MeSH           12485 non-null  object
 10  OMIM_ID        12902 non-null  object
 11  MONDO_ID       21598 non-null  object
 12  MONDO_Name     21598 non-null  object
 13  SNOMEDCT_Name  44458 non-null  object
dtypes: object(14)
memory usage: 12.7+ MB


## 4. BioMedgraphica ID

In [89]:
biomedgraphica_disease = final_disease.copy()
biomedgraphica_disease = biomedgraphica_disease.sort_values(by=['UMLS_ID'], na_position='last')
biomedgraphica_disease.reset_index(drop=True, inplace=True)

max_length = len(str(len(biomedgraphica_disease)))
biomedgraphica_disease['BioMedGraphica_ID'] = ['BMG_DS' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_disease) + 1)]
biomedgraphica_disease.rename(columns={'SNOMEDCT_US':'SNOMEDCT_ID', 'MeSH': 'MeSH_ID'}, inplace=True)

columns = ['BioMedGraphica_ID'] + [col for col in biomedgraphica_disease.columns if col != 'BioMedGraphica_ID']  # re-order columns
biomedgraphica_disease = biomedgraphica_disease[columns]
biomedgraphica_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118814 entries, 0 to 118813
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   BioMedGraphica_ID  118814 non-null  object
 1   SNOMEDCT_ID        44458 non-null   object
 2   UMLS_Name          69361 non-null   object
 3   MeSH_Name          7156 non-null    object
 4   ICD11_ID           40690 non-null   object
 5   ICD11_Title        40586 non-null   object
 6   ICD10_ID           21931 non-null   object
 7   DO_ID              10689 non-null   object
 8   DO_Name            10689 non-null   object
 9   UMLS_ID            77484 non-null   object
 10  MeSH_ID            12485 non-null   object
 11  OMIM_ID            12902 non-null   object
 12  MONDO_ID           21598 non-null   object
 13  MONDO_Name         21598 non-null   object
 14  SNOMEDCT_Name      44458 non-null   object
dtypes: object(15)
memory usage: 13.6+ MB


In [90]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease.csv'
biomedgraphica_disease.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Disease\BioMedGraphica_Disease.csv


## 5. File Generation

In [79]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease' / 'BioMedGraphica_Disease.csv'
biomedgraphica_disease = pd.read_csv(target_dir, dtype=str)

GUI Name

In [91]:
def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(sorted(combined))

    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

gui_name = biomedgraphica_disease.copy()
gui_name = merge_string_columns(
    gui_name, 
    ['UMLS_Name', 'MeSH_Name', 'ICD11_Title', 'DO_Name', 'MONDO_Name','SNOMEDCT_Name'], 
    'Disease_Name_List',
    separator=' | '
)

gui_name = gui_name[['BioMedGraphica_ID', 'Disease_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Disease_Name_List
0,BMG_DS000001,ABL - Abetalipoproteinaemia | ABL - Abetalipop...
1,BMG_DS000002,Abnormality of secretion of gastrin | Abnormal...
2,BMG_DS000003,"Abortion, Habitual | habitual spontaneous abor..."
3,BMG_DS000004,MA - Missed abortion | Missed abortion | Misse...
4,BMG_DS000005,placental abruption
...,...,...
118809,BMG_DS118810,Postpartum condition or complication
118810,BMG_DS118811,Diagnosis confirmed by histology
118811,BMG_DS118812,Diagnosis confirmed by imaging
118812,BMG_DS118813,"Unspecified as to episode of care, or not appl..."


In [92]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Disease\BioMedGraphica_Disease_GUI_Name.csv


LLM Name and ID

In [93]:
llm_name_id = biomedgraphica_disease.copy()

# Handle cases where we have multiple IDs separated by semicolons
llm_name_id['SNOMEDCT_ID'] = llm_name_id['SNOMEDCT_ID'].apply(
    lambda x: ' | '.join(f"SNOMEDCT ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['ICD11_ID'] = llm_name_id['ICD11_ID'].apply(
    lambda x: ' | '.join(f"ICD11 ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['MeSH_ID'] = llm_name_id['MeSH_ID'].apply(
    lambda x: ' | '.join(f"MeSH ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['OMIM_ID'] = llm_name_id['OMIM_ID'].apply(
    lambda x: ' | '.join(f"OMIM ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['ICD10_ID'] = llm_name_id['ICD10_ID'].apply(
    lambda x: ' | '.join(f"ICD10 ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['UMLS_ID'] = llm_name_id['UMLS_ID'].apply(
    lambda x: ' | '.join(f"UMLS ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

column_order = ['BioMedGraphica_ID', 'UMLS_Name', 'UMLS_ID', 'SNOMEDCT_Name', 'SNOMEDCT_ID', 'ICD11_Title', 'ICD11_ID', 
                'MONDO_Name', 'MONDO_ID', 'MeSH_Name', 'MeSH_ID', 'DO_Name', 'DO_ID',
                'ICD10_ID', 'OMIM_ID']

llm_name_id = llm_name_id[column_order]
llm_name_id

Unnamed: 0,BioMedGraphica_ID,UMLS_Name,UMLS_ID,SNOMEDCT_Name,SNOMEDCT_ID,ICD11_Title,ICD11_ID,MONDO_Name,MONDO_ID,MeSH_Name,MeSH_ID,DO_Name,DO_ID,ICD10_ID,OMIM_ID
0,BMG_DS000001,Abetalipoproteinemia,UMLS ID:C0000744,Abetalipoproteinemia | Abetalipoproteinaemia |...,SNOMEDCT ID:83123000 | SNOMEDCT ID:190787008,"Hypolipoproteinaemia, unspecified",ICD11 ID:5C81.Z,abetalipoproteinemia,MONDO:0008692,Abetalipoproteinemia,MeSH ID:D000012,abetalipoproteinemia,DOID:1386,ICD10 ID:E78.6,OMIM ID:MTHU014871 | OMIM ID:200100
1,BMG_DS000002,Abnormality of secretion of gastrin,UMLS ID:C0000774,Abnormality of secretion of gastrin | Abnormal...,SNOMEDCT ID:47344007,,,gastrin secretion abnormality,MONDO:0001770,,,gastrin secretion abnormality,DOID:13656,,
2,BMG_DS000003,"Abortion, Habitual",UMLS ID:C0000809,,,,,habitual spontaneous abortion,MONDO:0006774,"Abortion, Habitual",MeSH ID:D000026,,,,
3,BMG_DS000004,Missed abortion,UMLS ID:C0000814,Missed abortion | Missed abortion (disorder) |...,SNOMEDCT ID:267187007 | SNOMEDCT ID:198616002 ...,Missed abortion,ICD11 ID:JA03,,,,,,,ICD10 ID:O02.1,
4,BMG_DS000005,,UMLS ID:C0000821,,,,,,,,,placental abruption,DOID:9667,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118809,BMG_DS118810,,,,,Postpartum condition or complication,ICD11 ID:XY9P,,,,,,,,
118810,BMG_DS118811,,,,,Diagnosis confirmed by histology,ICD11 ID:XY9Q,,,,,,,,
118811,BMG_DS118812,,,,,Diagnosis confirmed by imaging,ICD11 ID:XY9R,,,,,,,,
118812,BMG_DS118813,,,,,"Unspecified as to episode of care, or not appl...",ICD11 ID:XY9S,,,,,,,,


In [94]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Disease\BioMedGraphica_Disease_LLM_Name_ID.csv


LLM Name and ID Combined

In [95]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, ['UMLS_Name', 'UMLS_ID', 'SNOMEDCT_ID', 'SNOMEDCT_Name', 'ICD11_Title', 'ICD11_ID', 
                'MONDO_Name', 'MONDO_ID', 'MeSH_Name', 'MeSH_ID', 'DO_Name', 'DO_ID',
                'ICD10_ID', 'OMIM_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_DS000001,Apolipoprotein B deficiency | SNOMEDCT ID:1907...
1,BMG_DS000002,gastrin secretion abnormality | Abnormality of...
2,BMG_DS000003,UMLS ID:C0000809 | MeSH ID:D000026 | habitual ...
3,BMG_DS000004,ICD11 ID:JA03 | SNOMEDCT ID:156087000 | Silent...
4,BMG_DS000005,UMLS ID:C0000821 | DOID:9667 | placental abrup...
...,...,...
118809,BMG_DS118810,ICD11 ID:XY9P | Postpartum condition or compli...
118810,BMG_DS118811,ICD11 ID:XY9Q | Diagnosis confirmed by histology
118811,BMG_DS118812,ICD11 ID:XY9R | Diagnosis confirmed by imaging
118812,BMG_DS118813,"Unspecified as to episode of care, or not appl..."


In [96]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Disease\BioMedGraphica_Disease_LLM_Name_ID_Combined.csv


Display Name

In [97]:
display_name = biomedgraphica_disease.copy()

display_name['BMG_Disease_Name'] = display_name['UMLS_Name'].fillna(display_name['SNOMEDCT_Name']).fillna(display_name['MONDO_Name']).fillna(display_name['DO_Name']).fillna(display_name['MeSH_Name']).fillna(display_name['ICD11_Title'])
display_name = display_name[['BioMedGraphica_ID', 'BMG_Disease_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Disease_Name
0,BMG_DS000001,Abetalipoproteinemia
1,BMG_DS000002,Abnormality of secretion of gastrin
2,BMG_DS000003,"Abortion, Habitual"
3,BMG_DS000004,Missed abortion
4,BMG_DS000005,placental abruption
...,...,...
118809,BMG_DS118810,Postpartum condition or complication
118810,BMG_DS118811,Diagnosis confirmed by histology
118811,BMG_DS118812,Diagnosis confirmed by imaging
118812,BMG_DS118813,"Unspecified as to episode of care, or not appl..."


In [98]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Disease\BioMedGraphica_Disease_Display_Name.csv


## 6. Description

In [99]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease' / 'BioMedGraphica_Disease.csv'
biomedgraphica_disease = pd.read_csv(target_dir, dtype=str)

### 6.1 From MeSH

In [100]:
# Download link: https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.xml 

import xml.etree.ElementTree as ET
import pandas as pd

def extract_disease_descriptors_with_scope(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    disease_descriptors = []

    for descriptor_record in root.findall('.//DescriptorRecord'):
        tree_numbers = [tree_number.text for tree_number in descriptor_record.findall('.//TreeNumberList/TreeNumber')]

        # Disease's tree number starts with 'C'
        if any(tree_number.startswith('C') for tree_number in tree_numbers):
            descriptor_ui = descriptor_record.find('.//DescriptorUI').text
            scope_note_element = descriptor_record.find('.//ScopeNote')
            scope_note = scope_note_element.text if scope_note_element is not None else ""

            disease_descriptors.append((descriptor_ui, scope_note))

    return disease_descriptors

def save_to_csv(disease_descriptors, output_file):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["DescriptorUI", "ScopeNote"])
        for descriptor in disease_descriptors:
            writer.writerow(descriptor)

input_file_path = 'desc2025.xml'
disease_list = extract_disease_descriptors_with_scope(input_file_path)
df_MeSH = pd.DataFrame(disease_list, columns=['DescriptorUI', 'ScopeNote'])
print(df_MeSH.head())

  DescriptorUI                                          ScopeNote
0      D000006  A clinical syndrome with acute abdominal pain ...
1      D000007  General or unspecified injuries involving orga...
2      D000008  New abnormal growth of tissue in the ABDOMEN.\...
3      D000012  An autosomal recessive disorder of lipid metab...
4      D000013  Malformations of organs or body parts during d...


In [106]:
df_MeSH

Unnamed: 0,DescriptorUI,ScopeNote
0,D000006,A clinical syndrome with acute abdominal pain ...
1,D000007,General or unspecified injuries involving orga...
2,D000008,New abnormal growth of tissue in the ABDOMEN.\...
3,D000012,An autosomal recessive disorder of lipid metab...
4,D000013,Malformations of organs or body parts during d...
...,...,...
5051,D000099067,A rare and aggressive hematologic malignancy d...
5052,D000099070,"A photochemical injury to retina tissues, usua..."
5053,D000099072,"Allergic reaction to certain foods, including ..."
5054,D000099074,Co-occurrence of DIABETES MELLITUS and OBESITY...


In [107]:
bmg_disease = biomedgraphica_disease[['BioMedGraphica_ID', 'MeSH_ID']]
bmg_disease['MeSH_ID'] = bmg_disease['MeSH_ID'].str.split(';')
bmg_disease = bmg_disease.explode('MeSH_ID')

mesh_description = pd.merge(bmg_disease, df_MeSH, left_on='MeSH_ID', right_on='DescriptorUI', how='left')
mesh_description = mesh_description.drop(columns=['MeSH_ID', 'DescriptorUI'])
mesh_description = mesh_description.groupby('BioMedGraphica_ID').agg({'ScopeNote': lambda x: ' | '.join(x.dropna().astype(str))}).reset_index()
mesh_description.replace('', pd.NA, inplace=True)
mesh_description.rename(columns={'ScopeNote': 'MeSH_Description'}, inplace=True)
mesh_description['MeSH_Description'] = mesh_description['MeSH_Description'].str.strip()
mesh_description

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_disease['MeSH_ID'] = bmg_disease['MeSH_ID'].str.split(';')


Unnamed: 0,BioMedGraphica_ID,MeSH_Description
0,BMG_DS000001,An autosomal recessive disorder of lipid metab...
1,BMG_DS000002,
2,BMG_DS000003,Three or more consecutive spontaneous abortions.
3,BMG_DS000004,
4,BMG_DS000005,
...,...,...
118809,BMG_DS118810,
118810,BMG_DS118811,
118811,BMG_DS118812,
118812,BMG_DS118813,


### 6.2 From UMLS

In [102]:
umls = pd.read_csv('2024AB/META/MRDEF.RRF', sep='|', header=None)
umls.columns = ['UMLS ID', 'drop1', 'drop2', 'drop3','database','def', 'drop4', 'drop5', 'drop6']
umls.drop(columns=['drop1', 'drop2', 'drop3', 'drop4', 'drop5', 'drop6'], inplace=True)
umls

Unnamed: 0,UMLS ID,database,def
0,C0000039,MSH,Synthetic phospholipid used in liposomes and l...
1,C0000039,MSHSWE,Syntetisk fosfolipid som används i liposomer o...
2,C0000039,MSHCZE,Syntetický fosfolipid používaný v liposomech a...
3,C0000039,MSHPOR,Fosfolipídeo sintético utilizado em lipossomos...
4,C0000039,MSHSPA,Fosfolípido sintético que se utiliza en liposo...
...,...,...,...
453747,C5942180,SNOMEDCT_US,Three or more spontaneous bleeds into a joint ...
453748,C5942184,SNOMEDCT_US,Institutional insurance claims are electronic ...
453749,C5942185,SNOMEDCT_US,Pharmacy insurance claims are electronic or pa...
453750,C5942186,SNOMEDCT_US,Professional insurance claims are electronic o...


In [109]:
bmg_disease = biomedgraphica_disease[['BioMedGraphica_ID', 'UMLS_ID']]
bmg_disease['UMLS_ID'] = bmg_disease['UMLS_ID'].str.split(';')
bmg_disease = bmg_disease.explode('UMLS_ID')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_disease['UMLS_ID'] = bmg_disease['UMLS_ID'].str.split(';')


In [110]:
umls_filter = umls[umls['UMLS ID'].isin(bmg_disease['UMLS_ID'])]
umls_filter = umls_filter.drop_duplicates(subset=['UMLS ID'])
umls_filter = umls_filter[(umls_filter['database'] == 'NCI') | (umls_filter['database'] == 'MSH') | (umls_filter['database'] == 'SNOMEDCT_US') | 
                          (umls_filter['database'] == 'HPO') | (umls_filter['database'] == 'ORPHANET')]
umls_pivot = umls_filter.pivot_table(index='UMLS ID', columns='database', values='def', aggfunc=lambda x: ' | '.join(x))
umls_pivot.reset_index(inplace=True)
umls_pivot

database,UMLS ID,HPO,MSH,NCI,ORPHANET,SNOMEDCT_US
0,C0000744,,An autosomal recessive disorder of lipid metab...,,,
1,C0000809,,Three or more consecutive spontaneous abortions.,,,
2,C0000814,,The retention in the UTERUS of a dead FETUS tw...,,,
3,C0000821,,UTERINE BLEEDING from a GESTATION of less than...,,,
4,C0000823,,Premature expulsion of the FETUS in animals.,,,
...,...,...,...,...,...,...
15527,C5924853,,,,,A rare hyper-IgE syndrome with characteristics...
15528,C5940598,,Co-occurrence of DIABETES MELLITUS and OBESITY...,,,
15529,C5942143,,,,,Patients classified as having mild obstructive...
15530,C5942144,,,,,Patients classified as having moderate obstruc...


In [111]:
umls_description = pd.merge(bmg_disease, umls_pivot, left_on='UMLS_ID', right_on='UMLS ID', how='left')
umls_description.drop(columns=['UMLS_ID', 'UMLS ID'], inplace=True)
umls_description = umls_description.groupby('BioMedGraphica_ID').agg(
    {'MSH': lambda x: ' | '.join(x.dropna().unique()), 
     'NCI': lambda x: ' | '.join(x.dropna().unique()), 
     'SNOMEDCT_US': lambda x: ' | '.join(x.dropna().unique()), 
     'HPO': lambda x: ' | '.join(x.dropna().unique()), 
     'ORPHANET': lambda x: ' | '.join(x.dropna().unique())
}).reset_index()
umls_description.replace('', pd.NA, inplace=True)
umls_description

Unnamed: 0,BioMedGraphica_ID,MSH,NCI,SNOMEDCT_US,HPO,ORPHANET
0,BMG_DS000001,An autosomal recessive disorder of lipid metab...,,,,
1,BMG_DS000002,,,,,
2,BMG_DS000003,Three or more consecutive spontaneous abortions.,,,,
3,BMG_DS000004,The retention in the UTERUS of a dead FETUS tw...,,,,
4,BMG_DS000005,UTERINE BLEEDING from a GESTATION of less than...,,,,
...,...,...,...,...,...,...
118809,BMG_DS118810,,,,,
118810,BMG_DS118811,,,,,
118811,BMG_DS118812,,,,,
118812,BMG_DS118813,,,,,


### 6.3 From MONDO

In [112]:
import re
import pandas as pd

def parse_obo_terms(file_path):
    terms = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        term_blocks = content.split("[Term]")[1:] 

        for block in term_blocks:
            term_id = re.search(r"id: (\S+)", block)
            term_def = re.search(r'def: "(.*?)"', block, re.DOTALL)

            if term_id and term_def:
                terms.append({
                    "id": term_id.group(1),
                    "def": term_def.group(1).strip()
                })

    return terms

obo_file = "mondo.obo"

terms_list = parse_obo_terms(obo_file)
mondo_description = [term for term in terms_list if "MONDO" in term['id']]
mondo_description = pd.DataFrame(mondo_description)
mondo_description

Unnamed: 0,id,def
0,MONDO:0000001,A disease is a disposition to undergo patholog...
1,MONDO:0000004,An endocrine or hormonal disorder that occurs ...
2,MONDO:0000015,A genetic deficiency of any early component of...
3,MONDO:0000022,Urination during sleep.
4,MONDO:0000044,Hypophosphatemic rickets is a group of genetic...
...,...,...
18348,MONDO:8000014,Autosomal dominant form of antiphospholipid sy...
18349,MONDO:8000015,"Any 46,XY complete gonadal dysgenesis in which..."
18350,MONDO:8000018,Idiopathic recurrent vertigo associated with p...
18351,MONDO:8000023,"A rare, primary immunodeficiency. It is caused..."


In [113]:
bmg_disease = biomedgraphica_disease[['BioMedGraphica_ID', 'MONDO_ID']]
bmg_disease['MONDO_ID'] = bmg_disease['MONDO_ID'].str.split(';')
bmg_disease = bmg_disease.explode('MONDO_ID')

mondo_description = pd.merge(bmg_disease, mondo_description, left_on='MONDO_ID', right_on='id', how='left')
mondo_description.drop(columns=['MONDO_ID', 'id'], inplace=True)
mondo_description = mondo_description.groupby('BioMedGraphica_ID').agg({'def': lambda x: ' | '.join(x.dropna().unique())}).reset_index()
mondo_description.replace('', pd.NA, inplace=True)
mondo_description.rename(columns={'def': 'MONDO'}, inplace=True)
mondo_description

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_disease['MONDO_ID'] = bmg_disease['MONDO_ID'].str.split(';')


Unnamed: 0,BioMedGraphica_ID,MONDO
0,BMG_DS000001,Abetalipoproteinemia/ homozygous familial hypo...
1,BMG_DS000002,
2,BMG_DS000003,Three or more consecutive spontaneous abortions.
3,BMG_DS000004,
4,BMG_DS000005,
...,...,...
118809,BMG_DS118810,
118810,BMG_DS118811,
118811,BMG_DS118812,
118812,BMG_DS118813,


### 6.4 Final Description

In [114]:
def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

disease_description = pd.merge(mesh_description, umls_description, on='BioMedGraphica_ID', how='outer')
disease_description = pd.merge(disease_description, mondo_description, on='BioMedGraphica_ID', how='outer')
disease_description = merge_string_columns(disease_description, ['MeSH_Description', 'MSH'], 'MeSH')
disease_description.replace('', pd.NA, inplace=True)
disease_description

Unnamed: 0,BioMedGraphica_ID,NCI,SNOMEDCT_US,HPO,ORPHANET,MONDO,MeSH
0,BMG_DS000001,,,,,Abetalipoproteinemia/ homozygous familial hypo...,An autosomal recessive disorder of lipid metab...
1,BMG_DS000002,,,,,,
2,BMG_DS000003,,,,,Three or more consecutive spontaneous abortions.,Three or more consecutive spontaneous abortions.
3,BMG_DS000004,,,,,,The retention in the UTERUS of a dead FETUS tw...
4,BMG_DS000005,,,,,,UTERINE BLEEDING from a GESTATION of less than...
...,...,...,...,...,...,...,...
118809,BMG_DS118810,,,,,,
118810,BMG_DS118811,,,,,,
118811,BMG_DS118812,,,,,,
118812,BMG_DS118813,,,,,,


In [115]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease_Description.csv'
disease_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Disease\BioMedGraphica_Disease_Description.csv


### 6.5 Combined Description

In [116]:
comb_description = disease_description.copy()

column_names = comb_description.columns.tolist()
column_names = [col for col in column_names if col != 'BioMedGraphica_ID']

for col in column_names:
    comb_description[col] = comb_description[col].apply(lambda x: ' | '.join([f"{col}: {i}" for i in x.split(' | ')]) if pd.notna(x) else x)

# now we can merge the columns into one
comb_description['Description'] = comb_description[column_names].apply(lambda x: ' | '.join(x.dropna()), axis=1)
comb_description = comb_description[['BioMedGraphica_ID', 'Description']]
comb_description


Unnamed: 0,BioMedGraphica_ID,Description
0,BMG_DS000001,MONDO: Abetalipoproteinemia/ homozygous famili...
1,BMG_DS000002,
2,BMG_DS000003,MONDO: Three or more consecutive spontaneous a...
3,BMG_DS000004,MeSH: The retention in the UTERUS of a dead FE...
4,BMG_DS000005,MeSH: UTERINE BLEEDING from a GESTATION of les...
...,...,...
118809,BMG_DS118810,
118810,BMG_DS118811,
118811,BMG_DS118812,
118812,BMG_DS118813,


In [117]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease_Description_Combined.csv'
comb_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Disease\BioMedGraphica_Disease_Description_Combined.csv
