### Disease Ontology

In [3]:
# Download Link: https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/refs/heads/main/src/ontology/HumanDO.obo
# Download Date: 2025-03-21
# Download Version: 2025-03-03
import pandas as pd
import numpy as np

file_path = 'HumanDO.obo'
ids = []
names = []
is_a_list = []

with open(file_path, 'r') as file:
    current_id = None
    current_name = None
    current_is_a = []
    inside_term = False

    for line in file:
        line = line.strip()
        if line == "[Term]":
            inside_term = True
            if current_id and current_name:
                ids.append(current_id)
                names.append(current_name)
                is_a_list.append(';'.join(current_is_a) if current_is_a else np.nan)

            current_id = None
            current_name = None
            current_is_a = []
        elif inside_term:
            if line.startswith("id:"):
                current_id = line.split("id: ")[1]
            elif line.startswith("name:"):
                current_name = line.split("name: ")[1]
            elif line.startswith("is_a:"):
                current_is_a.append(line.split("is_a: ")[1].split(' !')[0])
            elif line == "":
                inside_term = False

    if current_id and current_name:
        ids.append(current_id)
        names.append(current_name)
        is_a_list.append(';'.join(current_is_a) if current_is_a else np.nan)



do = pd.DataFrame({
    'ID': ids,
    'Name': names,
    'is_a': is_a_list
})
do.dropna(subset=['is_a'], inplace=True)
do.drop_duplicates(inplace=True)
do.reset_index(drop=True, inplace=True)
do

Unnamed: 0,ID,Name,is_a
0,DOID:0001816,angiosarcoma,DOID:175
1,DOID:0002116,pterygium,DOID:10124
2,DOID:0014667,disease of metabolism,DOID:4
3,DOID:0040001,shrimp allergy,DOID:0060524
4,DOID:0040002,aspirin allergy,DOID:0060500
...,...,...,...
11831,DOID:9987,orbit sarcoma,DOID:4143
11832,DOID:9988,tertiary neurosyphilis,DOID:8200
11833,DOID:999,hypereosinophilic syndrome,DOID:9500
11834,DOID:9993,hypoglycemia,DOID:4194


### BioMedgraphica ID

In [1]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_disease = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease' / 'BioMedGraphica_Disease.csv'
biomedgraphica_disease = pd.read_csv(target_dir_disease, dtype=str)

### DO Mapping

In [4]:
do_id = biomedgraphica_disease[['DO_ID', 'BioMedGraphica_ID']]
do_id.dropna(subset=['DO_ID'], inplace=True)
do_id = do_id.assign(DO_ID=do_id['DO_ID'].str.split(';')).explode('DO_ID')
do_to_individualID = do_id.groupby('DO_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

do['From_ID'] = do['ID'].map(do_to_individualID)
do = do.assign(is_a=do['is_a'].str.split(';')).explode('is_a')
do['To_ID'] = do['is_a'].map(do_to_individualID)
do

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  do_id.dropna(subset=['DO_ID'], inplace=True)


Unnamed: 0,ID,Name,is_a,From_ID,To_ID
0,DOID:0001816,angiosarcoma,DOID:175,BMG_DS001351;BMG_DS029260,BMG_DS014402;BMG_DS018124;BMG_DS031778;BMG_DS0...
1,DOID:0002116,pterygium,DOID:10124,BMG_DS002722,BMG_DS000704
2,DOID:0014667,disease of metabolism,DOID:4,BMG_DS002005,BMG_DS000858
3,DOID:0040001,shrimp allergy,DOID:0060524,,
4,DOID:0040002,aspirin allergy,DOID:0060500,BMG_DS000275,
...,...,...,...,...,...
11831,DOID:9987,orbit sarcoma,DOID:4143,BMG_DS034368,BMG_DS002243;BMG_DS004485
11832,DOID:9988,tertiary neurosyphilis,DOID:8200,BMG_DS002177,BMG_DS004255
11833,DOID:999,hypereosinophilic syndrome,DOID:9500,BMG_DS001013,BMG_DS001798
11834,DOID:9993,hypoglycemia,DOID:4194,BMG_DS001548,BMG_DS029870


In [5]:
disease_disease = do[['From_ID', 'To_ID']]
disease_disease.dropna(subset=['From_ID'], inplace=True)
disease_disease.dropna(subset=['To_ID'], inplace=True)
disease_disease

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_disease.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_disease.dropna(subset=['To_ID'], inplace=True)


Unnamed: 0,From_ID,To_ID
0,BMG_DS001351;BMG_DS029260,BMG_DS014402;BMG_DS018124;BMG_DS031778;BMG_DS0...
1,BMG_DS002722,BMG_DS000704
2,BMG_DS002005,BMG_DS000858
7,BMG_DS026073,BMG_DS026054
22,BMG_DS026067,BMG_DS026054
...,...,...
11831,BMG_DS034368,BMG_DS002243;BMG_DS004485
11832,BMG_DS002177,BMG_DS004255
11833,BMG_DS001013,BMG_DS001798
11834,BMG_DS001548,BMG_DS029870


In [6]:
disease_disease['From_ID'] = disease_disease['From_ID'].str.split(';')
disease_disease = disease_disease.explode('From_ID')
disease_disease['To_ID'] = disease_disease['To_ID'].str.split(';')
disease_disease = disease_disease.explode('To_ID')
disease_disease.drop_duplicates(inplace=True)
disease_disease.reset_index(drop=True, inplace=True)
disease_disease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disease_disease['From_ID'] = disease_disease['From_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_DS001351,BMG_DS014402
1,BMG_DS001351,BMG_DS018124
2,BMG_DS001351,BMG_DS031778
3,BMG_DS001351,BMG_DS034251
4,BMG_DS001351,BMG_DS034252
...,...,...
12001,BMG_DS034368,BMG_DS004485
12002,BMG_DS002177,BMG_DS004255
12003,BMG_DS001013,BMG_DS001798
12004,BMG_DS001548,BMG_DS029870


### Disease-Disease Relation

In [7]:
disease_disease['Source'] = 'Disease Ontology'
disease_disease['Type'] = 'Disease-Disease'

max_length = len(str(len(disease_disease)))
disease_disease['BioMedGraphica_ID'] = ['BMG_ED_DSDS' + str(i).zfill(max_length) for i in range(1, len(disease_disease) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in disease_disease.columns if col != 'BioMedGraphica_ID']  # re-order columns
disease_disease = disease_disease[columns]
disease_disease

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_DSDS00001,BMG_DS001351,BMG_DS014402,Disease Ontology,Disease-Disease
1,BMG_ED_DSDS00002,BMG_DS001351,BMG_DS018124,Disease Ontology,Disease-Disease
2,BMG_ED_DSDS00003,BMG_DS001351,BMG_DS031778,Disease Ontology,Disease-Disease
3,BMG_ED_DSDS00004,BMG_DS001351,BMG_DS034251,Disease Ontology,Disease-Disease
4,BMG_ED_DSDS00005,BMG_DS001351,BMG_DS034252,Disease Ontology,Disease-Disease
...,...,...,...,...,...
12001,BMG_ED_DSDS12002,BMG_DS034368,BMG_DS004485,Disease Ontology,Disease-Disease
12002,BMG_ED_DSDS12003,BMG_DS002177,BMG_DS004255,Disease Ontology,Disease-Disease
12003,BMG_ED_DSDS12004,BMG_DS001013,BMG_DS001798,Disease Ontology,Disease-Disease
12004,BMG_ED_DSDS12005,BMG_DS001548,BMG_DS029870,Disease Ontology,Disease-Disease


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Disease-Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Disease_Disease.csv'
disease_disease.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Disease-Disease\BioMedGraphica_Disease_Disease.csv
