### HPO

In [1]:
# Download Link: https://github.com/obophenotype/human-phenotype-ontology/releases/download/2025-03-03/phenotype.hpoa
# Download Date: 2025-03-21
# Download Version: 2025-03-03
import pandas as pd

df_hpo= pd.read_csv('phenotype.hpoa', delimiter='\t', comment='#')
df_hpo_omim = df_hpo[df_hpo['database_id'].str.startswith('OMIM')]
df_hpo_omim = df_hpo_omim[['database_id', 'hpo_id']]
df_hpo_omim['database_id'] = df_hpo_omim['database_id'].str.replace('OMIM:', '')
df_hpo_omim.drop_duplicates(inplace=True)
df_hpo_omim

  df_hpo= pd.read_csv('phenotype.hpoa', delimiter='\t', comment='#')


Unnamed: 0,database_id,hpo_id
0,619340,HP:0011097
1,619340,HP:0002187
2,619340,HP:0001518
3,619340,HP:0032792
4,619340,HP:0011451
...,...,...
156811,611705,HP:0000508
156812,611705,HP:0000007
156813,611705,HP:0001635
156814,611705,HP:0001678


### BioMedgraphica ID

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_phenotype = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype' / 'BioMedGraphica_Phenotype.csv'
target_dir_disease = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease' / 'BioMedGraphica_Disease.csv'
biomedgraphica_phenotype = pd.read_csv(target_dir_phenotype, dtype=str)
biomedgraphica_disease = pd.read_csv(target_dir_disease, dtype=str)

### HPO Mapping

In [3]:
hpo_individual = biomedgraphica_phenotype[['HPO_ID', 'BioMedGraphica_ID']]
hpo_individual = hpo_individual.dropna(subset=['HPO_ID'])
hpo_individual['HPO_ID'] = hpo_individual['HPO_ID'].str.split(';')
hpo_individual = hpo_individual.explode('HPO_ID')
hpo_individual = hpo_individual.drop_duplicates()

hpo_to_individual = hpo_individual.groupby('HPO_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
df_hpo_omim['From_ID'] = df_hpo_omim['hpo_id'].map(hpo_to_individual)
df_hpo_omim

Unnamed: 0,database_id,hpo_id,From_ID
0,619340,HP:0011097,BMG_PH07758
1,619340,HP:0002187,BMG_PH01650
2,619340,HP:0001518,BMG_PH01149
3,619340,HP:0032792,BMG_PH13238
4,619340,HP:0011451,BMG_PH08108
...,...,...,...
156811,611705,HP:0000508,BMG_PH00393
156812,611705,HP:0000007,BMG_PH00005
156813,611705,HP:0001635,BMG_PH01221
156814,611705,HP:0001678,BMG_PH01257


In [4]:
omim_individual = biomedgraphica_disease[['OMIM_ID', 'BioMedGraphica_ID']]
omim_individual = omim_individual.dropna(subset=['OMIM_ID'])
omim_individual['OMIM_ID'] = omim_individual['OMIM_ID'].str.split(';')
omim_individual = omim_individual.explode('OMIM_ID')
omim_individual = omim_individual.drop_duplicates()

omim_to_individual = omim_individual.groupby('OMIM_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_hpo_omim['To_ID'] = df_hpo_omim['database_id'].map(omim_to_individual)
df_hpo_omim

Unnamed: 0,database_id,hpo_id,From_ID,To_ID
0,619340,HP:0011097,BMG_PH07758,BMG_DS070605
1,619340,HP:0002187,BMG_PH01650,BMG_DS070605
2,619340,HP:0001518,BMG_PH01149,BMG_DS070605
3,619340,HP:0032792,BMG_PH13238,BMG_DS070605
4,619340,HP:0011451,BMG_PH08108,BMG_DS070605
...,...,...,...,...
156811,611705,HP:0000508,BMG_PH00393,BMG_DS043409;BMG_DS080448
156812,611705,HP:0000007,BMG_PH00005,BMG_DS043409;BMG_DS080448
156813,611705,HP:0001635,BMG_PH01221,BMG_DS043409;BMG_DS080448
156814,611705,HP:0001678,BMG_PH01257,BMG_DS043409;BMG_DS080448


In [5]:
phen_disease = df_hpo_omim[['From_ID', 'To_ID']]
phen_disease = phen_disease.dropna(subset=['From_ID'])
phen_disease = phen_disease.dropna(subset=['To_ID'])
phen_disease

Unnamed: 0,From_ID,To_ID
0,BMG_PH07758,BMG_DS070605
1,BMG_PH01650,BMG_DS070605
2,BMG_PH01149,BMG_DS070605
3,BMG_PH13238,BMG_DS070605
4,BMG_PH08108,BMG_DS070605
...,...,...
156811,BMG_PH00393,BMG_DS043409;BMG_DS080448
156812,BMG_PH00005,BMG_DS043409;BMG_DS080448
156813,BMG_PH01221,BMG_DS043409;BMG_DS080448
156814,BMG_PH01257,BMG_DS043409;BMG_DS080448


In [6]:
phen_disease['From_ID'] = phen_disease['From_ID'].str.split(';')
phen_disease['To_ID'] = phen_disease['To_ID'].str.split(';')
phen_disease = phen_disease.explode('From_ID')
phen_disease = phen_disease.explode('To_ID')
phen_disease.drop_duplicates()
phen_disease.reset_index(drop=True, inplace=True)
phen_disease

Unnamed: 0,From_ID,To_ID
0,BMG_PH07758,BMG_DS070605
1,BMG_PH01650,BMG_DS070605
2,BMG_PH01149,BMG_DS070605
3,BMG_PH13238,BMG_DS070605
4,BMG_PH08108,BMG_DS070605
...,...,...
181187,BMG_PH01221,BMG_DS080448
181188,BMG_PH01257,BMG_DS043409
181189,BMG_PH01257,BMG_DS080448
181190,BMG_PH02624,BMG_DS043409


### Phenotype-Disease Relation

In [7]:
phen_disease['Source'] = 'HPO'
phen_disease['Type'] = 'Phenotype-Disease'

max_length = len(str(len(phen_disease)))
phen_disease['BioMedGraphica_ID'] = ['BMG_ED_PHDS' + str(i).zfill(max_length) for i in range(1, len(phen_disease) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in phen_disease.columns if col != 'BioMedGraphica_ID']  # re-order columns
phen_disease = phen_disease[columns]
phen_disease

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_PHDS000001,BMG_PH07758,BMG_DS070605,HPO,Phenotype-Disease
1,BMG_ED_PHDS000002,BMG_PH01650,BMG_DS070605,HPO,Phenotype-Disease
2,BMG_ED_PHDS000003,BMG_PH01149,BMG_DS070605,HPO,Phenotype-Disease
3,BMG_ED_PHDS000004,BMG_PH13238,BMG_DS070605,HPO,Phenotype-Disease
4,BMG_ED_PHDS000005,BMG_PH08108,BMG_DS070605,HPO,Phenotype-Disease
...,...,...,...,...,...
181187,BMG_ED_PHDS181188,BMG_PH01221,BMG_DS080448,HPO,Phenotype-Disease
181188,BMG_ED_PHDS181189,BMG_PH01257,BMG_DS043409,HPO,Phenotype-Disease
181189,BMG_ED_PHDS181190,BMG_PH01257,BMG_DS080448,HPO,Phenotype-Disease
181190,BMG_ED_PHDS181191,BMG_PH02624,BMG_DS043409,HPO,Phenotype-Disease


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Phenotype-Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_Disease.csv'
phen_disease.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Phenotype-Disease\BioMedGraphica_Phenotype_Disease.csv
