### HPO

In [1]:
# Download Link: https://github.com/obophenotype/human-phenotype-ontology/releases/download/2025-03-03/hp.obo
# Download Date: 2025-03-21
# Download Version: 2025-03-03
import pandas as pd

def parse_obo(file_content):
    terms = []
    term = {}
    in_term = False

    for line in file_content:
        line = line.strip()
        if line == "[Term]":
            if term:
                terms.append(term)
            term = {}
            in_term = True
        elif line == "[Typedef]":
            in_term = False
            if term:
                terms.append(term)
            term = {}
        elif in_term:
            if not line:
                continue
            if ": " in line:
                key, value = line.split(": ", 1)
                if key in term:
                    if isinstance(term[key], list):
                        term[key].append(value)
                    else:
                        term[key] = [term[key], value]
                else:
                    term[key] = value
            else:
                if 'unknown' in term:
                    term['unknown'].append(line)
                else:
                    term['unknown'] = [line]
    if term:
        terms.append(term)
    
    return terms

file_path = 'hp.obo'
with open(file_path, 'r') as file:
    content = file.readlines()

parsed_terms = parse_obo(content)

df_hpo = pd.DataFrame(parsed_terms)
df_hpo['xref'] = df_hpo['xref'].apply(lambda x: ';'.join(x) if isinstance(x, list) else x)
df_hpo

Unnamed: 0,id,name,comment,xref,def,synonym,is_a,property_value,creation_date,alt_id,subset,is_obsolete,replaced_by,consider
0,HP:0000001,All,Root of all terms in the Human Phenotype Ontol...,UMLS:C0444868,,,,,,,,,,
1,HP:0000002,Abnormality of body height,,UMLS:C4025901,"""Deviation from the norm of height with respec...","""Abnormality of body height"" EXACT layperson []",HP:0001507 ! Growth abnormality,terms:creator https://orcid.org/0000-0002-0736...,2008-02-27T02:20:00Z,,,,,
2,HP:0000003,Multicystic kidney dysplasia,Multicystic kidney dysplasia is the result of ...,SNOMEDCT_US:204962002;SNOMEDCT_US:82525005;UML...,"""Multicystic dysplasia of the kidney is charac...","[""Multicystic dysplastic kidney"" EXACT [], ""Mu...",HP:0000107 ! Renal cyst,,,HP:0004715,,,,
3,HP:0000005,Mode of inheritance,While there is a close conceptual relationship...,UMLS:C1708511,"""The pattern in which a particular genetic tra...","""Inheritance"" EXACT []",HP:0000001 ! All,,,"[HP:0001425, HP:0001453, HP:0001461, HP:0010985]",,,,
4,HP:0000006,Autosomal dominant inheritance,,SNOMEDCT_US:263681008;UMLS:C0443147,"""A mode of inheritance that is observed for tr...","[""Autosomal dominant"" EXACT [], ""Autosomal dom...",HP:0034345 ! Mendelian inheritance,,,"[HP:0001415, HP:0001447, HP:0001448, HP:000145...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19528,HP:6001198,Scapholunate interval widening,,,"""Widening of the space between the scaphoid an...","""Terry-Thomas sign"" EXACT []",HP:0001191 ! Abnormal carpal morphology,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,
19529,HP:6001199,FInger pulp localization,,,"""Applies to an abnormality whose distribution ...","""Symptoms localized to pulp of the finger"" EXA...",HP:0012836 ! Spatial pattern,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,
19530,HP:6001200,Ulnar wrist pain,The ulnar side of the wrist is the side of the...,,"""An unpleasant sensation characterized by phys...",,HP:0030836 ! Wrist pain,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,
19531,HP:6001201,Lunotriquetral interval widening,This finding may be observed with lunotriquetr...,,"""Radiographic widening of the space between th...",,HP:0001191 ! Abnormal carpal morphology,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,


In [2]:
df_hpo_relation = df_hpo[['id', 'is_a']]
df_hpo_relation = df_hpo_relation.dropna()
df_hpo_relation = df_hpo_relation.dropna(subset=['is_a'])
df_hpo_relation['is_a'] = df_hpo_relation['is_a'].astype(str)
df_hpo_relation['is_a'] = df_hpo_relation['is_a'].str.replace('[', '').str.replace(']', '').str.replace("'", '')
df_hpo_relation['is_a'] = df_hpo_relation['is_a'].str.split(', ').apply(lambda x: [item.split(' ! ')[0] for item in x] if isinstance(x, list) else x)
df_hpo_relation = df_hpo_relation.explode('is_a')
df_hpo_relation = df_hpo_relation.rename(columns={'id': 'child_id', 'is_a': 'parent_id'})
df_hpo_relation

Unnamed: 0,child_id,parent_id
1,HP:0000002,HP:0001507
2,HP:0000003,HP:0000107
3,HP:0000005,HP:0000001
4,HP:0000006,HP:0034345
5,HP:0000007,HP:0034345
...,...,...
19528,HP:6001198,HP:0001191
19529,HP:6001199,HP:0012836
19530,HP:6001200,HP:0030836
19531,HP:6001201,HP:0001191


### BioMedGraphica ID

In [3]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_phenotype = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype' / 'BioMedGraphica_Phenotype.csv'
biomedgraphica_phenotype = pd.read_csv(target_dir_phenotype, dtype=str)

### HPO Mapping

In [4]:
hpo_individual = biomedgraphica_phenotype[['HPO_ID', 'BioMedGraphica_ID']]
hpo_individual = hpo_individual.dropna(subset=['HPO_ID'])
hpo_individual['HPO_ID'] = hpo_individual['HPO_ID'].str.split(';')
hpo_individual = hpo_individual.explode('HPO_ID')
hpo_individual = hpo_individual.drop_duplicates()

hpo_to_individual = hpo_individual.groupby('HPO_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

In [5]:
df_hpo_relation['To_ID'] = df_hpo_relation['child_id'].map(hpo_to_individual)
df_hpo_relation['From_ID'] = df_hpo_relation['parent_id'].map(hpo_to_individual)
df_hpo_relation

Unnamed: 0,child_id,parent_id,To_ID,From_ID
1,HP:0000002,HP:0001507,BMG_PH00001,BMG_PH01144
2,HP:0000003,HP:0000107,BMG_PH00002,BMG_PH00094
3,HP:0000005,HP:0000001,BMG_PH00003,
4,HP:0000006,HP:0034345,BMG_PH00004,BMG_PH14790
5,HP:0000007,HP:0034345,BMG_PH00005,BMG_PH14790
...,...,...,...,...
19528,HP:6001198,HP:0001191,BMG_PH19528,BMG_PH00940
19529,HP:6001199,HP:0012836,BMG_PH19529,BMG_PH09487
19530,HP:6001200,HP:0030836,BMG_PH19530,BMG_PH11355
19531,HP:6001201,HP:0001191,BMG_PH19531,BMG_PH00940


In [6]:
phen_phen = df_hpo_relation[['From_ID', 'To_ID']]
phen_phen = phen_phen.dropna(subset=['From_ID'])
phen_phen = phen_phen.dropna(subset=['To_ID'])
phen_phen

Unnamed: 0,From_ID,To_ID
1,BMG_PH01144,BMG_PH00001
2,BMG_PH00094,BMG_PH00002
4,BMG_PH14790,BMG_PH00004
5,BMG_PH14790,BMG_PH00005
6,BMG_PH00640,BMG_PH00006
...,...,...
19528,BMG_PH00940,BMG_PH19528
19529,BMG_PH09487,BMG_PH19529
19530,BMG_PH11355,BMG_PH19530
19531,BMG_PH00940,BMG_PH19531


In [7]:
phen_phen['To_ID'] = phen_phen['To_ID'].str.split(';')
phen_phen = phen_phen.explode('To_ID')

phen_phen['From_ID'] = phen_phen['From_ID'].str.split(';')
phen_phen = phen_phen.explode('From_ID')

phen_phen.drop_duplicates()
phen_phen

Unnamed: 0,From_ID,To_ID
1,BMG_PH01144,BMG_PH00001
2,BMG_PH00094,BMG_PH00002
4,BMG_PH14790,BMG_PH00004
5,BMG_PH14790,BMG_PH00005
6,BMG_PH00640,BMG_PH00006
...,...,...
19528,BMG_PH00940,BMG_PH19528
19529,BMG_PH09487,BMG_PH19529
19530,BMG_PH11355,BMG_PH19530
19531,BMG_PH00940,BMG_PH19531


### Phenotype-Phenotype Relation

In [8]:
phen_phen['Source'] = 'HPO'
phen_phen['Type'] = 'Phenotype-Phenotype'

max_length = len(str(len(phen_phen)))
phen_phen['BioMedGraphica_ID'] = ['BMG_ED_PHPH' + str(i).zfill(max_length) for i in range(1, len(phen_phen) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in phen_phen.columns if col != 'BioMedGraphica_ID']  # re-order columns
phen_phen = phen_phen[columns]
phen_phen

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
1,BMG_ED_PHPH00001,BMG_PH01144,BMG_PH00001,HPO,Phenotype-Phenotype
2,BMG_ED_PHPH00002,BMG_PH00094,BMG_PH00002,HPO,Phenotype-Phenotype
4,BMG_ED_PHPH00003,BMG_PH14790,BMG_PH00004,HPO,Phenotype-Phenotype
5,BMG_ED_PHPH00004,BMG_PH14790,BMG_PH00005,HPO,Phenotype-Phenotype
6,BMG_ED_PHPH00005,BMG_PH00640,BMG_PH00006,HPO,Phenotype-Phenotype
...,...,...,...,...,...
19528,BMG_ED_PHPH23423,BMG_PH00940,BMG_PH19528,HPO,Phenotype-Phenotype
19529,BMG_ED_PHPH23424,BMG_PH09487,BMG_PH19529,HPO,Phenotype-Phenotype
19530,BMG_ED_PHPH23425,BMG_PH11355,BMG_PH19530,HPO,Phenotype-Phenotype
19531,BMG_ED_PHPH23426,BMG_PH00940,BMG_PH19531,HPO,Phenotype-Phenotype


In [9]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Phenotype-Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_Phenotype.csv'
phen_phen.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Phenotype-Phenotype\BioMedGraphica_Phenotype_Phenotype.csv
