# BioMedGraphica Phenotype

## 1. Data Access  
### Direct Download Links  
**HPO**: Can be downloaded directly via the link without the need for registration. [Link](https://hpo.jax.org/data/ontology).  Extract relevant rows containing valid phenotype labels by filtering out unwanted entries that contain raw HPO identifiers.  
**UMLS**: Can be downloaded directly via the link without the need for registration. [Link](https://download.nlm.nih.gov/umls/kss/2024AA/umls-2024AA-full.zip?_gl=1*14ig82q*_ga*MTA5NTI1Nzc2My4xNzEwOTU5NjM5*_ga_7147EPK006*MTcyMzU3NDM0NC41My4xLjE3MjM1NzUyNzYuMC4wLjA.*_ga_P1FPTH9PL4*MTcyMzU3NDM0NC41My4xLjE3MjM1NzUyNzYuMC4wLjA)  

## 2. Load Data

### 2.1 HPO

In [1]:
import pandas as pd

def parse_obo(file_content):
    terms = []
    term = {}
    in_term = False

    for line in file_content:
        line = line.strip()
        if line == "[Term]":
            if term:
                terms.append(term)
            term = {}
            in_term = True
        elif line == "[Typedef]":
            in_term = False
            if term:
                terms.append(term)
            term = {}
        elif in_term:
            if not line:
                continue
            if ": " in line:
                key, value = line.split(": ", 1)
                if key in term:
                    if isinstance(term[key], list):
                        term[key].append(value)
                    else:
                        term[key] = [term[key], value]
                else:
                    term[key] = value
            else:
                if 'unknown' in term:
                    term['unknown'].append(line)
                else:
                    term['unknown'] = [line]
    if term:
        terms.append(term)
    
    return terms

file_path = 'hp.obo'
with open(file_path, 'r') as file:
    content = file.readlines()

parsed_terms = parse_obo(content)

df_hpo = pd.DataFrame(parsed_terms)
df_hpo['xref'] = df_hpo['xref'].apply(lambda x: '; '.join(x) if isinstance(x, list) else x)
df_hpo

Unnamed: 0,id,name,comment,xref,def,synonym,is_a,property_value,creation_date,alt_id,subset,is_obsolete,replaced_by,consider
0,HP:0000001,All,Root of all terms in the Human Phenotype Ontol...,UMLS:C0444868,,,,,,,,,,
1,HP:0000002,Abnormality of body height,,UMLS:C4025901,"""Deviation from the norm of height with respec...","""Abnormality of body height"" EXACT layperson []",HP:0001507 ! Growth abnormality,terms:creator https://orcid.org/0000-0002-0736...,2008-02-27T02:20:00Z,,,,,
2,HP:0000003,Multicystic kidney dysplasia,Multicystic kidney dysplasia is the result of ...,SNOMEDCT_US:204962002; SNOMEDCT_US:82525005; U...,"""Multicystic dysplasia of the kidney is charac...","[""Multicystic dysplastic kidney"" EXACT [], ""Mu...",HP:0000107 ! Renal cyst,,,HP:0004715,,,,
3,HP:0000005,Mode of inheritance,While there is a close conceptual relationship...,UMLS:C1708511,"""The pattern in which a particular genetic tra...","""Inheritance"" EXACT []",HP:0000001 ! All,,,"[HP:0001425, HP:0001453, HP:0001461, HP:0010985]",,,,
4,HP:0000006,Autosomal dominant inheritance,,SNOMEDCT_US:263681008; UMLS:C0443147,"""A mode of inheritance that is observed for tr...","[""Autosomal dominant"" EXACT [], ""Autosomal dom...",HP:0034345 ! Mendelian inheritance,,,"[HP:0001415, HP:0001447, HP:0001448, HP:000145...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19528,HP:6001198,Scapholunate interval widening,,,"""Widening of the space between the scaphoid an...","""Terry-Thomas sign"" EXACT []",HP:0001191 ! Abnormal carpal morphology,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,
19529,HP:6001199,FInger pulp localization,,,"""Applies to an abnormality whose distribution ...","""Symptoms localized to pulp of the finger"" EXA...",HP:0012836 ! Spatial pattern,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,
19530,HP:6001200,Ulnar wrist pain,The ulnar side of the wrist is the side of the...,,"""An unpleasant sensation characterized by phys...",,HP:0030836 ! Wrist pain,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,
19531,HP:6001201,Lunotriquetral interval widening,This finding may be observed with lunotriquetr...,,"""Radiographic widening of the space between th...",,HP:0001191 ! Abnormal carpal morphology,[IAO:0000233 https://github.com/obophenotype/h...,,,,,,


In [2]:
df_hpo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19533 entries, 0 to 19532
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              19533 non-null  object
 1   name            19533 non-null  object
 2   comment         4275 non-null   object
 3   xref            11562 non-null  object
 4   def             16509 non-null  object
 5   synonym         10859 non-null  object
 6   is_a            19076 non-null  object
 7   property_value  12627 non-null  object
 8   creation_date   5011 non-null   object
 9   alt_id          2040 non-null   object
 10  subset          845 non-null    object
 11  is_obsolete     456 non-null    object
 12  replaced_by     362 non-null    object
 13  consider        80 non-null     object
dtypes: object(14)
memory usage: 2.1+ MB


In [3]:
from collections import defaultdict

df_hpo_filter = df_hpo[['id', 'name', 'xref']]
# Split the 'xref' column by the semicolon delimiter and expand into multiple columns
split_xref = df_hpo_filter['xref'].str.split(';', expand=True)

# Add prefix to the new columns for clarity
split_xref.columns = [f'xref_{i+1}' for i in range(split_xref.shape[1])]

# Concatenate the new columns with the original DataFrame
expanded_data = pd.concat([df_hpo_filter.drop(columns=['xref']), split_xref], axis=1)

num_rows = len(expanded_data)

# Initialize a defaultdict to hold lists of IDs for each database
database_dict = defaultdict(lambda: [''] * num_rows)

# Iterate over each row to process the xref columns
for index, row in expanded_data.iterrows():
    row_dict = defaultdict(list)
    for col in split_xref.columns:
        if pd.notna(row[col]):
            db_name, db_id = row[col].split(':', 1)
            row_dict[db_name].append(db_id)
    
    # Update the main database_dict with the row_dict data
    for db_name, db_ids in row_dict.items():
        database_dict[db_name][index] = ';'.join(db_ids)

# Create a new DataFrame from the database_dict
database_df = pd.DataFrame(database_dict)

# Combine the new columns with the original DataFrame (excluding the old xref columns)
final_data = pd.concat([df_hpo_filter.drop(columns=['xref']), database_df], axis=1)
final_data = final_data[['id', 'name', 'UMLS', ' UMLS']]
final_data[' UMLS'] = final_data[' UMLS'].replace('', pd.NA)
final_data['UMLS'] = final_data['UMLS'].replace('', pd.NA)
final_data

Unnamed: 0,id,name,UMLS,UMLS.1
0,HP:0000001,All,C0444868,
1,HP:0000002,Abnormality of body height,C4025901,
2,HP:0000003,Multicystic kidney dysplasia,,C3714581
3,HP:0000005,Mode of inheritance,C1708511,
4,HP:0000006,Autosomal dominant inheritance,,C0443147
...,...,...,...,...
19528,HP:6001198,Scapholunate interval widening,,
19529,HP:6001199,FInger pulp localization,,
19530,HP:6001200,Ulnar wrist pain,,
19531,HP:6001201,Lunotriquetral interval widening,,


In [4]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

final_data = merge_column(final_data, 'UMLS', ' UMLS', 'umls')
final_data = final_data.drop(index = 0).reset_index(drop=True)
final_data = final_data.groupby('id').agg(lambda x: ';'.join(sorted(set(x.dropna().astype(str)))))
final_data = final_data.reset_index()
final_data

Unnamed: 0,id,name,umls
0,HP:0000002,Abnormality of body height,C4025901
1,HP:0000003,Multicystic kidney dysplasia,C3714581
2,HP:0000005,Mode of inheritance,C1708511
3,HP:0000006,Autosomal dominant inheritance,C0443147
4,HP:0000007,Autosomal recessive inheritance,C0441748;C4020899
...,...,...,...
19527,HP:6001198,Scapholunate interval widening,
19528,HP:6001199,FInger pulp localization,
19529,HP:6001200,Ulnar wrist pain,
19530,HP:6001201,Lunotriquetral interval widening,


### 2.2 UMLS

In [8]:
df_umls_name = pd.read_csv('MRCONSO.RRF', sep='|', header=None)
df_umls_name.columns = ['UMLS ID', 'Language', 'Term status', 'Unique identifier for term', 'String type', 
                        'Unique identifier for string', 'Atom status', 'AUI', 'SAUI', 'SCUI', 'SDUI', 'SAB', 'TTY', 'CODE', 'String', 'SRL', 'SUPPRESS', 'CVF','NA']
df_umls_name.drop(columns=['NA'], inplace=True)

df_umls_name_filter = df_umls_name[df_umls_name['Language'] == 'ENG'] # English language
df_umls_name_filter = df_umls_name_filter[df_umls_name['Term status'] == 'P'] # Preferred LUI of the CUI
df_umls_name_filter = df_umls_name_filter[df_umls_name['String type'] == 'PF'] # Preferred form of term
umls_name_filter = df_umls_name_filter[['UMLS ID', 'SAB', 'CODE']]
umls_name_filter.drop_duplicates(inplace=True)
umls_name_filter.reset_index(drop=True, inplace=True)
umls_name_filter

  df_umls_name = pd.read_csv('MRCONSO.RRF', sep='|', header=None)
  df_umls_name_filter = df_umls_name_filter[df_umls_name['Term status'] == 'P'] # Preferred LUI of the CUI
  df_umls_name_filter = df_umls_name_filter[df_umls_name['String type'] == 'PF'] # Preferred form of term
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  umls_name_filter.drop_duplicates(inplace=True)


Unnamed: 0,UMLS ID,SAB,CODE
0,C0000005,MSH,D012711
1,C0000039,RXNORM,1926948
2,C0000039,MTH,NOCODE
3,C0000052,MSH,D015061
4,C0000052,MTH,NOCODE
...,...,...,...
3920226,C5886725,SRC,V-LNC-PT-BR_277
3920227,C5886726,SRC,V-LNC-RU-RU_277
3920228,C5886727,SRC,V-LNC-TR-TR_277
3920229,C5886728,SRC,V-LNC-UK-UA_277


In [19]:
values_to_keep = ['HPO']
df_umls_filter = umls_name_filter[umls_name_filter['SAB'].isin(values_to_keep)]
df_umls_filter = df_umls_filter.drop(columns=['SAB'])
df_umls_filter = df_umls_filter.rename(columns={'CODE': 'HPO_ID'})
umls_hpo = df_umls_filter.groupby('HPO_ID')['UMLS ID'].apply(lambda x: ';'.join(x.unique())).reset_index()
umls_hpo

Unnamed: 0,HPO_ID,UMLS ID
0,HP:0000001,C0444868
1,HP:0000002,C4025901
2,HP:0000005,C1708511
3,HP:0000006,C0443147
4,HP:0000007,C0441748;C4020899
...,...,...
14909,HP:5201010,C5827008
14910,HP:5201011,C0158653
14911,HP:5201012,C0158654
14912,HP:5201013,C5827009


## 3. Merge Data

In [None]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df
    

### 3.1 HPO + UMLS

In [21]:
pheno_umls_hpo = pd.merge(umls_hpo, final_data, left_on='HPO_ID', right_on='id', how='outer')
pheno_umls_hpo = merge_column(pheno_umls_hpo, 'UMLS ID', 'umls', 'UMLS_ID')
pheno_umls_hpo = merge_column(pheno_umls_hpo, 'HPO_ID', 'id', 'HPO ID')
pheno_umls_hpo = pheno_umls_hpo.drop(index=0).reset_index(drop=True)
pheno_umls_hpo = pheno_umls_hpo.groupby('HPO ID').agg(lambda x: ';'.join(sorted(set(x.dropna().astype(str)))))
pheno_umls_hpo = pheno_umls_hpo.reset_index()
pheno_umls_hpo

Unnamed: 0,HPO ID,name,UMLS_ID
0,HP:0000002,Abnormality of body height,C4025901
1,HP:0000003,Multicystic kidney dysplasia,C3714581
2,HP:0000005,Mode of inheritance,C1708511
3,HP:0000006,Autosomal dominant inheritance,C0443147
4,HP:0000007,Autosomal recessive inheritance,C0441748;C4020899
...,...,...,...
19527,HP:6001198,Scapholunate interval widening,
19528,HP:6001199,FInger pulp localization,
19529,HP:6001200,Ulnar wrist pain,
19530,HP:6001201,Lunotriquetral interval widening,


## 4. BioMedGraphica ID

In [22]:
biomedgraphica_phenotype = pheno_umls_hpo.copy()

biomedgraphica_phenotype = biomedgraphica_phenotype.sort_values(by='HPO ID').reset_index(drop=True)
max_length = len(str(len(biomedgraphica_phenotype)))

biomedgraphica_phenotype['BioMedGraphica_ID'] = ['BMG_PH' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_phenotype) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in biomedgraphica_phenotype.columns if col != 'BioMedGraphica_ID']  # re-order columns
biomedgraphica_phenotype = biomedgraphica_phenotype[columns]

biomedgraphica_phenotype = biomedgraphica_phenotype.rename(columns={'HPO ID': 'HPO_ID', 'name': 'HPO_Name'})
biomedgraphica_phenotype

Unnamed: 0,BioMedGraphica_ID,HPO_ID,HPO_Name,UMLS_ID
0,BMG_PH00001,HP:0000002,Abnormality of body height,C4025901
1,BMG_PH00002,HP:0000003,Multicystic kidney dysplasia,C3714581
2,BMG_PH00003,HP:0000005,Mode of inheritance,C1708511
3,BMG_PH00004,HP:0000006,Autosomal dominant inheritance,C0443147
4,BMG_PH00005,HP:0000007,Autosomal recessive inheritance,C0441748;C4020899
...,...,...,...,...
19527,BMG_PH19528,HP:6001198,Scapholunate interval widening,
19528,BMG_PH19529,HP:6001199,FInger pulp localization,
19529,BMG_PH19530,HP:6001200,Ulnar wrist pain,
19530,BMG_PH19531,HP:6001201,Lunotriquetral interval widening,


In [23]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype.csv'
biomedgraphica_phenotype.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Phenotype\BioMedGraphica_Phenotype.csv


## 5. Description

In [1]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype' / 'BioMedGraphica_Phenotype.csv'
biomedgraphica_phenotype = pd.read_csv(target_dir, dtype=str)

### 5.1 From HPO

In [3]:
import csv

def parse_obo_to_csv(obo_file_path, csv_file_path):
    with open(obo_file_path, 'r') as obo_file:
        lines = obo_file.readlines()

    terms = []
    current_term = {}
    is_in_term_block = False

    for line in lines:
        line = line.strip()

        if line == "[Term]":
            # Save the previous term if it exists
            if current_term:
                terms.append(current_term)
            # Start a new term
            current_term = {}
            is_in_term_block = True
        elif is_in_term_block and line == "":
            # End of the current term block
            if current_term:
                terms.append(current_term)
            current_term = {}
            is_in_term_block = False
        elif is_in_term_block:
            # Parse lines within a term block
            if line.startswith("id: "):
                current_term['id'] = line.split("id: ")[1]
            elif line.startswith("def: "):
                current_term['def'] = line.split("def: ")[1]

    # Add the last term if it exists
    if current_term:
        terms.append(current_term)

    # Write to CSV
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["id", "def"])
        writer.writeheader()
        for term in terms:
            writer.writerow(term)

obo_file = "hp.obo"
csv_file = "hpo_def.csv"

parse_obo_to_csv(obo_file, csv_file)

print(f"Finished: {csv_file}")

Finished: hpo_def.csv


In [2]:
import pandas as pd

hpo_def = pd.read_csv('hpo_def.csv')
hpo_def["def"] = hpo_def["def"].str.replace('"', '', regex=False).str.replace(r'\[.*?\]', '', regex=True).str.strip()
hpo_def

Unnamed: 0,id,def
0,HP:0000001,
1,HP:0000002,Deviation from the norm of height with respect...
2,HP:0000003,Multicystic dysplasia of the kidney is charact...
3,HP:0000005,The pattern in which a particular genetic trai...
4,HP:0000006,A mode of inheritance that is observed for tra...
...,...,...
19528,HP:6001198,Widening of the space between the scaphoid and...
19529,HP:6001199,Applies to an abnormality whose distribution i...
19530,HP:6001200,An unpleasant sensation characterized by physi...
19531,HP:6001201,Radiographic widening of the space between the...


In [3]:
bmg_hpo = biomedgraphica_phenotype[['BioMedGraphica_ID', 'HPO_ID']]
bmg_hpo['HPO_ID'] = bmg_hpo['HPO_ID'].str.split(';')
bmg_hpo = bmg_hpo.explode('HPO_ID')

bmg_hpo_def = bmg_hpo.merge(hpo_def, left_on='HPO_ID', right_on='id', how='left')
bmg_hpo_def = bmg_hpo_def.drop(columns=['id', 'HPO_ID'])
bmg_hpo_def = bmg_hpo_def.rename(columns={'def': 'HPO'})
bmg_hpo_def = bmg_hpo_def.groupby('BioMedGraphica_ID')['HPO'].apply(lambda x: ' | '.join(x.dropna().astype(str).unique())).reset_index()
bmg_hpo_def.replace('', pd.NA, inplace=True)
bmg_hpo_def

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_hpo['HPO_ID'] = bmg_hpo['HPO_ID'].str.split(';')


Unnamed: 0,BioMedGraphica_ID,HPO
0,BMG_PH00001,Deviation from the norm of height with respect...
1,BMG_PH00002,Multicystic dysplasia of the kidney is charact...
2,BMG_PH00003,The pattern in which a particular genetic trai...
3,BMG_PH00004,A mode of inheritance that is observed for tra...
4,BMG_PH00005,A mode of inheritance that is observed for tra...
...,...,...
19527,BMG_PH19528,Widening of the space between the scaphoid and...
19528,BMG_PH19529,Applies to an abnormality whose distribution i...
19529,BMG_PH19530,An unpleasant sensation characterized by physi...
19530,BMG_PH19531,Radiographic widening of the space between the...


### 5.2 From UMLS

In [4]:
umls = pd.read_csv('MRDEF.RRF', sep='|', header=None)
umls.columns = ['UMLS ID', 'drop1', 'drop2', 'drop3','database','def', 'drop4', 'drop5', 'drop6']
umls.drop(columns=['drop1', 'drop2', 'drop3', 'drop4', 'drop5', 'drop6'], inplace=True)
umls

Unnamed: 0,UMLS ID,database,def
0,C0000039,MSH,Synthetic phospholipid used in liposomes and l...
1,C0000039,MSHSWE,Syntetisk fosfolipid som används i liposomer o...
2,C0000039,MSHCZE,Syntetický fosfolipid používaný v liposomech a...
3,C0000039,MSHPOR,Fosfolipídeo sintético utilizado em lipossomos...
4,C0000039,MSHSPA,Fosfolípido sintético que se utiliza en liposo...
...,...,...,...
441457,C5886678,MSHPOR,Intervenção com o objetivo de promover hábitos...
441458,C5886690,MSHPOR,Condição de quase morte experimentada por part...
441459,C5886690,MSHSPA,Condición cercana a la muerte experimentada po...
441460,C5886702,MSHPOR,Critérios para que qualquer pessoa adulta e pl...


In [5]:
databases_to_keep = ['NCI', 'GO', 'MSH', 'SNOMEDCT_US', 'ORPHANET']
filtered_umls = umls[umls['database'].isin(databases_to_keep)]
filtered_umls = filtered_umls.pivot_table(index='UMLS ID', columns='database', values='def', aggfunc='first').reset_index()
filtered_umls

database,UMLS ID,GO,MSH,NCI,ORPHANET,SNOMEDCT_US
0,C0000039,,Synthetic phospholipid used in liposomes and l...,,,
1,C0000052,,"In glycogen or amylopectin synthesis, the enzy...",,,
2,C0000084,,"Found in various tissues, particularly in four...",,,
3,C0000096,,A potent cyclic nucleotide phosphodiesterase i...,,,
4,C0000097,,A dopaminergic neurotoxic compound which produ...,,,
...,...,...,...,...,...,...
221515,C5885004,,,,,A mammography report that has been transcribed...
221516,C5885031,,,,,"Teaching about child support, the monthly amou..."
221517,C5885089,,,,,A posterior or backward torsion of the sacrum ...
221518,C5885090,,,,,A posterior or backward torsion of the sacrum ...


In [6]:
bmg_umls = biomedgraphica_phenotype[['BioMedGraphica_ID', 'UMLS_ID']]
bmg_umls['UMLS_ID'] = bmg_umls['UMLS_ID'].str.split(';')
bmg_umls = bmg_umls.explode('UMLS_ID')

bmg_umls_def = pd.merge(bmg_umls, filtered_umls, left_on='UMLS_ID', right_on='UMLS ID', how='left')
bmg_umls_def = bmg_umls_def.drop(columns=['UMLS_ID', 'UMLS ID'])
bmg_umls_def = bmg_umls_def.groupby('BioMedGraphica_ID').agg({
    'NCI': lambda x: ' | '.join(x.dropna().astype(str).unique()),
    'GO': lambda x: ' | '.join(x.dropna().astype(str).unique()),
    'MSH': lambda x: ' | '.join(x.dropna().astype(str).unique()),
    'SNOMEDCT_US': lambda x: ' | '.join(x.dropna().astype(str).unique()),
    'ORPHANET': lambda x: ' | '.join(x.dropna().astype(str).unique())
}).reset_index()
bmg_umls_def.replace('', pd.NA, inplace=True)
bmg_umls_def.rename(columns={'MSH': 'MeSH'}, inplace=True)
bmg_umls_def

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_umls['UMLS_ID'] = bmg_umls['UMLS_ID'].str.split(';')


Unnamed: 0,BioMedGraphica_ID,NCI,GO,MeSH,SNOMEDCT_US,ORPHANET
0,BMG_PH00001,,,,,
1,BMG_PH00002,Abnormal development of the kidney that is cha...,,A nongenetic defect due to malformation of the...,A congenital anomaly of the kidney and urinary...,A rare congenital anomaly of the kidney and ur...
2,BMG_PH00003,The manner in which a particular genetic trait...,,The different ways GENES and their ALLELES int...,,
3,BMG_PH00004,,,,,
4,BMG_PH00005,,,,,
...,...,...,...,...,...,...
19527,BMG_PH19528,,,,,
19528,BMG_PH19529,,,,,
19529,BMG_PH19530,,,,,
19530,BMG_PH19531,,,,,


### 5.3 Final Description

In [7]:
phenotype_description = bmg_hpo_def.merge(bmg_umls_def, on='BioMedGraphica_ID', how='outer')
phenotype_description

Unnamed: 0,BioMedGraphica_ID,HPO,NCI,GO,MeSH,SNOMEDCT_US,ORPHANET
0,BMG_PH00001,Deviation from the norm of height with respect...,,,,,
1,BMG_PH00002,Multicystic dysplasia of the kidney is charact...,Abnormal development of the kidney that is cha...,,A nongenetic defect due to malformation of the...,A congenital anomaly of the kidney and urinary...,A rare congenital anomaly of the kidney and ur...
2,BMG_PH00003,The pattern in which a particular genetic trai...,The manner in which a particular genetic trait...,,The different ways GENES and their ALLELES int...,,
3,BMG_PH00004,A mode of inheritance that is observed for tra...,,,,,
4,BMG_PH00005,A mode of inheritance that is observed for tra...,,,,,
...,...,...,...,...,...,...,...
19527,BMG_PH19528,Widening of the space between the scaphoid and...,,,,,
19528,BMG_PH19529,Applies to an abnormality whose distribution i...,,,,,
19529,BMG_PH19530,An unpleasant sensation characterized by physi...,,,,,
19530,BMG_PH19531,Radiographic widening of the space between the...,,,,,


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_Description.csv'
phenotype_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Phenotype\BioMedGraphica_Phenotype_Description.csv


### 5.4 Combined Description

In [9]:
comb_description = phenotype_description.copy()

# add the column name at the beginning of the string
# first, we need to get the column names
column_names = comb_description.columns.tolist()
column_names = [col for col in column_names if col != 'BioMedGraphica_ID']
# then we can apply the function to each column
for col in column_names:
    comb_description[col] = comb_description[col].apply(lambda x: ' | '.join([f"{col}: {i}" for i in x.split(' | ')]) if pd.notna(x) else x)

# now we can merge the columns into one
comb_description['Description'] = comb_description[column_names].apply(lambda x: ' | '.join(x.dropna()), axis=1)
comb_description = comb_description[['BioMedGraphica_ID', 'Description']]
comb_description

Unnamed: 0,BioMedGraphica_ID,Description
0,BMG_PH00001,HPO: Deviation from the norm of height with re...
1,BMG_PH00002,HPO: Multicystic dysplasia of the kidney is ch...
2,BMG_PH00003,HPO: The pattern in which a particular genetic...
3,BMG_PH00004,HPO: A mode of inheritance that is observed fo...
4,BMG_PH00005,HPO: A mode of inheritance that is observed fo...
...,...,...
19527,BMG_PH19528,HPO: Widening of the space between the scaphoi...
19528,BMG_PH19529,HPO: Applies to an abnormality whose distribut...
19529,BMG_PH19530,HPO: An unpleasant sensation characterized by ...
19530,BMG_PH19531,HPO: Radiographic widening of the space betwee...


In [10]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_Description_Combined.csv'
comb_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Phenotype\BioMedGraphica_Phenotype_Description_Combined.csv


## 6. File Generation

In [11]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype' / 'BioMedGraphica_Phenotype.csv'
biomedgraphica_phenotype = pd.read_csv(target_dir, dtype=str)

### 6.1 Name and ID

GUI Name

In [12]:
gui_name = biomedgraphica_phenotype.copy()
gui_name.rename(columns={'HPO_Name': 'Phenotype_Name_List'}, inplace=True)
gui_name = gui_name[['BioMedGraphica_ID', 'Phenotype_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Phenotype_Name_List
0,BMG_PH00001,Abnormality of body height
1,BMG_PH00002,Multicystic kidney dysplasia
2,BMG_PH00003,Mode of inheritance
3,BMG_PH00004,Autosomal dominant inheritance
4,BMG_PH00005,Autosomal recessive inheritance
...,...,...
19527,BMG_PH19528,Scapholunate interval widening
19528,BMG_PH19529,FInger pulp localization
19529,BMG_PH19530,Ulnar wrist pain
19530,BMG_PH19531,Lunotriquetral interval widening


In [13]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Phenotype\BioMedGraphica_Phenotype_GUI_Name.csv


LLM Name and ID

In [14]:
llm_name_id = biomedgraphica_phenotype.copy()

llm_name_id['HPO_ID'] = llm_name_id['HPO_ID'].apply(
    lambda x: ' | '.join(f"HPO ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['UMLS_ID'] = llm_name_id['UMLS_ID'].apply(
    lambda x: ' | '.join(f"UMLS ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

column_order = ['BioMedGraphica_ID', 'HPO_Name', 'HPO_ID', 'UMLS_ID']
llm_name_id = llm_name_id[column_order]
llm_name_id

Unnamed: 0,BioMedGraphica_ID,HPO_Name,HPO_ID,UMLS_ID
0,BMG_PH00001,Abnormality of body height,HPO ID:HP:0000002,UMLS ID:C4025901
1,BMG_PH00002,Multicystic kidney dysplasia,HPO ID:HP:0000003,UMLS ID:C3714581
2,BMG_PH00003,Mode of inheritance,HPO ID:HP:0000005,UMLS ID:C1708511
3,BMG_PH00004,Autosomal dominant inheritance,HPO ID:HP:0000006,UMLS ID:C0443147
4,BMG_PH00005,Autosomal recessive inheritance,HPO ID:HP:0000007,UMLS ID:C0441748 | UMLS ID:C4020899
...,...,...,...,...
19527,BMG_PH19528,Scapholunate interval widening,HPO ID:HP:6001198,
19528,BMG_PH19529,FInger pulp localization,HPO ID:HP:6001199,
19529,BMG_PH19530,Ulnar wrist pain,HPO ID:HP:6001200,
19530,BMG_PH19531,Lunotriquetral interval widening,HPO ID:HP:6001201,


In [15]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Phenotype\BioMedGraphica_Phenotype_LLM_Name_ID.csv


LLM Name and ID Combined

In [16]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, llm_combined.columns[llm_combined.columns != 'BioMedGraphica_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_PH00001,Abnormality of body height | UMLS ID:C4025901 ...
1,BMG_PH00002,Multicystic kidney dysplasia | HPO ID:HP:00000...
2,BMG_PH00003,HPO ID:HP:0000005 | UMLS ID:C1708511 | Mode of...
3,BMG_PH00004,UMLS ID:C0443147 | Autosomal dominant inherita...
4,BMG_PH00005,HPO ID:HP:0000007 | UMLS ID:C4020899 | UMLS ID...
...,...,...
19527,BMG_PH19528,Scapholunate interval widening | HPO ID:HP:600...
19528,BMG_PH19529,FInger pulp localization | HPO ID:HP:6001199
19529,BMG_PH19530,HPO ID:HP:6001200 | Ulnar wrist pain
19530,BMG_PH19531,HPO ID:HP:6001201 | Lunotriquetral interval wi...


In [17]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Phenotype\BioMedGraphica_Phenotype_LLM_Name_ID_Combined.csv


Display Name

In [18]:
display_name = biomedgraphica_phenotype.copy()

display_name['BMG_Phenotype_Name'] = display_name['HPO_Name']
display_name = display_name[['BioMedGraphica_ID', 'BMG_Phenotype_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Phenotype_Name
0,BMG_PH00001,Abnormality of body height
1,BMG_PH00002,Multicystic kidney dysplasia
2,BMG_PH00003,Mode of inheritance
3,BMG_PH00004,Autosomal dominant inheritance
4,BMG_PH00005,Autosomal recessive inheritance
...,...,...
19527,BMG_PH19528,Scapholunate interval widening
19528,BMG_PH19529,FInger pulp localization
19529,BMG_PH19530,Ulnar wrist pain
19530,BMG_PH19531,Lunotriquetral interval widening


In [19]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Phenotype_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Phenotype\BioMedGraphica_Phenotype_Display_Name.csv
