### HPO

In [1]:
# Download Link: https://hpo.jax.org/data/annotations
# Download Date: 2025-03-21
# Download Version: unknown

import pandas as pd

df_protein_phen = pd.read_csv('genes_to_phenotype.txt', sep='\t')
df_protein_phen = df_protein_phen[['ncbi_gene_id', 'hpo_id']]
df_protein_phen['ncbi_gene_id'] = df_protein_phen['ncbi_gene_id'].astype(str)
df_protein_phen.drop_duplicates(inplace=True)
df_protein_phen

Unnamed: 0,ncbi_gene_id,hpo_id
0,10,HP:0000007
1,10,HP:0001939
2,16,HP:0002460
3,16,HP:0002451
4,16,HP:0008619
...,...,...
316713,120766137,HP:0002209
316714,120766137,HP:0000653
316715,120766137,HP:0045075
316716,120766137,HP:0001596


### BioMedgraphica ID

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_protein = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein' / 'BioMedGraphica_Protein.csv'
target_dir_phenotype = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype' / 'BioMedGraphica_Phenotype.csv'
biomedgraphica_protein = pd.read_csv(target_dir_protein, dtype=str)
biomedgraphica_phenotype = pd.read_csv(target_dir_phenotype, dtype=str)

### HPO Mapping

In [3]:
ncbi_individual = biomedgraphica_protein[['NCBI_Gene_ID', 'BioMedGraphica_ID']]
ncbi_individual.dropna(subset=['NCBI_Gene_ID'], inplace=True)
ncbi_individual = ncbi_individual.assign(NCBI_Gene_ID=ncbi_individual['NCBI_Gene_ID'].str.split(';')).explode('NCBI_Gene_ID')

ncbi_to_individual = ncbi_individual.groupby('NCBI_Gene_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_protein_phen['From_ID'] = df_protein_phen['ncbi_gene_id'].map(ncbi_to_individual)
df_protein_phen

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ncbi_individual.dropna(subset=['NCBI_Gene_ID'], inplace=True)


Unnamed: 0,ncbi_gene_id,hpo_id,From_ID
0,10,HP:0000007,BMG_PT038896
1,10,HP:0001939,BMG_PT038896
2,16,HP:0002460,BMG_PT041062;BMG_PT160991
3,16,HP:0002451,BMG_PT041062;BMG_PT160991
4,16,HP:0008619,BMG_PT041062;BMG_PT160991
...,...,...,...
316713,120766137,HP:0002209,BMG_PT038750
316714,120766137,HP:0000653,BMG_PT038750
316715,120766137,HP:0045075,BMG_PT038750
316716,120766137,HP:0001596,BMG_PT038750


In [4]:
hpo_individual = biomedgraphica_phenotype[['HPO_ID', 'BioMedGraphica_ID']]
hpo_individual.dropna(subset=['HPO_ID'], inplace=True)
hpo_individual = hpo_individual.assign(HPO_ID=hpo_individual['HPO_ID'].str.split(';')).explode('HPO_ID')

hpo_to_individual = hpo_individual.groupby('HPO_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_protein_phen['To_ID'] = df_protein_phen['hpo_id'].map(hpo_to_individual)
df_protein_phen

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_individual.dropna(subset=['HPO_ID'], inplace=True)


Unnamed: 0,ncbi_gene_id,hpo_id,From_ID,To_ID
0,10,HP:0000007,BMG_PT038896,BMG_PH00005
1,10,HP:0001939,BMG_PT038896,BMG_PH01448
2,16,HP:0002460,BMG_PT041062;BMG_PT160991,BMG_PH01835
3,16,HP:0002451,BMG_PT041062;BMG_PT160991,BMG_PH01830
4,16,HP:0008619,BMG_PT041062;BMG_PT160991,BMG_PH05649
...,...,...,...,...
316713,120766137,HP:0002209,BMG_PT038750,BMG_PH01670
316714,120766137,HP:0000653,BMG_PT038750,BMG_PH00517
316715,120766137,HP:0045075,BMG_PT038750,BMG_PH16020
316716,120766137,HP:0001596,BMG_PT038750,BMG_PH01194


In [5]:
protein_phen = df_protein_phen[['From_ID', 'To_ID']]
protein_phen.dropna(subset=['From_ID'], inplace=True)
protein_phen.dropna(subset=['To_ID'], inplace=True)
protein_phen

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_phen.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_phen.dropna(subset=['To_ID'], inplace=True)


Unnamed: 0,From_ID,To_ID
0,BMG_PT038896,BMG_PH00005
1,BMG_PT038896,BMG_PH01448
2,BMG_PT041062;BMG_PT160991,BMG_PH01835
3,BMG_PT041062;BMG_PT160991,BMG_PH01830
4,BMG_PT041062;BMG_PT160991,BMG_PH05649
...,...,...
316713,BMG_PT038750,BMG_PH01670
316714,BMG_PT038750,BMG_PH00517
316715,BMG_PT038750,BMG_PH16020
316716,BMG_PT038750,BMG_PH01194


In [6]:
protein_phen['From_ID'] = protein_phen['From_ID'].str.split(';')
protein_phen['To_ID'] = protein_phen['To_ID'].str.split(';')

protein_phen = protein_phen.explode('From_ID')
protein_phen = protein_phen.explode('To_ID')

protein_phen.drop_duplicates(inplace=True)
protein_phen

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_phen['From_ID'] = protein_phen['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_phen['To_ID'] = protein_phen['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PT038896,BMG_PH00005
1,BMG_PT038896,BMG_PH01448
2,BMG_PT041062,BMG_PH01835
2,BMG_PT160991,BMG_PH01835
3,BMG_PT041062,BMG_PH01830
...,...,...
316713,BMG_PT038750,BMG_PH01670
316714,BMG_PT038750,BMG_PH00517
316715,BMG_PT038750,BMG_PH16020
316716,BMG_PT038750,BMG_PH01194


### Protein-Phenotype Relation

In [7]:
protein_phen['Source'] = 'HPO'
protein_phen['Type'] = 'Protein-Phenotype'
max_length = len(str(len(protein_phen)))
protein_phen['BioMedGraphica_ID'] = ['BMG_ED_PTPH' + str(i).zfill(max_length) for i in range(1, len(protein_phen) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in protein_phen.columns if col != 'BioMedGraphica_ID']  # re-order columns
protein_phen = protein_phen[columns]
protein_phen

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_PTPH000001,BMG_PT038896,BMG_PH00005,HPO,Protein-Phenotype
1,BMG_ED_PTPH000002,BMG_PT038896,BMG_PH01448,HPO,Protein-Phenotype
2,BMG_ED_PTPH000003,BMG_PT041062,BMG_PH01835,HPO,Protein-Phenotype
2,BMG_ED_PTPH000004,BMG_PT160991,BMG_PH01835,HPO,Protein-Phenotype
3,BMG_ED_PTPH000005,BMG_PT041062,BMG_PH01830,HPO,Protein-Phenotype
...,...,...,...,...,...
316713,BMG_ED_PTPH478275,BMG_PT038750,BMG_PH01670,HPO,Protein-Phenotype
316714,BMG_ED_PTPH478276,BMG_PT038750,BMG_PH00517,HPO,Protein-Phenotype
316715,BMG_ED_PTPH478277,BMG_PT038750,BMG_PH16020,HPO,Protein-Phenotype
316716,BMG_ED_PTPH478278,BMG_PT038750,BMG_PH01194,HPO,Protein-Phenotype


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Protein-Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_Phenotype.csv'
protein_phen.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Protein-Phenotype\BioMedGraphica_Protein_Phenotype.csv
