### KEGG

In [1]:
# Download Link: R code
# Download Date: 2025-03-21
# Download Version: 2025-03-21

import pandas as pd

kegg = pd.read_csv('full_kegg_pathway_list_with_id.csv', index_col=0)
kegg['pathway_id'] = kegg['pathway_id'].str.replace(':', '')
kegg_pathway_protein = kegg[(kegg['target_type'] == 'SYMBOL')]
kegg_pathway_protein = kegg_pathway_protein[['pathway_id', 'target']]
kegg_pathway_protein.drop_duplicates(inplace=True)
kegg_pathway_protein.reset_index(drop=True, inplace=True)
kegg_pathway_protein

Unnamed: 0,pathway_id,target
0,hsa00010,LDHAL6A
1,hsa00010,LDHA
2,hsa00010,LDHB
3,hsa00010,LDHC
4,hsa00010,PDHA1
...,...,...
24470,hsa05033,GABRQ
24471,hsa05033,CHRNA4
24472,hsa05033,CHRNA7
24473,hsa05033,CHRNB2


### BioMedGraphica ID

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_protein = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein' / 'BioMedGraphica_Protein.csv'
target_dir_pathway = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
biomedgraphica_protein = pd.read_csv(target_dir_protein, dtype=str)
biomedgraphica_pathway = pd.read_csv(target_dir_pathway, dtype=str)

### KEGG Mapping

In [3]:
gene_name_individualid = biomedgraphica_protein[['HGNC_Symbol', 'BioMedGraphica_ID']]
gene_name_individualid.dropna(subset=['HGNC_Symbol'], inplace=True)
gene_name_to_individualid = gene_name_individualid.assign(HGNC_Symbol=gene_name_individualid['HGNC_Symbol'].str.split(';')).explode('HGNC_Symbol')
gene_name_to_individualid = gene_name_to_individualid.groupby('HGNC_Symbol')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_pathway_protein['To_ID'] = kegg_pathway_protein['target'].map(gene_name_to_individualid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_name_individualid.dropna(subset=['HGNC_Symbol'], inplace=True)


In [4]:
pathway_individualid = biomedgraphica_pathway[['KEGG_ID', 'BioMedGraphica_ID']]
pathway_individualid.dropna(subset=['KEGG_ID'], inplace=True)
pathway_to_individualid = pathway_individualid.assign(KEGG_ID=pathway_individualid['KEGG_ID'].str.split(';')).explode('KEGG_ID')
pathway_to_individualid = pathway_to_individualid.groupby('KEGG_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_pathway_protein['From_ID'] = kegg_pathway_protein['pathway_id'].map(pathway_to_individualid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_individualid.dropna(subset=['KEGG_ID'], inplace=True)


In [5]:
pathway_protein = kegg_pathway_protein[['From_ID', 'To_ID']]
pathway_protein.dropna(subset=['From_ID'], inplace=True)
pathway_protein.dropna(subset=['To_ID'], inplace=True)
pathway_protein.reset_index(drop=True, inplace=True)
pathway_protein

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_protein.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_protein.dropna(subset=['To_ID'], inplace=True)


Unnamed: 0,From_ID,To_ID
0,BMG_PW0025;BMG_PW0637;BMG_PW0638,BMG_PT065496
1,BMG_PW0025;BMG_PW0637;BMG_PW0638,BMG_PT037560;BMG_PT129271;BMG_PT129340;BMG_PT1...
2,BMG_PW0025;BMG_PW0637;BMG_PW0638,BMG_PT038125;BMG_PT104532;BMG_PT109638;BMG_PT1...
3,BMG_PW0025;BMG_PW0637;BMG_PW0638,BMG_PT038181;BMG_PT104549;BMG_PT128792;BMG_PT1...
4,BMG_PW0025;BMG_PW0637;BMG_PW0638,BMG_PT038246;BMG_PT103745;BMG_PT112060;BMG_PT1...
...,...,...
24363,BMG_PW1028,BMG_PT098278
24364,BMG_PW1028,BMG_PT040826;BMG_PT123498;BMG_PT150608;BMG_PT1...
24365,BMG_PW1028,BMG_PT040536;BMG_PT151440;BMG_PT152407;BMG_PT1...
24366,BMG_PW1028,BMG_PT039380;BMG_PT152441;BMG_PT152989


In [6]:
pathway_protein['From_ID'] = pathway_protein['From_ID'].str.split(';')
pathway_protein['To_ID'] = pathway_protein['To_ID'].str.split(';')

pathway_protein = pathway_protein.explode('From_ID')
pathway_protein = pathway_protein.explode('To_ID')

pathway_protein.drop_duplicates(inplace=True)
pathway_protein.reset_index(drop=True, inplace=True)
pathway_protein

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_protein['From_ID'] = pathway_protein['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_protein['To_ID'] = pathway_protein['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PW0025,BMG_PT065496
1,BMG_PW0637,BMG_PT065496
2,BMG_PW0638,BMG_PT065496
3,BMG_PW0025,BMG_PT037560
4,BMG_PW0025,BMG_PT129271
...,...,...
176128,BMG_PW1028,BMG_PT039380
176129,BMG_PW1028,BMG_PT152441
176130,BMG_PW1028,BMG_PT152989
176131,BMG_PW1028,BMG_PT046311


### Pathway-Protein Relation

In [7]:
pathway_protein['Type'] = 'Pathway-Protein'
pathway_protein['Source'] = 'KEGG'

max_length = len(str(len(pathway_protein)))
pathway_protein['BioMedGraphica_ID'] = ['BMG_ED_PWPT' + str(i).zfill(max_length) for i in range(1, len(pathway_protein) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in pathway_protein.columns if col != 'BioMedGraphica_ID']  # re-order columns
pathway_protein = pathway_protein[columns]
pathway_protein

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Type,Source
0,BMG_ED_PWPT000001,BMG_PW0025,BMG_PT065496,Pathway-Protein,KEGG
1,BMG_ED_PWPT000002,BMG_PW0637,BMG_PT065496,Pathway-Protein,KEGG
2,BMG_ED_PWPT000003,BMG_PW0638,BMG_PT065496,Pathway-Protein,KEGG
3,BMG_ED_PWPT000004,BMG_PW0025,BMG_PT037560,Pathway-Protein,KEGG
4,BMG_ED_PWPT000005,BMG_PW0025,BMG_PT129271,Pathway-Protein,KEGG
...,...,...,...,...,...
176128,BMG_ED_PWPT176129,BMG_PW1028,BMG_PT039380,Pathway-Protein,KEGG
176129,BMG_ED_PWPT176130,BMG_PW1028,BMG_PT152441,Pathway-Protein,KEGG
176130,BMG_ED_PWPT176131,BMG_PW1028,BMG_PT152989,Pathway-Protein,KEGG
176131,BMG_ED_PWPT176132,BMG_PW1028,BMG_PT046311,Pathway-Protein,KEGG


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Pathway-Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_Protein.csv'
pathway_protein.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Pathway-Protein\BioMedGraphica_Pathway_Protein.csv
