### KEGG

In [1]:
# Download Link: R code
# Download Date: 2025-03-21
# Download Version: 2025-03-21
import pandas as pd

kegg = pd.read_csv('full_kegg_pathway_list_with_id.csv')
kegg_protein_pathway = kegg[(kegg['source_type'] == 'SYMBOL')]
kegg_protein_pathway = kegg_protein_pathway[['source', 'pathway_id']]
kegg_protein_pathway['pathway_id'] = kegg_protein_pathway['pathway_id'].str.replace(':', '')
kegg_protein_pathway.drop_duplicates(inplace=True)
kegg_protein_pathway.reset_index(drop=True, inplace=True)
kegg_protein_pathway

Unnamed: 0,source,pathway_id
0,GALM,hsa00010
1,LDHAL6A,hsa00010
2,DLAT,hsa00010
3,ENO1,hsa00010
4,ENO2,hsa00010
...,...,...
21046,GCK,hsa00524
21047,HK1,hsa00524
21048,HK2,hsa00524
21049,HK3,hsa00524


### BioMedGraphica ID

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_protein = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein' / 'BioMedGraphica_Protein.csv'
target_dir_pathway = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
biomedgraphica_protein = pd.read_csv(target_dir_protein, dtype=str)
biomedgraphica_pathway = pd.read_csv(target_dir_pathway, dtype=str)

### KEGG Mapping

In [3]:
gene_name_individualid = biomedgraphica_protein[['HGNC_Symbol', 'BioMedGraphica_ID']]
gene_name_individualid.dropna(subset=['HGNC_Symbol'], inplace=True)
gene_name_to_individualid = gene_name_individualid.assign(HGNC_Symbol=gene_name_individualid['HGNC_Symbol'].str.split(';')).explode('HGNC_Symbol')
gene_name_to_individualid = gene_name_to_individualid.groupby('HGNC_Symbol')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_protein_pathway['From_ID'] = kegg_protein_pathway['source'].map(gene_name_to_individualid)
kegg_protein_pathway

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_name_individualid.dropna(subset=['HGNC_Symbol'], inplace=True)


Unnamed: 0,source,pathway_id,From_ID
0,GALM,hsa00010,BMG_PT080047;BMG_PT105461;BMG_PT110190;BMG_PT1...
1,LDHAL6A,hsa00010,BMG_PT065496
2,DLAT,hsa00010,BMG_PT038822;BMG_PT104231;BMG_PT126037;BMG_PT1...
3,ENO1,hsa00010,BMG_PT038099;BMG_PT141847;BMG_PT143177;BMG_PT1...
4,ENO2,hsa00010,BMG_PT038310;BMG_PT128759;BMG_PT130715;BMG_PT1...
...,...,...,...
21046,GCK,hsa00524,BMG_PT040472;BMG_PT101944;BMG_PT104502;BMG_PT1...
21047,HK1,hsa00524,BMG_PT039465;BMG_PT109739;BMG_PT113965;BMG_PT1...
21048,HK2,hsa00524,BMG_PT041365;BMG_PT105740
21049,HK3,hsa00524,BMG_PT041366;BMG_PT119590;BMG_PT120813;BMG_PT1...


In [4]:
pathway_individualid = biomedgraphica_pathway[['KEGG_ID', 'BioMedGraphica_ID']]
pathway_individualid.dropna(subset=['KEGG_ID'], inplace=True)
pathway_to_individualid = pathway_individualid.assign(KEGG_ID=pathway_individualid['KEGG_ID'].str.split(';')).explode('KEGG_ID')
pathway_to_individualid = pathway_to_individualid.groupby('KEGG_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_protein_pathway['To_ID'] = kegg_protein_pathway['pathway_id'].map(pathway_to_individualid)
kegg_protein_pathway

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_individualid.dropna(subset=['KEGG_ID'], inplace=True)


Unnamed: 0,source,pathway_id,From_ID,To_ID
0,GALM,hsa00010,BMG_PT080047;BMG_PT105461;BMG_PT110190;BMG_PT1...,BMG_PW0025;BMG_PW0637;BMG_PW0638
1,LDHAL6A,hsa00010,BMG_PT065496,BMG_PW0025;BMG_PW0637;BMG_PW0638
2,DLAT,hsa00010,BMG_PT038822;BMG_PT104231;BMG_PT126037;BMG_PT1...,BMG_PW0025;BMG_PW0637;BMG_PW0638
3,ENO1,hsa00010,BMG_PT038099;BMG_PT141847;BMG_PT143177;BMG_PT1...,BMG_PW0025;BMG_PW0637;BMG_PW0638
4,ENO2,hsa00010,BMG_PT038310;BMG_PT128759;BMG_PT130715;BMG_PT1...,BMG_PW0025;BMG_PW0637;BMG_PW0638
...,...,...,...,...
21046,GCK,hsa00524,BMG_PT040472;BMG_PT101944;BMG_PT104502;BMG_PT1...,BMG_PW6625
21047,HK1,hsa00524,BMG_PT039465;BMG_PT109739;BMG_PT113965;BMG_PT1...,BMG_PW6625
21048,HK2,hsa00524,BMG_PT041365;BMG_PT105740,BMG_PW6625
21049,HK3,hsa00524,BMG_PT041366;BMG_PT119590;BMG_PT120813;BMG_PT1...,BMG_PW6625


In [5]:
protein_pathway = kegg_protein_pathway[['From_ID', 'To_ID']]
protein_pathway.dropna(subset=['From_ID'], inplace=True)
protein_pathway.dropna(subset=['To_ID'], inplace=True)
protein_pathway.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20867 entries, 0 to 21050
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From_ID  20867 non-null  object
 1   To_ID    20867 non-null  object
dtypes: object(2)
memory usage: 489.1+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_pathway.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_pathway.dropna(subset=['To_ID'], inplace=True)


In [6]:
protein_pathway['From_ID'] = protein_pathway['From_ID'].str.split(';')
protein_pathway['To_ID'] = protein_pathway['To_ID'].str.split(';')

protein_pathway = protein_pathway.explode('From_ID')
protein_pathway = protein_pathway.explode('To_ID')

protein_pathway.drop_duplicates(inplace=True)
protein_pathway.reset_index(drop=True, inplace=True)
protein_pathway

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_pathway['From_ID'] = protein_pathway['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_pathway['To_ID'] = protein_pathway['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PT080047,BMG_PW0025
1,BMG_PT080047,BMG_PW0637
2,BMG_PT080047,BMG_PW0638
3,BMG_PT105461,BMG_PW0025
4,BMG_PT105461,BMG_PW0637
...,...,...
152907,BMG_PT041366,BMG_PW6625
152908,BMG_PT119590,BMG_PW6625
152909,BMG_PT120813,BMG_PW6625
152910,BMG_PT121971,BMG_PW6625


### Protein-Pathway Relation

In [7]:
protein_pathway['Type'] = 'Protein-Pathway'
protein_pathway['Source'] = 'KEGG'

max_length = len(str(len(protein_pathway)))
protein_pathway['BioMedGraphica_ID'] = ['BMG_ED_PTPW' + str(i).zfill(max_length) for i in range(1, len(protein_pathway) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in protein_pathway.columns if col != 'BioMedGraphica_ID']  # re-order columns
protein_pathway = protein_pathway[columns]
protein_pathway

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Type,Source
0,BMG_ED_PTPW000001,BMG_PT080047,BMG_PW0025,Protein-Pathway,KEGG
1,BMG_ED_PTPW000002,BMG_PT080047,BMG_PW0637,Protein-Pathway,KEGG
2,BMG_ED_PTPW000003,BMG_PT080047,BMG_PW0638,Protein-Pathway,KEGG
3,BMG_ED_PTPW000004,BMG_PT105461,BMG_PW0025,Protein-Pathway,KEGG
4,BMG_ED_PTPW000005,BMG_PT105461,BMG_PW0637,Protein-Pathway,KEGG
...,...,...,...,...,...
152907,BMG_ED_PTPW152908,BMG_PT041366,BMG_PW6625,Protein-Pathway,KEGG
152908,BMG_ED_PTPW152909,BMG_PT119590,BMG_PW6625,Protein-Pathway,KEGG
152909,BMG_ED_PTPW152910,BMG_PT120813,BMG_PW6625,Protein-Pathway,KEGG
152910,BMG_ED_PTPW152911,BMG_PT121971,BMG_PW6625,Protein-Pathway,KEGG


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Protein-Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_Pathway.csv'
protein_pathway.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Protein-Pathway\BioMedGraphica_Protein_Pathway.csv
