### KEGG

In [1]:
# Download Link: R code
# Download Date: 2025-03-21
# Download Version: 2025-03-21
import pandas as pd

kegg = pd.read_csv('full_kegg_pathway_list_with_id.csv')
kegg['pathway_id'] = kegg['pathway_id'].str.replace(':', '')
kegg.head()

Unnamed: 0.1,Unnamed: 0,source_type,source,target_type,target,direction,edge_type,pathway_name,pathway_id
0,2,KEGGCOMP,C00022,SYMBOL,LDHAL6A,directed,Process,Glycolysis / Gluconeogenesis,hsa00010
1,3,KEGGCOMP,C00022,SYMBOL,LDHAL6A,undirected,Process,Glycolysis / Gluconeogenesis,hsa00010
2,4,KEGGCOMP,C00022,SYMBOL,LDHA,directed,Process,Glycolysis / Gluconeogenesis,hsa00010
3,5,KEGGCOMP,C00022,SYMBOL,LDHA,undirected,Process,Glycolysis / Gluconeogenesis,hsa00010
4,6,KEGGCOMP,C00022,SYMBOL,LDHB,directed,Process,Glycolysis / Gluconeogenesis,hsa00010


In [2]:
# Download Link: R code
# Download Date: 2025-03-21
# Download Version: 2025-03-21

kegg_compound = pd.read_csv('kegg_compound_data.csv')
kegg_compound['Name'] = kegg_compound['Name'].str.replace(';', '')

exploded_data = kegg_compound.assign(Other_DBS=kegg_compound['Other_DBS'].str.split(';')).explode('Other_DBS')
exploded_data['Other_DBS'] = exploded_data['Other_DBS'].str.strip()
exploded_data[['Database', 'ID']] = exploded_data['Other_DBS'].str.extract(r':(\w+):\s?(.*)')
cleaned_data = exploded_data.dropna(subset=['Database', 'ID'])
pivoted_data = cleaned_data.pivot(index=['KEGG_ID', 'Name'], columns='Database', values='ID').reset_index()

kegg_comp = pivoted_data[['KEGG_ID', 'CAS', 'PubChem']]
kegg_comp

Database,KEGG_ID,CAS,PubChem
0,C00001,7732-18-5,3303
1,C00002,56-65-5,3304
2,C00003,53-84-9,3305
3,C00004,58-68-4,3306
4,C00005,2646-71-1,3307
...,...,...,...
19287,C22958,,500141089
19288,C22959,,500141090
19289,C22960,,500141091
19290,C22961,,500141092


In [3]:
kegg_pathway_drug = kegg[(kegg['target_type'] == 'KEGGCOMP')]
kegg_pathway_drug = kegg_pathway_drug[['pathway_id', 'target']]
kegg_pathway_drug.drop_duplicates(inplace=True)
kegg_pathway_drug.reset_index(drop=True, inplace=True)

kegg_pathway_drug = kegg_pathway_drug.merge(kegg_comp, left_on='target', right_on='KEGG_ID', how='left')
kegg_pathway_drug = kegg_pathway_drug.drop(columns=['KEGG_ID'])
kegg_pathway_drug

Unnamed: 0,pathway_id,target,CAS,PubChem
0,hsa00010,C00221,492-61-5,3521
1,hsa00010,C00267,492-62-6,3565
2,hsa00010,C00022,127-17-3,3324
3,hsa00010,C15973,,47205286
4,hsa00010,C00631,,3904
...,...,...,...,...
2329,hsa05418,C00533,10102-43-9,3815
2330,hsa00290,C00109,600-18-0,3409
2331,hsa00290,C00183,72-18-4,3483
2332,hsa00290,C00407,73-32-5,3697


### BioMedGraphica ID

In [4]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_pathway = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
target_dir_drug = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Drug' / 'BioMedGraphica_Drug.csv'
biomedgraphica_pathway = pd.read_csv(target_dir_pathway, dtype=str)
biomedgraphica_drug = pd.read_csv(target_dir_drug, dtype=str)

### KEGG Mapping

In [5]:
kegg_pathway_drug = kegg[(kegg['target_type'] == 'KEGGCOMP')]
kegg_pathway_drug = kegg_pathway_drug[['pathway_id', 'target']]
kegg_pathway_drug.drop_duplicates(inplace=True)
kegg_pathway_drug.reset_index(drop=True, inplace=True)

kegg_pathway_drug = kegg_pathway_drug.merge(kegg_comp, left_on='target', right_on='KEGG_ID', how='left')
kegg_pathway_drug = kegg_pathway_drug.drop(columns=['KEGG_ID'])
kegg_pathway_drug

Unnamed: 0,pathway_id,target,CAS,PubChem
0,hsa00010,C00221,492-61-5,3521
1,hsa00010,C00267,492-62-6,3565
2,hsa00010,C00022,127-17-3,3324
3,hsa00010,C15973,,47205286
4,hsa00010,C00631,,3904
...,...,...,...,...
2329,hsa05418,C00533,10102-43-9,3815
2330,hsa00290,C00109,600-18-0,3409
2331,hsa00290,C00183,72-18-4,3483
2332,hsa00290,C00407,73-32-5,3697


In [6]:
kegg_pathway_drug = kegg_pathway_drug.assign(PubChem=kegg_pathway_drug['PubChem'].str.split(' ')).explode('PubChem')
kegg_pathway_drug = kegg_pathway_drug.assign(CAS=kegg_pathway_drug['CAS'].str.split(' ')).explode('CAS')
kegg_pathway_drug

Unnamed: 0,pathway_id,target,CAS,PubChem
0,hsa00010,C00221,492-61-5,3521
1,hsa00010,C00267,492-62-6,3565
2,hsa00010,C00022,127-17-3,3324
3,hsa00010,C15973,,47205286
4,hsa00010,C00631,,3904
...,...,...,...,...
2329,hsa05418,C00533,10102-43-9,3815
2330,hsa00290,C00109,600-18-0,3409
2331,hsa00290,C00183,72-18-4,3483
2332,hsa00290,C00407,73-32-5,3697


SID

In [7]:
SID_individualid = biomedgraphica_drug[['PubChem_SID', 'BioMedGraphica_ID']]
SID_individualid = SID_individualid.dropna(subset=['PubChem_SID'])
SID_individualid = SID_individualid.assign(PubChem_SID=SID_individualid['PubChem_SID'].str.split(';')).explode('PubChem_SID')
SID_individualid = SID_individualid.groupby('PubChem_SID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_pathway_drug['Drug_SID'] = kegg_pathway_drug['PubChem'].map(SID_individualid)

CAS RN

In [8]:
cas_individualid = biomedgraphica_drug[['CAS_RN', 'BioMedGraphica_ID']]
cas_individualid = cas_individualid.dropna(subset=['CAS_RN'])
cas_individualid = cas_individualid.assign(CAS_RN=cas_individualid['CAS_RN'].str.split(';')).explode('CAS_RN')
cas_individualid = cas_individualid.groupby('CAS_RN')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_pathway_drug['Drug_CAS'] = kegg_pathway_drug['CAS'].map(cas_individualid)

In [9]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

kegg_pathway_drug = merge_string_columns(kegg_pathway_drug, ['Drug_SID', 'Drug_CAS'], 'To_ID')
kegg_pathway_drug.replace('', pd.NA, inplace=True)
kegg_pathway_drug

Unnamed: 0,pathway_id,target,CAS,PubChem,To_ID
0,hsa00010,C00221,492-61-5,3521,BMG_DG176842
1,hsa00010,C00267,492-62-6,3565,BMG_DG199529
2,hsa00010,C00022,127-17-3,3324,BMG_DG006378
3,hsa00010,C15973,,47205286,
4,hsa00010,C00631,,3904,
...,...,...,...,...,...
2329,hsa05418,C00533,10102-43-9,3815,BMG_DG090250
2330,hsa00290,C00109,600-18-0,3409,BMG_DG169584
2331,hsa00290,C00183,72-18-4,3483,BMG_DG173075
2332,hsa00290,C00407,73-32-5,3697,BMG_DG173223


In [10]:
pathway_individualid = biomedgraphica_pathway[['KEGG_ID', 'BioMedGraphica_ID']]
pathway_individualid = pathway_individualid.dropna(subset=['KEGG_ID'])
pathway_individualid = pathway_individualid.assign(KEGG_ID=pathway_individualid['KEGG_ID'].str.split(';')).explode('KEGG_ID')
pathway_individualid = pathway_individualid.groupby('KEGG_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_pathway_drug['From_ID'] = kegg_pathway_drug['pathway_id'].map(pathway_individualid)

In [11]:
pathway_drug = kegg_pathway_drug[['From_ID', 'To_ID']]
pathway_drug.dropna(subset=['To_ID'], inplace=True)
pathway_drug.dropna(subset=['From_ID'], inplace=True)
pathway_drug.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1331 entries, 0 to 2333
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From_ID  1331 non-null   object
 1   To_ID    1331 non-null   object
dtypes: object(2)
memory usage: 31.2+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_drug.dropna(subset=['To_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_drug.dropna(subset=['From_ID'], inplace=True)


In [12]:
pathway_drug['From_ID'] = pathway_drug['From_ID'].str.split(';') 
pathway_drug = pathway_drug.explode('From_ID')

pathway_drug['To_ID'] = pathway_drug['To_ID'].str.split(';')
pathway_drug = pathway_drug.explode('To_ID')

pathway_drug.drop_duplicates(inplace=True)
pathway_drug.reset_index(drop=True, inplace=True)
pathway_drug

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathway_drug['From_ID'] = pathway_drug['From_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PW0025,BMG_DG176842
1,BMG_PW0637,BMG_DG176842
2,BMG_PW0638,BMG_DG176842
3,BMG_PW0025,BMG_DG199529
4,BMG_PW0637,BMG_DG199529
...,...,...
1790,BMG_PW6793,BMG_DG090250
1791,BMG_PW6616,BMG_DG169584
1792,BMG_PW6616,BMG_DG173075
1793,BMG_PW6616,BMG_DG173223


### Pathway-Drug Relation

In [13]:
pathway_drug['Type'] = 'Pathway-Drug'
pathway_drug['Source'] = 'KEGG'

max_length = len(str(len(pathway_drug)))
pathway_drug['BioMedGraphica_ID'] = ['BMG_ED_PWDG' + str(i).zfill(max_length) for i in range(1, len(pathway_drug) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in pathway_drug.columns if col != 'BioMedGraphica_ID']  # re-order columns
pathway_drug = pathway_drug[columns]
pathway_drug

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Type,Source
0,BMG_ED_PWDG0001,BMG_PW0025,BMG_DG176842,Pathway-Drug,KEGG
1,BMG_ED_PWDG0002,BMG_PW0637,BMG_DG176842,Pathway-Drug,KEGG
2,BMG_ED_PWDG0003,BMG_PW0638,BMG_DG176842,Pathway-Drug,KEGG
3,BMG_ED_PWDG0004,BMG_PW0025,BMG_DG199529,Pathway-Drug,KEGG
4,BMG_ED_PWDG0005,BMG_PW0637,BMG_DG199529,Pathway-Drug,KEGG
...,...,...,...,...,...
1790,BMG_ED_PWDG1791,BMG_PW6793,BMG_DG090250,Pathway-Drug,KEGG
1791,BMG_ED_PWDG1792,BMG_PW6616,BMG_DG169584,Pathway-Drug,KEGG
1792,BMG_ED_PWDG1793,BMG_PW6616,BMG_DG173075,Pathway-Drug,KEGG
1793,BMG_ED_PWDG1794,BMG_PW6616,BMG_DG173223,Pathway-Drug,KEGG


In [14]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Pathway-Drug'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_Drug.csv'
pathway_drug.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Pathway-Drug\BioMedGraphica_Pathway_Drug.csv
