### KEGG

In [2]:
# Download Link: R code
# Download Date: 2025-03-21
# Download Version: 2025-03-21
import pandas as pd

kegg = pd.read_csv('full_kegg_pathway_list_with_id.csv')
kegg['pathway_id'] = kegg['pathway_id'].str.replace(':', '')
kegg.head()

Unnamed: 0.1,Unnamed: 0,source_type,source,target_type,target,direction,edge_type,pathway_name,pathway_id
0,2,KEGGCOMP,C00022,SYMBOL,LDHAL6A,directed,Process,Glycolysis / Gluconeogenesis,hsa00010
1,3,KEGGCOMP,C00022,SYMBOL,LDHAL6A,undirected,Process,Glycolysis / Gluconeogenesis,hsa00010
2,4,KEGGCOMP,C00022,SYMBOL,LDHA,directed,Process,Glycolysis / Gluconeogenesis,hsa00010
3,5,KEGGCOMP,C00022,SYMBOL,LDHA,undirected,Process,Glycolysis / Gluconeogenesis,hsa00010
4,6,KEGGCOMP,C00022,SYMBOL,LDHB,directed,Process,Glycolysis / Gluconeogenesis,hsa00010


In [3]:
# Download Link: R code
# Download Date: 2025-03-21
# Download Version: 2025-03-21

kegg_compound = pd.read_csv('kegg_compound_data.csv')
kegg_compound['Name'] = kegg_compound['Name'].str.replace(';', '')

exploded_data = kegg_compound.assign(Other_DBS=kegg_compound['Other_DBS'].str.split(';')).explode('Other_DBS')
exploded_data['Other_DBS'] = exploded_data['Other_DBS'].str.strip()
exploded_data[['Database', 'ID']] = exploded_data['Other_DBS'].str.extract(r':(\w+):\s?(.*)')
cleaned_data = exploded_data.dropna(subset=['Database', 'ID'])
pivoted_data = cleaned_data.pivot(index=['KEGG_ID', 'Name'], columns='Database', values='ID').reset_index()

kegg_comp = pivoted_data[['KEGG_ID', 'CAS', 'PubChem']]
kegg_comp

Database,KEGG_ID,CAS,PubChem
0,C00001,7732-18-5,3303
1,C00002,56-65-5,3304
2,C00003,53-84-9,3305
3,C00004,58-68-4,3306
4,C00005,2646-71-1,3307
...,...,...,...
19287,C22958,,500141089
19288,C22959,,500141090
19289,C22960,,500141091
19290,C22961,,500141092


### BioMedGraphica ID

In [4]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_pathway = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
target_dir_drug = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Drug' / 'BioMedGraphica_Drug.csv'
biomedgraphica_pathway = pd.read_csv(target_dir_pathway, dtype=str)
biomedgraphica_drug = pd.read_csv(target_dir_drug, dtype=str)

### KEGG Mapping

In [5]:
kegg_drug_pathway = kegg[(kegg['source_type'] == 'KEGGCOMP')]
kegg_drug_pathway = kegg_drug_pathway[['source', 'pathway_id']]
kegg_drug_pathway.drop_duplicates(inplace=True)
kegg_drug_pathway.reset_index(drop=True, inplace=True)

kegg_drug_pathway = kegg_drug_pathway.merge(kegg_comp, left_on='source', right_on='KEGG_ID', how = 'left')
kegg_drug_pathway = kegg_drug_pathway.drop(columns=['KEGG_ID'])
kegg_drug_pathway

Unnamed: 0,source,pathway_id,CAS,PubChem
0,C00022,hsa00010,127-17-3,3324
1,C00024,hsa00010,72-89-9,3326
2,C00033,hsa00010,64-19-7,3335
3,C00036,hsa00010,328-42-7,3338
4,C00068,hsa00010,,3368
...,...,...,...,...
3917,C01327,hsa04974,7647-01-0,4538
3918,C00025,hsa05033,56-86-0,3327
3919,C00334,hsa05033,56-12-2,3628
3920,C00745,hsa05033,54-11-5,4007


In [6]:
kegg_drug_pathway = kegg_drug_pathway.assign(CAS=kegg_drug_pathway['CAS'].str.split(' ')).explode('CAS')
kegg_drug_pathway = kegg_drug_pathway.assign(PubChem=kegg_drug_pathway['PubChem'].str.split(' ')).explode('PubChem')
kegg_drug_pathway

Unnamed: 0,source,pathway_id,CAS,PubChem
0,C00022,hsa00010,127-17-3,3324
1,C00024,hsa00010,72-89-9,3326
2,C00033,hsa00010,64-19-7,3335
3,C00036,hsa00010,328-42-7,3338
4,C00068,hsa00010,,3368
...,...,...,...,...
3917,C01327,hsa04974,7647-01-0,4538
3918,C00025,hsa05033,56-86-0,3327
3919,C00334,hsa05033,56-12-2,3628
3920,C00745,hsa05033,54-11-5,4007


SID Mapping

In [7]:
SID_individualid = biomedgraphica_drug[['PubChem_SID', 'BioMedGraphica_ID']]
SID_individualid = SID_individualid.dropna(subset=['PubChem_SID'])
SID_individualid = SID_individualid.assign(PubChem_SID=SID_individualid['PubChem_SID'].str.split(';')).explode('PubChem_SID')
SID_individualid = SID_individualid.groupby('PubChem_SID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_drug_pathway['Drug_SID'] = kegg_drug_pathway['PubChem'].map(SID_individualid)

CAS RN

In [8]:
cas_individualid = biomedgraphica_drug[['CAS_RN', 'BioMedGraphica_ID']]
cas_individualid = cas_individualid.dropna(subset=['CAS_RN'])
cas_individualid = cas_individualid.assign(CAS_RN=cas_individualid['CAS_RN'].str.split(';')).explode('CAS_RN')
cas_individualid = cas_individualid.groupby('CAS_RN')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_drug_pathway['Drug_CAS'] = kegg_drug_pathway['CAS'].map(cas_individualid)

In [9]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

kegg_drug_pathway = merge_string_columns(kegg_drug_pathway, ['Drug_SID', 'Drug_CAS'], 'From_ID')
kegg_drug_pathway.replace('', pd.NA, inplace=True)
kegg_drug_pathway

Unnamed: 0,source,pathway_id,CAS,PubChem,From_ID
0,C00022,hsa00010,127-17-3,3324,BMG_DG006378
1,C00024,hsa00010,72-89-9,3326,BMG_DG145662
2,C00033,hsa00010,64-19-7,3335,BMG_DG109862
3,C00036,hsa00010,328-42-7,3338,BMG_DG215725
4,C00068,hsa00010,,3368,
...,...,...,...,...,...
3917,C01327,hsa04974,7647-01-0,4538,BMG_DG136174
3918,C00025,hsa05033,56-86-0,3327,BMG_DG137202
3919,C00334,hsa05033,56-12-2,3628,BMG_DG015910
3920,C00745,hsa05033,54-11-5,4007,BMG_DG206831


In [10]:
pathway_individualid = biomedgraphica_pathway[['KEGG_ID', 'BioMedGraphica_ID']]
pathway_individualid = pathway_individualid.dropna(subset=['KEGG_ID'])
pathway_individualid = pathway_individualid.assign(KEGG_ID=pathway_individualid['KEGG_ID'].str.split(';')).explode('KEGG_ID')
pathway_individualid = pathway_individualid.groupby('KEGG_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

kegg_drug_pathway['To_ID'] = kegg_drug_pathway['pathway_id'].map(pathway_individualid)
kegg_drug_pathway

Unnamed: 0,source,pathway_id,CAS,PubChem,From_ID,To_ID
0,C00022,hsa00010,127-17-3,3324,BMG_DG006378,BMG_PW0025;BMG_PW0637;BMG_PW0638
1,C00024,hsa00010,72-89-9,3326,BMG_DG145662,BMG_PW0025;BMG_PW0637;BMG_PW0638
2,C00033,hsa00010,64-19-7,3335,BMG_DG109862,BMG_PW0025;BMG_PW0637;BMG_PW0638
3,C00036,hsa00010,328-42-7,3338,BMG_DG215725,BMG_PW0025;BMG_PW0637;BMG_PW0638
4,C00068,hsa00010,,3368,,BMG_PW0025;BMG_PW0637;BMG_PW0638
...,...,...,...,...,...,...
3917,C01327,hsa04974,7647-01-0,4538,BMG_DG136174,BMG_PW6739
3918,C00025,hsa05033,56-86-0,3327,BMG_DG137202,BMG_PW1028
3919,C00334,hsa05033,56-12-2,3628,BMG_DG015910,BMG_PW1028
3920,C00745,hsa05033,54-11-5,4007,BMG_DG206831,BMG_PW1028


In [11]:
drug_pathway = kegg_drug_pathway[['From_ID', 'To_ID']]
drug_pathway = drug_pathway.applymap(lambda x: x.strip() if isinstance(x, str) else x)
drug_pathway.dropna(subset=['To_ID'], inplace=True)
drug_pathway.dropna(subset=['From_ID'], inplace=True)
drug_pathway.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2380 entries, 0 to 3921
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From_ID  2380 non-null   object
 1   To_ID    2380 non-null   object
dtypes: object(2)
memory usage: 55.8+ KB


  drug_pathway = drug_pathway.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [12]:
drug_pathway['From_ID'] = drug_pathway['From_ID'].str.split(';')
drug_pathway = drug_pathway.explode('From_ID')

drug_pathway['To_ID'] = drug_pathway['To_ID'].str.split(';')
drug_pathway = drug_pathway.explode('To_ID')

drug_pathway.drop_duplicates(inplace=True)
drug_pathway.reset_index(drop=True, inplace=True)
drug_pathway

Unnamed: 0,From_ID,To_ID
0,BMG_DG006378,BMG_PW0025
1,BMG_DG006378,BMG_PW0637
2,BMG_DG006378,BMG_PW0638
3,BMG_DG145662,BMG_PW0025
4,BMG_DG145662,BMG_PW0637
...,...,...
3060,BMG_DG136174,BMG_PW6739
3061,BMG_DG137202,BMG_PW1028
3062,BMG_DG015910,BMG_PW1028
3063,BMG_DG206831,BMG_PW1028


### Drug-Pathway Relation

In [13]:
drug_pathway['Type'] = 'Drug-Pathway'
drug_pathway['Source'] = 'KEGG'

max_length = len(str(len(drug_pathway)))
drug_pathway['BioMedGraphica_ID'] = ['BMG_ED_DGPW' + str(i).zfill(max_length) for i in range(1, len(drug_pathway) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in drug_pathway.columns if col != 'BioMedGraphica_ID']  # re-order columns
drug_pathway = drug_pathway[columns]
drug_pathway

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Type,Source
0,BMG_ED_DGPW0001,BMG_DG006378,BMG_PW0025,Drug-Pathway,KEGG
1,BMG_ED_DGPW0002,BMG_DG006378,BMG_PW0637,Drug-Pathway,KEGG
2,BMG_ED_DGPW0003,BMG_DG006378,BMG_PW0638,Drug-Pathway,KEGG
3,BMG_ED_DGPW0004,BMG_DG145662,BMG_PW0025,Drug-Pathway,KEGG
4,BMG_ED_DGPW0005,BMG_DG145662,BMG_PW0637,Drug-Pathway,KEGG
...,...,...,...,...,...
3060,BMG_ED_DGPW3061,BMG_DG136174,BMG_PW6739,Drug-Pathway,KEGG
3061,BMG_ED_DGPW3062,BMG_DG137202,BMG_PW1028,Drug-Pathway,KEGG
3062,BMG_ED_DGPW3063,BMG_DG015910,BMG_PW1028,Drug-Pathway,KEGG
3063,BMG_ED_DGPW3064,BMG_DG206831,BMG_PW1028,Drug-Pathway,KEGG


In [14]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Drug-Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Drug_Pathway.csv'
drug_pathway.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Drug-Pathway\BioMedGraphica_Drug_Pathway.csv
