### CTD

In [2]:
# Download Link: https://ctdbase.org/reports/CTD_chem_pathways_enriched.csv.gz
# Download Date: 2025-03-21
# Download Version: 2025-03

import pandas as pd
from io import StringIO
# Read the lines from the file
with open('CTD_chem_pathways_enriched.csv', 'r') as f:
    lines = f.readlines()

# Extract column names
columns = []
for line in lines:
    if line.startswith('# Fields'):
        # Get the following line which contains column names
        columns = lines[lines.index(line) + 1].strip()[2:].split(',')
        break

# Filter out comment lines
data_lines = [line for line in lines if not line.startswith('#')]

# Create a DataFrame from the filtered lines
data_str = '\n'.join(data_lines)
ctd = pd.read_csv(StringIO(data_str), names=columns)
ctd

Unnamed: 0,ChemicalName,ChemicalID,CasRN,PathwayName,PathwayID,PValue,CorrectedPValue,TargetMatchQty,TargetTotalQty,BackgroundMatchQty,BackgroundTotalQty
0,10074-G5,C534883,,Cyclin A:Cdk2-associated events at S phase entry,REACT:R-HSA-69656,1.440000e-05,1.150000e-03,2,4,71,45539
1,10074-G5,C534883,,Cyclin E associated events during G1/S transition,REACT:R-HSA-69202,1.480000e-05,1.180000e-03,2,4,72,45539
2,10074-G5,C534883,,G1/S Transition,REACT:R-HSA-69206,4.050000e-05,3.240000e-03,2,4,119,45539
3,10074-G5,C534883,,Mitotic G1-G1/S phases,REACT:R-HSA-453279,5.850000e-05,4.680000e-03,2,4,143,45539
4,10074-G5,C534883,,Pathways in cancer,KEGG:hsa05200,2.570000e-06,2.060000e-04,3,4,395,45539
...,...,...,...,...,...,...,...,...,...,...,...
1624465,Zymosan,D015054,9010-72-4,Viral carcinogenesis,KEGG:hsa05203,7.590000e-13,6.650000e-10,13,168,201,45539
1624466,Zymosan,D015054,9010-72-4,Viral myocarditis,KEGG:hsa05416,2.520000e-06,2.210000e-03,5,168,58,45539
1624467,Zymosan,D015054,9010-72-4,VLDLR internalisation and degradation,REACT:R-HSA-8866427,1.060000e-05,9.270000e-03,3,168,12,45539
1624468,Zymosan,D015054,9010-72-4,ZBP1(DAI) mediated induction of type I IFNs,REACT:R-HSA-1606322,3.850000e-10,3.370000e-07,6,168,25,45539


In [4]:
ctd = ctd[['ChemicalID', 'CasRN', 'PathwayID']]
ctd['Database'] = ctd['PathwayID'].str.split(':').str[0]
ctd['PathwayID'] = ctd['PathwayID'].str.split(':').str[1]
ctd = ctd.drop_duplicates()
ctd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctd['Database'] = ctd['PathwayID'].str.split(':').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctd['PathwayID'] = ctd['PathwayID'].str.split(':').str[1]


Unnamed: 0,ChemicalID,CasRN,PathwayID,Database
0,C534883,,R-HSA-69656,REACT
1,C534883,,R-HSA-69202,REACT
2,C534883,,R-HSA-69206,REACT
3,C534883,,R-HSA-453279,REACT
4,C534883,,hsa05200,KEGG
...,...,...,...,...
1624465,D015054,9010-72-4,hsa05203,KEGG
1624466,D015054,9010-72-4,hsa05416,KEGG
1624467,D015054,9010-72-4,R-HSA-8866427,REACT
1624468,D015054,9010-72-4,R-HSA-1606322,REACT


### BioMedGraphica ID

In [3]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_exposure = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Exposure' / 'BioMedGraphica_Exposure.csv'
target_dir_pathway = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
biomedgraphica_exposure = pd.read_csv(target_dir_exposure, dtype=str)
biomedgraphica_pathway = pd.read_csv(target_dir_pathway, dtype=str)

### CTD Mapping

In [5]:
mesh_individual = biomedgraphica_exposure[['MeSH_ID', 'BioMedGraphica_ID']]
mesh_individual.dropna(subset=['MeSH_ID'], inplace=True)
mesh_individual = mesh_individual.assign(MeSH_ID=mesh_individual['MeSH_ID'].str.split(';')).explode('MeSH_ID')

mesh_to_individual = mesh_individual.groupby('MeSH_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['From_mesh'] = ctd['ChemicalID'].map(mesh_to_individual)
ctd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mesh_individual.dropna(subset=['MeSH_ID'], inplace=True)


Unnamed: 0,ChemicalID,CasRN,PathwayID,Database,From_mesh
0,C534883,,R-HSA-69656,REACT,
1,C534883,,R-HSA-69202,REACT,
2,C534883,,R-HSA-69206,REACT,
3,C534883,,R-HSA-453279,REACT,
4,C534883,,hsa05200,KEGG,
...,...,...,...,...,...
1624465,D015054,9010-72-4,hsa05203,KEGG,
1624466,D015054,9010-72-4,hsa05416,KEGG,
1624467,D015054,9010-72-4,R-HSA-8866427,REACT,
1624468,D015054,9010-72-4,R-HSA-1606322,REACT,


In [6]:
cas_individual = biomedgraphica_exposure[['CAS_RN', 'BioMedGraphica_ID']]
cas_individual.dropna(subset=['CAS_RN'], inplace=True)
cas_individual = cas_individual.assign(CAS_RN=cas_individual['CAS_RN'].str.split(';')).explode('CAS_RN')

cas_to_individual = cas_individual.groupby('CAS_RN')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['From_cas'] = ctd['CasRN'].map(cas_to_individual)
ctd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cas_individual.dropna(subset=['CAS_RN'], inplace=True)


Unnamed: 0,ChemicalID,CasRN,PathwayID,Database,From_mesh,From_cas
0,C534883,,R-HSA-69656,REACT,,
1,C534883,,R-HSA-69202,REACT,,
2,C534883,,R-HSA-69206,REACT,,
3,C534883,,R-HSA-453279,REACT,,
4,C534883,,hsa05200,KEGG,,
...,...,...,...,...,...,...
1624465,D015054,9010-72-4,hsa05203,KEGG,,
1624466,D015054,9010-72-4,hsa05416,KEGG,,
1624467,D015054,9010-72-4,R-HSA-8866427,REACT,,
1624468,D015054,9010-72-4,R-HSA-1606322,REACT,,


In [7]:
kegg_individual = biomedgraphica_pathway[['KEGG_ID', 'BioMedGraphica_ID']]
kegg_individual.dropna(subset=['KEGG_ID'], inplace=True)
kegg_individual = kegg_individual.assign(KEGG_ID=kegg_individual['KEGG_ID'].str.split(';')).explode('KEGG_ID')

kegg_to_individual = kegg_individual.groupby('KEGG_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['To_kegg'] = ctd['PathwayID'].map(kegg_to_individual)
ctd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kegg_individual.dropna(subset=['KEGG_ID'], inplace=True)


Unnamed: 0,ChemicalID,CasRN,PathwayID,Database,From_mesh,From_cas,To_kegg
0,C534883,,R-HSA-69656,REACT,,,
1,C534883,,R-HSA-69202,REACT,,,
2,C534883,,R-HSA-69206,REACT,,,
3,C534883,,R-HSA-453279,REACT,,,
4,C534883,,hsa05200,KEGG,,,BMG_PW6766
...,...,...,...,...,...,...,...
1624465,D015054,9010-72-4,hsa05203,KEGG,,,BMG_PW6768
1624466,D015054,9010-72-4,hsa05416,KEGG,,,BMG_PW1032
1624467,D015054,9010-72-4,R-HSA-8866427,REACT,,,
1624468,D015054,9010-72-4,R-HSA-1606322,REACT,,,


In [8]:
reactome_individual = biomedgraphica_pathway[['Reactome_ID', 'BioMedGraphica_ID']]
reactome_individual.dropna(subset=['Reactome_ID'], inplace=True)
reactome_individual = reactome_individual.assign(Reactome_ID=reactome_individual['Reactome_ID'].str.split(';')).explode('Reactome_ID')

reactome_to_individual = reactome_individual.groupby('Reactome_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['To_reactome'] = ctd['PathwayID'].map(reactome_to_individual)
ctd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reactome_individual.dropna(subset=['Reactome_ID'], inplace=True)


Unnamed: 0,ChemicalID,CasRN,PathwayID,Database,From_mesh,From_cas,To_kegg,To_reactome
0,C534883,,R-HSA-69656,REACT,,,,BMG_PW4215
1,C534883,,R-HSA-69202,REACT,,,,BMG_PW4204
2,C534883,,R-HSA-69206,REACT,,,,BMG_PW0087
3,C534883,,R-HSA-453279,REACT,,,,BMG_PW3689
4,C534883,,hsa05200,KEGG,,,BMG_PW6766,
...,...,...,...,...,...,...,...,...
1624465,D015054,9010-72-4,hsa05203,KEGG,,,BMG_PW6768,
1624466,D015054,9010-72-4,hsa05416,KEGG,,,BMG_PW1032,
1624467,D015054,9010-72-4,R-HSA-8866427,REACT,,,,BMG_PW4348
1624468,D015054,9010-72-4,R-HSA-1606322,REACT,,,,BMG_PW2881


In [9]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

ctd = ctd[['From_mesh', 'From_cas', 'To_kegg', 'To_reactome']]
ctd = merge_string_columns(ctd, ['From_mesh', 'From_cas'], 'From_ID')
ctd = merge_string_columns(ctd, ['To_kegg', 'To_reactome'], 'To_ID')
ctd.replace('', pd.NA, inplace=True)
ctd.dropna(subset=['From_ID'], inplace=True)
ctd.dropna(subset=['To_ID'], inplace=True)
ctd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[combined_column_name] = df.apply(merge_strings, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[combined_column_name] = df.apply(merge_strings, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: 

Unnamed: 0,From_ID,To_ID
1306,BMG_EP0060,BMG_PW0840
1307,BMG_EP0060,BMG_PW1029
1308,BMG_EP0060,BMG_PW0016
1309,BMG_EP0060,BMG_PW1025
1310,BMG_EP0060,BMG_PW3644
...,...,...
1622879,BMG_EP1159,BMG_PW0816
1622880,BMG_EP1159,BMG_PW3583
1622881,BMG_EP1159,BMG_PW6768
1622882,BMG_EP1159,BMG_PW1032


In [10]:
ctd['From_ID'] = ctd['From_ID'].str.split(';')
ctd = ctd.explode('From_ID')
ctd['To_ID'] = ctd['To_ID'].str.split(';')
ctd = ctd.explode('To_ID')

ctd.drop_duplicates(inplace=True)
ctd.reset_index(drop=True, inplace=True)
ctd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctd['From_ID'] = ctd['From_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_EP0060,BMG_PW0840
1,BMG_EP0060,BMG_PW1029
2,BMG_EP0060,BMG_PW0016
3,BMG_EP0060,BMG_PW1025
4,BMG_EP0060,BMG_PW3644
...,...,...
301443,BMG_EP1159,BMG_PW1032
301444,BMG_EP1159,BMG_PW0008
301445,BMG_EP1159,BMG_PW0199
301446,BMG_EP1159,BMG_PW0197


### Exposure-Pathway Relation

In [11]:
exposure_pathway = ctd[['From_ID', 'To_ID']]
exposure_pathway['Type'] = 'Exposure-Pathway'
exposure_pathway['Source'] = 'CTD'

max_length = len(str(len(exposure_pathway)))
exposure_pathway['BioMedGraphica_ID'] = ['BMG_ED_EPPW' + str(i).zfill(max_length) for i in range(1, len(exposure_pathway) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in exposure_pathway.columns if col != 'BioMedGraphica_ID']  # re-order columns
exposure_pathway = exposure_pathway[columns]
exposure_pathway

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Type,Source
0,BMG_ED_EPPW000001,BMG_EP0060,BMG_PW0840,Exposure-Pathway,CTD
1,BMG_ED_EPPW000002,BMG_EP0060,BMG_PW1029,Exposure-Pathway,CTD
2,BMG_ED_EPPW000003,BMG_EP0060,BMG_PW0016,Exposure-Pathway,CTD
3,BMG_ED_EPPW000004,BMG_EP0060,BMG_PW1025,Exposure-Pathway,CTD
4,BMG_ED_EPPW000005,BMG_EP0060,BMG_PW3644,Exposure-Pathway,CTD
...,...,...,...,...,...
301443,BMG_ED_EPPW301444,BMG_EP1159,BMG_PW1032,Exposure-Pathway,CTD
301444,BMG_ED_EPPW301445,BMG_EP1159,BMG_PW0008,Exposure-Pathway,CTD
301445,BMG_ED_EPPW301446,BMG_EP1159,BMG_PW0199,Exposure-Pathway,CTD
301446,BMG_ED_EPPW301447,BMG_EP1159,BMG_PW0197,Exposure-Pathway,CTD


In [12]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Exposure-Pathway'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Exposure_Pathway.csv'
exposure_pathway.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Exposure-Pathway\BioMedGraphica_Exposure_Pathway.csv
