### CTD

In [1]:
# Download Link: https://ctdbase.org/reports/CTD_chem_pathways_enriched.csv.gz
# Download Date: 2025-03-21
# Download Version: 2025-02-28

import pandas as pd
from io import StringIO
# Read the lines from the file
with open('CTD_chem_pathways_enriched.csv', 'r') as f:
    lines = f.readlines()

# Extract column names
columns = []
for line in lines:
    if line.startswith('# Fields'):
        # Get the following line which contains column names
        columns = lines[lines.index(line) + 1].strip()[2:].split(',')
        break

# Filter out comment lines
data_lines = [line for line in lines if not line.startswith('#')]

# Create a DataFrame from the filtered lines
data_str = '\n'.join(data_lines)
ctd = pd.read_csv(StringIO(data_str), names=columns)
ctd.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,PathwayName,PathwayID,PValue,CorrectedPValue,TargetMatchQty,TargetTotalQty,BackgroundMatchQty,BackgroundTotalQty
0,10074-G5,C534883,,Cyclin A:Cdk2-associated events at S phase entry,REACT:R-HSA-69656,1.4e-05,0.00115,2,4,71,45539
1,10074-G5,C534883,,Cyclin E associated events during G1/S transition,REACT:R-HSA-69202,1.5e-05,0.00118,2,4,72,45539
2,10074-G5,C534883,,G1/S Transition,REACT:R-HSA-69206,4.1e-05,0.00324,2,4,119,45539
3,10074-G5,C534883,,Mitotic G1-G1/S phases,REACT:R-HSA-453279,5.8e-05,0.00468,2,4,143,45539
4,10074-G5,C534883,,Pathways in cancer,KEGG:hsa05200,3e-06,0.000206,3,4,395,45539


In [2]:
ctd = ctd[['ChemicalID', 'CasRN', 'PathwayID']]
ctd['Database'] = ctd['PathwayID'].str.split(':').str[0]
ctd['PathwayID'] = ctd['PathwayID'].str.split(':').str[1]
ctd = ctd.drop_duplicates()
ctd

Unnamed: 0,ChemicalID,CasRN,PathwayID,Database
0,C534883,,R-HSA-69656,REACT
1,C534883,,R-HSA-69202,REACT
2,C534883,,R-HSA-69206,REACT
3,C534883,,R-HSA-453279,REACT
4,C534883,,hsa05200,KEGG
...,...,...,...,...
1624465,D015054,9010-72-4,hsa05203,KEGG
1624466,D015054,9010-72-4,hsa05416,KEGG
1624467,D015054,9010-72-4,R-HSA-8866427,REACT
1624468,D015054,9010-72-4,R-HSA-1606322,REACT


### BioMedGraphica ID

In [3]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_pathway = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Pathway' / 'BioMedGraphica_Pathway.csv'
target_dir_exposure = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Exposure' / 'BioMedGraphica_Exposure.csv'
biomedgraphica_pathway = pd.read_csv(target_dir_pathway, dtype=str)
biomedgraphica_exposure = pd.read_csv(target_dir_exposure, dtype=str)

### CTD Mapping

In [4]:
mesh_individual = biomedgraphica_exposure[['MeSH_ID', 'BioMedGraphica_ID']]
mesh_individual.dropna(subset=['MeSH_ID'], inplace=True)
mesh_individual = mesh_individual.assign(MeSH_ID=mesh_individual['MeSH_ID'].str.split(';')).explode('MeSH_ID')

mesh_to_individual = mesh_individual.groupby('MeSH_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['To_mesh'] = ctd['ChemicalID'].map(mesh_to_individual)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mesh_individual.dropna(subset=['MeSH_ID'], inplace=True)


In [5]:
cas_individual = biomedgraphica_exposure[['CAS_RN', 'BioMedGraphica_ID']]
cas_individual.dropna(subset=['CAS_RN'], inplace=True)
cas_individual = cas_individual.assign(CAS_RN=cas_individual['CAS_RN'].str.split(';')).explode('CAS_RN')

cas_to_individual = cas_individual.groupby('CAS_RN')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['To_cas'] = ctd['CasRN'].map(cas_to_individual)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cas_individual.dropna(subset=['CAS_RN'], inplace=True)


In [6]:
kegg_individual = biomedgraphica_pathway[['KEGG_ID', 'BioMedGraphica_ID']]
kegg_individual.dropna(subset=['KEGG_ID'], inplace=True)
kegg_individual = kegg_individual.assign(KEGG_ID=kegg_individual['KEGG_ID'].str.split(';')).explode('KEGG_ID')

kegg_to_individual = kegg_individual.groupby('KEGG_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['From_kegg'] = ctd['PathwayID'].map(kegg_to_individual)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kegg_individual.dropna(subset=['KEGG_ID'], inplace=True)


In [7]:
reactome_individual = biomedgraphica_pathway[['Reactome_ID', 'BioMedGraphica_ID']]
reactome_individual.dropna(subset=['Reactome_ID'], inplace=True)
reactome_individual = reactome_individual.assign(Reactome_ID=reactome_individual['Reactome_ID'].str.split(';')).explode('Reactome_ID')

reactome_to_individual = reactome_individual.groupby('Reactome_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd['From_reactome'] = ctd['PathwayID'].map(reactome_to_individual)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reactome_individual.dropna(subset=['Reactome_ID'], inplace=True)


In [8]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

ctd = ctd[['To_mesh', 'To_cas', 'From_kegg', 'From_reactome']]
ctd = merge_string_columns(ctd, ['From_kegg', 'From_reactome'], 'From_ID')
ctd = merge_string_columns(ctd, ['To_mesh', 'To_cas'], 'To_ID')
ctd.replace('', pd.NA, inplace=True)
ctd.dropna(subset=['From_ID'], inplace=True)
ctd.dropna(subset=['To_ID'], inplace=True)
ctd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[combined_column_name] = df.apply(merge_strings, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[combined_column_name] = df.apply(merge_strings, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: 

Unnamed: 0,From_ID,To_ID
1306,BMG_PW0840,BMG_EP0060
1307,BMG_PW1029,BMG_EP0060
1308,BMG_PW0016,BMG_EP0060
1309,BMG_PW1025,BMG_EP0060
1310,BMG_PW3644,BMG_EP0060
...,...,...
1622879,BMG_PW0816,BMG_EP1159
1622880,BMG_PW3583,BMG_EP1159
1622881,BMG_PW6768,BMG_EP1159
1622882,BMG_PW1032,BMG_EP1159


In [9]:
ctd['From_ID'] = ctd['From_ID'].str.split(';')
ctd = ctd.explode('From_ID')
ctd['To_ID'] = ctd['To_ID'].str.split(';')
ctd = ctd.explode('To_ID')

ctd.drop_duplicates(inplace=True)
ctd.reset_index(drop=True, inplace=True)
ctd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctd['From_ID'] = ctd['From_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PW0840,BMG_EP0060
1,BMG_PW1029,BMG_EP0060
2,BMG_PW0016,BMG_EP0060
3,BMG_PW1025,BMG_EP0060
4,BMG_PW3644,BMG_EP0060
...,...,...
301443,BMG_PW1032,BMG_EP1159
301444,BMG_PW0199,BMG_EP1159
301445,BMG_PW0197,BMG_EP1159
301446,BMG_PW0198,BMG_EP1159


### Pathway-Exposure Relation

In [10]:
pathway_exposure = ctd[['To_ID', 'From_ID']]
pathway_exposure = pathway_exposure.rename(columns={'To_ID': 'From_ID', 'From_ID': 'To_ID'})
pathway_exposure['Type'] = 'Pathway-Exposure'
pathway_exposure['Source'] = 'CTD'

max_length = len(str(len(pathway_exposure)))
pathway_exposure['BioMedGraphica_ID'] = ['BMG_ED_PWEP' + str(i).zfill(max_length) for i in range(1, len(pathway_exposure) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in pathway_exposure.columns if col != 'BioMedGraphica_ID']  # re-order columns
pathway_exposure = pathway_exposure[columns]
pathway_exposure

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Type,Source
0,BMG_ED_PWEP000001,BMG_EP0060,BMG_PW0840,Pathway-Exposure,CTD
1,BMG_ED_PWEP000002,BMG_EP0060,BMG_PW1029,Pathway-Exposure,CTD
2,BMG_ED_PWEP000003,BMG_EP0060,BMG_PW0016,Pathway-Exposure,CTD
3,BMG_ED_PWEP000004,BMG_EP0060,BMG_PW1025,Pathway-Exposure,CTD
4,BMG_ED_PWEP000005,BMG_EP0060,BMG_PW3644,Pathway-Exposure,CTD
...,...,...,...,...,...
301443,BMG_ED_PWEP301444,BMG_EP1159,BMG_PW1032,Pathway-Exposure,CTD
301444,BMG_ED_PWEP301445,BMG_EP1159,BMG_PW0199,Pathway-Exposure,CTD
301445,BMG_ED_PWEP301446,BMG_EP1159,BMG_PW0197,Pathway-Exposure,CTD
301446,BMG_ED_PWEP301447,BMG_EP1159,BMG_PW0198,Pathway-Exposure,CTD


In [11]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Pathway-Exposure'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Pathway_Exposure.csv'
pathway_exposure.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Pathway-Exposure\BioMedGraphica_Pathway_Exposure.csv
