### CTD

In [1]:
import pandas as pd
from io import StringIO
# Read the lines from the file
with open('CTD_chem_gene_ixns.csv', 'r') as f:
    lines = f.readlines()

# Extract column names
columns = []
for line in lines:
    if line.startswith('# Fields'):
        # Get the following line which contains column names
        columns = lines[lines.index(line) + 1].strip()[2:].split(',')
        break

# Filter out comment lines
data_lines = [line for line in lines if not line.startswith('#')]

# Create a DataFrame from the filtered lines
data_str = '\n'.join(data_lines)
ctd = pd.read_csv(StringIO(data_str), names=columns)

ctd

Unnamed: 0,ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
0,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606.0,10074-G5 affects the reaction [MYC protein res...,affects^reaction|increases^expression,32184358
1,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606.0,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression,32184358
2,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606.0,10074-G5 results in decreased expression of AR...,decreases^expression,32184358
3,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606.0,10074-G5 results in decreased expression of AR...,decreases^expression,32184358
4,10074-G5,C534883,,EPHB2,2048,protein,Homo sapiens,9606.0,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression,32184358
...,...,...,...,...,...,...,...,...,...,...,...
2892320,Zymosan,D015054,9010-72-4,VEGFA,7422,protein,Mus musculus,10090.0,TNFRSF1A gene mutant form inhibits the reactio...,decreases^reaction|increases^expression,17724436
2892321,Zymosan,D015054,9010-72-4,VEGFA,7422,protein,Mus musculus,10090.0,Zymosan results in increased expression of VEG...,increases^expression,17724436
2892322,Zymosan,D015054,9010-72-4,XIAP,331,mRNA,Homo sapiens,9606.0,Zymosan analog results in decreased expression...,decreases^expression,16803582
2892323,zymosterol,C015582,128-33-6,CYP27A1,1593,protein,Homo sapiens,9606.0,CYP27A1 protein results in increased metabolis...,increases^metabolic processing,14622972


In [2]:
ctd['GeneForms'].value_counts()

GeneForms
mRNA                        1910053
protein                      777470
gene                         112023
promoter                      35454
intron                        16995
exon                           7322
5' UTR                         4309
3' UTR                         3872
mutant form                    2779
polyA tail                      669
enhancer                        530
mRNA|promoter                   510
polymorphism                    234
mRNA|protein                    230
gene|mRNA                       173
gene|protein                    136
SNP                             105
intron|mRNA                      99
promoter|protein                 96
alternative form                 44
enhancer|mRNA                    33
modified form                    23
3' UTR|protein                   19
mutant form|protein              17
3' UTR|mRNA                      14
mRNA|mutant form                 11
5' UTR|protein                   10
mRNA|promoter|prot

In [3]:
ctd_human = ctd[(ctd['OrganismID'] == 9606) & (ctd['GeneForms'] == 'gene')]
ctd_human = ctd_human[['ChemicalID', 'GeneID']]
ctd_human = ctd_human.drop_duplicates()
ctd_human['GeneID'] = ctd_human['GeneID'].astype(str)
ctd_human

Unnamed: 0,ChemicalID,GeneID
21,C004822,2052
121,C013567,1565
127,C065719,1559
575,C037530,80045
576,C037530,4009
...,...,...
2886738,C017803,210
2886853,C017803,3077
2887176,C017803,7421
2887282,D019287,540


### BioMedGraphica ID

In [4]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_exposure = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Exposure' / 'BioMedGraphica_Exposure.csv'
target_dir_gene = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene' / 'BioMedGraphica_Gene.csv'
biomedgraphica_exposure = pd.read_csv(target_dir_exposure, dtype=str)
biomedgraphica_gene = pd.read_csv(target_dir_gene, dtype=str)

### CTD Mapping

In [5]:
ncbi_individual = biomedgraphica_gene[['NCBI_Gene_ID', 'BioMedGraphica_ID']]
ncbi_individual.dropna(subset=['NCBI_Gene_ID'], inplace=True)
ncbi_individual = ncbi_individual.assign(NCBI_Gene_ID=ncbi_individual['NCBI_Gene_ID'].str.split(';')).explode('NCBI_Gene_ID')

ncbi_to_individual = ncbi_individual.groupby('NCBI_Gene_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd_human['To_ID'] = ctd_human['GeneID'].map(ncbi_to_individual)
ctd_human

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ncbi_individual.dropna(subset=['NCBI_Gene_ID'], inplace=True)


Unnamed: 0,ChemicalID,GeneID,To_ID
21,C004822,2052,BMG_GN166788
121,C013567,1565,BMG_GN165671
127,C065719,1559,BMG_GN165667
575,C037530,80045,BMG_GN185940
576,C037530,4009,BMG_GN174830
...,...,...,...
2886738,C017803,210,BMG_GN166841
2886853,C017803,3077,BMG_GN171500
2887176,C017803,7421,BMG_GN184896
2887282,D019287,540,BMG_GN177975


In [6]:
mesh_individual = biomedgraphica_exposure[['MeSH_ID', 'BioMedGraphica_ID']]
mesh_individual.dropna(subset=['MeSH_ID'], inplace=True)
mesh_individual = mesh_individual.assign(MeSH_ID=mesh_individual['MeSH_ID'].str.split(';')).explode('MeSH_ID')

mesh_to_individual = mesh_individual.groupby('MeSH_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

ctd_human['From_ID'] = ctd_human['ChemicalID'].map(mesh_to_individual)
ctd_human

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mesh_individual.dropna(subset=['MeSH_ID'], inplace=True)


Unnamed: 0,ChemicalID,GeneID,To_ID,From_ID
21,C004822,2052,BMG_GN166788,
121,C013567,1565,BMG_GN165671,
127,C065719,1559,BMG_GN165667,
575,C037530,80045,BMG_GN185940,
576,C037530,4009,BMG_GN174830,
...,...,...,...,...
2886738,C017803,210,BMG_GN166841,BMG_EP1157
2886853,C017803,3077,BMG_GN171500,BMG_EP1157
2887176,C017803,7421,BMG_GN184896,BMG_EP1157
2887282,D019287,540,BMG_GN177975,


In [7]:
exposure_gene =  ctd_human[['From_ID', 'To_ID']]
exposure_gene.dropna(subset=['From_ID'], inplace=True)
exposure_gene.dropna(subset=['To_ID'], inplace=True)
exposure_gene

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exposure_gene.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exposure_gene.dropna(subset=['To_ID'], inplace=True)


Unnamed: 0,From_ID,To_ID
5175,BMG_EP0373,BMG_GN177806
22017,BMG_EP0406,BMG_GN166788
22122,BMG_EP0406,BMG_GN171246
22127,BMG_EP0406,BMG_GN171253
22156,BMG_EP0406,BMG_GN171766
...,...,...
2883152,BMG_EP0175,BMG_GN183862
2883343,BMG_EP0175,BMG_GN178353
2886738,BMG_EP1157,BMG_GN166841
2886853,BMG_EP1157,BMG_GN171500


In [8]:
exposure_gene['From_ID'] = exposure_gene['From_ID'].str.split(';')
exposure_gene['To_ID'] = exposure_gene['To_ID'].str.split(';')

exposure_gene = exposure_gene.explode('From_ID')
exposure_gene = exposure_gene.explode('To_ID')

exposure_gene.drop_duplicates(inplace=True)
exposure_gene

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exposure_gene['From_ID'] = exposure_gene['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exposure_gene['To_ID'] = exposure_gene['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
5175,BMG_EP0373,BMG_GN177806
22017,BMG_EP0406,BMG_GN166788
22122,BMG_EP0406,BMG_GN171246
22127,BMG_EP0406,BMG_GN171253
22156,BMG_EP0406,BMG_GN171766
...,...,...
2883152,BMG_EP0175,BMG_GN183862
2883343,BMG_EP0175,BMG_GN178353
2886738,BMG_EP1157,BMG_GN166841
2886853,BMG_EP1157,BMG_GN171500


### Exposure-Gene Relation

In [9]:
exposure_gene['Source'] = 'CTD'
exposure_gene['Type'] = 'Exposure-Gene'

max_length = len(str(len(exposure_gene)))
exposure_gene['BioMedGraphica_ID'] = ['BMG_ED_EPGN' + str(i).zfill(max_length) for i in range(1, len(exposure_gene) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in exposure_gene.columns if col != 'BioMedGraphica_ID']  # re-order columns
exposure_gene = exposure_gene[columns]
exposure_gene

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
5175,BMG_ED_EPGN00001,BMG_EP0373,BMG_GN177806,CTD,Exposure-Gene
22017,BMG_ED_EPGN00002,BMG_EP0406,BMG_GN166788,CTD,Exposure-Gene
22122,BMG_ED_EPGN00003,BMG_EP0406,BMG_GN171246,CTD,Exposure-Gene
22127,BMG_ED_EPGN00004,BMG_EP0406,BMG_GN171253,CTD,Exposure-Gene
22156,BMG_ED_EPGN00005,BMG_EP0406,BMG_GN171766,CTD,Exposure-Gene
...,...,...,...,...,...
2883152,BMG_ED_EPGN28978,BMG_EP0175,BMG_GN183862,CTD,Exposure-Gene
2883343,BMG_ED_EPGN28979,BMG_EP0175,BMG_GN178353,CTD,Exposure-Gene
2886738,BMG_ED_EPGN28980,BMG_EP1157,BMG_GN166841,CTD,Exposure-Gene
2886853,BMG_ED_EPGN28981,BMG_EP1157,BMG_GN171500,CTD,Exposure-Gene


In [10]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Exposure-Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Exposure_Gene.csv'
exposure_gene.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Exposure-Gene\BioMedGraphica_Exposure_Gene.csv
