In [5]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(123)

%cd /Users/cgu3/Documents/conditional-graph-variational-autoencoder
%pwd

/Users/cgu3/Documents/conditional-graph-variational-autoencoder


'/Users/cgu3/Documents/conditional-graph-variational-autoencoder'

In [213]:
cpi = pd.read_csv('data/interactions/cpi_use_accession.csv')
# remove row if  "chemical" contains "CIDs"
cpi = cpi[~cpi['chemical'].str.contains('CIDs')]
# remove "CIDm" part from chemical column
cpi['chemical'] = cpi['chemical'].str.replace('CIDm', '')
# remove leading zeros from "chemical" column
cpi['chemical'] = cpi['chemical'].str.replace('^0+', '')
# chemical column to int
cpi['chemical'] = cpi['chemical'].astype(int)
cpi.head()

Unnamed: 0,chemical,protein,experimental,prediction,database,textmining,combined_score
0,91758680,Q9BZR9,0,0,0,154,154
1,91758408,Q6NTF9,0,0,0,225,225
2,91758408,Q12774,0,0,0,178,178
3,91758408,Q9Y3P4,0,0,0,225,225
4,91758408,O95140,0,0,0,162,162


In [214]:
# get unique chemicals
cpi['chemical'].unique()

array([91758680, 91758408, 91758407, ...,        4,        3,        1])

In [215]:
HMDB = pd.read_csv('data/interactions/metabolites-detected.csv')
HMDB

Unnamed: 0,HMDB_ID,NAME,SMILES,INCHIKEY,CHEMICAL_FORMULA,AVERAGE_MASS,MONO_MASS
0,HMDB0000001,1-Methylhistidine,CN1C=NC(C[C@H](N)C(O)=O)=C1,BRMWTNUJHUMWMS-LURJTMIESA-N,C7H11N3O2,169.1811,169.085127
1,HMDB0000002,"1,3-Diaminopropane",NCCCN,XFNJVJPLKCPIBV-UHFFFAOYSA-N,C3H10N2,74.1249,74.084398
2,HMDB0000005,2-Ketobutyric acid,CCC(=O)C(O)=O,TYEYBOSBBBHJIV-UHFFFAOYSA-N,C4H6O3,102.0886,102.031694
3,HMDB0000008,2-Hydroxybutyric acid,[H]OC(=O)[C@@]([H])(O[H])C([H])([H])C([H])([H]...,AFENDNXGAFYKQO-VKHMYHEASA-N,C4H8O3,104.1050,104.047344
4,HMDB0000010,2-Methoxyestrone,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,WHEUWNKSCXYKBU-QPWUGHHJSA-N,C19H24O3,300.3921,300.172545
...,...,...,...,...,...,...,...
20619,HMDB0341441,Succinylserine,N[C@@H](COC(=O)CCC(O)=O)C(O)=O,ZAHSBRLHJRVFAU-BYPYZUCNSA-N,C7H11NO6,205.1660,205.058637
20620,HMDB0341442,4-Hydroxylidocaine,CCN(CC)CC(=O)NC1=C(C)C=C(C)C(O)=C1,ZCGIIZBUSSWEFR-UHFFFAOYSA-N,C14H22N2O2,250.3420,250.168128
20621,HMDB0341443,Tryptophan N-glucoside,NC(CC1=CN(C2OC(CO)C(O)C(O)C2O)C2=CC=CC=C12)C(O)=O,ZHBHZDMTVVJASV-UHFFFAOYSA-N,C17H22N2O7,366.3700,366.142701
20622,HMDB0341444,"6-Amino-5-formamido-1,3-dimethyluracil",CN1C(N)=C(N=CO)C(=O)N(C)C1=O,ZNDGAXCBZGSJGU-UHFFFAOYSA-N,C7H10N4O3,198.1820,198.075290


In [216]:
# HMDB['HMDB_ID'] to a text file with one HMDB_ID per line
HMDB['HMDB_ID'].to_csv('data/interactions/HMDB_IDs.txt', index=False, header=False)

In [217]:
# read the mapping file and pubChem column as int
mapping = pd.read_csv('data/Metabolites/name_map_detected.csv')
# remove row if pubChem is NaN
mapping = mapping.dropna(subset=['PubChem'])
# convert pubChem to int
mapping['PubChem'] = mapping['PubChem'].astype(int)
mapping.head()

Unnamed: 0,Query,Match,HMDB,PubChem,ChEBI,KEGG,METLIN,SMILES,Comment
0,HMDB0000001,1-Methylhistidine,HMDB0000001,92105,50599,C01152,3741.0,CN1C=NC(C[C@H](N)C(O)=O)=C1,1
1,HMDB0000002,"1,3-Diaminopropane",HMDB0000002,428,15725,C00986,5081.0,NCCCN,1
2,HMDB0000005,2-Ketobutyric acid,HMDB0000005,58,30831,C00109,,CCC(=O)C(O)=O,1
3,HMDB0000008,2-Hydroxybutyric acid,HMDB0000008,440864,50613,C05984,3783.0,CC[C@H](O)C(O)=O,1
4,HMDB0000010,2-Methoxyestrone,HMDB0000010,440624,1189,C05299,2578.0,[H][C@@]12CCC(=O)[C@@]1(C)CC[C@]1([H])C3=C(CC[...,1


In [218]:
# get the rows with same PubChem values
mapping[mapping['PubChem'].duplicated(keep=False)] # 74 can be mapped to multiple HMDB
# remove the rows with same PubChem values
mapping = mapping.drop_duplicates(subset=['PubChem'])

Unnamed: 0,Query,Match,HMDB,PubChem,ChEBI,KEGG,METLIN,SMILES,Comment
241,HMDB0000379,2-Methylcitric acid,HMDB0000379,515,30835,C02225,5368.0,CC(C(O)=O)C(O)(CC(O)=O)C(O)=O,1
242,HMDB0000380,2-Hydroxyestradiol-3-methyl ether,HMDB0000380,515,30835,,5368.0,[H][C@@]12CC[C@H](O)[C@@]1(C)CC[C@]1([H])C3=C(...,1
595,HMDB0000949,Tetrahydrocortisol,HMDB0000949,387316,16525,C05472,5897.0,C[C@]12C[C@H](O)C3C(CCC4C[C@H](O)CC[C@]34C)C1C...,1
597,HMDB0000951,Taurochenodesoxycholic acid,HMDB0000951,387316,16525,C05465,5897.0,[H][C@@]1(CC[C@@]2([H])[C@]3([H])[C@H](O)C[C@]...,1
1089,HMDB0002504,3-Sulfodeoxycholic acid,HMDB0002504,10255824,67045,,6704.0,[H][C@@]12CC[C@H]([C@H](C)CCC(O)=O)C1[C@@H](O)...,1
...,...,...,...,...,...,...,...,...,...
10353,HMDB0244196,"1,4-Benzodioxane",HMDB0244196,952,,,,C1COC2=CC=CC=C2O1,1
14677,HMDB0250961,Deisopropylatrazine,HMDB0250961,8785,27399,C06556,,CCNC1=NC(N)=NC(Cl)=N1,1
16052,HMDB0253154,Hexenal,HMDB0253154,5281168,19591,C08497,,CCCC=CC=O,1
16572,HMDB0254039,Leucyl-leucine,HMDB0254039,94244,6418,C11332,,CC(C)CC(N)C(O)=NC(CC(C)C)C(O)=O,1


In [219]:
# get the number of digits of the largest number in PubChem
# np.sort(mapping['PubChem'])[::-1]

In [225]:
# check how many "PubChem" values in mapping are in HMDB
# mapping['PubChem'][mapping['PubChem'].isin(mpi['chemical'])]
mpi = cpi[cpi['chemical'].isin(mapping['PubChem'])]
mpi['chemical'].unique().shape
mpi

(5956,)

Unnamed: 0,chemical,protein,experimental,prediction,database,textmining,combined_score
5488,91617630,Q9H6Z9,0,0,0,221,221
5489,91617630,Q15370,0,0,0,260,260
5490,91617630,O95876,0,0,0,259,259
5835,91582103,O75106,0,0,0,179,179
5836,91582103,O14804,0,0,0,296,296
...,...,...,...,...,...,...,...
5949655,1,Q00059,0,0,0,150,150
5949656,1,Q9NTX5,0,0,0,279,279
5949657,1,P40925,0,0,0,311,311
5949658,1,O14727,0,0,0,322,322


In [232]:
def get_values_from_dict(keys, dictionary, na_rm=True):
    # Ensure the dictionary is a DataFrame
    dictionary = pd.DataFrame(dictionary)

    # Check for duplicated keys in the dictionary
    if dictionary.iloc[:, 0].duplicated().any():
        raise ValueError("Non-unique keys in dictionary: the same key can be mapped to multiple values")

    # Convert keys to a DataFrame
    keys_df = pd.DataFrame({'key': keys})

    # Join keys with the dictionary to get the corresponding values
    merged = keys_df.merge(dictionary, left_on='key', right_on=dictionary.columns[0], how='left')

    # Select the values column
    values = merged[dictionary.columns[1]]

    # If na_rm is True, remove NA values
    if na_rm:
        values = values.dropna()

    return values.tolist()

In [245]:
# map PubChem to HMDB
mpi.loc[:, 'chemical'] = get_values_from_dict(mpi['chemical'], mapping[['PubChem', 'HMDB']])

ValueError: You are trying to merge on object and int64 columns for key 'key'. If you wish to proceed you should use pd.concat

In [246]:
mpi

Unnamed: 0,chemical,protein,experimental,prediction,database,textmining,combined_score
5488,HMDB0250863,Q9H6Z9,0,0,0,221,221
5489,HMDB0250863,Q15370,0,0,0,260,260
5490,HMDB0250863,O95876,0,0,0,259,259
5835,HMDB0245911,O75106,0,0,0,179,179
5836,HMDB0245911,O14804,0,0,0,296,296
...,...,...,...,...,...,...,...
5949655,HMDB0240773,Q00059,0,0,0,150,150
5949656,HMDB0240773,Q9NTX5,0,0,0,279,279
5949657,HMDB0240773,P40925,0,0,0,311,311
5949658,HMDB0240773,O14727,0,0,0,322,322


In [242]:
mpi_medium_evidence = mpi[mpi['combined_score'] > 400]

In [243]:
# acquire the unique chemicals  and protein in the  mpi
metabolites = mpi_medium_evidence['chemical'].unique()
proteins = mpi_medium_evidence['protein'].unique()
metabolites.shape
proteins.shape

(3516,)

(9476,)

In [257]:
mpi_medium_evidence.to_csv('data/metabolites/mpi_medium_evidence.csv', index=False)

# Parse Chemical-Chemical Interactions

In [256]:
cci = pd.read_csv('data/interactions/chemical_chemical.links.detailed.v5.0.tsv', sep='\t')
cci.head()

Unnamed: 0,chemical1,chemical2,similarity,experimental,database,textmining,combined_score
0,CIDm00024759,CIDs00024759,0,0,900,0,900
1,CIDs91758695,CIDs00107694,0,0,0,230,230
2,CIDs91758695,CIDs11013287,0,0,0,230,230
3,CIDs91758695,CIDs11980957,0,0,0,328,328
4,CIDs91758695,CIDs00013078,0,0,0,162,162


In [185]:
# remove row if  "chemical1" or "chemical2" contains "CIDs"
cci = cci[~cci['chemical1'].str.contains('CIDs')]
cci = cci[~cci['chemical2'].str.contains('CIDs')]
# remove "CIDm" part from chemical1 and chemical2 columns
cci['chemical1'] = cci['chemical1'].str.replace('CIDm', '')
cci['chemical2'] = cci['chemical2'].str.replace('CIDm', '')
# remove leading zeros from "chemical1" and "chemical2" columns
cci['chemical1'] = cci['chemical1'].str.replace('^0+', '')
cci['chemical2'] = cci['chemical2'].str.replace('^0+', '')
# chemical1 and chemical2 columns to int
cci['chemical1'] = cci['chemical1'].astype(int)
cci['chemical2'] = cci['chemical2'].astype(int)
cci.head()

Unnamed: 0,chemical1,chemical2,similarity,experimental,database,textmining,combined_score
9665846,91758680,66,0,0,0,230,230
9665847,91758680,124,750,0,0,274,274
9665848,91758680,136,0,0,0,170,170
9665849,91758680,1145,0,0,0,179,179
9665850,91758680,6914,0,0,0,215,215


In [150]:
# check the number of rows that both "chemical1" and "chemical2" are in HMDB
mmi = cci[(cci['chemical1'].isin(mapping['PubChem'])) & (cci['chemical2'].isin(mapping['PubChem']))]

In [247]:
# map PubChem to HMDB
mmi.loc[:, 'chemical1'] = get_values_from_dict(mmi['chemical1'], mapping[['PubChem', 'HMDB']])
mmi.loc[:, 'chemical2'] = get_values_from_dict(mmi['chemical2'], mapping[['PubChem', 'HMDB']])

In [249]:
# mmi with experimental evidence > 0
mmi_medium_evidence = mmi[mmi['combined_score'] > 400]

In [253]:
mmi_medium_evidence.to_csv('data/metabolites//mmi_medium_evidence.csv', index=False)