### HMDB

In [None]:
# Download Link: https://www.hmdb.ca/downloads#
# Download Date: 2025-03-21
# Download Version: 2021-11-17

import xml.etree.ElementTree as ET
import pandas as pd

context = ET.iterparse('hmdb_metabolites.xml', events=('start', 'end'))
context = iter(context)

event, root = next(context)

namespace = {'hmdb': 'http://www.hmdb.ca'}

metabolites = []

for event, elem in context:
    if event == 'end' and elem.tag == '{http://www.hmdb.ca}metabolite':
        accession = elem.find('hmdb:accession', namespace).text
        
        for protein in elem.findall('hmdb:protein_associations/hmdb:protein', namespace):
            uniprot_id = protein.find('hmdb:uniprot_id', namespace).text if protein.find('hmdb:uniprot_id', namespace) is not None else None
            
            metabolites.append({
                'accession': accession,
                'uniprot_id': uniprot_id
            })

        elem.clear()

hmdb_protein = pd.DataFrame(metabolites)

hmdb_protein.to_csv('hmdb_protein_uniprot.csv', index=False)

print("Data has been saved to hmdb_protein_uniprot.csv")

In [1]:
import pandas as pd

hmdb_protein = pd.read_csv('hmdb_protein_uniprot.csv')
hmdb_protein

Unnamed: 0,accession,uniprot_id
0,HMDB0000001,Q96KN2
1,HMDB0000001,O60678
2,HMDB0000002,P52788
3,HMDB0000002,P49366
4,HMDB0000002,P19801
...,...,...
863754,HMDB0259856,Q9Y5Z9
863755,HMDB0259856,E7FB98
863756,HMDB0259928,P53621
863757,HMDB0259928,P40765


### STITCH

In [2]:
# Download Link: http://stitch.embl.de/download/protein_chemical.links.v5.0/9606.protein_chemical.links.v5.0.tsv.gz
# Download Date: 2025-03-21
# Download Version: 2015

import pandas as pd
import re

stitch = pd.read_csv('9606.protein_chemical.links.v5.0.tsv', sep='\t')
stitch

Unnamed: 0,chemical,protein,combined_score
0,CIDm91758680,9606.ENSP00000257254,279
1,CIDm91758680,9606.ENSP00000302120,154
2,CIDm91758408,9606.ENSP00000006777,225
3,CIDm91758408,9606.ENSP00000056217,178
4,CIDm91758408,9606.ENSP00000216085,225
...,...,...,...
15473934,CIDs00000001,9606.ENSP00000420588,151
15473935,CIDs00000001,9606.ENSP00000436585,279
15473936,CIDs00000001,9606.ENSP00000438144,311
15473937,CIDs00000001,9606.ENSP00000448165,322


In [3]:
# Download Link: http://stitch.embl.de/download/chemicals.inchikeys.v5.0.tsv.gz
# Download Date: 2025-03-21
# Download Version: 2015

stitch_inchikey = pd.read_csv('chemicals.inchikeys.v5.0.tsv', sep='\t')
stitch_inchikey

Unnamed: 0,flat_chemical_id,stereo_chemical_id,source_cid,inchikey
0,CIDm00000001,CIDs00000001,1,RDHQFKQIGNGIED-UHFFFAOYSA-N
1,CIDm00000010,CIDs00000010,10,AUFGTPPARQZWDO-UHFFFAOYSA-N
2,CIDm00000100,CIDs00000100,100,UTIBHEBNILDQKX-UHFFFAOYSA-N
3,CIDm00001000,CIDs00001000,1000,ULSIYEODSMZIPX-UHFFFAOYSA-N
4,CIDm00010000,CIDs00010000,10000,ZPIFKCVYZBVZIV-UHFFFAOYSA-N
...,...,...,...,...
68373236,CIDm09999995,CIDs09999995,9999995,XPKJPIMTEFXWTR-QJSROADHSA-N
68373237,CIDm09999996,CIDs09999996,9999996,UIACKXKZPDPMBY-UHFFFAOYSA-N
68373238,CIDm09999996,CIDs09999996,9999997,PLLYNUUCBINGLY-UHFFFAOYSA-N
68373239,CIDm09999998,CIDs09999998,9999998,PFFVTWFOPAQBJU-UHFFFAOYSA-N


In [4]:
stitch_inchikey = stitch_inchikey[stitch_inchikey['flat_chemical_id'].isin(stitch['chemical'])]
stitch_inchikey = stitch_inchikey[['flat_chemical_id', 'inchikey']]
stitch_inchikey = stitch_inchikey.drop_duplicates()
stitch_inchikey = stitch_inchikey.groupby('flat_chemical_id')['inchikey'].agg(lambda x: ';'.join(x)).reset_index()
stitch_inchikey

Unnamed: 0,flat_chemical_id,inchikey
0,CIDm00000001,RDHQFKQIGNGIED-UHFFFAOYSA-N;ZIUSCUAKYGIGBA-DDW...
1,CIDm00000003,INCSWYKICIYAHB-UHFFFAOYSA-M;INCSWYKICIYAHB-PHD...
2,CIDm00000004,HXKKHQJGJAFBHI-AZXPZELESA-N;HXKKHQJGJAFBHI-UHF...
3,CIDm00000005,HIQNVODXENYOFK-UHFFFAOYSA-M;HIQNVODXENYOFK-UHF...
4,CIDm00000006,VYZAHLCBVHPDDF-UHFFFAOYSA-N;VYZAHLCBVHPDDF-CBY...
...,...,...
374275,CIDm91758404,JEPZJYPEDPLQFU-INIZCTEOSA-N;JEPZJYPEDPLQFU-MRX...
374276,CIDm91758406,HLTDAXDZGPBJTI-UHFFFAOYSA-N
374277,CIDm91758407,OYYNQAPQYQZOFQ-UHFFFAOYSA-N
374278,CIDm91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N


In [5]:
stitch_chem_inchikey = pd.merge(stitch, stitch_inchikey, left_on='chemical', right_on='flat_chemical_id', how='left')
stitch_chem_inchikey = stitch_chem_inchikey.drop(columns=['flat_chemical_id','combined_score'])
stitch_chem_inchikey

Unnamed: 0,chemical,protein,inchikey
0,CIDm91758680,9606.ENSP00000257254,DMDVEPIJCJGHPE-UHFFFAOYSA-K
1,CIDm91758680,9606.ENSP00000302120,DMDVEPIJCJGHPE-UHFFFAOYSA-K
2,CIDm91758408,9606.ENSP00000006777,AZJRSFUJNQNLRC-UHFFFAOYSA-N
3,CIDm91758408,9606.ENSP00000056217,AZJRSFUJNQNLRC-UHFFFAOYSA-N
4,CIDm91758408,9606.ENSP00000216085,AZJRSFUJNQNLRC-UHFFFAOYSA-N
...,...,...,...
15473934,CIDs00000001,9606.ENSP00000420588,
15473935,CIDs00000001,9606.ENSP00000436585,
15473936,CIDs00000001,9606.ENSP00000438144,
15473937,CIDs00000001,9606.ENSP00000448165,


In [6]:
stitch_chem_inchikey['CID'] = stitch_chem_inchikey['chemical'].apply(lambda x: int(re.search(r'\d+', x).group()))
stitch_chem_inchikey['Protein'] = stitch_chem_inchikey['protein'].apply(lambda x: x.split('.')[1])
stitch_chem_inchikey['inchikey'] = stitch_chem_inchikey['inchikey'].str.split(';')
stitch_chem_inchikey = stitch_chem_inchikey.explode('inchikey')

stitch_filter = stitch_chem_inchikey[['Protein', 'CID','inchikey']]
stitch_filter.drop_duplicates(inplace=True)
stitch_filter

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stitch_filter.drop_duplicates(inplace=True)


Unnamed: 0,Protein,CID,inchikey
0,ENSP00000257254,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K
1,ENSP00000302120,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K
2,ENSP00000006777,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N
3,ENSP00000056217,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N
4,ENSP00000216085,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N
...,...,...,...
15473934,ENSP00000420588,1,
15473935,ENSP00000436585,1,
15473936,ENSP00000438144,1,
15473937,ENSP00000448165,1,


### BioMedGraphica ID

In [7]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_protein = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein' / 'BioMedGraphica_Protein.csv'
target_dir_metabolite = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Metabolite' / 'BioMedGraphica_Metabolite.csv'
biomedgraphica_protein = pd.read_csv(target_dir_protein, dtype=str)
biomedgraphica_metabolite = pd.read_csv(target_dir_metabolite, dtype=str)

### HMDB Mapping

In [8]:
hmdb_biomedgraphica = biomedgraphica_metabolite[['BioMedGraphica_ID', 'HMDB_ID']]
hmdb_biomedgraphica.dropna(subset=['HMDB_ID'], inplace=True)
hmdb_biomedgraphica = hmdb_biomedgraphica.assign(HMDB_ID=hmdb_biomedgraphica['HMDB_ID'].str.split(';')).explode('HMDB_ID')
hmdb_to_individualID = hmdb_biomedgraphica.groupby('HMDB_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

hmdb_protein['From_ID'] = hmdb_protein['accession'].map(hmdb_to_individualID)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hmdb_biomedgraphica.dropna(subset=['HMDB_ID'], inplace=True)


In [9]:
uniprot_biomedgraphica = biomedgraphica_protein[['Uniprot_ID', 'BioMedGraphica_ID']]
uniprot_biomedgraphica.dropna(subset=['Uniprot_ID'], inplace=True)
uniprot_biomedgraphica = uniprot_biomedgraphica.assign(Uniprot_ID=uniprot_biomedgraphica['Uniprot_ID'].str.split(';')).explode('Uniprot_ID')
uniprot_to_individualID = uniprot_biomedgraphica.groupby('Uniprot_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

hmdb_protein['To_ID'] = hmdb_protein['uniprot_id'].map(uniprot_to_individualID)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_biomedgraphica.dropna(subset=['Uniprot_ID'], inplace=True)


In [10]:
metabolite_protein_hmdb = hmdb_protein[['From_ID', 'To_ID']]
metabolite_protein_hmdb['From_ID'].replace('', pd.NA, inplace=True)
metabolite_protein_hmdb['To_ID'].replace('', pd.NA, inplace=True)
metabolite_protein_hmdb.dropna(subset=['From_ID'], inplace=True)
metabolite_protein_hmdb.dropna(subset=['To_ID'], inplace=True)
metabolite_protein_hmdb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 849980 entries, 0 to 863756
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   From_ID  849980 non-null  object
 1   To_ID    849980 non-null  object
dtypes: object(2)
memory usage: 19.5+ MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  metabolite_protein_hmdb['From_ID'].replace('', pd.NA, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_protein_hmdb['From_ID'].replace('', pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=

In [11]:
metabolite_protein_hmdb['From_ID'] = metabolite_protein_hmdb['From_ID'].str.split(';')
metabolite_protein_hmdb['To_ID'] = metabolite_protein_hmdb['To_ID'].str.split(';')
metabolite_protein_hmdb = metabolite_protein_hmdb.explode('From_ID')
metabolite_protein_hmdb = metabolite_protein_hmdb.explode('To_ID')
metabolite_protein_hmdb['From_ID'] = metabolite_protein_hmdb['From_ID'].str.strip()
metabolite_protein_hmdb['To_ID'] = metabolite_protein_hmdb['To_ID'].str.strip()
metabolite_protein_hmdb.drop_duplicates(inplace=True)
metabolite_protein_hmdb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_protein_hmdb['From_ID'] = metabolite_protein_hmdb['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_protein_hmdb['To_ID'] = metabolite_protein_hmdb['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_MT000001,BMG_PT082247
1,BMG_MT000001,BMG_PT035382
2,BMG_MT000002,BMG_PT041364
3,BMG_MT000002,BMG_PT041046
4,BMG_MT000002,BMG_PT039489
...,...,...
863750,BMG_MT172954,BMG_PT009001
863752,BMG_MT172981,BMG_PT042111
863753,BMG_MT172981,BMG_PT090834
863754,BMG_MT172981,BMG_PT100169


### STITCH Mapping

Ensembl ID

In [12]:
ensembl_biomedgraphica = biomedgraphica_protein[['Ensembl_Protein_ID', 'BioMedGraphica_ID']]
ensembl_biomedgraphica.dropna(subset=['Ensembl_Protein_ID'], inplace=True)
ensembl_biomedgraphica = ensembl_biomedgraphica.assign(Ensembl_Protein_ID=ensembl_biomedgraphica['Ensembl_Protein_ID'].str.split(';')).explode('Ensembl_Protein_ID')
ensembl_to_individualID = ensembl_biomedgraphica.groupby('Ensembl_Protein_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

stitch_filter['To_ID'] = stitch_filter['Protein'].map(ensembl_to_individualID)
stitch_filter

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ensembl_biomedgraphica.dropna(subset=['Ensembl_Protein_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stitch_filter['To_ID'] = stitch_filter['Protein'].map(ensembl_to_individualID)


Unnamed: 0,Protein,CID,inchikey,To_ID
0,ENSP00000257254,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT040452
1,ENSP00000302120,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT101482
2,ENSP00000006777,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT062814
3,ENSP00000056217,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT043776
4,ENSP00000216085,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT099520
...,...,...,...,...
15473934,ENSP00000420588,1,,BMG_PT042466
15473935,ENSP00000436585,1,,BMG_PT092678
15473936,ENSP00000438144,1,,BMG_PT128776
15473937,ENSP00000448165,1,,BMG_PT033598


CID

In [13]:
cid_biomedgraphica = biomedgraphica_metabolite[['BioMedGraphica_ID', 'PubChem_CID']]
cid_biomedgraphica.dropna(subset=['PubChem_CID'], inplace=True)
cid_biomedgraphica = cid_biomedgraphica.assign(PubChem_CID=cid_biomedgraphica['PubChem_CID'].str.split(';')).explode('PubChem_CID')
cid_to_individualID = cid_biomedgraphica.groupby('PubChem_CID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

stitch_filter['From_CID'] = stitch_filter['CID'].map(cid_to_individualID)
stitch_filter

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cid_biomedgraphica.dropna(subset=['PubChem_CID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stitch_filter['From_CID'] = stitch_filter['CID'].map(cid_to_individualID)


Unnamed: 0,Protein,CID,inchikey,To_ID,From_CID
0,ENSP00000257254,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT040452,
1,ENSP00000302120,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT101482,
2,ENSP00000006777,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT062814,
3,ENSP00000056217,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT043776,
4,ENSP00000216085,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT099520,
...,...,...,...,...,...
15473934,ENSP00000420588,1,,BMG_PT042466,
15473935,ENSP00000436585,1,,BMG_PT092678,
15473936,ENSP00000438144,1,,BMG_PT128776,
15473937,ENSP00000448165,1,,BMG_PT033598,


InchiKey

In [16]:
inchikey_biomedgraphica = biomedgraphica_metabolite[['BioMedGraphica_ID', 'InChIKey']]
inchikey_biomedgraphica.dropna(subset=['InChIKey'], inplace=True)
inchikey_biomedgraphica = inchikey_biomedgraphica.assign(InChIKey=inchikey_biomedgraphica['InChIKey'].str.split(';')).explode('InChIKey')
inchikey_to_individualID = inchikey_biomedgraphica.groupby('InChIKey')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

stitch_filter['From_InChIKey'] = stitch_filter['inchikey'].map(inchikey_to_individualID)
stitch_filter

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inchikey_biomedgraphica.dropna(subset=['InChIKey'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stitch_filter['From_InChIKey'] = stitch_filter['inchikey'].map(inchikey_to_individualID)


Unnamed: 0,Protein,CID,inchikey,To_ID,From_CID,From_InChIKey
0,ENSP00000257254,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT040452,,
1,ENSP00000302120,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT101482,,
2,ENSP00000006777,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT062814,,
3,ENSP00000056217,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT043776,,
4,ENSP00000216085,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT099520,,
...,...,...,...,...,...,...
15473934,ENSP00000420588,1,,BMG_PT042466,,
15473935,ENSP00000436585,1,,BMG_PT092678,,
15473936,ENSP00000438144,1,,BMG_PT128776,,
15473937,ENSP00000448165,1,,BMG_PT033598,,


In [17]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

stitch_filter = merge_string_columns(stitch_filter, ['From_CID', 'From_InChIKey'], 'From_ID')
stitch_filter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[combined_column_name] = df.apply(merge_strings, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns, inplace=True)


Unnamed: 0,Protein,CID,inchikey,To_ID,From_ID
0,ENSP00000257254,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT040452,
1,ENSP00000302120,91758680,DMDVEPIJCJGHPE-UHFFFAOYSA-K,BMG_PT101482,
2,ENSP00000006777,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT062814,
3,ENSP00000056217,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT043776,
4,ENSP00000216085,91758408,AZJRSFUJNQNLRC-UHFFFAOYSA-N,BMG_PT099520,
...,...,...,...,...,...
15473934,ENSP00000420588,1,,BMG_PT042466,
15473935,ENSP00000436585,1,,BMG_PT092678,
15473936,ENSP00000438144,1,,BMG_PT128776,
15473937,ENSP00000448165,1,,BMG_PT033598,


In [18]:
metabolite_protein_stitch = stitch_filter[['From_ID', 'To_ID']]
metabolite_protein_stitch.replace('', pd.NA, inplace=True)
metabolite_protein_stitch['From_ID'].replace('', pd.NA, inplace=True)
metabolite_protein_stitch['To_ID'].replace('', pd.NA, inplace=True)
metabolite_protein_stitch.dropna(subset=['From_ID'], inplace=True)
metabolite_protein_stitch.dropna(subset=['To_ID'], inplace=True)
metabolite_protein_stitch

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_protein_stitch.replace('', pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  metabolite_protein_stitch['From_ID'].replace('', pd.NA, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_protein_stitch['From_ID'].replace('', pd.NA, inplace=True)
T

Unnamed: 0,From_ID,To_ID
3856,BMG_MT172744,BMG_PT032920
3857,BMG_MT172744,BMG_PT042982
3858,BMG_MT172744,BMG_PT041127
3859,BMG_MT172744,BMG_PT041155
3860,BMG_MT172744,BMG_PT041247
...,...,...
7436551,BMG_MT157080,BMG_PT033598
7436551,BMG_MT000128,BMG_PT033598
7436552,BMG_MT157082,BMG_PT042538
7436552,BMG_MT157080,BMG_PT042538


In [19]:
metabolite_protein_stitch['From_ID'] = metabolite_protein_stitch['From_ID'].str.split(';')
metabolite_protein_stitch['To_ID'] = metabolite_protein_stitch['To_ID'].str.split(';')
metabolite_protein_stitch = metabolite_protein_stitch.explode('From_ID')
metabolite_protein_stitch = metabolite_protein_stitch.explode('To_ID')
metabolite_protein_stitch['From_ID'] = metabolite_protein_stitch['From_ID'].str.strip()
metabolite_protein_stitch['To_ID'] = metabolite_protein_stitch['To_ID'].str.strip()
metabolite_protein_stitch.drop_duplicates(inplace=True)
metabolite_protein_stitch

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_protein_stitch['From_ID'] = metabolite_protein_stitch['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metabolite_protein_stitch['To_ID'] = metabolite_protein_stitch['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
3856,BMG_MT172744,BMG_PT032920
3857,BMG_MT172744,BMG_PT042982
3858,BMG_MT172744,BMG_PT041127
3859,BMG_MT172744,BMG_PT041155
3860,BMG_MT172744,BMG_PT041247
...,...,...
7436551,BMG_MT157080,BMG_PT033598
7436551,BMG_MT000128,BMG_PT033598
7436552,BMG_MT157082,BMG_PT042538
7436552,BMG_MT157080,BMG_PT042538


### Metabolite-Protein Relation

In [20]:
metabolite_protein_hmdb['Source1'] = 'HMDB'
metabolite_protein_stitch['Source2'] = 'STITCH'

metabolite_protein = pd.merge(metabolite_protein_hmdb, metabolite_protein_stitch, on=['From_ID', 'To_ID'], how='outer')
metabolite_protein = merge_string_columns(metabolite_protein, ['Source1', 'Source2'], 'Source')
metabolite_protein

Unnamed: 0,From_ID,To_ID,Source
0,BMG_MT000001,BMG_PT010663,STITCH
1,BMG_MT000001,BMG_PT013590,STITCH
2,BMG_MT000001,BMG_PT016722,STITCH
3,BMG_MT000001,BMG_PT033059,STITCH
4,BMG_MT000001,BMG_PT034409,STITCH
...,...,...,...
2804425,BMG_MT217942,BMG_PT103805,STITCH
2804426,BMG_MT217942,BMG_PT104066,STITCH
2804427,BMG_MT217942,BMG_PT111217,STITCH
2804428,BMG_MT217942,BMG_PT113932,STITCH


In [21]:
metabolite_protein['Type'] = 'Metabolite-Protein'

max_length = len(str(len(metabolite_protein)))
metabolite_protein['BioMedGraphica_ID'] = ['BMG_ED_MTPT' + str(i).zfill(max_length) for i in range(1, len(metabolite_protein) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in metabolite_protein.columns if col != 'BioMedGraphica_ID']  # re-order columns
metabolome_protein = metabolite_protein[columns]
metabolome_protein

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_MTPT0000001,BMG_MT000001,BMG_PT010663,STITCH,Metabolite-Protein
1,BMG_ED_MTPT0000002,BMG_MT000001,BMG_PT013590,STITCH,Metabolite-Protein
2,BMG_ED_MTPT0000003,BMG_MT000001,BMG_PT016722,STITCH,Metabolite-Protein
3,BMG_ED_MTPT0000004,BMG_MT000001,BMG_PT033059,STITCH,Metabolite-Protein
4,BMG_ED_MTPT0000005,BMG_MT000001,BMG_PT034409,STITCH,Metabolite-Protein
...,...,...,...,...,...
2804425,BMG_ED_MTPT2804426,BMG_MT217942,BMG_PT103805,STITCH,Metabolite-Protein
2804426,BMG_ED_MTPT2804427,BMG_MT217942,BMG_PT104066,STITCH,Metabolite-Protein
2804427,BMG_ED_MTPT2804428,BMG_MT217942,BMG_PT111217,STITCH,Metabolite-Protein
2804428,BMG_ED_MTPT2804429,BMG_MT217942,BMG_PT113932,STITCH,Metabolite-Protein


In [22]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Metabolite-Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Metabolite_Protein.csv'
metabolite_protein.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Metabolite-Protein\BioMedGraphica_Metabolite_Protein.csv
