### DrugCentral

In [2]:
# Download Link: SQL filter
# Download Date: 2024-10-7
# Download Version: 2024-10-7
import pandas as pd

df_dc_id = pd.read_csv('drugcentral_drug_id.csv')
df_dc_id = df_dc_id[['identifier', 'id_type', 'struct_id']]
df_dc_id.head()

Unnamed: 0,identifier,id_type,struct_id
0,D00002,KEGG_DRUG,4140
1,D00003,KEGG_DRUG,4235
2,D00004,KEGG_DRUG,4256
3,D00007,KEGG_DRUG,1310
4,D00008,KEGG_DRUG,3281


In [3]:
def list_to_string(lst):
    if isinstance(lst, list):
        return ','.join(map(str, lst))
    return lst

df_dc_id_pivot = df_dc_id.pivot_table(index='struct_id', columns='id_type', values='identifier', aggfunc=lambda x: list(x)).reset_index()
df_dc_id_pivot_filter = df_dc_id_pivot[['struct_id', 'CHEBI', 'DRUGBANK_ID', 'PUBCHEM_CID']]
df_dc_id_pivot_filter = df_dc_id_pivot_filter.applymap(list_to_string)
df_dc_id_pivot_filter.head()

  df_dc_id_pivot_filter = df_dc_id_pivot_filter.applymap(list_to_string)


id_type,struct_id,CHEBI,DRUGBANK_ID,PUBCHEM_CID
0,4,CHEBI:6149,DB01002,92253
1,5,CHEBI:180904,,6604415
2,6,CHEBI:135522,,157702
3,13,,,688441
4,21,CHEBI:22526,DB08878,169371


In [4]:
# Download Link: SQL filter
# Download Date: 2024-10-7
# Download Version: 2024-10-7

df_dc_drug_dis = pd.read_csv('drugcentral_drug_disease.csv')
df_dc_drug_dis = df_dc_drug_dis[['struct_id', 'umls_cui', 'snomed_conceptid']]
df_dc_drug_dis.head()

Unnamed: 0,struct_id,umls_cui,snomed_conceptid
0,564,C0153225,151004.0
1,559,C0153225,151004.0
2,818,C0018824,368009.0
3,1572,C0018824,368009.0
4,1968,C0018824,368009.0


In [5]:
df_dc = pd.merge(df_dc_id_pivot_filter, df_dc_drug_dis, on='struct_id', how='right')
df_dc.drop(columns=['struct_id'], inplace=True)
df_dc

Unnamed: 0,CHEBI,DRUGBANK_ID,PUBCHEM_CID,umls_cui,snomed_conceptid
0,CHEBI:29007,DB01212,5479530,C0153225,151004.0
1,CHEBI:3508,DB00438,5481173,C0153225,151004.0
2,CHEBI:4453,DB00304,40973,C0018824,368009.0
3,CHEBI:6443,"DB00367,DB09389",13109,C0018824,368009.0
4,CHEBI:50815,DB00957,6540478,C0018824,368009.0
...,...,...,...,...,...
42302,CHEBI:3231,DB11148,2482,,
42303,CHEBI:204734,DB01260,5311066,,
42304,CHEBI:6532,DB00836,3955,,
42305,CHEBI:49005,DB01609,214348,,


In [6]:
# Split rows where the CHEBI, DRUGBANK_ID, or PUBCHEM_CID columns contain commas
df_split_all = df_dc.copy()

for col in ['DRUGBANK_ID', 'PUBCHEM_CID']:
    df_split_all = df_split_all.assign(**{col: df_split_all[col].str.split(',')}).explode(col)
    df_split_all[col] = df_split_all[col].str.strip()

df_split_all = df_split_all.rename(columns={'CHEBI': 'drug_chebi', 'DRUGBANK_ID': 'drug_drugbank', 'PUBCHEM_CID': 'drug_pubchem', 'umls_cui': 'disease_umls', 'snomed_conceptid': 'disease_snomed'})
df_split_all

Unnamed: 0,drug_chebi,drug_drugbank,drug_pubchem,disease_umls,disease_snomed
0,CHEBI:29007,DB01212,5479530,C0153225,151004.0
1,CHEBI:3508,DB00438,5481173,C0153225,151004.0
2,CHEBI:4453,DB00304,40973,C0018824,368009.0
3,CHEBI:6443,DB00367,13109,C0018824,368009.0
3,CHEBI:6443,DB09389,13109,C0018824,368009.0
...,...,...,...,...,...
42302,CHEBI:3231,DB11148,2482,,
42303,CHEBI:204734,DB01260,5311066,,
42304,CHEBI:6532,DB00836,3955,,
42305,CHEBI:49005,DB01609,214348,,


### BioMedgraphica ID

In [7]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_drug = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Drug' / 'BioMedGraphica_Drug.csv'
target_dir_disease = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease' / 'BioMedGraphica_Disease.csv'
biomedgraphica_disease = pd.read_csv(target_dir_disease, dtype=str)
biomedgraphica_drug = pd.read_csv(target_dir_drug, dtype=str)

### DrugCentral Mapping

In [8]:
db_id = biomedgraphica_drug[['DrugBank_ID', 'BioMedGraphica_ID']]
db_id.dropna(subset=['DrugBank_ID'], inplace=True)
db_id = db_id.assign(DrugBank_ID=db_id['DrugBank_ID'].str.split(';')).explode('DrugBank_ID')
db_id['DrugBank_ID'] = db_id['DrugBank_ID'].str.strip()

drugbank_to_individualid = db_id.groupby('DrugBank_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
df_split_all['Drug_drugbank'] = df_split_all['drug_drugbank'].astype(str).map(drugbank_to_individualid)
df_split_all

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db_id.dropna(subset=['DrugBank_ID'], inplace=True)


Unnamed: 0,drug_chebi,drug_drugbank,drug_pubchem,disease_umls,disease_snomed,Drug_drugbank
0,CHEBI:29007,DB01212,5479530,C0153225,151004.0,BMG_DG166076
1,CHEBI:3508,DB00438,5481173,C0153225,151004.0,BMG_DG166102
2,CHEBI:4453,DB00304,40973,C0018824,368009.0,BMG_DG141079
3,CHEBI:6443,DB00367,13109,C0018824,368009.0,BMG_DG025668
3,CHEBI:6443,DB09389,13109,C0018824,368009.0,BMG_DG148275
...,...,...,...,...,...,...
42302,CHEBI:3231,DB11148,2482,,,BMG_DG124868
42303,CHEBI:204734,DB01260,5311066,,,BMG_DG157342
42304,CHEBI:6532,DB00836,3955,,,BMG_DG140299
42305,CHEBI:49005,DB01609,214348,,,BMG_DG116501;BMG_DG166533


In [9]:
cid_id = biomedgraphica_drug[['PubChem_CID', 'BioMedGraphica_ID']]
cid_id.dropna(subset=['PubChem_CID'], inplace=True)
cid_id['PubChem_CID'] = cid_id['PubChem_CID'].astype(str)
cid_id = cid_id.assign(PubChem_CID=cid_id['PubChem_CID'].str.split(';')).explode('PubChem_CID')
cid_id['PubChem_CID'] = cid_id['PubChem_CID'].str.strip()

cid_to_individualid = cid_id.groupby('PubChem_CID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
df_split_all['Drug_pubchem'] = df_split_all['drug_pubchem'].astype(str).map(cid_to_individualid)
df_split_all

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cid_id.dropna(subset=['PubChem_CID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cid_id['PubChem_CID'] = cid_id['PubChem_CID'].astype(str)


Unnamed: 0,drug_chebi,drug_drugbank,drug_pubchem,disease_umls,disease_snomed,Drug_drugbank,Drug_pubchem
0,CHEBI:29007,DB01212,5479530,C0153225,151004.0,BMG_DG166076,BMG_DG166076
1,CHEBI:3508,DB00438,5481173,C0153225,151004.0,BMG_DG166102,BMG_DG166102
2,CHEBI:4453,DB00304,40973,C0018824,368009.0,BMG_DG141079,BMG_DG141079
3,CHEBI:6443,DB00367,13109,C0018824,368009.0,BMG_DG025668,BMG_DG025668
3,CHEBI:6443,DB09389,13109,C0018824,368009.0,BMG_DG148275,BMG_DG025668
...,...,...,...,...,...,...,...
42302,CHEBI:3231,DB11148,2482,,,BMG_DG124868,BMG_DG124868
42303,CHEBI:204734,DB01260,5311066,,,BMG_DG157342,BMG_DG157342
42304,CHEBI:6532,DB00836,3955,,,BMG_DG140299,BMG_DG140299
42305,CHEBI:49005,DB01609,214348,,,BMG_DG116501;BMG_DG166533,BMG_DG116501


In [10]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

df_split_all = merge_string_columns(df_split_all, ['Drug_drugbank', 'Drug_pubchem'], 'From_ID')
df_split_all.drop(columns=['drug_drugbank', 'drug_pubchem'], inplace=True)
df_split_all

Unnamed: 0,drug_chebi,disease_umls,disease_snomed,From_ID
0,CHEBI:29007,C0153225,151004.0,BMG_DG166076
1,CHEBI:3508,C0153225,151004.0,BMG_DG166102
2,CHEBI:4453,C0018824,368009.0,BMG_DG141079
3,CHEBI:6443,C0018824,368009.0,BMG_DG025668
3,CHEBI:6443,C0018824,368009.0,BMG_DG148275;BMG_DG025668
...,...,...,...,...
42302,CHEBI:3231,,,BMG_DG124868
42303,CHEBI:204734,,,BMG_DG157342
42304,CHEBI:6532,,,BMG_DG140299
42305,CHEBI:49005,,,BMG_DG116501;BMG_DG166533


In [11]:
umls_id = biomedgraphica_disease[['UMLS_ID', 'BioMedGraphica_ID']]
umls_id.dropna(subset=['UMLS_ID'], inplace=True)
umls_id = umls_id.assign(UMLS_ID=umls_id['UMLS_ID'].str.split(';')).explode('UMLS_ID')
umls_id['UMLS_ID'] = umls_id['UMLS_ID'].str.strip()

umls_to_individualid = umls_id.groupby('UMLS_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
df_split_all['Disease_umls'] = df_split_all['disease_umls'].astype(str).map(umls_to_individualid)
df_split_all

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  umls_id.dropna(subset=['UMLS_ID'], inplace=True)


Unnamed: 0,drug_chebi,disease_umls,disease_snomed,From_ID,Disease_umls
0,CHEBI:29007,C0153225,151004.0,BMG_DG166076,BMG_DS004279
1,CHEBI:3508,C0153225,151004.0,BMG_DG166102,BMG_DS004279
2,CHEBI:4453,C0018824,368009.0,BMG_DG141079,BMG_DS001340
3,CHEBI:6443,C0018824,368009.0,BMG_DG025668,BMG_DS001340
3,CHEBI:6443,C0018824,368009.0,BMG_DG148275;BMG_DG025668,BMG_DS001340
...,...,...,...,...,...
42302,CHEBI:3231,,,BMG_DG124868,
42303,CHEBI:204734,,,BMG_DG157342,
42304,CHEBI:6532,,,BMG_DG140299,
42305,CHEBI:49005,,,BMG_DG116501;BMG_DG166533,


In [12]:
biomedgraphica_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118814 entries, 0 to 118813
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   BioMedGraphica_ID  118814 non-null  object
 1   SNOMEDCT_ID        44458 non-null   object
 2   UMLS_Name          69361 non-null   object
 3   MeSH_Name          7156 non-null    object
 4   ICD11_ID           40690 non-null   object
 5   ICD11_Title        40585 non-null   object
 6   ICD10_ID           21931 non-null   object
 7   DO_ID              10689 non-null   object
 8   DO_Name            10689 non-null   object
 9   UMLS_ID            77484 non-null   object
 10  MeSH_ID            12485 non-null   object
 11  OMIM_ID            12902 non-null   object
 12  MONDO_ID           21598 non-null   object
 13  MONDO_Name         21598 non-null   object
 14  SNOMEDCT_Name      44458 non-null   object
dtypes: object(15)
memory usage: 13.6+ MB


In [13]:
snomed_id = biomedgraphica_disease[['SNOMEDCT_ID', 'BioMedGraphica_ID']]
snomed_id.dropna(subset=['SNOMEDCT_ID'], inplace=True)
snomed_id = snomed_id.assign(SNOMEDCT_ID=snomed_id['SNOMEDCT_ID'].str.split(';')).explode('SNOMEDCT_ID')
snomed_id['SNOMEDCT_ID'] = snomed_id['SNOMEDCT_ID'].str.strip()

snomed_to_individualid = snomed_id.groupby('SNOMEDCT_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

# Fill NaN values with a placeholder (e.g., 0) and convert the column
df_split_all['disease_snomed'] = df_split_all['disease_snomed'].fillna(0).astype(float).astype(int).astype(str)
df_split_all['Disease_snomed'] = df_split_all['disease_snomed'].map(snomed_to_individualid)
df_split_all = merge_string_columns(df_split_all, ['Disease_umls', 'Disease_snomed'], 'To_ID')
df_split_all

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snomed_id.dropna(subset=['SNOMEDCT_ID'], inplace=True)


Unnamed: 0,drug_chebi,disease_umls,disease_snomed,From_ID,To_ID
0,CHEBI:29007,C0153225,151004,BMG_DG166076,BMG_DS004279
1,CHEBI:3508,C0153225,151004,BMG_DG166102,BMG_DS004279
2,CHEBI:4453,C0018824,368009,BMG_DG141079,BMG_DS001340
3,CHEBI:6443,C0018824,368009,BMG_DG025668,BMG_DS001340
3,CHEBI:6443,C0018824,368009,BMG_DG148275;BMG_DG025668,BMG_DS001340
...,...,...,...,...,...
42302,CHEBI:3231,,0,BMG_DG124868,
42303,CHEBI:204734,,0,BMG_DG157342,
42304,CHEBI:6532,,0,BMG_DG140299,
42305,CHEBI:49005,,0,BMG_DG116501;BMG_DG166533,


In [14]:
drug_disease = df_split_all[['From_ID', 'To_ID']]
drug_disease['From_ID'].replace('', pd.NA, inplace=True)
drug_disease['To_ID'].replace('', pd.NA, inplace=True)
drug_disease.dropna(subset=['From_ID'], inplace=True)
drug_disease.dropna(subset=['To_ID'], inplace=True)
drug_disease

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  drug_disease['From_ID'].replace('', pd.NA, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_disease['From_ID'].replace('', pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df

Unnamed: 0,From_ID,To_ID
0,BMG_DG166076,BMG_DS004279
1,BMG_DG166102,BMG_DS004279
2,BMG_DG141079,BMG_DS001340
3,BMG_DG025668,BMG_DS001340
3,BMG_DG148275;BMG_DG025668,BMG_DS001340
...,...,...
39492,BMG_DG198599;BMG_DG264608,BMG_DS059590
39498,BMG_DG223724,BMG_DS029272
39515,BMG_DG143463,BMG_DS059590
39842,BMG_DG264771,BMG_DS029272


In [15]:
drug_disease['From_ID'] = drug_disease['From_ID'].str.split(';')
drug_disease = drug_disease.explode('From_ID')
drug_disease['From_ID'] = drug_disease['From_ID'].str.strip()

drug_disease['To_ID'] = drug_disease['To_ID'].str.split(';')
drug_disease = drug_disease.explode('To_ID')
drug_disease['To_ID'] = drug_disease['To_ID'].str.strip()
drug_disease.drop_duplicates(inplace=True)
drug_disease.reset_index(drop=True, inplace=True)
drug_disease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_disease['From_ID'] = drug_disease['From_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_DG166076,BMG_DS004279
1,BMG_DG166102,BMG_DS004279
2,BMG_DG141079,BMG_DS001340
3,BMG_DG025668,BMG_DS001340
4,BMG_DG148275,BMG_DS001340
...,...,...
39972,BMG_DG264608,BMG_DS059590
39973,BMG_DG223724,BMG_DS029272
39974,BMG_DG143463,BMG_DS059590
39975,BMG_DG264771,BMG_DS029272


### Drug-Disease Relation

In [16]:
drug_disease['Source'] = 'DrugCentral'
drug_disease['Type'] = 'Drug-Disease'

max_length = len(str(len(drug_disease)))
drug_disease['BioMedGraphica_ID'] = ['BMG_ED_DGDS' + str(i).zfill(max_length) for i in range(1, len(drug_disease) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in drug_disease.columns if col != 'BioMedGraphica_ID']  # re-order columns
drug_disease = drug_disease[columns]
drug_disease

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_DGDS00001,BMG_DG166076,BMG_DS004279,DrugCentral,Drug-Disease
1,BMG_ED_DGDS00002,BMG_DG166102,BMG_DS004279,DrugCentral,Drug-Disease
2,BMG_ED_DGDS00003,BMG_DG141079,BMG_DS001340,DrugCentral,Drug-Disease
3,BMG_ED_DGDS00004,BMG_DG025668,BMG_DS001340,DrugCentral,Drug-Disease
4,BMG_ED_DGDS00005,BMG_DG148275,BMG_DS001340,DrugCentral,Drug-Disease
...,...,...,...,...,...
39972,BMG_ED_DGDS39973,BMG_DG264608,BMG_DS059590,DrugCentral,Drug-Disease
39973,BMG_ED_DGDS39974,BMG_DG223724,BMG_DS029272,DrugCentral,Drug-Disease
39974,BMG_ED_DGDS39975,BMG_DG143463,BMG_DS059590,DrugCentral,Drug-Disease
39975,BMG_ED_DGDS39976,BMG_DG264771,BMG_DS029272,DrugCentral,Drug-Disease


In [17]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Drug-Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Drug_Disease.csv'
drug_disease.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Drug-Disease\BioMedGraphica_Drug_Disease.csv
