### SIDER

In [1]:
# Download Link: http://sideeffects.embl.de/media/download/meddra_all_se.tsv.gz
# Download Date: 2025-03-21
# Download Version: 2015-10-21

import pandas as pd

df_sider = pd.read_csv('meddra_all_se.tsv', sep='\t', names=['STITCH_FLAT', 'STITCH_STEREO', 'UMLS_CONCEPT_ID_LABEL', 'MedDRA concept type', 'UMLS_CONCEPT_ID_MEDDRA', 'side effect name'])
# filter MedDRA concept type == 'PT' (Preferred Term)
df_sider = df_sider[df_sider['MedDRA concept type'] == 'PT']
df_sider = df_sider.drop(columns=['MedDRA concept type'])
df_sider = df_sider[['STITCH_STEREO', 'UMLS_CONCEPT_ID_MEDDRA']]
df_sider['STITCH_STEREO'] = df_sider['STITCH_STEREO'].str.replace('CID', '')
df_sider = df_sider.dropna()
df_sider = df_sider.drop_duplicates()
df_sider

Unnamed: 0,STITCH_STEREO,UMLS_CONCEPT_ID_MEDDRA
1,000010917,C0000737
3,000010917,C0687713
6,000010917,C0002418
8,000010917,C0002871
10,000010917,C0232462
...,...,...
309836,071306834,C2364111
309838,071306834,C2830004
309840,071306834,C2979982
309843,071306834,C3203358


### BioMedgraphica ID

In [4]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_phenotype = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Phenotype' / 'BioMedGraphica_Phenotype.csv'
target_dir_drug = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Drug' / 'BioMedGraphica_Drug.csv'
biomedgraphica_phenotype = pd.read_csv(target_dir_phenotype, dtype=str)
biomedgraphica_drug = pd.read_csv(target_dir_drug, dtype=str)

### SIDER Mapping

In [5]:
cid_individual = biomedgraphica_drug[['PubChem_CID', 'BioMedGraphica_ID']]
cid_individual = cid_individual.dropna(subset=['PubChem_CID'])
cid_individual['PubChem_CID'] = cid_individual['PubChem_CID'].str.split(';')
cid_individual = cid_individual.explode('PubChem_CID')
cid_individual = cid_individual.drop_duplicates()
cid_individual

Unnamed: 0,PubChem_CID,BioMedGraphica_ID
0,1,BMG_DG000001
1,1000,BMG_DG000002
2,10000,BMG_DG000003
3,100001,BMG_DG000004
4,10000220,BMG_DG000005
...,...,...
220850,9999342,BMG_DG220851
220851,9999516,BMG_DG220852
220852,9999932,BMG_DG220853
220853,9999996,BMG_DG220854


In [6]:
df_sider['STITCH_STEREO'] = df_sider['STITCH_STEREO'].apply(lambda x: x.lstrip('0'))
cid_to_individual = cid_individual.groupby('PubChem_CID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_sider['From_ID'] = df_sider['STITCH_STEREO'].map(cid_to_individual)
df_sider

Unnamed: 0,STITCH_STEREO,UMLS_CONCEPT_ID_MEDDRA,From_ID
1,10917,C0000737,BMG_DG007978
3,10917,C0687713,BMG_DG007978
6,10917,C0002418,BMG_DG007978
8,10917,C0002871,BMG_DG007978
10,10917,C0232462,BMG_DG007978
...,...,...,...
309836,71306834,C2364111,BMG_DG186618
309838,71306834,C2830004,BMG_DG186618
309840,71306834,C2979982,BMG_DG186618
309843,71306834,C3203358,BMG_DG186618


In [7]:
umls_individual = biomedgraphica_phenotype[['UMLS_ID', 'BioMedGraphica_ID']]
umls_individual = umls_individual.dropna(subset=['UMLS_ID'])
umls_individual['UMLS_ID'] = umls_individual['UMLS_ID'].str.split(';')
umls_individual = umls_individual.explode('UMLS_ID')
umls_individual = umls_individual.drop_duplicates()

umls_to_individual = umls_individual.groupby('UMLS_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()
df_sider['To_ID'] = df_sider['UMLS_CONCEPT_ID_MEDDRA'].map(umls_to_individual)
df_sider

Unnamed: 0,STITCH_STEREO,UMLS_CONCEPT_ID_MEDDRA,From_ID,To_ID
1,10917,C0000737,BMG_DG007978,BMG_PH01524
3,10917,C0687713,BMG_DG007978,
6,10917,C0002418,BMG_DG007978,BMG_PH00510
8,10917,C0002871,BMG_DG007978,BMG_PH01423
10,10917,C0232462,BMG_DG007978,BMG_PH03188
...,...,...,...,...
309836,71306834,C2364111,BMG_DG186618,BMG_PH15825
309838,71306834,C2830004,BMG_DG186618,BMG_PH00980
309840,71306834,C2979982,BMG_DG186618,
309843,71306834,C3203358,BMG_DG186618,BMG_PH02077


In [8]:
sider_drug_phenotype = df_sider[['From_ID', 'To_ID']]

sider_drug_phenotype = sider_drug_phenotype.dropna(subset=['From_ID'])
sider_drug_phenotype = sider_drug_phenotype.dropna(subset=['To_ID'])
sider_drug_phenotype.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91692 entries, 1 to 309843
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From_ID  91692 non-null  object
 1   To_ID    91692 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB


In [9]:
sider_drug_phenotype['From_ID'] = sider_drug_phenotype['From_ID'].str.split(';')
sider_drug_phenotype = sider_drug_phenotype.explode('From_ID')

sider_drug_phenotype['To_ID'] = sider_drug_phenotype['To_ID'].str.split(';')
sider_drug_phenotype = sider_drug_phenotype.explode('To_ID')

sider_drug_phenotype = sider_drug_phenotype.drop_duplicates()
sider_drug_phenotype

Unnamed: 0,From_ID,To_ID
1,BMG_DG007978,BMG_PH01524
6,BMG_DG007978,BMG_PH00510
8,BMG_DG007978,BMG_PH01423
10,BMG_DG007978,BMG_PH03188
12,BMG_DG007978,BMG_PH00585
...,...,...
309823,BMG_DG186618,BMG_PH01210
309832,BMG_DG186618,BMG_PH01256
309836,BMG_DG186618,BMG_PH15825
309838,BMG_DG186618,BMG_PH00980


### Drug-Phenotype Relation

In [10]:
sider_drug_phenotype['Source'] = 'SIDER'
sider_drug_phenotype['Type'] = 'Drug-Phenotype'

max_length = len(str(len(sider_drug_phenotype)))
sider_drug_phenotype['BioMedGraphica_ID'] = ['BMG_ED_DGPH' + str(i).zfill(max_length) for i in range(1, len(sider_drug_phenotype) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in sider_drug_phenotype.columns if col != 'BioMedGraphica_ID']  # re-order columns
sider_drug_phenotype = sider_drug_phenotype[columns]
sider_drug_phenotype

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
1,BMG_ED_DGPH00001,BMG_DG007978,BMG_PH01524,SIDER,Drug-Phenotype
6,BMG_ED_DGPH00002,BMG_DG007978,BMG_PH00510,SIDER,Drug-Phenotype
8,BMG_ED_DGPH00003,BMG_DG007978,BMG_PH01423,SIDER,Drug-Phenotype
10,BMG_ED_DGPH00004,BMG_DG007978,BMG_PH03188,SIDER,Drug-Phenotype
12,BMG_ED_DGPH00005,BMG_DG007978,BMG_PH00585,SIDER,Drug-Phenotype
...,...,...,...,...,...
309823,BMG_ED_DGPH93822,BMG_DG186618,BMG_PH01210,SIDER,Drug-Phenotype
309832,BMG_ED_DGPH93823,BMG_DG186618,BMG_PH01256,SIDER,Drug-Phenotype
309836,BMG_ED_DGPH93824,BMG_DG186618,BMG_PH15825,SIDER,Drug-Phenotype
309838,BMG_ED_DGPH93825,BMG_DG186618,BMG_PH00980,SIDER,Drug-Phenotype


In [11]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Drug-Phenotype'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Drug_Phenotype.csv'
sider_drug_phenotype.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Drug-Phenotype\BioMedGraphica_Drug_Phenotype.csv
