In [2]:
import pandas as pd
import requests
import warnings

We are using the datasets listed in the Supporting Information section for the PNAS publication link: https://www.pnas.org/doi/suppl/10.1073/pnas.1803294115

The two that we need for this project are Datasets S01 and S02, which are also under `data/supplement_excel`. S01 describes the different Drug-Drug Interaction (DDI) types and the sentence structure associated with each. S02 contains the base data which was used in the original paper, however it is missing some crucial information necessary for model training including the SMILES representation and the DDI type. This notebook will primarily process S02 to create a ready-to-use dataset for model training. 

First, dataset S02 is manually converted to CSV format and columns renamed 

'Sentences describing the reported drug-drug interactions' --> 'gt' 

'Data type used to optimize the DNN architecture' --> 'set'

The resulting file is in 'data/ds_s2.csv'

In [99]:
original_ds = pd.read_csv('data/ds_s2.csv')

original_ds.head()

Unnamed: 0,gt,set
0,DB04571 may increase the photosensitizing acti...,training
1,DB00855 may increase the photosensitizing acti...,training
2,DB09536 may increase the photosensitizing acti...,training
3,DB01600 may increase the photosensitizing acti...,validation
4,DB09000 may increase the photosensitizing acti...,testing


We can see from reading through Dataset S01 that there are three types of sentence structures for Dataset S02 regarding the order of the drugs mentioned: 

1. Drug A ... Drug B ... (AB)
2. Drug B ... Drug A ... (BA)
3. Drug B ... Drug B ... Drug A (BBA)

The DDI types were manually sorted into these three types and saved in `ddi_types_ab.csv`, `ddi_types_ba.csv`, and `ddi_types_bba.csv` where the column `type` is the DDI type listed in Dataset S01, and the column `structure` is the original sentence structure listed in Dataset S01. 

In [None]:
ddi_types_ab = pd.read_csv('data/ddi_types_ab.csv')
ddi_types_ba = pd.read_csv('data/ddi_types_ba.csv')
ddi_types_bba = pd.read_csv('data/ddi_types_bba.csv')

From these structures, we can create regular expression patterns to extract the Drug Bank ID from Dataset S02: 

In [23]:
def get_ddi_regex_df(ddi_types_df): 
    def replace_and_count(text):
        replacements = {'Drug a': '(\w+)', 'Drug b': '(\w+)'}
        num_replacements = 0
        for drug, regex in replacements.items():
            # Count occurrences
            num_replacements += text.count(drug)
            # Replace
            text = text.replace(drug, regex)
        return text, num_replacements

    # Apply the function to each row in the dataframe
    ddi_types_df['result'] = ddi_types_df['structure'].apply(replace_and_count)

    # Separate the tuple into two columns
    ddi_types_df[['regex', 'num']] = pd.DataFrame(ddi_types_df['result'].tolist(), index=ddi_types_df.index)

    # Drop the original structure and result columns
    data = ddi_types_df.drop(columns=['result', 'num'])
    
    return data

In [24]:
ab_regex_df = get_ddi_regex_df(ddi_types_ab)
ba_regex_df = get_ddi_regex_df(ddi_types_ba)
bba_regex_df = get_ddi_regex_df(ddi_types_bba)

ab_regex_df.head()

Next, we extract the Drug Bank IDs for Drug A and Drug B from the ground truth sentences in Dataset S02 using the regular expression structures generated earlier for each of the three sentence structure types. 

In [63]:
extracted_df = pd.DataFrame()

with warnings.catch_warnings():
    warnings.simplefilter("ignore", UserWarning)
    
    for idx, row in ab_regex_df.iterrows():
        regex_pattern = row.regex
        condition = original_ds['gt'].str.contains(regex_pattern, regex=True)
        matches = original_ds[condition].copy()
        
        if len(matches) == 0: 
            continue
        
        extracted_data = matches['gt'].str.extract(regex_pattern)

        a, b = zip(*extracted_data.dropna().values)
        matches.loc[condition, 'drug_a'] = a
        matches.loc[condition, 'drug_b'] = b
        matches.loc[condition, 'interaction'] = row.type
        extracted_df = pd.concat([extracted_df, matches])

    for idx, row in ba_regex_df.iterrows():
        regex_pattern = row.regex
        condition = original_ds['gt'].str.contains(regex_pattern, regex=True)
        matches = original_ds[condition].copy()
        
        if len(matches) == 0: 
            continue
        
        extracted_data = matches['gt'].str.extract(regex_pattern)

        b, a = zip(*extracted_data.dropna().values)
        matches.loc[condition, 'drug_a'] = a
        matches.loc[condition, 'drug_b'] = b
        matches.loc[condition, 'interaction'] = row.type
        extracted_df = pd.concat([extracted_df, matches])
        
    for idx, row in bba_regex_df.iterrows():
        regex_pattern = row.regex
        condition = original_ds['gt'].str.contains(regex_pattern, regex=True)
        matches = original_ds[condition].copy()
        
        if len(matches) == 0: 
            continue
        
        extracted_data = matches['gt'].str.extract(regex_pattern)

        b, _, a = zip(*extracted_data.dropna().values)
        matches.loc[condition, 'drug_a'] = a
        matches.loc[condition, 'drug_b'] = b
        matches.loc[condition, 'interaction'] = row.type
        extracted_df = pd.concat([extracted_df, matches])

In [100]:
extracted_df.head()

Unnamed: 0,gt,set,drug_b,drug_a,interaction
143295,DB11126 can cause a decrease in the absorption...,training,DB00827,DB11126,1.0
143296,DB00375 can cause a decrease in the absorption...,validation,DB00240,DB00375,1.0
143297,DB00375 can cause a decrease in the absorption...,testing,DB01250,DB00375,1.0
143298,DB09275 can cause a decrease in the absorption...,validation,DB01615,DB09275,1.0
143299,DB00326 can cause a decrease in the absorption...,validation,DB00467,DB00326,1.0


Now we would like to get the SMILES representation for each drug, which we can get from go.drugbank.com using the extracted DrugBank ID

In [65]:
unique_drugs = pd.concat([extracted_df['drug_a'], extracted_df['drug_b']]).unique()

In [70]:
def fetch_smiles(db_drug_id):
    
    base_url = "https://go.drugbank.com/structures/small_molecule_drugs/{}.smiles"
    url = base_url.format(db_drug_id)
    
    result = ""
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            result = response.text.strip()
        else:
            result = None
    except Exception as e:
        result = "error"
    
    return result

def fetch_smiles_df(db_drug_id_lst): 
    
    results = {}
    
    for idx, db_drug_id in enumerate(db_drug_id_lst): 
        smiles = fetch_smiles(db_drug_id)
        
        print(f'{idx} / {len(db_drug_id_lst)}: {smiles}')

        results[db_drug_id] = smiles
            
    results_df = pd.DataFrame(list(results.items()), columns=['db_drug_id', 'smiles'])
    
    return results_df

In [71]:
db_drug_smiles = fetch_smiles_df(unique_drugs)
db_drug_smiles.to_csv("data/drug_id_to_smiles.csv", index=None)

0 / 1709: [Ca++].[H][C@@](O)(CO)[C@@]([H])(O)[C@]([H])(O)[C@@]([H])(O)C([O-])=O.[H][C@@](O)(CO)[C@@]([H])(O)[C@]([H])(O)[C@@]([H])(O)C([O-])=O
1 / 1709: None
2 / 1709: [K+].[K+].[K+].[K+].[K+].[Bi+3].[O-]C(=O)CC([O-])(CC([O-])=O)C([O-])=O.[O-]C(=O)CC([O-])(CC([O-])=O)C([O-])=O
3 / 1709: [Ca++].OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(O)C([O-])=O.OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(O)C([O-])=O
4 / 1709: NC(CO)(CO)CO
5 / 1709: [Cl-].[Cl-].[Ca++]
6 / 1709: O.O[Al](O)O.O[Al](O)OS(=O)(=O)OC[C@H]1O[C@@](COS(=O)(=O)O[Al](O)O)(O[C@H]2O[C@H](COS(=O)(=O)O[Al](O)O)[C@@H](OS(=O)(=O)O[Al](O)O)[C@H](OS(=O)(=O)O[Al](O)O)[C@H]2OS(=O)(=O)O[Al](O)O)[C@@H](OS(=O)(=O)O[Al](O)O)[C@@H]1OS(=O)(=O)O[Al](O)O
7 / 1709: O.[Mg++].[Al+3].[Al+3].[O-][Si]([O-])([O-])[O-].[O-][Si]([O-])([O-])[O-]
8 / 1709: O.[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[Mg++].[Mg++].[Mg++].[Mg+

In [83]:
db_drug_smiles = pd.read_csv("data/drug_id_to_smiles.csv", index_col=None)

In [92]:
dataset = extracted_df

Now we can merge the extracted Drug Bank IDs with the retrieved SMILES representations to create the full dataset. 

In [93]:
# Merging for drug_a
dataset = dataset.merge(db_drug_smiles, left_on='drug_a', right_on='db_drug_id', how='left')
dataset.rename(columns={'smiles': 'drug_a_smiles'}, inplace=True)
dataset.drop(columns=['db_drug_id'], inplace=True)

# Merging for drug_b
dataset = dataset.merge(db_drug_smiles, left_on='drug_b', right_on='db_drug_id', how='left')
dataset.rename(columns={'smiles': 'drug_b_smiles'}, inplace=True)
dataset.drop(columns=['db_drug_id'], inplace=True)

# Remove rows for which we did not find a SMILES representation
dataset = dataset[~(dataset.drug_a_smiles.isna() | dataset.drug_b_smiles.isna())]

In [97]:
dataset.to_csv("data/full_ds.csv", index=None)