In [36]:
import pandas as pd
import numpy as np
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import AllChem
import requests
from itertools import product

`Dataset_S02` from PNAS publication link: https://www.pnas.org/doi/suppl/10.1073/pnas.1803294115
The original file is an XLSX file, so it is converted to CSV format and columns renamed 

'Sentences describing the reported drug-drug interactions' --> 'gt' 

'Data type used to optimize the DNN architecture' --> 'set'

The resulting file is in 'data/ds_s2.csv'

In [21]:
full_ds = pd.read_csv('data/ds_s2.csv')

For the initial testing, we will only train a model for DDI type 6 "The metabolism of Drug b can be decreased when combined with Drug a."

In [25]:
metabolism_ds = full_ds[full_ds['gt'].str.contains("metabolism")].reset_index(drop=True)
metabolism_ds = metabolism_ds[metabolism_ds['gt'].str.contains("decreased")].reset_index(drop=True)
# metabolism_ds.to_csv('data/metabolism_ds_raw.csv')

In [None]:
metabolism_ds = pd.read_csv('data/metabolism_ds_raw.csv')

Next, we extract Drug A and Drug B from the ground truth sentences from the data assuming the structure "The metabolism of \<Drug b> can be decreased when combined with \<Drug a>."


In [28]:
regex_pattern = r"The metabolism of (\w+) can be decreased when combined with (\w+)\."

# Extract drug identifiers
metabolism_ds['drug_b'], metabolism_ds['drug_a'] = zip(*metabolism_ds['gt'].str.extract(regex_pattern).values)

Now we would like to get the SMILES representation for each, which we can get from go.drugbank.com using the provided drugbank ID

In [30]:
unique_drugs = pd.concat([metabolism_ds['drug_a'], metabolism_ds['drug_b']]).unique()

In [23]:
def fetch_smiles(db_drug_id):
    
    base_url = "https://go.drugbank.com/structures/small_molecule_drugs/{}.smiles"
    url = base_url.format(db_drug_id)
    
    result = ""
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            result = response.text.strip()
        else:
            result = None
    except Exception as e:
        result = "error"
    
    return result

def fetch_smiles_df(db_drug_id_lst): 
    
    results = {}
    
    for idx, db_drug_id in enumerate(db_drug_id_lst): 
        smiles = fetch_smiles(db_drug_id)
        if smiles != None or smiles != "error": 
            results[db_drug_id] = smiles
            
    results_df = pd.DataFrame(list(results.items()), columns=['db_drug_id', 'smiles'])
    
    return results_df

In [None]:
db_drug_smiles = fetch_smiles_df(unique_drugs)

In [32]:
# db_drug_smiles.to_csv("data/metabolism_db_drug_id_to_smiles.csv", index=None)
db_drug_smiles = pd.read_csv("data/metabolism_db_drug_id_to_smiles.csv")

In [33]:
# Merging for drug_a
metabolism_ds = metabolism_ds.merge(db_drug_smiles, left_on='drug_a', right_on='db_drug_id', how='left')
metabolism_ds.rename(columns={'smiles': 'drug_a_smiles'}, inplace=True)
metabolism_ds.drop(columns=['db_drug_id'], inplace=True)

# Merging for drug_b
metabolism_ds = metabolism_ds.merge(db_drug_smiles, left_on='drug_b', right_on='db_drug_id', how='left')
metabolism_ds.rename(columns={'smiles': 'drug_b_smiles'}, inplace=True)
metabolism_ds.drop(columns=['db_drug_id'], inplace=True)

In [34]:
# metabolism_ds.to_csv("data/true_metabolism_clean_ds.csv", index=None)

This dataset only contains positive cases of the indicated interaction, so we also create negative casees of the indicated interaction the model can learn from. 

In [37]:
# Step 1: Create a set of all unique drugs
all_drugs = set(metabolism_ds['drug_a']).union(set(metabolism_ds['drug_b']))

# Step 2: Create all possible pairs
all_possible_pairs = pd.DataFrame(product(all_drugs, all_drugs), columns=['drug_a', 'drug_b'])

# Remove self-pairs (where a drug pairs with itself)
all_possible_pairs = all_possible_pairs[all_possible_pairs['drug_a'] != all_possible_pairs['drug_b']]

# Create a set of tuples for existing interactions considering both orderings
interaction_set = set()
for idx, row in metabolism_ds.iterrows():
    interaction_set.add((row['drug_a'], row['drug_b']))
    interaction_set.add((row['drug_b'], row['drug_a']))

# Filter the dataframe to exclude any pairs found in the interaction set
non_interacting_pairs = all_possible_pairs[
    ~all_possible_pairs.apply(lambda x: (x['drug_a'], x['drug_b']) in interaction_set, axis=1)
]

# Step 3: Add SMILES Information
# Map to get SMILES for each drug
smiles_map = pd.concat([
    metabolism_ds[['drug_a', 'drug_a_smiles']].rename(columns={'drug_a': 'drug', 'drug_a_smiles': 'smiles'}),
    metabolism_ds[['drug_b', 'drug_b_smiles']].rename(columns={'drug_b': 'drug', 'drug_b_smiles': 'smiles'})
]).drop_duplicates().set_index('drug')['smiles']

# Add SMILES information to the non-interacting pairs
non_interacting_pairs['drug_a_smiles'] = non_interacting_pairs['drug_a'].map(smiles_map)
non_interacting_pairs['drug_b_smiles'] = non_interacting_pairs['drug_b'].map(smiles_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_interacting_pairs['drug_a_smiles'] = non_interacting_pairs['drug_a'].map(smiles_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_interacting_pairs['drug_b_smiles'] = non_interacting_pairs['drug_b'].map(smiles_map)


In [38]:
# non_interacting_pairs.to_csv('data/non_interacting_drug_pairs.csv', index=None)

We can now combine the provided data and the negative examples we created to make our full dataset for model training

In [39]:
interacting_pairs = pd.read_csv('data/true_metabolism_clean_ds.csv')
interacting_pairs['interaction'] = 1
interacting_pairs = interacting_pairs[~(interacting_pairs['drug_a_smiles'].isna() | interacting_pairs['drug_b_smiles'].isna())]

# Load the non-interacting pairs dataset
non_interacting_pairs = pd.read_csv('data/non_interacting_drug_pairs.csv')
non_interacting_pairs['interaction'] = 0
non_interacting_pairs = non_interacting_pairs[~(non_interacting_pairs['drug_a_smiles'].isna() | non_interacting_pairs['drug_b_smiles'].isna())]


# Ensure the columns are aligned and in the same order
columns = ['drug_a', 'drug_b', 'drug_a_smiles', 'drug_b_smiles', 'interaction']
interacting_pairs = interacting_pairs[columns]
non_interacting_pairs = non_interacting_pairs[columns]

# Determine the smaller size
min_size = min(len(interacting_pairs), len(non_interacting_pairs))

# Randomly sample from the larger dataset
non_interacting_sample = non_interacting_pairs.sample(n=min_size, random_state=42)  # Using a seed for reproducibility
interacting_sample = interacting_pairs.sample(n=min_size, random_state=42)  # This step is usually not necessary unless interacting_pairs is also too large

# Concatenate the balanced datasets
balanced_dataset = pd.concat([interacting_sample, non_interacting_sample])

# Shuffle the dataset
balanced_dataset = balanced_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [40]:
balanced_dataset.to_csv('data/full_clean_metabolism_ds.csv', index=None)