In [2]:
import pandas as pd
import numpy as np

In [3]:
full_ds = pd.read_csv('ds_s2.csv')

In [4]:
# metabolism_ds = full_ds[full_ds['gt'].str.contains("metabolism")].reset_index(drop=True)
metabolism_ds = pd.read_csv('metabolism_ds_raw.csv')

In [5]:
metabolism_ds['set'].value_counts(normalize=True)

set
training      0.600010
validation    0.199995
testing       0.199995
Name: proportion, dtype: float64

In [6]:
# Regular expression to extract drug identifiers
# Assuming the structure "The metabolism of <Drug b> can be ... combined with <Drug a>."
regex_pattern = r"The metabolism of (\w+) can be .+ combined with (\w+)\."

# Extract drug identifiers
metabolism_ds['drug_b'], metabolism_ds['drug_a'] = zip(*metabolism_ds['gt'].str.extract(regex_pattern).values)


In [7]:
unique_drugs = pd.concat([metabolism_ds['drug_a'], metabolism_ds['drug_b']]).unique()

In [8]:
len(unique_drugs)

829

In [23]:
import requests

# Function to fetch SMILES strings
def fetch_smiles(db_drug_id):
    
    base_url = "https://go.drugbank.com/structures/small_molecule_drugs/{}.smiles"
    url = base_url.format(db_drug_id)
    
    result = ""
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            result = response.text.strip()
        else:
            result = None
    except Exception as e:
        result = "error"
    
    return result

def fetch_smiles_df(db_drug_id_lst): 
    
    results = {}
    
    for idx, db_drug_id in enumerate(db_drug_id_lst): 
        print(f"Fetching SMILES string {idx} / {len(db_drug_id_lst)}")
        smiles = fetch_smiles(db_drug_id)
        print(f"Recieved SMILES: {smiles}")
        if smiles != None or smiles != "error": 
            results[db_drug_id] = smiles
            
    results_df = pd.DataFrame(list(results.items()), columns=['db_drug_id', 'smiles'])
    
    return results_df

In [24]:
db_drug_smiles = fetch_smiles_df(unique_drugs)

Fetching SMILES string 0 / 829
Recieved SMILES: CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(C(O)=C3C)C(=O)C(NC(=O)\C(C)=C/C=C/[C@H](C)[C@H](O)[C@@H](C)[C@@H](O)[C@@H](C)[C@H](OC(C)=O)[C@@H]1C)=C1NC3(CCN(CC3)CC(C)C)N=C21
Fetching SMILES string 1 / 829
Recieved SMILES: O=C1NC(=O)C(N1)(C1=CC=CC=C1)C1=CC=CC=C1
Fetching SMILES string 2 / 829
Recieved SMILES: CCCC(C)C1(CC=C)C(=O)NC(=S)NC1=O
Fetching SMILES string 3 / 829
Recieved SMILES: CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(O)C(\C=N\N4CCN(C)CC4)=C(NC(=O)\C(C)=C/C=C/[C@H](C)[C@H](O)[C@@H](C)[C@@H](O)[C@@H](C)[C@H](OC(C)=O)[C@@H]1C)C(O)=C2C(O)=C3C
Fetching SMILES string 4 / 829
Recieved SMILES: OP(O)(=O)OCN1C(=O)NC(C1=O)(C1=CC=CC=C1)C1=CC=CC=C1
Fetching SMILES string 5 / 829
Recieved SMILES: NC(=O)N1C2=CC=CC=C2C=CC2=CC=CC=C12
Fetching SMILES string 6 / 829
Recieved SMILES: [H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)C1=C(OCC)C=CC2=CC=CC=C12)C(O)=O
Fetching SMILES string 7 / 829
Recieved SMILES: CCC1(C(=O)NCNC1=O)C1=CC=CC=C1
Fetching SMILES string 

In [31]:
db_drug_smiles[db_drug_smiles['smiles'].isna()]

Unnamed: 0,db_drug_id,smiles
268,DB00104,
431,DB09396,


In [32]:
db_drug_smiles.to_csv("metabolism_db_drug_id_to_smiles.csv", index=None)

In [33]:
# Merging for drug_a
metabolism_ds = metabolism_ds.merge(db_drug_smiles, left_on='drug_a', right_on='db_drug_id', how='left')
metabolism_ds.rename(columns={'smiles': 'drug_a_smiles'}, inplace=True)
metabolism_ds.drop(columns=['db_drug_id'], inplace=True)

# Merging for drug_b
metabolism_ds = metabolism_ds.merge(db_drug_smiles, left_on='drug_b', right_on='db_drug_id', how='left')
metabolism_ds.rename(columns={'smiles': 'drug_b_smiles'}, inplace=True)
metabolism_ds.drop(columns=['db_drug_id'], inplace=True)

In [96]:
metabolism_ds.head()

Unnamed: 0,gt,set,drug_b,drug_a,drug_a_smiles,drug_b_smiles
0,The metabolism of DB01435 can be increased whe...,training,DB01435,DB00615,CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(C(O)=C3...,CN1N(C(=O)C=C1C)C1=CC=CC=C1
1,The metabolism of DB00199 can be increased whe...,training,DB00199,DB00252,O=C1NC(=O)C(N1)(C1=CC=CC=C1)C1=CC=CC=C1,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(...
2,The metabolism of DB00869 can be increased whe...,training,DB00869,DB00615,CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(C(O)=C3...,CCN[C@H]1C[C@H](C)S(=O)(=O)C2=C1C=C(S2)S(N)(=O)=O
3,The metabolism of DB09227 can be increased whe...,training,DB09227,DB01154,CCCC(C)C1(CC=C)C(=O)NC(=S)NC1=O,COC(=O)C1=C(C)NC(C)=C([C@H]1C1=CC(=CC=C1)[N+](...
4,The metabolism of DB06736 can be increased whe...,training,DB06736,DB01045,CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(O)C(\C=...,OC(=O)COC(=O)CC1=CC=CC=C1NC1=C(Cl)C=CC=C1Cl


In [97]:
metabolism_ds.to_csv("true_metabolism_clean_ds.csv", index=None)

In [35]:
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import AllChem

In [None]:
ms = [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'),
Chem.MolFromSmiles('COC')]

fpgen = AllChem.GetRDKitFPGenerator()

fps = [fpgen.GetFingerprint(x) for x in ms]

In [84]:
def get_fingerprint_df_from_smiles_df(smiles_df):
    fpgen = AllChem.GetRDKitFPGenerator()
    results = {}
    
    for idx, row in smiles_df.iterrows():
        if row.smiles is not None:
            ms = Chem.MolFromSmiles(row.smiles)
            if ms is not None:  # Check if the molecule was successfully created
                fp = fpgen.GetFingerprint(ms)
                array = np.zeros((2048,), dtype=np.int8)  # Correct size for the array
                DataStructs.ConvertToNumpyArray(fp, array)
                results[row.db_drug_id] = array.tolist()  # Store as a list
            else:
                results[row.db_drug_id] = None
        else:
            results[row.db_drug_id] = None
    
    results_df = pd.DataFrame(list(results.items()), columns=['db_drug_id', 'fingerprint'])
    return results_df

In [85]:
fingerprint_df = get_fingerprint_df_from_smiles_df(db_drug_smiles)

In [86]:
fingerprint_df['fingerprint'].head()

0    [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
1    [0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, ...
2    [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, ...
4    [0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, ...
Name: fingerprint, dtype: object

In [69]:
fingerprint_df.to_csv("metabolism_db_drug_id_to_fingerprint.csv", index=None)

In [70]:
sample_fingerprint_df = pd.read_csv('/Users/aho/uiuc/dlh_project/metabolism_db_drug_id_to_fingerprint.csv')

In [95]:
sample_fingerprint_df['fingerprint'][0][0]

'['

In [92]:
all_fps_as_arrays = np.array(sample_fingerprint_df['fingerprint'].tolist())

In [104]:
from itertools import product

# Load the dataset

# Step 1: Create a set of all unique drugs
all_drugs = set(metabolism_ds['drug_a']).union(set(metabolism_ds['drug_b']))

# Step 2: Create all possible pairs
all_possible_pairs = pd.DataFrame(product(all_drugs, all_drugs), columns=['drug_a', 'drug_b'])

# Remove self-pairs (where a drug pairs with itself)
all_possible_pairs = all_possible_pairs[all_possible_pairs['drug_a'] != all_possible_pairs['drug_b']]

# Create a set of tuples for existing interactions considering both orderings
interaction_set = set()
for idx, row in metabolism_ds.iterrows():
    interaction_set.add((row['drug_a'], row['drug_b']))
    interaction_set.add((row['drug_b'], row['drug_a']))

# Filter the dataframe to exclude any pairs found in the interaction set
non_interacting_pairs = all_possible_pairs[
    ~all_possible_pairs.apply(lambda x: (x['drug_a'], x['drug_b']) in interaction_set, axis=1)
]

# Step 3: Add SMILES Information
# Map to get SMILES for each drug
smiles_map = pd.concat([
    metabolism_ds[['drug_a', 'drug_a_smiles']].rename(columns={'drug_a': 'drug', 'drug_a_smiles': 'smiles'}),
    metabolism_ds[['drug_b', 'drug_b_smiles']].rename(columns={'drug_b': 'drug', 'drug_b_smiles': 'smiles'})
]).drop_duplicates().set_index('drug')['smiles']

# Add SMILES information to the non-interacting pairs
non_interacting_pairs['drug_a_smiles'] = non_interacting_pairs['drug_a'].map(smiles_map)
non_interacting_pairs['drug_b_smiles'] = non_interacting_pairs['drug_b'].map(smiles_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_interacting_pairs['drug_a_smiles'] = non_interacting_pairs['drug_a'].map(smiles_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_interacting_pairs['drug_b_smiles'] = non_interacting_pairs['drug_b'].map(smiles_map)


In [105]:
non_interacting_pairs.to_csv('non_interacting_drug_pairs.csv', index=None)

In [13]:
interacting_pairs = pd.read_csv('true_metabolism_clean_ds.csv')
interacting_pairs['interaction'] = 1
interacting_pairs = interacting_pairs[~(interacting_pairs['drug_a_smiles'].isna() | interacting_pairs['drug_b_smiles'].isna())]

# Load the non-interacting pairs dataset
non_interacting_pairs = pd.read_csv('non_interacting_drug_pairs.csv')
non_interacting_pairs['interaction'] = 0
non_interacting_pairs = non_interacting_pairs[~(non_interacting_pairs['drug_a_smiles'].isna() | non_interacting_pairs['drug_b_smiles'].isna())]


# Ensure the columns are aligned and in the same order
columns = ['drug_a', 'drug_b', 'drug_a_smiles', 'drug_b_smiles', 'interaction']
interacting_pairs = interacting_pairs[columns]
non_interacting_pairs = non_interacting_pairs[columns]

# Determine the smaller size
min_size = min(len(interacting_pairs), len(non_interacting_pairs))

# Randomly sample from the larger dataset
non_interacting_sample = non_interacting_pairs.sample(n=min_size, random_state=42)  # Using a seed for reproducibility
interacting_sample = interacting_pairs.sample(n=min_size, random_state=42)  # This step is usually not necessary unless interacting_pairs is also too large

# Concatenate the balanced datasets
balanced_dataset = pd.concat([interacting_sample, non_interacting_sample])

# Shuffle the dataset
balanced_dataset = balanced_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [16]:
balanced_dataset.to_csv('full_clean_metabolism_ds.csv', index=None)