### Feature calculation

In [1]:
import os
import time

import pandas as pd
from pandarallel import pandarallel
from rdkit.Chem import PandasTools

In [2]:
import sys

BASE_PATH = os.path.dirname(os.getcwd())
sys.path.append(BASE_PATH)

from src.utils import calculate_selected_descriptors

In [3]:
def get_descriptors(filepath):
    df = pd.read_csv(filepath)
    try:
        PandasTools.AddMoleculeColumnToFrame(df, smilesCol="taut_smiles")
    except KeyError:
        PandasTools.AddMoleculeColumnToFrame(df, smilesCol="smiles")

    if len(df) > 1000:
        pandarallel.initialize(nb_workers=24, progress_bar=True)
        descriptors = df["ROMol"].parallel_apply(
            calculate_selected_descriptors
        ).apply(pd.Series)
    else:
        descriptors = df["ROMol"].apply(
            calculate_selected_descriptors
        ).apply(pd.Series)
    result = pd.concat((df[df.columns[0]].copy(), descriptors), axis=1)
    return result


In [4]:
files = [f for f in os.listdir(os.path.join(BASE_PATH, "data", "processed")) 
         if f.endswith("cleaned.csv")]
files

['chembl_35_NP_cleaned.csv',
 'pseudo_NPs_cleaned.csv',
 'enamine_advanced_50k_subset_cleaned.csv',
 'drugbank_5_1_13_cleaned.csv']

In [5]:
start = time.time()
descriptors = []
for file in files:
    name = file.split("_")[0]
    path = os.path.join(BASE_PATH, "data", "processed", file)
    df = get_descriptors(path)
    df.insert(1, "dataset", [name] * len(df))
    df.rename(columns={df.columns[0]: "ID"}, inplace=True)
    descriptors.append(df)

descriptors = pd.concat(descriptors, axis=0)
end = time.time()

print(f"Total execution time: {end - start :.2f} s")

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3257), Label(value='0 / 3257'))), …

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2083), Label(value='0 / 2083'))), …

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=472), Label(value='0 / 472'))), HB…

Total execution time: 79.51 s


In [6]:
descriptors.head()

Unnamed: 0,ID,dataset,HeavyAtoms,ExactMolWt,RingCount,NumAromaticRings,NumAliphaticRings,NumHDonors,NumHAcceptors,MolLogP,TPSA,NumRotatableBonds,fr_halogen,NumBridgeheadAtoms,FractionCSP3,NumOxygen,NumNitrogen,LipinskiViolations,VeberViolations
0,5a,compounds,31.0,420.204907,5.0,3.0,2.0,1.0,4.0,4.4525,63.79,4.0,0.0,0.0,0.4,4.0,2.0,0.0,0.0
1,5b,compounds,32.0,438.195486,5.0,3.0,2.0,1.0,4.0,4.5916,63.79,4.0,1.0,0.0,0.4,4.0,2.0,0.0,0.0
2,5c,compounds,32.0,454.165935,5.0,3.0,2.0,1.0,4.0,5.1059,63.79,4.0,1.0,0.0,0.4,4.0,2.0,1.0,0.0
3,5d,compounds,32.0,498.115419,5.0,3.0,2.0,1.0,4.0,5.215,63.79,4.0,1.0,0.0,0.4,4.0,2.0,1.0,0.0
4,5e,compounds,34.0,474.176642,5.0,3.0,2.0,1.0,4.0,5.3425,63.79,4.0,3.0,0.0,0.4,4.0,2.0,1.0,0.0


In [7]:
descriptors.to_csv(os.path.join(BASE_PATH, "reports", "descriptors.csv"),
                   index=False)

In [8]:
descriptors.shape

(139466, 19)

Rerun calculations for synthesized compounds and Sceletium references

In [5]:
names = ["pseudoNPs", "Sceletium"]

files = [
    os.path.join(BASE_PATH, "data", "processed", "pseudo_NPs_cleaned.csv"),
    os.path.join(BASE_PATH, "data", "interim", "Sceletium_compounds.csv")
]

descriptors = []
for file, name in zip(files, names):
    df = get_descriptors(file)
    df.insert(1, "dataset", [name] * len(df))
    df.rename(columns={df.columns[0]: "ID"}, inplace=True)
    descriptors.append(df)

descriptors = pd.concat(descriptors, axis=0)

In [7]:
old_desc_file = os.path.join(BASE_PATH, "reports", "descriptors.csv")
old_desc = pd.read_csv(old_desc_file)
old_desc.head(3)

Unnamed: 0,ID,dataset,HeavyAtoms,ExactMolWt,RingCount,NumAromaticRings,NumAliphaticRings,NumHDonors,NumHAcceptors,MolLogP,TPSA,NumRotatableBonds,fr_halogen,NumBridgeheadAtoms,FractionCSP3,NumOxygen,NumNitrogen,LipinskiViolations,VeberViolations
0,5a,compounds,31.0,420.204907,5.0,3.0,2.0,1.0,4.0,4.4525,63.79,4.0,0.0,0.0,0.4,4.0,2.0,0.0,0.0
1,5b,compounds,32.0,438.195486,5.0,3.0,2.0,1.0,4.0,4.5916,63.79,4.0,1.0,0.0,0.4,4.0,2.0,0.0,0.0
2,5c,compounds,32.0,454.165935,5.0,3.0,2.0,1.0,4.0,5.1059,63.79,4.0,1.0,0.0,0.4,4.0,2.0,1.0,0.0


In [8]:
print(old_desc.shape[0])
remove = old_desc.query("dataset == 'compounds'").index
old_desc.drop(remove, axis=0, inplace=True)
print(old_desc.shape[0])

139466
139444


In [9]:
descriptors = pd.concat((descriptors, old_desc), axis=0)
print(descriptors.shape[0])

139486


In [10]:
descriptors.to_csv(os.path.join(BASE_PATH, "reports", "descriptors.csv"),
                   index=False)

Rerun calculations for Hasubanan references

In [4]:
names = ["Hasubanan"]

files = [
    os.path.join(BASE_PATH, "data", "processed", "Hasubanan_cleaned.csv")
]

descriptors = []
for file, name in zip(files, names):
    df = get_descriptors(file)
    df.insert(1, "dataset", [name] * len(df))
    df.rename(columns={df.columns[0]: "ID"}, inplace=True)
    descriptors.append(df)

descriptors = pd.concat(descriptors, axis=0)

In [6]:
old_desc_file = os.path.join(BASE_PATH, "reports", "descriptors.csv")
old_desc = pd.read_csv(old_desc_file)
old_desc.head(3)

Unnamed: 0,ID,dataset,HeavyAtoms,ExactMolWt,RingCount,NumAromaticRings,NumAliphaticRings,NumHDonors,NumHAcceptors,MolLogP,TPSA,NumRotatableBonds,fr_halogen,NumBridgeheadAtoms,FractionCSP3,NumOxygen,NumNitrogen,LipinskiViolations,VeberViolations
0,1,pseudoNPs,31.0,420.204907,5.0,3.0,2.0,1.0,4.0,4.4525,63.79,4.0,0.0,0.0,0.4,4.0,2.0,0.0,0.0
1,2,pseudoNPs,32.0,498.115419,5.0,3.0,2.0,1.0,4.0,5.215,63.79,4.0,1.0,0.0,0.4,4.0,2.0,1.0,0.0
2,3,pseudoNPs,32.0,438.195486,5.0,3.0,2.0,1.0,4.0,4.5916,63.79,4.0,1.0,0.0,0.4,4.0,2.0,0.0,0.0


In [7]:
descriptors = pd.concat((descriptors, old_desc), axis=0)
print(descriptors.shape[0])

139604


In [8]:
descriptors.to_csv(os.path.join(BASE_PATH, "reports", "descriptors.csv"),
                   index=False)