In [2]:
%pip install deepchem
%pip install rdkit-pypi

import pandas as pd
import numpy as np
import deepchem as dc

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

import warnings

warnings.filterwarnings('ignore')


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Skipped loading some Pytorch utilities, missing a dependency. No module named 'torch'


This module requires PyTorch to be installed.


Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading some PyTorch models, missing a dependency. No module named 'torch'
No module named 'torch'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'torch'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


In [3]:
df = pd.read_csv("../Data/half-life-dataset.csv")

# Drop the unwanted column
df = df.drop('Unnamed: 4', axis=1)  

# Save the DataFrame to a new CSV file
df.to_csv("../Data/half-life-dataset-updated.csv", index=False)

Modify Inchi for row 561 and 679. Original Inchi causing issue when converting to Mol.

In [4]:
new_inchi_561 = 'InChI=1S/C22H12F4N4O/c23-18-4-3-15(8-17(18)22(24,25)26)30-20(31)6-2-13-9-27-19-5-1-12(7-16(19)21(13)30)14-10-28-29-11-14/h1-11H,(H,28,29)'
new_inchi_679 = 'InChI=1S/C25H30N4O2/c30-25(27-31)14-11-21-9-12-23-22(19-21)26-24(13-10-20-7-3-1-4-8-20)29(23)18-17-28-15-5-2-6-16-28/h1,3-4,7-9,11-12,14,19,31H,2,5-6,10,13,15-18H2,(H,27,30)/b14-11+'

df.at[561, 'PUBCHEM_IUPAC_INCHI'] = new_inchi_561
df.at[679, 'PUBCHEM_IUPAC_INCHI'] = new_inchi_679

df.to_csv('modified_half-life.csv', index=False)

print("Modified value:", df.at[561, 'PUBCHEM_IUPAC_INCHI'])
print("Modified value:", df.at[679, 'PUBCHEM_IUPAC_INCHI'])

print("CSV file copied and modified successfully.")

Modified value: InChI=1S/C22H12F4N4O/c23-18-4-3-15(8-17(18)22(24,25)26)30-20(31)6-2-13-9-27-19-5-1-12(7-16(19)21(13)30)14-10-28-29-11-14/h1-11H,(H,28,29)
Modified value: InChI=1S/C25H30N4O2/c30-25(27-31)14-11-21-9-12-23-22(19-21)26-24(13-10-20-7-3-1-4-8-20)29(23)18-17-28-15-5-2-6-16-28/h1,3-4,7-9,11-12,14,19,31H,2,5-6,10,13,15-18H2,(H,27,30)/b14-11+
CSV file copied and modified successfully.


Convert each molecule to mol from Inchi and then from mol to SMILES

In [5]:
df['mol'] = df['PUBCHEM_IUPAC_INCHI'].apply(Chem.MolFromInchi)
df['SMILES'] = df['mol'].apply(lambda x: Chem.MolToSmiles(x) if x is not None else None)

Split dataset into 80% trainning 20% testing

In [6]:
from sklearn.model_selection import train_test_split


train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Binary Classification'])

# Save the training set to a CSV file
train_df.to_csv("../data/half-life-train.csv", index=False)

# Save the testing set to a CSV file
test_df.to_csv("../data/half-life-test.csv", index=False)

print("Training and testing sets created and saved as separate CSV files.")

Training and testing sets created and saved as separate CSV files.


In [7]:
DATA_DIR = "../Data"

TRN_PATH = f"{DATA_DIR}/half-life-train.csv"
TST_PATH = f"{DATA_DIR}/half-life-test.csv"


In [8]:
df_trn = pd.read_csv(TRN_PATH)
df_tst = pd.read_csv(TST_PATH)

Simplify dataframes so they only include SMILES and Binary Classification columns

In [10]:
df_trn= df_trn[['SMILES', 'Binary Classification']]
df_tst= df_tst[['SMILES', 'Binary Classification']]


Using deepchem to generate features: RDKit descriptors and circular fingerprints

In [11]:
featurizers = {
    'circular': dc.feat.CircularFingerprint(size=2048, radius=4), 
    'rdkit': dc.feat.RDKitDescriptors()
}

def featurize(df, key, featurizer):
    feats = featurizer.featurize(df['SMILES'])
    pd_feats = pd.DataFrame(feats, columns=[key + '_' + str(i+1) for i in range(feats.shape[1])])
    return pd.concat([df, pd_feats], axis=1)

for k, f in featurizers.items():
    print(f"Generating {k} fingerprints...")
    df_trn = featurize(df_trn, k, f)
    df_tst = featurize(df_tst, k, f)
print("Done.")

print(f"Shape of trn set = {df_trn.shape}")
print(f"Shape of tst set = {df_tst.shape}")

Generating circular fingerprints...
Generating rdkit fingerprints...
Done.
Shape of trn set = (715, 2258)
Shape of tst set = (179, 2258)


In [12]:
# For the training set
X_trn = df_trn.drop(['SMILES', 'Binary Classification'], axis=1)
y_trn = df_trn['Binary Classification']

# For the testing set
X_tst = df_tst.drop(['SMILES', 'Binary Classification'], axis=1)
y_tst = df_tst['Binary Classification']

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score, accuracy_score

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#Initialize RF Classifier
rf2 = RandomForestClassifier(n_estimators=1000, max_depth=14, max_samples=80, random_state=42)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',
    'average_precision': make_scorer(average_precision_score)
}

# Perform cross-validation
# Perform cross-validation
cv_results = cross_validate(rf2, X_trn, y_trn, cv=skf, scoring=scoring)


# Check results
print("Accuracy scores:", cv_results['test_accuracy'])
print("Average accuracy:", cv_results['test_accuracy'].mean())

print("ROC AUC scores:", cv_results['test_roc_auc'])
print("Average ROC AUC:", cv_results['test_roc_auc'].mean())

print("Average Precision scores:", cv_results['test_average_precision'])
print("Average Precision:", cv_results['test_average_precision'].mean())

Accuracy scores: [0.73426573 0.76223776 0.77622378 0.73426573 0.78321678]
Average accuracy: 0.758041958041958
ROC AUC scores: [0.78382838 0.7932579  0.86940123 0.8019802  0.83451202]
Average ROC AUC: 0.8165959453088165
Average Precision scores: [0.37683613 0.42218892 0.4542853  0.36474636 0.4764036 ]
Average Precision: 0.4188920616420616


Determining classification threshold to maximize F1-score

In [14]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve, f1_score, auc

# Perform cross-validation with probability prediction
y_val_probs = cross_val_predict(rf2, X_trn, y_trn, cv=skf, method='predict_proba')[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_trn, y_val_probs)

# Calculate F1 score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the index of the maximum F1 score
best_index = np.argmax(f1_scores)
best_threshold = thresholds[best_index]
best_f1_score = f1_scores[best_index]

print("Best Threshold:", best_threshold)
print("Best F1 Score:", best_f1_score)


Best Threshold: 0.24702
Best F1 Score: 0.622673434856176
