In [None]:
!pip install numpy rdkit scikit-learn pandas

In [2]:
import os
import sys
import numpy as np
from rdkit import Chem
from rdkit.Chem import RDKFingerprint, rdMolDescriptors, rdFingerprintGenerator
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, accuracy_score
import pandas as pd
import random
from sklearn import model_selection, svm
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier

In [None]:
file_path = '../data/tox21.csv'
df = pd.read_csv(file_path)
targets = df.columns[0:12]
print(targets)


In [None]:
target_name = targets[7]
""" print(target_name)
df = df[[target_name, 'smiles']].dropna()
pos_num = df[df[target_name] == 1].shape[0]
neg_num = df[df[target_name] == 0].shape[0]
print(pos_num, neg_num)

# balance the dataset (for testing)
neg_rows = df[df[target_name] == 0]
drop_rows = neg_rows.sample(n=neg_num-pos_num).index
df = df.drop(drop_rows)
print(len(df)) """

In [None]:
num_bits = 1024
num_rows = len(df)

fp_array = np.zeros((num_rows, num_bits))
target_array = np.zeros((num_rows, 1))
i = 0
morgan_fp_gen = GetMorganGenerator(radius=2,fpSize=num_bits)

for idx, row in df.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])

    if mol is not None:
        morgan_fp = morgan_fp_gen.GetFingerprint(mol)
        fp_array[i] = np.array(morgan_fp)
        target_array[i] = row[target_name]
        i += 1
target_array = target_array.ravel()
print(fp_array.shape)
print(target_array.shape)

In [None]:
#Metrics ---- roc  acc f1  prs sns sps
svm_metrics = [[], [], [], [], [], []]
rf_metrics  = [[], [], [], [], [], []]
knn_metrics = [[], [], [], [], [], []]
xgb_metrics = [[], [], [], [], [], []]


for i in range(2, 32):
    print("ITERATION:", i - 1)
    seed = i - 1
    random.seed(seed)

    Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(fp_array,target_array, test_size=0.3, shuffle=True, random_state=seed)

    #################### SVM ####################

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', class_weight='balanced', random_state=seed)

    SVM.fit(Train_X,Train_Y)

    predictions_SVM = SVM.predict(Test_X)

    svm_metrics[0].append(roc_auc_score(Test_Y, predictions_SVM))
    svm_metrics[1].append(accuracy_score(Test_Y, predictions_SVM))
    svm_metrics[2].append(f1_score(Test_Y, predictions_SVM))
    svm_metrics[3].append(precision_score(Test_Y, predictions_SVM))
    svm_metrics[4].append(recall_score(Test_Y, predictions_SVM))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_SVM).ravel()
    sp = tn/(tn+fp)
    svm_metrics[5].append(sp)

    #################### RF ####################

    RF = RandomForestClassifier(max_depth=10, n_estimators=100, class_weight='balanced', random_state=seed)

    RF.fit(Train_X,Train_Y)

    predictions_RF = RF.predict(Test_X)

    rf_metrics[0].append(roc_auc_score(Test_Y, predictions_RF))
    rf_metrics[1].append(accuracy_score(Test_Y, predictions_RF))
    rf_metrics[2].append(f1_score(Test_Y, predictions_RF))
    rf_metrics[3].append(precision_score(Test_Y, predictions_RF))
    rf_metrics[4].append(recall_score(Test_Y, predictions_RF))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_RF).ravel()
    sp = tn/(tn+fp)
    rf_metrics[5].append(sp)

    #################### GP ####################

    GP = GaussianProcessClassifier(kernel = 2.0 * RBF(1.0), max_iter_predict = 100, random_state=seed)

    GP.fit(Train_X,Train_Y)

    predictions_GP = GP.predict(Test_X)

    xgb_metrics[0].append(roc_auc_score(Test_Y, predictions_GP))
    xgb_metrics[1].append(accuracy_score(Test_Y, predictions_GP))
    xgb_metrics[2].append(f1_score(Test_Y, predictions_GP))
    xgb_metrics[3].append(precision_score(Test_Y, predictions_GP))
    xgb_metrics[4].append(recall_score(Test_Y, predictions_GP))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_GP).ravel()
    sp = tn/(tn+fp)
    xgb_metrics[5].append(sp)

    #################### KNN ####################

    KNN = KNeighborsClassifier(n_neighbors=3)

    KNN.fit(Train_X,Train_Y)

    predictions_KNN = KNN.predict(Test_X)

    knn_metrics[0].append(roc_auc_score(Test_Y, predictions_KNN))
    knn_metrics[1].append(accuracy_score(Test_Y, predictions_KNN))
    knn_metrics[2].append(f1_score(Test_Y, predictions_KNN))
    knn_metrics[3].append(precision_score(Test_Y, predictions_KNN))
    knn_metrics[4].append(recall_score(Test_Y, predictions_KNN))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_KNN).ravel()
    sp = tn/(tn+fp)
    knn_metrics[5].append(sp)


In [None]:
print(np.mean(acc_SVM), np.mean(acc_RF), np.mean(acc_GP), np.mean(acc_KNN))
filename = "ml_baselines_tox21_morgan.txt"

with open(filename, "w") as file:
  file.write("Mean SVM Accuracy Score    -> " + str(np.round(np.mean(acc_SVM), 3)) + "+-" + str(np.round(np.std(acc_SVM), 3)))
  file.write("\nMean SVM ROC-AUC Score     -> " + str(np.round(np.mean(roc_SVM), 3)) + "+-" + str(np.round(np.std(roc_SVM), 3)))
  file.write("\nMean SVM F1-Score          -> " + str(np.round(np.mean(f1s_SVM), 3)) + "+-" + str(np.round(np.std(f1s_SVM), 3)))
  file.write("\nMean SVM Precision Score   -> " + str(np.round(np.mean(prs_SVM), 3)) + "+-" + str(np.round(np.std(prs_SVM), 3)))
  file.write("\nMean SVM Sensitivity Score -> " + str(np.round(np.mean(sns_SVM), 3)) + "+-" + str(np.round(np.std(sns_SVM), 3)))
  file.write("\nMean SVM Specificity Score -> " + str(np.round(np.mean(sps_SVM), 3)) + "+-" + str(np.round(np.std(sps_SVM), 3)))
  file.write("\n")
  file.write("\nMean RF Accuracy Score    -> "+ str(np.round(np.mean(acc_RF), 3))+ "+-"+ str(np.round(np.std(acc_RF), 3)))
  file.write("\nMean RF ROC-AUC Score     -> "+ str(np.round(np.mean(roc_RF), 3))+ "+-"+ str(np.round(np.std(roc_RF), 3)))
  file.write("\nMean RF F1-Score          -> "+ str(np.round(np.mean(f1s_RF), 3))+ "+-"+ str(np.round(np.std(f1s_RF), 3)))
  file.write("\nMean RF Precision Score   -> "+ str(np.round(np.mean(prs_RF), 3))+ "+-"+ str(np.round(np.std(prs_RF), 3)))
  file.write("\nMean RF Sensitivity Score -> "+ str(np.round(np.mean(sns_RF), 3))+ "+-"+ str(np.round(np.std(sns_RF), 3)))
  file.write("\nMean RF Specificity Score -> "+ str(np.round(np.mean(sps_RF), 3))+ "+-"+ str(np.round(np.std(sps_RF), 3)))
  file.write("\n")
  file.write("\nMean GP Accuracy Score    -> "+ str(np.round(np.mean(acc_GP), 3))+ "+-"+ str(np.round(np.std(acc_GP), 3)))
  file.write("\nMean GP ROC-AUC Score     -> "+ str(np.round(np.mean(roc_GP), 3))+ "+-"+ str(np.round(np.std(roc_GP), 3)))
  file.write("\nMean GP F1-Score          -> "+ str(np.round(np.mean(f1s_GP), 3))+ "+-"+ str(np.round(np.std(f1s_GP), 3)))
  file.write("\nMean GP Precision Score   -> "+ str(np.round(np.mean(prs_GP), 3))+ "+-"+ str(np.round(np.std(prs_GP), 3)))
  file.write("\nMean GP Sensitivity Score -> "+ str(np.round(np.mean(sns_GP), 3))+ "+-"+ str(np.round(np.std(sns_GP), 3)))
  file.write("\nMean GP Specificity Score -> "+ str(np.round(np.mean(sps_GP), 3))+ "+-"+ str(np.round(np.std(sps_GP), 3)))
  file.write("\n")
  file.write("\nMean KNN Accuracy Score    -> "+ str(np.round(np.mean(acc_KNN), 3))+ "+-"+ str(np.round(np.std(acc_KNN), 3)))
  file.write("\nMean KNN ROC-AUC Score     -> "+ str(np.round(np.mean(roc_KNN), 3))+ "+-"+ str(np.round(np.std(roc_KNN), 3)))
  file.write("\nMean KNN F1-Score          -> "+ str(np.round(np.mean(f1s_KNN), 3))+ "+-"+ str(np.round(np.std(f1s_KNN), 3)))
  file.write("\nMean KNN Precision Score   -> "+ str(np.round(np.mean(prs_KNN), 3))+ "+-"+ str(np.round(np.std(prs_KNN), 3)))
  file.write("\nMean KNN Sensitivity Score -> "+ str(np.round(np.mean(sns_KNN), 3))+ "+-"+ str(np.round(np.std(sns_KNN), 3)))
  file.write("\nMean KNN Specificity Score -> "+ str(np.round(np.mean(sps_KNN), 3))+ "+-"+ str(np.round(np.std(sps_KNN), 3)))