In [1]:
import pandas as pd
import numpy as np
import random
from rdkit import Chem
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import matplotlib.pyplot as plt
from utils import load_dataset_df, fp_generator
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn import model_selection, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#### Load DataFrame

In [2]:
files = ['tox21.csv','sider.csv', 'BBBP.csv']
dt_file = files[2]

df, targets = load_dataset_df(filename=dt_file)
print(targets)


target_name = targets[0]
df = df[[target_name, 'smiles']].dropna()
print(df[target_name].size,"1 :", np.round(df[target_name].size/df[target_name].sum(), 1))


['p_np']
2050 1 : 1.3


In [3]:
print(target_name)
print(df[target_name].sum())
print(df[target_name].size)

p_np
1567
2050


#### SMILE to Fingerprint

In [4]:
fp_types = [['morgan', 1024], ['maccs', 167], ['RDKit', 1024]]
fp_type, num_bits = fp_types[2]
print(fp_type, '-', num_bits)
num_rows = len(df)
fp_array = np.zeros((num_rows, num_bits))
target_array = np.zeros((num_rows, 1))
i = 0

img = None
# Smile to Fingerprint of size {num_bits}
fp_gen = fp_generator(fp_type)
for idx, row in df.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    #TODO: sanitize molecules to remove the warnings (?)
    
    if mol is not None:
        fingerprint = fp_gen(mol)

        fp_array[i] = np.array(fingerprint)
        target_array[i] = row[target_name]
        i += 1
target_array = target_array.ravel()

RDKit - 1024


[22:14:58] Explicit valence for atom # 1 N, 4, is greater than permitted
[22:14:58] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:14:59] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 11 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 12 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 5 N, 4, is greater than permitted
[22:15:00] Explicit valence for atom # 5 N, 4, is greater than permitted


In [5]:
#Metrics ---- roc  acc f1  prs sns sps
svm_metrics = [[], [], [], [], [], []]
rf_metrics  = [[], [], [], [], [], []]
knn_metrics = [[], [], [], [], [], []]
xgb_metrics = [[], [], [], [], [], []]
mlp_metrics = [[], [], [], [], [], []]

knn_param_dist = {
        'n_neighbors': range(1, 20),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }
knn_best_params = []

print("Iterations:")
for i in range(2, 32):
    print(str(i - 1) + "/30")
    seed = i - 1
    random.seed(seed)
    
    Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(fp_array,target_array, test_size=0.3, shuffle=True, random_state=seed)

    class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(Train_Y), y=Train_Y)
    #class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
    class_weights_dict = "balanced"


    #################### SVM ####################

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', class_weight=class_weights_dict, random_state=seed)

    SVM.fit(Train_X,Train_Y)

    predictions_SVM = SVM.predict(Test_X)

    svm_metrics[0].append(roc_auc_score(Test_Y, predictions_SVM))
    svm_metrics[1].append(accuracy_score(Test_Y, predictions_SVM))
    svm_metrics[2].append(f1_score(Test_Y, predictions_SVM))
    svm_metrics[3].append(precision_score(Test_Y, predictions_SVM))
    svm_metrics[4].append(recall_score(Test_Y, predictions_SVM))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_SVM).ravel()
    sp = tn/(tn+fp)
    svm_metrics[5].append(sp)

    #################### RF ####################

    RF = RandomForestClassifier(max_depth=10, n_estimators=100, class_weight=class_weights_dict, random_state=seed)

    RF.fit(Train_X,Train_Y)

    predictions_RF = RF.predict(Test_X)

    rf_metrics[0].append(roc_auc_score(Test_Y, predictions_RF))
    rf_metrics[1].append(accuracy_score(Test_Y, predictions_RF))
    rf_metrics[2].append(f1_score(Test_Y, predictions_RF))
    rf_metrics[3].append(precision_score(Test_Y, predictions_RF))
    rf_metrics[4].append(recall_score(Test_Y, predictions_RF))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_RF).ravel()
    sp = tn/(tn+fp)
    rf_metrics[5].append(sp)

    #################### XGB ####################
    #pos_weight = sum(Train_Y == 0) / sum(Train_Y == 1)
    pos_weight=1
    XGB = XGBClassifier(objective="binary:logistic",learning_rate=0.1,max_depth=6,n_estimators=100,scale_pos_weight=pos_weight)

    XGB.fit(Train_X,Train_Y)

    predictions_XGB = XGB.predict(Test_X)

    xgb_metrics[0].append(roc_auc_score(Test_Y, predictions_XGB))
    xgb_metrics[1].append(accuracy_score(Test_Y, predictions_XGB))
    xgb_metrics[2].append(f1_score(Test_Y, predictions_XGB))
    xgb_metrics[3].append(precision_score(Test_Y, predictions_XGB))
    xgb_metrics[4].append(recall_score(Test_Y, predictions_XGB))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_XGB).ravel()
    sp = tn/(tn+fp)
    xgb_metrics[5].append(sp)

    #################### KNN ####################

    # Randomized search for knn
    
    if i == 2:
        KNN = KNeighborsClassifier()

        #Randomized Search
        random_search = RandomizedSearchCV(KNN, knn_param_dist, n_iter=20, cv=5, scoring='roc_auc', random_state=42)
        random_search.fit(Train_X, Train_Y)

        print("KNN Best Parameters:", random_search.best_params_)
        print("KNN Best Score:", random_search.best_score_)

        knn_best_params = random_search.best_params_

    else:
        KNN = KNeighborsClassifier(
            n_neighbors=knn_best_params['n_neighbors'],
            weights=knn_best_params['weights'],
            metric=knn_best_params['metric'])


    KNN.fit(Train_X,Train_Y)

    predictions_KNN = KNN.predict(Test_X)

    knn_metrics[0].append(roc_auc_score(Test_Y, predictions_KNN))
    knn_metrics[1].append(accuracy_score(Test_Y, predictions_KNN))
    knn_metrics[2].append(f1_score(Test_Y, predictions_KNN))
    knn_metrics[3].append(precision_score(Test_Y, predictions_KNN))
    knn_metrics[4].append(recall_score(Test_Y, predictions_KNN))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_KNN).ravel()
    sp = tn/(tn+fp)
    knn_metrics[5].append(sp)

    #################### MLP ####################

    #sample_weight = np.array([class_weights[cls] for cls in Train_Y])
    #sample_weight = None
    MLP =  MLPClassifier(hidden_layer_sizes=(num_bits), activation='relu', solver='adam', max_iter=200)
    MLP.fit(Train_X, Train_Y)
    predictions_MLP = MLP.predict(Test_X)

    mlp_metrics[0].append(roc_auc_score(Test_Y, predictions_MLP))
    mlp_metrics[1].append(accuracy_score(Test_Y, predictions_MLP))
    mlp_metrics[2].append(f1_score(Test_Y, predictions_MLP))
    mlp_metrics[3].append(precision_score(Test_Y, predictions_MLP))
    mlp_metrics[4].append(recall_score(Test_Y, predictions_MLP))
    tn, fp, fn, tp = confusion_matrix(Test_Y, predictions_MLP).ravel()
    sp = tn/(tn+fp)
    mlp_metrics[5].append(sp)


Iterations:
1/30


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



KNN Best Parameters: {'weights': 'distance', 'n_neighbors': 8, 'metric': 'manhattan'}
KNN Best Score: 0.8710977407229674
2/30
3/30
4/30
5/30
6/30
7/30
8/30
9/30
10/30
11/30
12/30
13/30
14/30
15/30
16/30
17/30
18/30
19/30
20/30
21/30
22/30
23/30
24/30
25/30
26/30
27/30
28/30
29/30
30/30


In [6]:

metrics = [svm_metrics, rf_metrics, xgb_metrics, knn_metrics, mlp_metrics]
metrics_np = np.zeros((len(metrics), 12))

for i, clf in enumerate(metrics):
    metrics_np[i, 0::2] = np.round([np.mean(metric) for metric in clf], 3)
    metrics_np[i, 1::2] = np.round([np.std(metric) for metric in clf], 3)    


metric_names = ['AUC', 'Accuracy', 'F1 Score', 'Precision', 'Sensitivity', 'Specificity']

columns = []
clfs = ["SVM", "RF","XGB", "KNN", "MLP"]
for name in metric_names:
    columns.extend([f'Mean {name}', f'Std {name}'])

df_clfs = pd.DataFrame(clfs, columns=["Classifier"])
df_metrics = pd.DataFrame(metrics_np, columns=columns)
df = pd.concat([df_clfs, df_metrics], axis=1)

filename = f"results\\updated\\weighted\\ml_baselines_{dt_file.strip('.csv')}_{fp_type}_{target_name}.csv"
df.to_csv(filename, index=False)
print(filename)


results\updated\weighted\ml_baselines_BBBP_RDKit_p_np.csv
