In [1]:
import pandas as pd
import numpy as np
import random
from rdkit import Chem
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import matplotlib.pyplot as plt
from utils import load_dataset_df, smile_to_fp, data_splitter
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn import model_selection, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (c:\Users\knsve\Desktop\MEI\Tese\torch\pt_venv2\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (c:\Users\knsve\Desktop\MEI\Tese\torch\pt_venv2\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


#### Load DataFrame

In [2]:
files = ['tox21.csv','sider.csv', 'BBBP.csv']
dt_file = files[0]
dirname = dt_file.removesuffix('.csv')

df, targets = load_dataset_df(filename=dt_file)
print(targets)



Index(['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
       'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')


In [3]:
if dirname == 'tox21':
    # SR-ARE
    target_name = targets[7]
elif dirname == 'sider':
    target_name = targets[0]
else:
    target_name = targets[0]
    
df = df[[target_name, 'smiles']].dropna()

print(target_name)
print(df[target_name].sum())
print(df[target_name].size)

SR-ARE
942.0
5832


#### SMILE to Fingerprint

In [4]:
fp_types = [['morgan', 1024], ['maccs', 167], ['RDKit', 1024], ['pubchem', 881]]
fp_type, num_bits = fp_types[2]
#num_bits = 512
fp_config = {"fp_type": fp_type,
             "num_bits": num_bits}

print(fp_type, '-', num_bits)

maccs - 167


In [5]:
dtype = torch.float32
split = "scaffold"
dataset = None
if dirname != 'BBBP':
    split = "random"
    fp_array, target_array = smile_to_fp(df, fp_config=fp_config, target_name=target_name)
    # Create Torch Dataset
    fp_tensor = torch.tensor(fp_array, dtype=dtype)
    target_tensor = torch.tensor(target_array, dtype=dtype).long()

    dataset = TensorDataset(fp_tensor, target_tensor)

In [6]:
#Metrics ---- roc  acc f1  prs sns sps
svm_metrics = [[], [], [], [], [], []]
rf_metrics  = [[], [], [], [], [], []]
knn_metrics = [[], [], [], [], [], []]
xgb_metrics = [[], [], [], [], [], []]
mlp_metrics = [[], [], [], [], [], []]
metrics = [svm_metrics, rf_metrics, xgb_metrics, knn_metrics, mlp_metrics]

grid_parameters = {
    "SVM": {
        "C": list(range(1, 100)),
        "kernel": ["linear", "rbf", "poly"],
        "gamma": ["scale", "auto"],
        "degree": [2, 3, 4],
    },
    "RF": {
        "max_depth": [5] + list(range(10, 100, 10)),
        "n_estimators": list(range(50, 400, 50)),
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },
    "XGB": {
        "learning_rate": [0.005, 0.01, 0.1, 0.2],
        "max_depth": range(2, 20, 2),
        "n_estimators": range(50, 400, 50),
    },
    "KNN": {
        "n_neighbors": list(range(1, 20)),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"],
    }
}

knn_best_params = []
svm_best_params = []
rf_best_params = []
xgb_best_params = []
 

In [7]:
def calculate_metrics(metrics_list, y_pred, y_true):
    accuracy = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp/(tp + fn)
    specificity = tn/(tn + fp)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)

    metrics_list[0].append(accuracy)
    metrics_list[1].append(auc_roc)
    metrics_list[2].append(sensitivity)
    metrics_list[3].append(specificity)
    metrics_list[4].append(f1)
    metrics_list[5].append(precision)

In [8]:
def train_test_model(model, train_X, train_Y, test_X, test_Y, metrics_list):
    model.fit(train_X,train_Y)

    predictions = model.predict(test_X)
    
    calculate_metrics(metrics_list=metrics_list, y_true=test_Y, y_pred=predictions)

In [9]:
def random_param_search(model, grid_param, train_X, train_Y):
    search = RandomizedSearchCV(model, grid_param, n_iter=40, scoring='roc_auc', random_state=42)
    search.fit(train_X, train_Y)
    return search.best_params_

In [10]:
#pos_weight = (sum(train_Y == 1) / sum(train_Y == 0))

In [11]:
iterations = 30
print("Iterations:")
for iter in range(iterations):
    print(str(iter) + "/30")
    seed = iter+1
    random.seed(seed)
    train, val, test = data_splitter(df, target_name, split=split, dataset=dataset, fp_config=fp_config, seed=iter+1, dtype=dtype)
    train_X, train_Y = train[:]
    val_X, val_Y = val[:]
    test_X, test_Y = test[:]
    
    class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.array(train_Y))
    #class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
    class_weights_dict = "balanced"

    pos_weight = (sum(train_Y == 1).numpy() / sum(train_Y == 0).numpy())


    if iter == 0:
        print("KNN parameter search...")
        KNN = KNeighborsClassifier()
        knn_best_params = random_param_search(KNN, grid_parameters['KNN'], train_X, train_Y)
        print("SVM parameter search...")
        SVM = svm.SVC(class_weight=class_weights_dict, random_state=seed)
        svm_best_params = random_param_search(SVM, grid_parameters['SVM'], train_X, train_Y)
        print("Random Forest parameter search...")
        RF = RandomForestClassifier(class_weight=class_weights_dict, random_state=seed)
        rf_best_params = random_param_search(RF, grid_parameters['RF'], train_X, train_Y)
        print("XGBoost parameter search...")
        XGB = XGBClassifier(objective="binary:logistic", scale_pos_weight=pos_weight)
        xgb_best_params = random_param_search(XGB, grid_parameters['XGB'], train_X, train_Y)

        print(knn_best_params, svm_best_params, rf_best_params, xgb_best_params)

    SVM = svm.SVC(**svm_best_params, class_weight=class_weights_dict, random_state=seed)
    RF = RandomForestClassifier(**rf_best_params, class_weight=class_weights_dict, random_state=seed)
    XGB = XGBClassifier(**xgb_best_params, objective="binary:logistic", scale_pos_weight=pos_weight, random_state=seed)
    KNN = KNeighborsClassifier(**knn_best_params)
    MLP =  MLPClassifier(hidden_layer_sizes=(num_bits), activation='relu', solver='adam', max_iter=1000)

    models = [SVM, RF, XGB, KNN, MLP]
    for i, model in enumerate(models):
        train_test_model(model, train_X, train_Y, test_X, test_Y, metrics[i])
        

Iterations:
0/30
KNN parameter search...
SVM parameter search...
Random Forest parameter search...
XGBoost parameter search...
{'weights': 'distance', 'n_neighbors': 16, 'metric': 'manhattan'} {'kernel': 'rbf', 'gamma': 'auto', 'degree': 3, 'C': 20} {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 40} {'n_estimators': 100, 'max_depth': 18, 'learning_rate': 0.1}
1/30
2/30
3/30
4/30
5/30
6/30
7/30
8/30
9/30
10/30
11/30
12/30
13/30
14/30
15/30
16/30
17/30
18/30
19/30
20/30
21/30
22/30
23/30
24/30
25/30
26/30
27/30
28/30
29/30


In [12]:
import sklearn
import xgboost

print(f"scikit-learn version: {sklearn.__version__}")
print(f"xgboost version: {xgboost.__version__}")

scikit-learn version: 1.5.2
xgboost version: 2.1.3


-------------------------------------------------------------------------------------------------------------------

In [13]:
metric_names = ['Acc', 'AUC', 'Sn', 'Sp', 'F1', 'Precision']
metrics = [svm_metrics, rf_metrics, xgb_metrics, knn_metrics, mlp_metrics]
metrics_np = np.zeros((len(metrics), 12))

for i, clf in enumerate(metrics):
    metrics_np[i, 0::2] = np.round([np.mean(metric) for metric in clf], 3)
    metrics_np[i, 1::2] = np.round([np.std(metric) for metric in clf], 3)  

columns = []
for name in metric_names:
    columns.extend([f'Mean {name}', f'Std {name}'])

print(metrics_np)
clfs = ["SVM", "RF","XGB", "KNN", "MLP"]
df_clfs = pd.DataFrame(clfs, columns=["Classifier"])
df_metrics = pd.DataFrame(metrics_np, columns=columns)
df = pd.concat([df_clfs, df_metrics], axis=1)

if fp_type in ['maccs', 'pubchem']:
    filename = f"results\\{dirname}\\ml_{fp_type}_{target_name}.csv"

else:
    filename = f"results\\{dirname}\\ml_{fp_type}_{num_bits}_{target_name}.csv"

df.to_csv(filename, index=False)

print(filename)

[[0.778 0.015 0.728 0.026 0.656 0.049 0.801 0.014 0.48  0.036 0.379 0.034]
 [0.86  0.013 0.655 0.027 0.356 0.057 0.954 0.009 0.441 0.052 0.587 0.05 ]
 [0.861 0.012 0.577 0.018 0.163 0.037 0.991 0.004 0.268 0.05  0.783 0.063]
 [0.863 0.013 0.611 0.025 0.244 0.049 0.978 0.005 0.355 0.06  0.666 0.079]
 [0.854 0.014 0.673 0.03  0.409 0.067 0.936 0.013 0.465 0.051 0.546 0.043]]
results\tox21\ml_maccs_SR-ARE.csv
