In [None]:
import pandas as pd
import numpy as np
import random
import torch
from torch.utils.data import TensorDataset, random_split
from utils import load_dataset_df, smile_to_fp, calc_metrics
from sklearn.utils.class_weight import compute_class_weight
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#### Load DataFrame

In [None]:
files = ['tox21.csv','sider.csv']
dt_file = files[0]
dirname = dt_file.removesuffix('.csv')

df, targets = load_dataset_df(filename=dt_file)

In [None]:
if dirname == 'tox21':
    # SR-ARE
    target_name = targets[7]
elif dirname == 'sider':
    #Hepatobiliary disorders
    target_name = targets[0]
 
df = df[[target_name, 'smiles']].dropna()

#### SMILE to Fingerprint

In [None]:
fp_types = [['morgan', 1024], ['maccs', 167], ['RDKit', 1024]]
mix = True
fp_type, num_bits = fp_types[1]
if mix and fp_type == 'RDKit':
    num_bits = 512
elif mix and fp_type == 'morgan': # keep morgan as 2nd MF
    mix = False
fp_config = {"fp_type": fp_type,
             "num_bits": num_bits,
             "radius": 2,
             "fp_type_2": fp_types[0][0],
             "num_bits_2": 1024 - num_bits,
             "mix": mix,
             }
print(fp_type, '-', num_bits)
if mix:
   print(fp_config['fp_type_2'], '-', fp_config['num_bits_2']) 

In [None]:
dtype = torch.float32
split = "random"
dataset = None

fp_array, target_array = smile_to_fp(df, fp_config=fp_config, target_name=target_name)
# Create Torch Dataset
fp_tensor = torch.tensor(fp_array, dtype=dtype)
target_tensor = torch.tensor(target_array, dtype=dtype).long()
dataset = TensorDataset(fp_tensor, target_tensor)

In [None]:
#Metrics ---- roc  acc f1  prs sns sps
svm_metrics = [[], [], [], [], [], []]
rf_metrics  = [[], [], [], [], [], []]
knn_metrics = [[], [], [], [], [], []]
xgb_metrics = [[], [], [], [], [], []]
mlp_metrics = [[], [], [], [], [], []]
metrics = [svm_metrics, rf_metrics, xgb_metrics, knn_metrics, mlp_metrics]

grid_parameters = {
    "SVM": {
        "C": list(range(1, 100)),
        "kernel": ["linear", "rbf", "poly"],
        "gamma": ["scale", "auto"],
        "degree": [2, 3, 4],
    },
    "RF": {
        "max_depth": [5] + list(range(10, 100, 10)),
        "n_estimators": list(range(50, 400, 50)),
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },
    "XGB": {
        "learning_rate": [0.005, 0.01, 0.1, 0.2],
        "max_depth": range(2, 20, 2),
        "n_estimators": range(50, 400, 50),
    },
    "KNN": {
        "n_neighbors": list(range(1, 20)),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"],
    }
}

knn_best_params = []
svm_best_params = []
rf_best_params = []
xgb_best_params = []
 

In [None]:
def train_test_model(model, train_X, train_Y, test_X, test_Y, metrics_list):
    model.fit(train_X,train_Y)

    predictions = model.predict(test_X)
    
    calc_metrics(metrics_list=metrics_list, all_targets=test_Y, all_preds=predictions)

In [None]:
def random_param_search(model, grid_param, train_X, train_Y):
    search = RandomizedSearchCV(model, grid_param, n_iter=40, scoring='roc_auc', random_state=42)
    search.fit(train_X, train_Y)
    return search.best_params_

In [None]:
iterations = 30
print("Iterations:")
for iter in range(iterations):
    print(str(iter) + "/30")
    seed = iter + 1
    random.seed(seed)
    generator = torch.Generator().manual_seed(int(seed))
    train, val, test = random_split(dataset, [0.8, 0.1, 0.1], generator=generator)    
    train_X, train_Y = train[:]
    val_X, val_Y = val[:]
    test_X, test_Y = test[:]
    
    class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.array(train_Y))
    class_weights_dict = "balanced"
    pos_weight = (sum(train_Y == 1).numpy() / sum(train_Y == 0).numpy())

    if iter == 0:
        print("XGBoost parameter search...")
        XGB = XGBClassifier(objective="binary:logistic", scale_pos_weight=pos_weight)
        xgb_best_params = random_param_search(XGB, grid_parameters['XGB'], train_X, train_Y)
        print("KNN parameter search...")
        KNN = KNeighborsClassifier()
        knn_best_params = random_param_search(KNN, grid_parameters['KNN'], train_X, train_Y)
        print("SVM parameter search...")
        SVM = svm.SVC(class_weight=class_weights_dict, random_state=seed)
        svm_best_params = random_param_search(SVM, grid_parameters['SVM'], train_X, train_Y)
        print("Random Forest parameter search...")
        RF = RandomForestClassifier(class_weight=class_weights_dict, random_state=seed)
        rf_best_params = random_param_search(RF, grid_parameters['RF'], train_X, train_Y)

        print(knn_best_params, svm_best_params, rf_best_params, xgb_best_params)

    SVM = svm.SVC(**svm_best_params, class_weight=class_weights_dict, random_state=seed)
    RF = RandomForestClassifier(**rf_best_params, class_weight=class_weights_dict, random_state=seed)
    XGB = XGBClassifier(**xgb_best_params, objective="binary:logistic", scale_pos_weight=pos_weight, random_state=seed)
    KNN = KNeighborsClassifier(**knn_best_params)
    MLP =  MLPClassifier(hidden_layer_sizes=(num_bits), activation='relu', solver='adam', max_iter=1000)


models = [SVM, RF, XGB, KNN, MLP]
for i, model in enumerate(models):
    train_test_model(model, train_X, train_Y, test_X, test_Y, metrics[i])

-------------------------------------------------------------------------------------------------------------------

In [None]:
metric_names = ['Acc', 'AUC', 'Sn', 'Sp', 'F1', 'Precision']
metrics = [svm_metrics, rf_metrics, xgb_metrics, knn_metrics, mlp_metrics]
metrics_np = np.zeros((len(metrics), 12))

for i, clf in enumerate(metrics):
    metrics_np[i, 0::2] = np.round([np.mean(metric) for metric in clf], 3)
    metrics_np[i, 1::2] = np.round([np.std(metric) for metric in clf], 3)  

columns = []
for name in metric_names:
    columns.extend([f'Mean {name}', f'Std {name}'])

print(metrics_np)
clfs = ["SVM", "RF","XGB", "KNN", "MLP"]
df_clfs = pd.DataFrame(clfs, columns=["Classifier"])
df_metrics = pd.DataFrame(metrics_np, columns=columns)
df = pd.concat([df_clfs, df_metrics], axis=1)

if fp_config['mix']:
    filename = f"results\\{dirname}\\ml_{fp_type}_{fp_config['fp_type_2']}_{target_name}.csv"

elif fp_type == 'maccs':
    filename = f"results\\{dirname}\\ml_{fp_type}_{target_name}.csv"

else:
    filename = f"results\\{dirname}\\ml_{fp_type}_{num_bits}_{target_name}.csv"

#df.to_csv(filename, index=False)

print(filename)