In [1]:
import pandas as pd
import numpy as np
import random
from rdkit import Chem
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import matplotlib.pyplot as plt
from utils import load_dataset_df, smile_to_fp,smiles_to_descriptor,smiles_to_onehot, smiles_to_onehot_selfies, data_splitter
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn import model_selection, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (c:\Users\knsve\Desktop\MEI\Tese\torch\snn_venv\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-geometri

In [2]:
import sys
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = f"./results/logs/output_{timestamp}.txt"
log_file = open(log_path, "w")
sys.stdout = log_file

#### Load DataFrame

In [3]:
files = ['tox21.csv','sider.csv', 'BBBP.csv']
dt_file = files[1]
dirname = dt_file.removesuffix('.csv')

df, targets = load_dataset_df(filename=dt_file)
print(targets)



In [4]:
if dirname == 'tox21':
    # SR-ARE
    target_name = targets[7]
elif dirname == 'sider':
    target_name = targets[0]
else:
    target_name = targets[0]
    
df = df[[target_name, 'smiles']].dropna()

print(target_name)
print(df[target_name].sum())
print(df[target_name].size)

#### Molecular Representation

#### SMILE to Fingerprint

In [5]:
representations = ["fp", "descriptor", "SELFIES-1hot", "SMILES-1hot"]#, "graph-list"]

repr_type = representations[1]

In [6]:
if repr_type == "fp":
    fp_types = [['morgan', 1024], ['maccs', 167], ['RDKit', 1024], ['count_morgan', 1024], ['pubchem', 881]]
    mix = False
    fp_type, num_bits = fp_types[2]
    if mix and fp_type == 'RDKit':
        num_bits = 512
    data_config = {"fp_type": fp_type,
                "num_bits": num_bits,
                "radius": 2,
                "fp_type_2": fp_types[0][0],
                "num_bits_2": 1024 - num_bits,
                "mix": mix,}
    dim_2 = False
    print(fp_type, '-', num_bits)
    if mix: print(data_config['fp_type_2'], '-', data_config['num_bits_2'])
    if dim_2: print("2D FP")

elif repr_type == "descriptor":
    desc_type = ["RDKit", "TODO"]
    data_config = {"desc": desc_type[0],
                   "num_bits": 0,
                }
elif repr_type == "SELFIES-1hot":
    dim_2 = True
    data_config = {}

elif repr_type == "SMILES-1hot":
    dim_2 = True
    data_config = {}

data_config["repr_type"] = repr_type
print(repr_type)

In [7]:
dtype = torch.float32
split = "scaffold"
dataset = None

if dirname != 'BBBP':
    split = "random"
    if repr_type == "fp":
        fp_array, target_array = smile_to_fp(df, data_config=data_config, target_name=target_name)
        # Create Torch Dataset
        fp_tensor = torch.tensor(fp_array, dtype=dtype)
        print(fp_tensor.size())
        target_tensor = torch.tensor(target_array, dtype=dtype).long()
        if dim_2:
            fp_tensor = fp_tensor.view(-1, 32, 32)
            print(fp_tensor.size())
        dataset = TensorDataset(fp_tensor, target_tensor)
    elif repr_type == "descriptor":
        desc_array, target_array = smiles_to_descriptor(df, data_config=data_config, target_name=target_name, missing_val=0)
        # Create Torch Dataset
        desc_tensor = torch.tensor(desc_array, dtype=dtype)
        target_tensor = torch.tensor(target_array, dtype=dtype).long()

        dataset = TensorDataset(desc_tensor, target_tensor)
        print(desc_tensor.size())
    elif repr_type == "SELFIES-1hot":
        selfies_array, target_array = smiles_to_onehot_selfies(df, data_config=data_config, target_name=target_name, missing_val=0)
        # Create Torch Dataset
        selfies_tensor = torch.tensor(selfies_array, dtype=dtype)
        target_tensor = torch.tensor(target_array, dtype=dtype).long()

        dataset = TensorDataset(selfies_tensor, target_tensor)
        print(selfies_tensor.size())
    elif repr_type == "SMILES-1hot":
        smiles_array, target_array = smiles_to_onehot(df, data_config=data_config, target_name=target_name, missing_val=0)
        # Create Torch Dataset
        smiles_tensor = torch.tensor(smiles_array, dtype=dtype)
        target_tensor = torch.tensor(target_array, dtype=dtype).long()

        dataset = TensorDataset(smiles_tensor, target_tensor)
        print(smiles_tensor.size())



In [8]:
if repr_type == "fp":
    data_config["input_size"] = 1024 if data_config['mix'] else num_bits

elif repr_type == "descriptor":
    data_config["input_size"] = desc_tensor.shape[1]


In [9]:
#Metrics ---- acc  roc sn  sp  f1  prs
svm_metrics = [[], [], [], [], [], []]
rf_metrics  = [[], [], [], [], [], []]
knn_metrics = [[], [], [], [], [], []]
xgb_metrics = [[], [], [], [], [], []]
mlp_metrics = [[], [], [], [], [], []]
metrics = [svm_metrics, rf_metrics, xgb_metrics, knn_metrics, mlp_metrics]

grid_parameters = {
    "SVM": {
        "C": list(range(1, 100)),
        "kernel": ["linear", "rbf", "poly"],
        "gamma": ["scale", "auto"],
        "degree": [2, 3, 4],
    },
    "RF": {
        "max_depth": [5] + list(range(10, 100, 10)),
        "n_estimators": list(range(50, 400, 50)),
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },
    "XGB": {
        "learning_rate": [0.005, 0.01, 0.1, 0.2],
        "max_depth": range(2, 20, 2),
        "n_estimators": range(50, 400, 50),
    },
    "KNN": {
        "n_neighbors": list(range(1, 20)),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"],
    },
    "MLP": {
        'hidden_layer_sizes': [(512,), (1024,), (512,256), (1024,512), (1024, 256)],
        'activation': ['relu'],
        'solver': ['adam'],
        'alpha': [0, 0.0001, 0.001],   # L2 regularization
        'learning_rate': ['constant', 'adaptive'],
        "early_stopping": [True],
        "max_iter": [1000]
    }
}

knn_best_params = []
svm_best_params = []
rf_best_params = []
xgb_best_params = []
mlp_best_params = []
 

In [10]:
def calculate_metrics(metrics_list, y_pred, y_true):
    accuracy = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp/(tp + fn)
    specificity = tn/(tn + fp)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)

    metrics_list[0].append(accuracy)
    metrics_list[1].append(auc_roc)
    metrics_list[2].append(sensitivity)
    metrics_list[3].append(specificity)
    metrics_list[4].append(f1)
    metrics_list[5].append(precision)

In [11]:
def minmax_norm(train_subset, val_subset, test_subset):
    train_tensor, _ = train_subset[:]
    val_tensor, _ = val_subset[:]
    test_tensor, _ = test_subset[:]

    min_val = train_tensor.min(dim=0).values
    max_val = train_tensor.max(dim=0).values
    range_val = (max_val - min_val).clamp(min=1e-6)

    train_norm = ((train_tensor - min_val) / range_val).clamp(0.0, 1.0)
    val_norm   = ((val_tensor   - min_val) / range_val).clamp(0.0, 1.0)
    test_norm  = ((test_tensor  - min_val) / range_val).clamp(0.0, 1.0)

    return train_norm, val_norm, test_norm

In [12]:
def train_test_model(model, train_X, train_Y, test_X, test_Y, metrics_list):
    if isinstance(model, MLPClassifier):
        sample_weights = compute_sample_weight(class_weight='balanced', y=train_Y)
        model.fit(train_X, train_Y, sample_weight=sample_weights)
    else:
        model.fit(train_X, train_Y)

    predictions = model.predict(test_X)
    
    calculate_metrics(metrics_list=metrics_list, y_true=test_Y, y_pred=predictions)

In [13]:
def random_param_search(model, grid_param, train_X, train_Y):
    search = RandomizedSearchCV(model, grid_param, n_iter=40, scoring='roc_auc', random_state=42, verbose=1)
    if isinstance(model, MLPClassifier):
        sample_weights = compute_sample_weight(class_weight='balanced', y=train_Y)
        search.fit(train_X, train_Y, sample_weight=sample_weights)
    else:
        search.fit(train_X, train_Y)
    return search.best_params_

In [14]:
#pos_weight = (sum(train_Y == 1) / sum(train_Y == 0))

In [15]:
iterations = 30
print("Iterations:")
for iter in range(iterations):
    print(str(iter) + "/30", flush=True)
    seed = iter+1
    random.seed(seed)
    train, val, test = data_splitter(df, target_name, split=split, dataset=dataset, data_config=data_config, seed=seed, dtype=dtype)
    train_X, train_Y = train[:]
    val_X, val_Y = val[:]
    test_X, test_Y = test[:]
    if repr_type == "descriptor":
        train_X, val_X, test_X = minmax_norm(train, val, test)


    class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.array(train_Y))
    #class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
    class_weights_dict = "balanced"

    pos_weight = (sum(train_Y == 1).numpy() / sum(train_Y == 0).numpy())


    if iter == 0:
        print("MLP parameter search...", flush=True)
        MLP =  MLPClassifier()
        mlp_best_params = random_param_search(MLP, grid_parameters['MLP'], train_X, train_Y)
        print("XGBoost parameter search...", flush=True)
        XGB = XGBClassifier(objective="binary:logistic", scale_pos_weight=pos_weight)
        xgb_best_params = random_param_search(XGB, grid_parameters['XGB'], train_X, train_Y)
        print("KNN parameter search...", flush=True)
        KNN = KNeighborsClassifier()
        knn_best_params = random_param_search(KNN, grid_parameters['KNN'], train_X, train_Y)
        print("SVM parameter search...", flush=True)
        SVM = svm.SVC(class_weight=class_weights_dict, random_state=seed, probability=False)
        svm_best_params = random_param_search(SVM, grid_parameters['SVM'], train_X, train_Y)
        print("Random Forest parameter search...", flush=True)
        RF = RandomForestClassifier(class_weight=class_weights_dict, random_state=seed)
        rf_best_params = random_param_search(RF, grid_parameters['RF'], train_X, train_Y)

        print(knn_best_params, svm_best_params, rf_best_params, xgb_best_params)

    SVM = svm.SVC(**svm_best_params, class_weight=class_weights_dict, random_state=seed, probability=False)
    RF = RandomForestClassifier(**rf_best_params, class_weight=class_weights_dict, random_state=seed)
    XGB = XGBClassifier(**xgb_best_params, objective="binary:logistic", scale_pos_weight=pos_weight, random_state=seed)
    KNN = KNeighborsClassifier(**knn_best_params)
    MLP =  MLPClassifier(**mlp_best_params, random_state=seed)

    models = [SVM, RF, XGB, KNN, MLP]
    for i, model in enumerate(models):
        train_test_model(model, train_X, train_Y, test_X, test_Y, metrics[i])
    for m in metrics:
        print(m[1][iter], flush=True)
        

[WinError 2] The system cannot find the file specified
  File "c:\Users\knsve\Desktop\MEI\Tese\torch\snn_venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\knsve\.pyenv\pyenv-win\versions\3.10.0\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\knsve\.pyenv\pyenv-win\versions\3.10.0\lib\subprocess.py", line 966, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\knsve\.pyenv\pyenv-win\versions\3.10.0\lib\subprocess.py", line 1435, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


-------------------------------------------------------------------------------------------------------------------

In [16]:
metric_names = ['Acc', 'AUC', 'Sn', 'Sp', 'F1', 'Precision']
metrics = [svm_metrics, rf_metrics, xgb_metrics, knn_metrics, mlp_metrics]
metrics_np = np.zeros((len(metrics), 12))

for i, clf in enumerate(metrics):
    metrics_np[i, 0::2] = np.round([np.mean(metric) for metric in clf], 3)
    metrics_np[i, 1::2] = np.round([np.std(metric) for metric in clf], 3)  

columns = []
for name in metric_names:
    columns.extend([f'Mean {name}', f'Std {name}'])

print(metrics_np)
clfs = ["SVM", "RF","XGB", "KNN", "MLP"]
df_clfs = pd.DataFrame(clfs, columns=["Classifier"])
df_metrics = pd.DataFrame(metrics_np, columns=columns)
df = pd.concat([df_clfs, df_metrics], axis=1)
blank = pd.DataFrame([[""] * 12] * 3, columns=columns) 

if repr_type == "fp":
    if data_config['mix']:
        filename = f"results\\{dirname}\\ml_{fp_type}_{data_config['fp_type_2']}_{target_name}.csv"

    elif fp_type in ['maccs', 'pubchem']:
        filename = f"results\\{dirname}\\ml_{fp_type}_{target_name}.csv"
    else:
        filename = f"results\\{dirname}\\ml_{fp_type}_{num_bits}_{target_name}.csv"
elif repr_type == "descriptor":
    filename = f"results\\{dirname}\\ml_desc_217_{target_name}.csv"

filename = filename.replace(" ", "_")

df.to_csv(filename, index=False)
for i in range(len(metrics)):
    df_raw = pd.DataFrame({name: metrics[i][j] for j, name in enumerate(metric_names)})
    df_raw["Seed"] = list(range(1, 31))
    df_raw = df_raw[["Seed"] + metric_names]  # reorder columns
    blank.to_csv(filename, mode='a', index=False, header=False)
    pd.DataFrame([["Classifier: " + clfs[i]]], columns=[df_raw.columns[0]]).to_csv(filename, mode='a', index=False, header=False)
    df_raw.to_csv(filename, mode='a', index=False)

print(filename)