# Evaluate Hyperparameter Results

In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from led3_score.fingerprints.fingerprint_generator import FingerprintGenerator
import numpy
import xgboost as xgb
import sklearn.metrics
import pandas

In [2]:

def train_model(training_dataset, learning_rate, max_depth, n_estimators, min_split_loss, y_column):
    X_train = FingerprintGenerator.get_fingerprint_columns(training_dataset, "fp_").to_numpy()
    Y_train = numpy.vstack(training_dataset[y_column].to_numpy()).ravel()
    
    model = xgb.XGBClassifier(objective="binary:logistic", tree_method="hist", random_state=42, n_jobs = 1, learning_rate = learning_rate, max_depth = max_depth, n_estimators = n_estimators, min_split_loss = min_split_loss)
    model.fit(X_train, Y_train)
    Y_train_predictions = model.predict(X_train)

    results = {}

    results["train_accuracy"] = sklearn.metrics.accuracy_score(Y_train, Y_train_predictions)
    results["train_precision"] = sklearn.metrics.precision_score(Y_train, Y_train_predictions)
    results["train_recall"] = sklearn.metrics.recall_score(Y_train, Y_train_predictions)
    results["train_f1"] = sklearn.metrics.f1_score(Y_train, Y_train_predictions)
    results["train_mcc"] = sklearn.metrics.matthews_corrcoef(Y_train, Y_train_predictions)

    return model, results

def evaluate_model(model, test_dataset, y_column):
    X_test = FingerprintGenerator.get_fingerprint_columns(test_dataset, "fp_").to_numpy()
    Y_test = numpy.vstack(test_dataset[y_column].to_numpy()).ravel()
    
    Y_test_predictions = model.predict(X_test)

    results = {}

    results["test_accuracy"] = sklearn.metrics.accuracy_score(Y_test, Y_test_predictions)
    results["test_precision"] = sklearn.metrics.precision_score(Y_test, Y_test_predictions)
    results["test_recall"] = sklearn.metrics.recall_score(Y_test, Y_test_predictions)
    results["test_f1"] = sklearn.metrics.f1_score(Y_test, Y_test_predictions)
    results["test_mcc"] = sklearn.metrics.matthews_corrcoef(Y_test, Y_test_predictions)

    return results

def extract_hyperparameters(hyperparameter_path):
    hyperparameter = pandas.read_csv(hyperparameter_path)

    # get the row where f1 is max
    hyperparameter = hyperparameter[hyperparameter["f1"] == hyperparameter["f1"].max()]
    #drop the columns parameters.learning_rate.max, parameters.learning_rate.min
    hyperparameter = hyperparameter.drop(columns=['Created', 'Runtime', 'End Time', 'Hostname', 'Notes',
       'Updated', 'Tags', 'State', 'User', 'Group', 'Job Type', 'Sweep',
       'Description', 'Commit', 'GitHub', 'GPU Count', 'GPU Type', 'entity', 'method', 'metric.goal',
       'metric.name', 'parameters.learning_rate.max', 'parameters.learning_rate.min',
       'parameters.max_depth.max', 'parameters.max_depth.min',
       'parameters.min_split_loss.max', 'parameters.min_split_loss.min',
       'parameters.n_estimators.max', 'parameters.n_estimators.min', 'program', 'project'])

    # rename the columns f1 to hyperparameter_search_f1
    hyperparameter = hyperparameter.rename(columns={"f1": "hyperparameter_search_f1"})
    hyperparameter = hyperparameter.rename(columns={"mcc": "hyperparameter_search_mcc"})
    hyperparameter = hyperparameter.rename(columns={"accuracy": "hyperparameter_search_accuracy"})
    hyperparameter = hyperparameter.rename(columns={"precision": "hyperparameter_search_precision"})
    hyperparameter = hyperparameter.rename(columns={"recall": "hyperparameter_search_recall"})
    hyperparameter = hyperparameter.rename(columns={"Name": "hyperparameter_sweep_name"})
    hyperparameter = hyperparameter.rename(columns={"dataset_path": "hyperparameter_training_dataset_path"})

    return hyperparameter

def evaluate_hyperparameters(dataset):

    # load data
    training_csv = pandas.read_csv(dataset["train_dataset_path"][0])

    model, train_results = train_model(training_dataset= training_csv, learning_rate = dataset["learning_rate"][0], max_depth = dataset["max_depth"][0], n_estimators = dataset["n_estimators"][0], min_split_loss = dataset["min_split_loss"][0], y_column = dataset["y_column"][0])

    # add to dataset
    dataset["train_accuracy"] = train_results["train_accuracy"]
    dataset["train_precision"] = train_results["train_precision"]
    dataset["train_recall"] = train_results["train_recall"]
    dataset["train_f1"] = train_results["train_f1"]
    dataset["train_mcc"] = train_results["train_mcc"]


    # load data
    test_csv = pandas.read_csv(dataset["test_dataset_path"][0])

    test_results = evaluate_model(model, test_dataset= test_csv, y_column = dataset["y_column"][0])

    # add to dataset
    dataset["test_accuracy"] = test_results["test_accuracy"]
    dataset["test_precision"] = test_results["test_precision"]
    dataset["test_recall"] = test_results["test_recall"]
    dataset["test_f1"] = test_results["test_f1"]
    dataset["test_mcc"] = test_results["test_mcc"]

    # load data
    chembl_test_csv = pandas.read_csv(dataset["chembl_test_dataset_path"][0])

    chembl_test_results = evaluate_model(model, test_dataset= chembl_test_csv, y_column = dataset["y_column"][0])

    # add to dataset
    dataset["chembl_test_accuracy"] = chembl_test_results["test_accuracy"]
    dataset["chembl_test_precision"] = chembl_test_results["test_precision"]
    dataset["chembl_test_recall"] = chembl_test_results["test_recall"]
    dataset["chembl_test_f1"] = chembl_test_results["test_f1"]
    dataset["chembl_test_mcc"] = chembl_test_results["test_mcc"]

    return dataset, model


In [3]:
import os

def save_results(results: pandas.DataFrame, model: xgb.XGBClassifier, path:str):
    # create folder if not exists
    os.makedirs(os.path.dirname(path), exist_ok=True)

    results.to_csv(path, index=False)
    model.save_model(path.replace(".csv", ".model"))

# evaluate models

## caspyrus 1k

In [None]:
#led 3
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/led3_caspyrus1k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus1k/caspyrus_1k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus1k/caspyrus_1k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_led3"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/led3_caspyrus1k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

In [None]:
#zinc
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/zinc_caspyrus1k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus1k/caspyrus_1k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus1k/caspyrus_1k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_zinc"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/zinc_caspyrus1k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

## caspyrus10k

In [None]:
#led 3
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/led3_caspyrus10k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus10k/caspyrus_10k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus10k/caspyrus_10k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_led3"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/led3_caspyrus10k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

In [None]:
#zinc
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/zinc_caspyrus10k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus10k/caspyrus_10k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus10k/caspyrus_10k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_zinc"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/zinc_caspyrus10k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

## 20k

In [None]:
#led 3
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/led3_caspyrus20k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus20k/caspyrus_20k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus20k/caspyrus_20k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_led3"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/led3_caspyrus20k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

In [None]:
#zinc
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/zinc_caspyrus20k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus20k/caspyrus_20k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus20k/caspyrus_20k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_zinc"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/zinc_caspyrus20k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

# 50k 

In [None]:
#led 3
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/led3_caspyrus50k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus50k/caspyrus_50k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus50k/caspyrus_50k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_led3"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/led3_caspyrus50k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

In [None]:
#zinc
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/zinc_caspyrus50k_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus50k/caspyrus_50k_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/caspyrus50k/caspyrus_50k_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_zinc"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/zinc_caspyrus50k_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

# chembl200k

In [None]:
#led 3
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/led3_chembl200k_random_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/chembl200k_random/chembl_200k_random_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/chembl200k_random/chembl_200k_random_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_led3"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/led3_chembl200k_random_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])

In [None]:
#zinc
hyperparameter_path = "<PATH>/led3_score/data/results/2_synthesis_score/hyperparameter_search/zinc_chembl200k_random_hist_xgboost.csv"
dataset = extract_hyperparameters(hyperparameter_path)

dataset["train_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/chembl200k_random/chembl_200k_random_train.csv"
dataset["test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/train_data/chembl200k_random/chembl_200k_random_test.csv"
dataset["chembl_test_dataset_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/test_data/test_chembl200k_random/test_chembl200k_results.csv"
dataset["y_column"] = "Y_zinc"
dataset["save_path"] = "<PATH>/led3_score/data/results/2_synthesis_score/models/zinc_chembl200k_random_hist_xgboost.csv"

dataset_results, model = evaluate_hyperparameters(dataset)

save_results(dataset_results, model, dataset_results["save_path"][0])