In [6]:
import pandas as pd

train = pd.read_csv("train.csv")
valid = pd.read_csv("val.csv")

train_valid_merged = pd.concat((train, valid), ignore_index=True)
train_valid_merged.to_csv("train_valid.csv")

In [29]:
# Start with the imports.
import sklearn
import optuna
from optunaz.three_step_opt_build_merge import (
    optimize,
    buildconfig_best,
    build_best,
    build_merged,
)
from optunaz.config import ModelMode, OptimizationDirection
from optunaz.config.optconfig import (
    OptimizationConfig,
)
from optunaz.datareader import Dataset
from optunaz.descriptors import  *
from optunaz.config.optconfig import *

# Prepare hyperparameter optimization configuration.
config = OptimizationConfig(
    data=Dataset(
        input_column="smiles",
        response_column="true_labels_R",
        training_dataset_file="train_valid.csv",
    ),
    descriptors=[
        Avalon.new(),
        ECFP.new(),
        ECFP_counts.new(),
        PathFP.new(),
        AmorProtDescriptors.new(),
        MACCS_keys.new(),
        UnscaledMAPC.new(),
        UnscaledPhyschemDescriptors.new(),
        UnscaledMAPC.new(),
        UnscaledJazzyDescriptors.new(),
        UnscaledZScalesDescriptors.new(),
        SmilesFromFile.new()

    ],
    algorithms=[
        ChemPropClassifier.new(epochs=4),
        RandomForestClassifier.new(),
        PRFClassifier.new(),
        AdaBoostClassifier.new(),
        KNeighborsClassifier.new(),
        SVC.new(),
        ChemPropHyperoptClassifier.new(),
        # Mapie.new()<
    ],
    settings=OptimizationConfig.Settings(
        mode=ModelMode.CLASSIFICATION,
        cross_validation=1,
        scoring="f1",
        n_trials=20,
        random_seed=42,
        direction=OptimizationDirection.MAXIMIZATION,
    ),
)

In [None]:
study = optimize(config, study_name="my_study")

In [3]:
train.true_labels_R.value_counts()

true_labels_R
0    61940
1     3176
Name: count, dtype: int64

In [5]:
import pandas as pd

train = pd.read_csv("val.csv")
train.true_labels_R.value_counts()

true_labels_R
0    20646
1     1059
Name: count, dtype: int64

In [6]:
import pickle

from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, balanced_accuracy_score, average_precision_score, matthews_corrcoef

import pandas as pd


with open("deepmol_predictions.pkl", "rb") as f:
    deepmol_predictions = pickle.load(f)

deepmol_predictions = (deepmol_predictions > 0.5).astype(int)

y_true = pd.read_csv("test.csv").loc[:, "true_labels_R"]
print(f1_score(y_true, deepmol_predictions))
print(precision_score(y_true, deepmol_predictions))
print(recall_score(y_true, deepmol_predictions))
print(average_precision_score(y_true, deepmol_predictions))
print(balanced_accuracy_score(y_true, deepmol_predictions))
print(matthews_corrcoef(y_true, deepmol_predictions))

0.16358839050131926
0.1854066985645933
0.14636449480642116
0.0687825098951754
0.5566915461246363
0.1269288967429531


In [7]:
predictions_qsartuna = pd.read_csv("prediction_qsartuna.csv").loc[:, "Prediction"]
predictions_qsartuna = (predictions_qsartuna > 0.5).astype(int)
predictions_qsartuna
print(f1_score(y_true, predictions_qsartuna))
print(precision_score(y_true, predictions_qsartuna))
print(recall_score(y_true, predictions_qsartuna))
print(average_precision_score(y_true, predictions_qsartuna))
print(balanced_accuracy_score(y_true, predictions_qsartuna))
print(matthews_corrcoef(y_true, predictions_qsartuna))

0.1340909090909091
0.08829691709069142
0.2785646836638338
0.05979242249500215
0.5655221713553574
0.0782275492831921
