In [13]:
# global settings
import random
import numpy as np
import os

N_CPU = 12 # number of CPUs for parallel operations
random_state = 42
#random.seed(random_state)
np.random.seed(random_state)

os.environ["TF_DETERMINISTIC_OPS"] = str(random_state)

# logging settings
from qsprpred.logs.utils import enable_file_logger, export_conda_environment
logSettings = enable_file_logger(
    log_folder = 'qspr/models',
    filename = 'Modelling.log',
    debug = False,
    disable_existing_loggers = False
)

# save the environment to a yaml file
export_conda_environment("qspr/models/environment.yml")


Environment exported to qspr/models/environment.yml successfully!


In [14]:
# Already run this tutorial before? You can reload your data/models by running this cell and uncommenting the models/data you need.
from qsprpred.data.data import QSPRDataset
from qsprpred.models.interfaces import QSPRModel

# For the regression part of the tutorial
#dataset = QSPRDataset.fromFile('./qspr/data/tutorial_data_df.pkl')
# model = QSPRModel.fromFile('./qspr/models/PLS_REG/PLS_REG_meta.json')

# For the classification part of the tutorial
# dataset = QSPRDataset.fromFile('./qspr/data/A2A_LIGANDS_df.pkl')
# fitted_models = [QSPRModel.fromFile('./qspr/models/ExtraTreesClassifier/ExtraTreesClassifier_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/RandomForestClassifier/RandomForestClassifier_meta.json')]

## Data Sets

QSPRPred package defines the `QSPRDataset` class, which is used to manage data and supply it to the models of interest (see [data_preparation](./data_preparation.ipynb) and [data_preparation_advanced](./data_preparation_advanced.ipynb)). We already assume you are familiar with these data structures, and we will use example data sets that are loaded automatically via the `datasets.py` module defined in the current folder. Feel free to examine this code to see how the data is loaded and preprocessed.

## Building Models - Regression

Here, we will show how to train a simple single task regression model with QSPRPred.

### Preparing the Data

We will load the `Parkinsons` data set from `datasets.py`:

In [15]:
from datasets import Parkinsons

dataset = Parkinsons(random_state=random_state)
dataset.getDF()

Number of samples per target:
GABAAalpha    6280
NMDA          4073
P41594        2730
Q14416        1342
Q13255         975
Q14833         856
Q14832         172
O00222         153
O15303         104
Q14831          89
Q14643          12
Name: accession, dtype: int64


  super().__init__(


accession,SMILES,GABAAalpha,NMDA,O00222,O15303,P41594,Q13255,Q14416,Q14643,Q14831,...,QSPRID,O00222_imputed,O15303_imputed,P41594_imputed,Q13255_imputed,Q14416_imputed,Q14831_imputed,Q14832_imputed,Q14833_imputed,Split_IsTrain
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tutorial_data_98,C(#Cc1ncn2c1COc1ccccc1-2)c1ccccc1,5.92,,,,6.975,,,,,...,tutorial_data_98,5.070667,4.955192,6.975,6.286032,6.824711,4.752146,6.044017,5.833706,False
tutorial_data_99,C(#Cc1ncn2c1COc1ccccc1-2)c1cccnc1,6.35,,,,6.5,,,,,...,tutorial_data_99,5.070667,4.955192,6.5,6.286032,6.824711,4.752146,6.044017,5.833706,False
tutorial_data_2318,CN1Cc2c(C#Cc3ccccc3)ncn2-c2cccc(Cl)c2C1=O,9.0,,,,5.95,,,,,...,tutorial_data_2318,5.070667,4.955192,5.95,6.286032,6.824711,4.752146,6.044017,5.833706,True
tutorial_data_2319,CN1Cc2c(C#Cc3ccccc3)ncn2-c2ccc(F)cc2C1=O,8.4,,,,6.48,,,,,...,tutorial_data_2319,5.070667,4.955192,6.48,6.286032,6.824711,4.752146,6.044017,5.833706,True
tutorial_data_4131,Cc1cccc(C#Cc2ncn3c2COc2ccccc2-3)c1,6.47,,,,7.36,,,,,...,tutorial_data_4131,5.070667,4.955192,7.36,6.286032,6.824711,4.752146,6.044017,5.833706,True
tutorial_data_4132,Cc1cc(C#Cc2ncn3c2COc2ccccc2-3)ccn1,6.47,,,,7.9,,,,,...,tutorial_data_4132,5.070667,4.955192,7.9,6.286032,6.824711,4.752146,6.044017,5.833706,True
tutorial_data_4141,Cc1cc(C#Cc2cn(-c3cccc(S(C)(=O)=O)c3)c(C)n2)ccn1,5.5,,,,6.8,,,,,...,tutorial_data_4141,5.070667,4.955192,6.8,6.286032,6.824711,4.752146,6.044017,5.833706,True
tutorial_data_4142,Cc1cc(C#Cc2cn(-c3ccc(C#N)cc3)c(C)n2)ccn1,5.5,,,,7.12,,,,,...,tutorial_data_4142,5.070667,4.955192,7.12,6.286032,6.824711,4.752146,6.044017,5.833706,True
tutorial_data_4143,Cc1cc(C#Cc2cn(-c3ccc(F)cc3F)c(C)n2)ccn1,5.5,,,,7.255,,,,,...,tutorial_data_4143,5.070667,4.955192,7.255,6.286032,6.824711,4.752146,6.044017,5.833706,False
tutorial_data_4144,Cc1cc(C#Cc2cn(-c3ccc(S(C)(=O)=O)cc3)c(C)n2)ccn1,5.5,,,,6.55,,,,,...,tutorial_data_4144,5.070667,4.955192,6.55,6.286032,6.824711,4.752146,6.044017,5.833706,True


We will build a regression model for the GABA(A) receptor subunit alpha (GABAAalpha).
Therefore, we initialize our QSPRdataset as regression for "GABAAalpha".
After this we need to do some processing of the data.
We need to calculate compound features, split our dataset into a train and test set.

In [16]:
from qsprpred.data.utils.descriptorsets import FingerprintSet
from qsprpred.data.utils.descriptorcalculator import MoleculeDescriptorsCalculator
from sklearn.preprocessing import StandardScaler as Scaler
from qsprpred.data.utils.datasplitters import RandomSplit

# Calculate MorganFP and physicochemical properties
feature_calculator = MoleculeDescriptorsCalculator(desc_sets = [FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048)])

# Do a random split for creating the train (85%) and test set (15%)
rand_split = RandomSplit(test_fraction=0.2, dataset=dataset)

# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    split=rand_split,
    feature_calculators=[feature_calculator],
    feature_standardizer=Scaler()
)

print(f"Number of samples train set: {len(dataset.y)}")
print(f"Number of samples test set: {len(dataset.y_ind)}")

Molecular descriptors already exist in tutorial_data. Use `recalculate=True` to overwrite them.
Missing values filled with nan


Number of samples train set: 17
Number of samples test set: 5


The `prepareDataset` function is shorthand method that can be used to perform multiple steps at once, but these steps can also be performed individually. For example, we can calculate the features and split the dataset separately by several calls to various methods of the `QSPRDataset` class. The following code should be equivalent to the previous one:

```python

# Calculate MorganFP and physicochemical properties
feature_calculator = MoleculeDescriptorsCalculator(desc_sets = [FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048)])
dataset.addDescriptors(feature_calculator, featurize=False)
dataset.fillMissingValues()
dataset.splitDataset(rand_split)
dataset.setFeatureStandardizer(Scaler())
self.featurizeSplits()
```

In [17]:
# Let's save the dataset for later
dataset.save()

### Training the Model

After preparing our dataset, we will train a QSPR regression model.
In this tutorial we will train a XGBoost model as it generally performs well, but there are other model types
available. Most machine learning models have tunable hyperparameters (for example depth & learning rate), as some hyperparameter combinations will lead to better performance on a task we perform hyperparameter optimization (in which different combinations are tested and evaluated on a subset of the training data). After finding good hyperparameters a model will be trained on the training data & evaluated on the test data. In addition to this the final model will be trained on all data

In [22]:
from qsprpred.models.models import QSPRsklearn
from sklearn.cross_decomposition import PLSRegression
from qsprpred.models.hyperparam_optimization import OptunaOptimization
from qsprpred.models.assessment_methods import CrossValAssessor, TestSetAssessor
from qsprpred.models.metrics import SklearnMetric

# This is an SKlearn model, so we will initialize it with the QSPRsklearn class
model = QSPRsklearn(base_dir = 'qspr/models/', data=dataset, alg = PLSRegression, name='PLS_REG')

# We will first optimize the hyperparameters (n_components and scale) through bayes optimization
# the best hyperparameter combination will be saved in PLS_REG_GABAAalpha_params.json
score_func = SklearnMetric.getDefaultMetric(model.task)
search_space_bs = {"n_components": ["int", 1, 30], "scale": ["categorical", [True, False]]}
bayesoptimizer = OptunaOptimization(scoring = score_func, param_grid=search_space_bs,
                                    n_trials=5, n_jobs=4)
best_params = bayesoptimizer.optimize(model)

#Then we will evaluate the performance of the best model using the independent test set
CrossValAssessor()(model)
TestSetAssessor()(model)

# Finally, we need to fit the model on the complete dataset if we want to use it further
# model is saved under qsprmodels/PLS_REG_GABAAalpha.pkg
model.fitAttached()

At the moment n_jobs>1 not available for bayes optimization, n_jobs set to 1.
[I 2023-08-10 19:16:38,328] A new study created in memory with name: no-name-c175329c-21f6-4c8c-b5ee-2c61f6537660
[I 2023-08-10 19:16:38,413] Trial 0 finished with value: 0.6658313393644304 and parameters: {'n_components': 23, 'scale': True}. Best is trial 0 with value: 0.6658313393644304.
[I 2023-08-10 19:16:38,475] Trial 1 finished with value: 0.6654643047122633 and parameters: {'n_components': 5, 'scale': True}. Best is trial 0 with value: 0.6658313393644304.
[I 2023-08-10 19:16:38,539] Trial 2 finished with value: 0.6640508933548628 and parameters: {'n_components': 4, 'scale': True}. Best is trial 0 with value: 0.6658313393644304.
[I 2023-08-10 19:16:38,611] Trial 3 finished with value: 0.6658313359530796 and parameters: {'n_components': 18, 'scale': False}. Best is trial 0 with value: 0.6658313393644304.
[I 2023-08-10 19:16:38,672] Trial 4 finished with value: 0.6640508933548628 and parameters: {'n_compo

'qspr/models/PLS_REG/PLS_REG_meta.json'

## Evaluating the results

In [None]:
# we need to sve the model first
model.save()

Plotting the results of our model on the test set we can see that it is performing reasonably well.

In [None]:
from qsprpred.plotting.regression import CorrelationPlot
from qsprpred.models.interfaces import QSPRModel

# give path to saved metadata of the model and load it
#metadata_path = './qspr/models/PLS_GABAAalpha_REGRESSION/PLS_GABAAalpha_REGRESSION_meta.json'
#model = QSPRModel.fromFile(metadata_path)
model
plt = CorrelationPlot([model])
axes, summary = plt.make(save=False, property_name='GABAAalpha')
axes[0]

As you can see, the plot also generates a summary with the displayed metrics:

In [None]:
summary

## Building Models - Classification

In this part of the tutorial, we show how to train a simple single task classification model with QSPRPred.

### Preparing the Data

We will repeat the same steps as with the regression model, but this time with classification data loaded from `datasets.py`:

In [None]:
from qsprpred.data.utils.datasplitters import ScaffoldSplit
from qsprpred.data.utils.descriptorcalculator import MoleculeDescriptorsCalculator
from qsprpred.data.utils.descriptorsets import FingerprintSet
from qsprpred.data.utils.featurefilters import LowVarianceFilter, HighCorrelationFilter
from qsprpred.data.utils.scaffolds import Murcko
from datasets import A2AR

# intialize the dataset
dataset = A2AR(random_state=random_state)

# Calculate MorganFP and physicochemical properties
feature_calculator = MoleculeDescriptorsCalculator(desc_sets = [FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048)])

# Split on scaffolds
split = ScaffoldSplit(dataset=dataset, scaffold=Murcko(), test_fraction=0.2)

# Remove features that have a low variance (<0.05) in the trainingset
lv = LowVarianceFilter(0.05)

# Remove features that have a high correlation (>0.9) in the trainingset
hc = HighCorrelationFilter(0.8)

dataset.prepareDataset(
    split=split,
    feature_calculators=[feature_calculator],
    feature_filters=[lv, hc]
)

# save the data set if you do not want to recalculate descriptors
dataset.save()

print(f"Number of samples train set: {len(dataset.y)}")
print(f"Number of samples test set: {len(dataset.y_ind)}, {len(dataset.y_ind) / len(dataset.df) * 100}%")

Let's save the dataset again so that we do not have to recalculate anything later:

In [None]:
dataset.save()

As you can see, the training part basically works the same way as with regression, but to mix things up we build two models at once here:

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from qsprpred.models.models import QSPRsklearn
from qsprpred.models.hyperparam_optimization import GridSearchOptimization
from qsprpred.models.assessment_methods import CrossValAssessor, TestSetAssessor
from qsprpred.models.metrics import SklearnMetric

params = {
    'n_estimators' : [50, 200],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    "n_jobs": [1]
}


fitted_models = []
for model in [ExtraTreesClassifier, RandomForestClassifier]:
    model = QSPRsklearn(
        base_dir='qspr/models/',
        data=dataset, 
        alg = model,
        name=model.__name__,
        random_state=random_state
    )

    score_func = SklearnMetric.getDefaultMetric(model.task)
    gridsearcher = GridSearchOptimization(scoring = score_func, param_grid=params)
    best_params = gridsearcher.optimize(model)

    CrossValAssessor()(model)
    TestSetAssessor()(model)
    model.fitAttached()
    
    fitted_models.append(model)

fitted_models

In [None]:
from qsprpred.plotting.classification import ROCPlot

plot = ROCPlot(fitted_models)
plot.make(save=True, show=True, property_name="pchembl_value_Median_class", validation="cv")

In [None]:
plot.make(save = True, show=True, property_name="pchembl_value_Median_class", validation="ind")

In [None]:
from qsprpred.plotting.classification import MetricsPlot

plot = MetricsPlot(fitted_models)
figs, summary = plot.make(save=True, show=True, property_name="pchembl_value_Median_class", out_dir="qspr/models/")

Again, the summary contains the data frame with the metrics used to create the plots:

In [None]:
summary

## Multitask Regression Model
Until now, the examples have show models that predict one property (single task). In addition to this we can also make multitask models that are trained with multiple different properties simultaneously. This can, for example, be a model that predicts the bioactivity on two or more proteins. Below, we will build a multitask model for 8 different human mGLU receptors.

Data on the mGLU receptors can also be found in the `Parkinsons` dataset. Here we specify that singletask is False, so that all the targets are set as properties to predict (see `datasets.py` for the exact code, note the now necessary imputation step):

In [None]:
from datasets import Parkinsons

dataset = Parkinsons(singletask=False, random_state=random_state)
dataset.targetProperties

In [None]:
dataset.getDF()

Feature calculation and dataset preparation is the same as for single task models. During splitting compounds are assigned to the training or test set (not individual datapoints).

In [None]:
from qsprpred.data.utils.descriptorsets import FingerprintSet
from qsprpred.data.utils.descriptorcalculator import MoleculeDescriptorsCalculator
from sklearn.preprocessing import StandardScaler as Scaler
from qsprpred.data.utils.datasplitters import RandomSplit

# Calculate MorganFP and physicochemical properties
feature_calculator = MoleculeDescriptorsCalculator(desc_sets = [FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048)])

# Do a random split for creating the train (85%) and test set (15%)
rand_split = RandomSplit(test_fraction=0.2, dataset=dataset)

# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    split=rand_split,
    feature_calculators=[feature_calculator],
    feature_standardizer=Scaler()
)

print(f"Number of samples train set: {len(dataset.y)}")
print(f"Number of samples test set: {len(dataset.y_ind)}")

Save the dataset again to avoid recalculation later if needed:

In [19]:
dataset.save()

### Training the model
We use a KNN model because it is relatively fast and the previously shown PLS does not work with multiple-tasks. Because hyperparameter optimization works the same as for single task models, we skip this step here.

In [20]:
from qsprpred.models.models import QSPRsklearn
from sklearn.neighbors import KNeighborsRegressor
from qsprpred.models.assessment_methods import CrossValAssessor, TestSetAssessor

# This is an SKlearn model, so we will initialize it with the QSPRsklearn class
model = QSPRsklearn(base_dir = 'qspr/models/', data=dataset, alg = KNeighborsRegressor, name='KNN_REG_MT')

CrossValAssessor()(model)
TestSetAssessor()(model)

# Finally, we need to fit the model on the complete dataset if we want to use it further
model.fitAttached()

# and save the model
model.save()

'qspr/models/KNN_REG_MT/KNN_REG_MT_meta.json'

### Model evaluation
Here we show how to calculate the metrics that are compatible with multitask model results using an `SklearnMetric` object. Currently, imputed values are included in this analysis, the option to remove these from the analysis will be added soon.

In [21]:
from qsprpred.models.metrics import SklearnMetric

import pandas as pd

# get independent test set
df = pd.read_table("qspr/models/KNN_REG_MT/KNN_REG_MT.ind.tsv")

# column names containing original labels or predictions for the tasks
label_names = [i for i in list(df.columns.values) if "imputed_Label" in i] 
pred_names = [i for i in list(df.columns.values) if "imputed_Prediction" in i]
 
# turn into np array
ylabel = df[label_names].to_numpy()
ypred = df[pred_names].to_numpy()

# get metrics
summary = {}
for metric in SklearnMetric.multiTaskRegressionMetrics:
    scorer = SklearnMetric.getMetric(metric)
    score = scorer(ylabel, ypred)
    summary[metric]= score

summary["ModelName"] = model.name

summary

ValueError: Found array with 0 feature(s) (shape=(5, 0)) while a minimum of 1 is required.

Lastly we plot the predicted pchembl values against the experimental results, to visualize model performance. This is not yet integrate in the QSPRPred plotting functions so the code is more extensive than for the single task models:

In [None]:
import math
from matplotlib import pyplot as plt
from sklearn import metrics
import numpy as np

property_name = "pChEMBL"

my_cmap = ["#12517B", "#88002A"]

plt.figure(figsize=(5, 7))
cate = ["qspr/models/KNN_REG_MT/KNN_REG_MT.cv.tsv", "qspr/models/KNN_REG_MT/KNN_REG_MT.ind.tsv"]
cate_names = ["cv", "ind"]
ret_axes = []
summary = {"ModelName": [], "R2": [], "RMSE": [], "Set": []}


for m, mymodel in enumerate([model]):
    min_val = 0
    max_val = 10
    for j, cate_name in enumerate(['Cross Validation', 'Independent Test']):
        ax = plt.subplot(2, len([model]), m + j + 1)
        ret_axes.append(ax)
        #todo create subplot show can show both plots
        df = pd.read_table(cate[j])
        # column names containing original labels or predictions for the tasks
        label_names = [i for i in list(df.columns.values) if "imputed_Label" in i]  #df[f"{property_name}_imputed_Label"]
        labels = [i.replace("_imputed_Label", '') for i in label_names]
        pred_names = [i for i in list(df.columns.values) if "imputed_Prediction" in i] #df[f"{property_name}_imputed_Prediction"]

        # # create mask with True if original value, False where no value in original  dataset
        # #TODO currently does not work because do not have df with cv / ind, nee
        # array = df_that_does_not_exist_yet[labels].to_numpy()
        # mask = ~np.isnan(array)
        
        # turn ylabel and ypred into np array
        ylabel = df[label_names].to_numpy()
        c = np.zeros(ylabel.shape)
        for k in range(c.shape[1]):
            c[:, k] = k
        c.flatten()
        
        ylabel = ylabel.flatten()
        ypred = df[pred_names].to_numpy().flatten()
        # mask = mask.flatten()

        # # no markers for inputed values
        # area = np.full(mask.shape, 5)
        # area1 = np.ma.masked_where(mask, area)
        # scatter = plt.scatter(
        #     ylabel,
        #     ypred,
        #     s=area1,
        #     c=c)

        scatter = plt.scatter(
            ylabel,
            ypred,
            s=5,
            c=c)
        coef = metrics.r2_score(ylabel, ypred)
        rmse = metrics.mean_squared_error(
            ylabel, ypred,
            squared=False)
        summary["R2"].append(coef)
        summary["RMSE"].append(rmse)
        summary["Set"].append(cate_names[j])
        summary["ModelName"].append(model.name)

        plt.title(f'{model} {cate_name}')
        plt.xlabel(f"Experimental {property_name}")
        plt.ylabel(f"Predicted {property_name}")
        min_val_now = math.floor(
            min(np.concatenate((ylabel, ypred))))
        max_val_now = math.ceil(
            max(np.concatenate((ylabel, ypred))))
        if min_val_now < min_val:
            min_val = min_val_now
        if max_val_now > max_val:
            max_val = max_val_now
        pad = (max_val - min_val) * 0.1
        plt.plot(
            [min_val - pad, max_val + pad],
            [min_val - pad, max_val + pad],
            lw=2, linestyle='--', color='black')

        handles, _ = scatter.legend_elements()

        plt.legend(handles, labels, title="Tasks", loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.subplots_adjust(wspace=0.3, hspace=0.5)

In [None]:
summary