# Install QSPRpred

In [None]:
%pip install git+https://github.com/CDDLeiden/QSPRPred.git@BOO-2023

# Create your dataset

## Fetching data from Papyrus

In [None]:
### SKIP THIS CELL ###

from qsprpred.data.sources.papyrus import Papyrus

acc_keys = ["P49840", "P36897", "Q13464", "P22455", "P21802"] # Specify the protein accession key(s) of your target(s) of interest
dataset_name = "papyrus_data"  # name of the file to be generated
quality = "low"  # choose minimum quality from {"high", "medium", "low"}
papyrus_version = '05.6'  # Papyrus database version
data_dir = "data"

papyrus = Papyrus(
    data_dir=data_dir,
    version=papyrus_version,
    stereo=False,
    plus_only=False, # Set to False to include lower quality data
)

mt = papyrus.getData(
    acc_keys,
    quality,
    name=dataset_name,
    use_existing=True,
    activity_types=["Ki", "IC50", "Kd", "EC50"]
)
mt

In [None]:
# read in all data

import pandas as pd
df = pd.read_csv('data/papyrus_data.tsv', sep='\t')

# filter data for target of interest
MY_TARGET = 'P49840' # REPLACE WITH YOUR TARGET ACCESSION

df = df[df['accession'] == MY_TARGET]

# keep only high quality data
df = df[df['Quality'] == 'High']

# Create molecule table for visualization
from qsprpred.data.data import MoleculeTable

mt = MoleculeTable(df=df, name=MY_TARGET, store_dir='data')

mt.getDF()

## Preparing data for modelling

In [None]:
from qsprpred.models.tasks import TargetTasks
from qsprpred.data.data import QSPRDataset

target_props=[{
                "name": "pchembl_value_Median", # name of the target column in the dataset
                "task": TargetTasks.REGRESSION, # specify the task type (SINGLECLASS, MULTICLASS, REGRESSION)
                }]

# Create a QSPRDataset instance used for training and evaluation of QSPR models
dataset = QSPRDataset.fromMolTable(mt, target_props=target_props)
dataset.targetProperties

![descriptors](figures/descriptors.png)

In [None]:
from qsprpred.data.utils.descriptorsets import FingerprintSet
from qsprpred.data.utils.descriptorcalculator import MoleculeDescriptorsCalculator
from sklearn.preprocessing import StandardScaler as Scaler
from qsprpred.data.utils.datasplitters import randomsplit

# Calculate MorganFP and physicochemical properties
feature_calculator = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048)])

# Do a random split for creating the train (80%) and test set (20%)
rand_split = randomsplit(0.2)

# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    split=rand_split,
    feature_calculators=[feature_calculator],
    feature_standardizer=Scaler()
)

print(f"Number of samples train set: {len(dataset.y)}")
print(f"Number of samples test set: {len(dataset.y_ind)}")

# Let's save the dataset for later
dataset.save()

# Data Visualization

In [None]:
# create histogram of pchembl values in the dataset
import seaborn as sns
sns.histplot(dataset.getDF()['pchembl_value_Median'], bins=20)

In [None]:
# Visualize the compounds with the highest pchembl values in the dataset
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import MolFromSmiles

# number of compounds to show
NUM_COMPOUNDS = 30

# Sort the dataset by pchembl value
dataset_sorted = dataset.getDF().sort_values(by='pchembl_value_Median', ascending=False)
 
# show average pchembl value per scaffold and the count of compounds per scaffold
Draw.MolsToGridImage([MolFromSmiles(smiles) for smiles in dataset_sorted[:NUM_COMPOUNDS].SMILES], molsPerRow=5, subImgSize=(200,200), legends=[f"{dataset_sorted['pchembl_value_Median'][idx]:.2f}" for idx in dataset_sorted[:NUM_COMPOUNDS].index])

In [None]:
from scaffviz.clustering.manifold import TSNE
from qsprpred.data.utils.scaffolds import Murcko, BemisMurcko
from scaffviz.depiction.plot import Plot

# Show top n scaffolds with at least x compounds
NUM_SCAFFOLDS = 20
MIN_COMPOUNDS = 5

dataset.addScaffolds([Murcko()])

# get average pchembl value per scaffold
scaffolds = dataset.getDF().groupby('Scaffold_Murcko')['pchembl_value_Median'].mean().sort_values(ascending=False)
scaffolds = scaffolds.rename('Average pchembl value')

# add the number of compounds per scaffold
scaffolds = pd.concat([scaffolds, dataset.getDF().groupby('Scaffold_Murcko')['pchembl_value_Median'].count()], axis=1)
scaffolds = scaffolds.rename(columns={'pchembl_value_Median': 'Count'})

# Drop scaffolds with less than MIN_COMPOUNDS compounds
scaffolds = scaffolds[scaffolds['Count'] > MIN_COMPOUNDS]
 
# show average pchembl value per scaffold and the count of compounds per scaffold
Draw.MolsToGridImage([MolFromSmiles(scaffold) for scaffold in scaffolds.index[:NUM_SCAFFOLDS]], molsPerRow=5, subImgSize=(200,200), legends=[f"{scaffolds['Average pchembl value'][scaffold]:.2f} ({scaffolds['Count'][scaffold]})" for scaffold in scaffolds.index[:NUM_SCAFFOLDS]])

In [None]:
# Show all compounds from from scaffold N.

SCAFFOLD_INDEX = 0

scaffold = scaffolds.index[SCAFFOLD_INDEX]

# get all compounds from scaffold
scaffold_df = dataset.getDF()[dataset.getDF()['Scaffold_Murcko'] == scaffold]

# sort compounds by pchembl value
scaffold_df = scaffold_df.sort_values(by='pchembl_value_Median', ascending=False)

# visualize compounds
Draw.MolsToGridImage([MolFromSmiles(smiles) for smiles in scaffold_df.SMILES], molsPerRow=5, subImgSize=(200,200), legends=[f"{scaffold_df['pchembl_value_Median'][idx]:.2f}" for idx in scaffold_df.index])

# Training a ML model

In [None]:
N_CPU = 4 # number of CPUs for parallel operations

In [None]:
from qsprpred.models.models import QSPRsklearn
from sklearn.cross_decomposition import PLSRegression
from qsprpred.models.hyperparam_optimization import OptunaOptimization

# This is an SKlearn model, so we will initialize it with the QSPRsklearn class
model = QSPRsklearn(base_dir = '.', data=dataset, alg = PLSRegression, name='PLS_REG')

# We will first optimize the hyperparameters (n_components and scale) through bayes optimization
# the best hyperparameter combination will be saved in PLS_REG_params.json
search_space_bs = {"n_components": ["int", 1, 30], "scale": ["categorical", [True, False]]}
bayesoptimizer = OptunaOptimization(scoring = model.score_func, param_grid=search_space_bs, n_trials=5)
best_params = bayesoptimizer.optimize(model)

#Then we will evaluate the performance of the best model using the independent test set
_ = model.evaluate()

# Finally, we need to fit the model on the complete dataset if we want to use it further
# model is saved under qspr/models/PLS_REG.json
model.fit()

In [None]:
from qsprpred.plotting.regression import CorrelationPlot

plt = CorrelationPlot([model])
axes, summary = plt.make(save=False, property_name='pchembl_value_Median')
axes[0]

print(summary)

# Make predictions for your own compounds

In [None]:
# replace with your own compounds
list_of_smiles = ['OCCc1ccn2cnccc12',
                  'C1CC1Oc1cc2ccncn2c1',
                  'CNC(=O)c1nccc2cccn12'] # REPLACE WITH YOUR OWN COMPOUNDS

# make predictions with the model
predictions = model.predictMols(list_of_smiles)

# show molecules with predicted values using rdkit
from rdkit import Chem
from rdkit.Chem import Draw
 
mols = [Chem.MolFromSmiles(smi) for smi in list_of_smiles]
Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(200, 200), legends=[f'{pred[0]:.3f}' for pred in predictions])