In [1]:
from qsprpred.data.descriptors.sets import FingerprintSet, SmilesDesc
from qsprpred.data.descriptors.calculators import MoleculeDescriptorsCalculator
from qsprpred.data import QSPRDataset, RandomSplit
import os

os.makedirs("tutorial_output/data", exist_ok=True)

# Create dataset
dataset = QSPRDataset.fromTableFile(
  	filename="tutorial_data/A2A_LIGANDS.tsv", 
  	store_dir="tutorial_output/data",
  	name="ChempropTutorialDataset",
  	target_props=[{"name": "pchembl_value_Mean", "task": "REGRESSION"}],
  	random_state=42
)

# calculate compound features and split dataset into train and test
feature_calculator = MoleculeDescriptorsCalculator(desc_sets = [SmilesDesc()])
dataset.prepareDataset(
    feature_calculators=[feature_calculator],
    recalculate_features=True,
)

dataset.getDF().head()



Unnamed: 0_level_0,SMILES,pchembl_value_Mean,Year,QSPRID,pchembl_value_Mean_original
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ChempropTutorialDataset_0000,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,8.68,2008.0,ChempropTutorialDataset_0000,8.68
ChempropTutorialDataset_0001,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12...,4.82,2010.0,ChempropTutorialDataset_0001,4.82
ChempropTutorialDataset_0002,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,5.65,2009.0,ChempropTutorialDataset_0002,5.65
ChempropTutorialDataset_0003,CNC(=O)C12CC1C(n1cnc3c(NCc4cccc(Cl)c4)nc(C#CCC...,5.45,2009.0,ChempropTutorialDataset_0003,5.45
ChempropTutorialDataset_0004,CCCn1c(=O)c2c(nc3cc(OC)ccn32)n(CCCNC(=O)c2ccc(...,5.2,2019.0,ChempropTutorialDataset_0004,5.2


In [2]:
dataset.X

Unnamed: 0_level_0,Descriptor_SmilesDesc_SMILES
QSPRID,Unnamed: 1_level_1
ChempropTutorialDataset_0599,CCCn1c(-c2ccccc2)nc2c(NC3CCOC3)ncnc21
ChempropTutorialDataset_0752,CCCn1c(=O)c2nc(-c3cn[nH]c3)[nH]c2n(CCC)c1=O
ChempropTutorialDataset_1954,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccc(F)cn4)CC3C)nc21
ChempropTutorialDataset_2928,COc1cccc(CCCC(=O)Nc2nc3ccccc3c(=O)s2)c1
ChempropTutorialDataset_2512,COc1ccc(N(CCO)C(C)=O)c2sc(NC(=O)c3ccc(F)cc3)nc12
...,...
ChempropTutorialDataset_1130,CCNC(=O)C1OC(n2cnc3c(NCC)nc(C#CC4(O)CCCC4)nc32...
ChempropTutorialDataset_1294,CNC(=O)C1SC(n2cnc3c(NCc4cccc(I)c4)nc(Cl)nc32)C...
ChempropTutorialDataset_0860,CCNC(=O)C1OC(n2cnc3c(N)nc(N4CCN(c5ccc(OCC(=O)O...
ChempropTutorialDataset_3507,CNC(=O)C1[Se]C(n2cnc3c(NC4CCC4)ncnc32)C(O)C1O


In [9]:
# Create model
from qsprpred.extra.gpu.models.chemprop import ChempropModel

os.makedirs("../../tutorial_output/models", exist_ok=True)
model = ChempropModel(
    base_dir = '../../tutorial_output/models',
    data = dataset,
    name = 'ChempropTutorialModel',
    parameters={"epochs":5},
    quiet_logger=False
)



In [12]:
from qsprpred.models import CrossValAssessor
# import sklearn shuffle split
from sklearn.model_selection import ShuffleSplit

CrossValAssessor("neg_root_mean_squared_error",
                 split= ShuffleSplit(n_splits=3, test_size=0.1, random_state=dataset.randomState))(model, split=RandomSplit(test_fraction=0.11, dataset=dataset))

  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
model.estimator.args.gpu

In [6]:
print(model.estimator.args)

{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,
 'checkpoint_dir': None,
 'checkpoint_frzn': None,
 'checkpoint_path': None,
 'checkpoint_paths': None,
 'class_balance': False,
 'config_path': None,
 'constraints_path': None,
 'crossval_index_dir': None,
 'crossval_index_file': None,
 'crossval_index_sets': None,
 'cuda': True,
 'data_path': '',
 'data_weights_path': None,
 'dataset_type': 'regression',
 'depth': 3,
 'depth_sol

In [7]:
df = dataset.getDF()


# rename SMILES column to smiles
df.rename(columns={"SMILES": "smiles"}, inplace=True)

# Drop columns that are not needed for the tutorial (QSPRID, Year, pchembl_value_Mean_original)
df.drop(columns=["QSPRID", "Year", "pchembl_value_Mean_original"], inplace=True)

display(df.head())

df.to_csv("tutorial_output/data/ChempropTutorialDataset.csv", index=False)

Unnamed: 0_level_0,smiles,pchembl_value_Mean
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1
ChempropTutorialDataset_0000,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,8.68
ChempropTutorialDataset_0001,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12...,4.82
ChempropTutorialDataset_0002,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,5.65
ChempropTutorialDataset_0003,CNC(=O)C12CC1C(n1cnc3c(NCc4cccc(Cl)c4)nc(C#CCC...,5.45
ChempropTutorialDataset_0004,CCCn1c(=O)c2c(nc3cc(OC)ccn32)n(CCCNC(=O)c2ccc(...,5.2


In [8]:
import chemprop
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

arguments = [
    '--data_path', "tutorial_output/data/ChempropTutorialDataset.csv",
    '--dataset_type', 'regression',
    '--save_dir', 'test_checkpoints_reg',
    '--epochs', '5',
    '--save_smiles_splits'
]

args = chemprop.args.TrainArgs().parse_args(arguments)
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

Command line
python /home/helle/.conda/envs/qsprpred/lib/python3.11/site-packages/ipykernel_launcher.py --f=/home/helle/.local/share/jupyter/runtime/kernel-v2-14544546y07PTREE6A1.json
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,
 'checkpoint_dir': None,
 'checkpoint_frzn': None,
 'checkpoint_path': None,
 'checkpoint_paths': None,
 'class_balance': False,
 'config_path': None,
 'constraints_path': None,
 'crossval_index