In [1]:
from qsprpred.data.descriptors.sets import FingerprintSet, SmilesDesc
from qsprpred.data.descriptors.calculators import MoleculeDescriptorsCalculator
from qsprpred.data import QSPRDataset, RandomSplit
import os

os.makedirs("tutorial_output/data", exist_ok=True)

# Create dataset
dataset = QSPRDataset.fromTableFile(
  	filename="tutorial_data/A2A_LIGANDS.tsv", 
  	store_dir="tutorial_output/data",
  	name="ChempropTutorialDataset",
  	target_props=[{"name": "pchembl_value_Mean", "task": "REGRESSION"}],
  	random_state=42
)

# calculate compound features and split dataset into train and test
feature_calculator = MoleculeDescriptorsCalculator(desc_sets = [SmilesDesc()])
dataset.prepareDataset(
    feature_calculators=[feature_calculator],
    recalculate_features=True,
)

dataset.getDF().head()



Unnamed: 0_level_0,SMILES,pchembl_value_Mean,Year,QSPRID,pchembl_value_Mean_original
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ChempropTutorialDataset_0000,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,8.68,2008.0,ChempropTutorialDataset_0000,8.68
ChempropTutorialDataset_0001,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12...,4.82,2010.0,ChempropTutorialDataset_0001,4.82
ChempropTutorialDataset_0002,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,5.65,2009.0,ChempropTutorialDataset_0002,5.65
ChempropTutorialDataset_0003,CNC(=O)C12CC1C(n1cnc3c(NCc4cccc(Cl)c4)nc(C#CCC...,5.45,2009.0,ChempropTutorialDataset_0003,5.45
ChempropTutorialDataset_0004,CCCn1c(=O)c2c(nc3cc(OC)ccn32)n(CCCNC(=O)c2ccc(...,5.2,2019.0,ChempropTutorialDataset_0004,5.2


In [2]:
# Create model
from qsprpred.extra.gpu.models.chemprop import ChempropModel

os.makedirs("../../tutorial_output/models", exist_ok=True)
model = ChempropModel(
    base_dir = '../../tutorial_output/models',
    data = dataset,
    name = 'ChempropTutorialModel',
    parameters={"epochs":5},
    quiet_logger=False
)



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from qsprpred.models import CrossValAssessor
# import sklearn shuffle split
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics
from qsprpred.models.metrics import SklearnMetrics

rmse = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False, squared=False)
assessor = CrossValAssessor(scoring=SklearnMetrics(rmse),
                 split= ShuffleSplit(n_splits=1, test_size=0.1, random_state=dataset.randomState))

assessor(model, split=RandomSplit(test_fraction=0.111111111, dataset=dataset))

train size = 3,264 | val size = 409
Total size = 3,673
Fitting scaler
Number of parameters = 355,201
Moving model to cuda
  0%|          | 0/5 [00:00<?, ?it/s]Epoch 0
Loss = 1.0126e+00, PNorm = 33.9635, GNorm = 3.5847, lr_0 = 1.7615e-04
Loss = 1.0086e+00, PNorm = 33.9655, GNorm = 5.2273, lr_0 = 2.4538e-04
Loss = 9.7799e-01, PNorm = 33.9689, GNorm = 0.9776, lr_0 = 3.1462e-04
Loss = 8.5465e-01, PNorm = 33.9756, GNorm = 3.5943, lr_0 = 3.8385e-04
Loss = 9.5209e-01, PNorm = 33.9851, GNorm = 4.9440, lr_0 = 4.5308e-04
Loss = 8.9767e-01, PNorm = 33.9982, GNorm = 9.7313, lr_0 = 5.2231e-04
Validation rmse = 1.077383
Model best validation rmse = 1.077383 on epoch                     0
 20%|██        | 1/5 [02:52<11:29, 172.33s/it]Epoch 1
Loss = 8.1033e-01, PNorm = 34.0155, GNorm = 3.0533, lr_0 = 5.9846e-04
Loss = 8.4785e-01, PNorm = 34.0346, GNorm = 0.7051, lr_0 = 6.6769e-04
Loss = 9.1530e-01, PNorm = 34.0580, GNorm = 4.2852, lr_0 = 7.3692e-04
Loss = 8.7839e-01, PNorm = 34.0899, GNorm = 0.9194, l

array([-0.92888709])

In [22]:
assessor.monitor.foldData

{0: {'X_train':                                                    Descriptor_SmilesDesc_SMILES
  QSPRID                                                                         
  ChempropTutorialDataset_1620  Nc1nc(C(=O)NCc2cccc3cccnc23)c2cccc(-c3cncnc3)c2n1
  ChempropTutorialDataset_1916        Cn1c(=O)c2c(nc3n2CCCCN3Cc2ccccc2Br)n(C)c1=O
  ChempropTutorialDataset_3874             CC(=O)Nc1nc(-c2ccccc2)c(-c2nc(C)no2)s1
  ChempropTutorialDataset_0826  NCCNCCNC(=O)COc1ccc(CCCn2ncc3c2nc(N)n2nc(-c4cc...
  ChempropTutorialDataset_0297   O=C(Nc1nc(-c2ccccc2)nc2sc(-c3ccco3)nc12)c1ccccc1
  ...                                                                         ...
  ChempropTutorialDataset_3719  CC(C)n1cnc(CCNc2nc(NCC(c3ccccc3)c3ccccc3)c3ncn...
  ChempropTutorialDataset_3878  O=C(NC1CCC1)C1SC(n2cnc3c(NCc4cccc(I)c4)nc(Cl)n...
  ChempropTutorialDataset_2348                     COCCNC(=O)c1cc2cccc(OC)c2oc1=N
  ChempropTutorialDataset_1810               Nc1nc(-c2ccco2)c2ncn(CCc3ccccc3)c2n1
  

In [6]:
import pandas as pd
assessor.monitor.fits[0]["fitData"]

df_train = pd.DataFrame(assessor.monitor.fits[0]["fitData"]["X_train"], columns=["smiles"])
df_train["pchembl_value_Mean"] = assessor.monitor.fits[0]["fitData"]["y_train"]

df_train.to_csv("tutorial_output/data/ChempropTutorialDataset_train.csv", index=False)

df_val = pd.DataFrame(assessor.monitor.fits[0]["fitData"]["X_val"], columns=["smiles"])
df_val["pchembl_value_Mean"] = assessor.monitor.fits[0]["fitData"]["y_val"]

df_val.to_csv("tutorial_output/data/ChempropTutorialDataset_val.csv", index=False)

df_test = pd.DataFrame(assessor.monitor.foldData[0]["X_test"])
# rename column Descriptor_SmilesDesc_SMILES to smiles
df_test.rename(columns={"Descriptor_SmilesDesc_SMILES": "smiles"}, inplace=True)
df_test["pchembl_value_Mean"] = assessor.monitor.foldData[0]["y_test"]

df_test.to_csv("tutorial_output/data/ChempropTutorialDataset_test.csv", index=False)

df_test.head()



Unnamed: 0_level_0,smiles,pchembl_value_Mean
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1
ChempropTutorialDataset_0733,Cc1ccc(Nc2nc3ccccc3c3[nH]c(C4CCCC4)nc23)cc1,5.28
ChempropTutorialDataset_3347,CC(=O)Nc1ccc(Cn2nnc3c(-c4ccco4)nc(N)nc32)cc1,6.94
ChempropTutorialDataset_1170,CCNC(=O)C1OC(n2cnc3c(NCC)nc(C#CC(O)C4CCCCC4)nc...,7.23
ChempropTutorialDataset_1063,OCC1OC(n2cnc3c(NC4CCSC4)ncnc32)C(O)C1O,5.01
ChempropTutorialDataset_2627,Cc1cc(C)cc(-c2nc(Nc3ccc(Cl)cc3F)c3ncn(C4OC(Cn5...,5.37


In [4]:
df = dataset.getDF()


# rename SMILES column to smiles
df.rename(columns={"SMILES": "smiles"}, inplace=True)

# Drop columns that are not needed for the tutorial (QSPRID, Year, pchembl_value_Mean_original)
df.drop(columns=["QSPRID", "Year", "pchembl_value_Mean_original"], inplace=True)

display(df.head())

df.to_csv("tutorial_output/data/ChempropTutorialDataset.csv", index=False)

Unnamed: 0_level_0,smiles,pchembl_value_Mean
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1
ChempropTutorialDataset_0000,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,8.68
ChempropTutorialDataset_0001,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12...,4.82
ChempropTutorialDataset_0002,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,5.65
ChempropTutorialDataset_0003,CNC(=O)C12CC1C(n1cnc3c(NCc4cccc(Cl)c4)nc(C#CCC...,5.45
ChempropTutorialDataset_0004,CCCn1c(=O)c2c(nc3cc(OC)ccn32)n(CCCNC(=O)c2ccc(...,5.2


In [10]:
dataset.randomState

42

In [13]:
import chemprop
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA

arguments = [
    '--data_path', "tutorial_output/data/ChempropTutorialDataset_train.csv",
    '--separate_val_path', "tutorial_output/data/ChempropTutorialDataset_val.csv",
    '--separate_test_path', "tutorial_output/data/ChempropTutorialDataset_test.csv",
    '--dataset_type', 'regression',
    '--save_dir', 'test_checkpoints_reg',
    '--epochs', '5',
    '--save_smiles_splits',
    '--seed', '42',
    '--pytorch_seed', '42'
]

args = chemprop.args.TrainArgs().parse_args(arguments)
mean_score, std_score = chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)
print(mean_score)

Command line
python /home/helle/.conda/envs/qsprpred/lib/python3.11/site-packages/ipykernel_launcher.py --f=/home/helle/.local/share/jupyter/runtime/kernel-v2-2663457eMriRCVoYpTM.json
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,
 'checkpoint_dir': None,
 'checkpoint_frzn': None,
 'checkpoint_path': None,
 'checkpoint_paths': None,
 'class_balance': False,
 'config_path': None,
 'constraints_path': None,
 'crossval_index

In [14]:
print(mean_score)

0.9288870871112483
