# RDkit features

There are a variety of descriptor options that RDkit and AMPL provide. For demonstration purposes, we choose to use rdkit features in this tutorial. RDKit is an open source toolkit for cheminformatics. It is a collection of cheminformatics and machine-learning software written in C++ and Python. Let us see how to calculate descriptors using RDkit.

In [1]:
import pandas as pd

# Set up
dataset_file = 'dataset/curated_kcna5_ic50.csv'
odir='dataset'

In [2]:
# Read the dataset
df = pd.read_csv(dataset_file)

In [3]:
#Calculate descriptors using RDkit

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(df['base_rdkit_smiles'])

In [4]:
# View the descriptors
df_with_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_with_descriptors

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,14.101871,-6.232636,14.101871,0.935669,0.770404,403.504,378.304,403.156577,150,0,...,0,1,0,0,0,0,0,0,0,1
1,14.336747,-6.469670,14.336747,0.909616,0.802325,403.891,385.747,403.075740,140,0,...,0,1,0,0,0,0,0,0,0,0
2,9.304449,-5.128857,9.304449,0.958213,0.820421,339.479,310.247,339.219829,134,0,...,0,0,0,0,0,0,0,0,0,0
3,15.212871,-7.160522,15.212871,2.962826,0.445226,456.633,413.289,456.321515,184,0,...,0,0,0,0,0,0,0,0,0,0
4,15.051705,-4.380136,15.051705,0.079079,0.568548,486.378,464.202,485.118544,170,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
797,16.231143,-5.759972,16.231143,0.815520,0.368934,542.724,500.388,542.325691,212,0,...,0,0,0,0,0,0,0,0,0,0
798,13.632325,-4.014188,13.632325,0.381029,0.601473,379.508,350.276,379.237211,148,0,...,0,0,0,0,0,0,0,0,0,1
799,14.376682,-6.507586,14.376682,0.793149,0.412245,479.602,450.370,479.187877,178,0,...,0,1,0,0,0,0,0,0,0,1
800,9.390684,-4.591510,9.390684,0.868797,0.925392,299.374,278.206,299.163377,116,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
!tar -tf dataset/*.tar.gz

./best_model/
./best_model/model.joblib
./model_metadata.json
./model_metrics.json
./transformers.pkl


In [8]:
!tar xzf dataset/*.tar.gz -C /tmp

In [9]:
!cat /tmp/model_metadata.json | head

{
    "descriptor_specific": {
        "descriptor_bucket": "public",
        "descriptor_key": null,
        "descriptor_type": "rdkit_raw"
    },
    "model_parameters": {
        "ampl_version": "1.6.0",
        "class_number": 2,
        "featurizer": "computed_descriptors",


In [10]:
import joblib
# load the model from disk
loaded_model = joblib.load("/tmp/best_model/model.joblib")
loaded_model

RandomForestRegressor(max_features=32, n_estimators=500, n_jobs=-1)

In [11]:
import json
import pandas as pd
data = json.load(open('/tmp/model_metadata.json'))
data['model_parameters']

{'ampl_version': '1.6.0',
 'class_number': 2,
 'featurizer': 'computed_descriptors',
 'hyperparam_uuid': None,
 'model_bucket': 'public',
 'model_choice_score_type': 'r2',
 'model_type': 'RF',
 'num_model_tasks': 1,
 'prediction_type': 'regression',
 'save_results': False,
 'system': 'LC',
 'time_generated': 1701067884.515601,
 'transformer_bucket': '',
 'transformer_key': 'dataset/curated_kcna5_ic50/RF_computed_descriptors_scaffold_regression/b0421e6c-c5e2-4d05-9392-8cb5b1516bd3/transformers.pkl',
 'transformer_oid': '',
 'transformers': True,
 'uncertainty': True}

In [12]:
response_col = "avg_pIC50"
compound_id = "compound_id"
smiles_col = "base_rdkit_smiles"

params = {
        "verbose": "True",
        "system": "LC",
        "datastore": "False",
        "save_results": "False",
        "prediction_type": "regression",
        "dataset_key": dataset_file,
        "id_col": compound_id,
        "smiles_col": smiles_col,
        "response_cols": response_col,
        "previously_split": "False",
        "split_only": "False",
        "splitter": "scaffold",
        "split_valid_frac": "0.15",
        "split_test_frac": "0.15",
        "featurizer": "computed_descriptors",
        "descriptor_type" : "rdkit_raw",
        "model_type": "RF",
        "verbose": "True",
        "transformers": "True",
        'max_epochs': '70',
        "rerun": "False",
        "result_dir": odir
    }

ampl_param = parse.wrapper(params)
pl = mp.ModelPipeline(ampl_param)
pl.train_model()

INFO:ATOM:Created a dataset hash 'd73e30e5b0ddf05e34665d76e5c62d27' from dataset_key '/gpfs/gsfs12/users/lup2/AMPL/AMPL_setup_tutorials/atomsci/ddm/examples/tutorials2023/dataset/curated_kcna5_ic50.csv'
DEBUG:ATOM:Attempting to load featurized dataset
DEBUG:ATOM:Got dataset, attempting to extract data
DEBUG:ATOM:Creating deepchem dataset
INFO:ATOM:Using prefeaturized data; number of features = 200
INFO:ATOM:Wrote transformers to dataset/curated_kcna5_ic50/RF_computed_descriptors_scaffold_regression/7cd56c1d-de44-445c-aa2b-a120b901a4cd/transformers.pkl
INFO:ATOM:Transforming response data
INFO:ATOM:Transforming feature data
  X = np.nan_to_num((X - self.X_means) * X_weight / self.X_stds)
INFO:ATOM:Transforming response data
INFO:ATOM:Transforming feature data
INFO:ATOM:Transforming response data
INFO:ATOM:Transforming feature data
INFO:ATOM:Fitting random forest model
INFO:ATOM:Fold 0: training r2_score = 0.940, validation r2_score = 0.188, test r2_score = 0.397
INFO:ATOM:Wrote model ta

In [13]:
# Model Performance
from atomsci.ddm.pipeline import compare_models as cm
pred_df = cm.get_filesystem_perf_results(odir, pred_type='regression')

DEBUG:ATOM:Model tracker client not supported in your environment; can look at models in filesystem only.


Found data for 2 models under dataset


The pred_df dataframe has details about the model_uuid, model_path, ampl_version, model_type, features, splitter and the results for popular metrics that help evaluate the performance. Let us view the contents of the pred_df dataframe.

In [14]:
# View the pred_df dataframe
pred_df

Unnamed: 0,model_uuid,model_path,ampl_version,model_type,dataset_key,features,splitter,model_score_type,feature_transform_type,model_choice_score,...,rf_max_depth,max_epochs,best_epoch,learning_rate,layer_sizes,dropouts,xgb_gamma,xgb_learning_rate,model_parameters_dict,feat_parameters_dict
0,b0421e6c-c5e2-4d05-9392-8cb5b1516bd3,dataset/curated_kcna5_ic50_model_b0421e6c-c5e2...,1.6.0,RF,/gpfs/gsfs12/users/lup2/AMPL/AMPL_setup_tutori...,rdkit_raw,scaffold,r2,normalization,0.249681,...,,,,,,,,,"{""rf_estimators"": 500, ""rf_max_depth"": null, ""...",{}
1,7cd56c1d-de44-445c-aa2b-a120b901a4cd,dataset/curated_kcna5_ic50_model_7cd56c1d-de44...,1.6.0,RF,/gpfs/gsfs12/users/lup2/AMPL/AMPL_setup_tutori...,rdkit_raw,scaffold,r2,normalization,0.188429,...,,,,,,,,,"{""rf_estimators"": 500, ""rf_max_depth"": null, ""...",{}


In [15]:
# Top performing model
top_model=pred_df.sort_values(by="best_valid_r2_score", ascending=False).iloc[0,:]
top_model

model_uuid                               b0421e6c-c5e2-4d05-9392-8cb5b1516bd3
model_path                  dataset/curated_kcna5_ic50_model_b0421e6c-c5e2...
ampl_version                                                            1.6.0
model_type                                                                 RF
dataset_key                 /gpfs/gsfs12/users/lup2/AMPL/AMPL_setup_tutori...
features                                                            rdkit_raw
splitter                                                             scaffold
model_score_type                                                           r2
feature_transform_type                                          normalization
model_choice_score                                                   0.249681
best_train_r2_score                                                  0.941705
best_train_rms_score                                                 0.195702
best_train_mae_score                                            

In [16]:
# Top performing model path
top_model.model_path

'dataset/curated_kcna5_ic50_model_b0421e6c-c5e2-4d05-9392-8cb5b1516bd3.tar.gz'