### QSAR Models for Blood Brain Barrier (BBB)

In [310]:
import rdkit

In [311]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

In [312]:
# Run the following three lines of code if you want to check and explore how some of the molecules in the dataset actually look like


# from rdkit.Chem.Draw import IPythonConsole
# from rdkit.Chem import Draw
# IPythonConsole.ipython_useSVG=True

# set the last line of code to False if you want PNGs instead of SVGs

In [313]:
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef

In [314]:
#import joblib for compressing and saving some of the models and datasets
import joblib

#### Reading molecules and activity from SDF

In [315]:
# we read the SDF file and make a list of molecules in mols and a list of their activity (logBB) in y

mols = []
y = []
for mol in Chem.SDMolSupplier("logBB.sdf"):
    if mol is not None:
        mols.append(mol)
        y.append(mol.GetIntProp("logBB_class"))

#### Calculate descriptors (fingerprints) and convert them into numpy array

In [316]:
# generate binary Morgan fingerprint with radius 2 for each molecule
fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]

In [317]:
fp

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427800>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e4270d0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427c60>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427ad0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427670>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427490>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427a80>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e4278f0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427710>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427760>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427120>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e4275d0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e4276c0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2724e427530>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect

In [318]:
#Convert the morgan fingerprints from the rdkit format to a readable numpy array.

def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [319]:
x = rdkit_numpy_convert(fp)

In [320]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [321]:
x.shape

(321, 2048)

In [322]:
# check wether the data set is balanced
sum(y) / len(y)

0.5545171339563862

#### Set random seed to make all further calculations reproducible

In [323]:
seed = 42

#### Split the whole set on training and test sets

In [324]:
# randomly select 20% of compounds as test set
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.20, random_state=seed)

In [325]:
#checking the distribution of test data
print(y_ts)

[0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1]


#### Create folds for cross-validation

In [326]:
cv = StratifiedKFold(n_splits=5, random_state=None)

In [327]:
# print out the ids of folds
for i, (train_index, test_index) in enumerate(cv.split(x_tr, y_tr)):
    print("\nFold_" + str(i+1))
    print("TRAIN:", train_index)
    print("TEST:", test_index)


Fold_1
TRAIN: [ 44  48  49  51  52  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105
 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
 250 251 252 253 254 255]
TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 45 46 47 50
 53 

#### Scale X

This step may be crucial for certain modeling approaches lke SVM.
In the case of binary fingerprints it may be less useful.

In [328]:
# Instantiate a scale object which can be further applied to scale any data to fit the training set
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)

In [329]:
# it is a good idea to save it for future use with joblib
joblib.dump(scale, "logBB_scale.pkl", compress=3)

['logBB_scale.pkl']

#### Search for optimal tuning parameters and build the Radom Forest model

In [330]:
# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [331]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [332]:
# run model building
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=2,
             param_grid={'max_features': [204, 292, 409, 682],
                         'n_estimators': [100, 250, 500]},
             verbose=1)

In [333]:
#check which parameters gives us the best result
m.best_params_

{'max_features': 292, 'n_estimators': 100}

In [334]:
#check the best score opbtained after running the model on multiple folds and candidates
m.best_score_

0.7655354449472096

In [335]:
#check the results of different folds and candidates to get a better idea of our dataset and the performance of our model
m.cv_results_

{'mean_fit_time': array([0.50453482, 1.16934047, 2.10327806, 0.43078108, 1.04966002,
        1.94051886, 0.45112343, 1.02720408, 2.01914606, 0.46382065,
        1.16148992, 2.24114404]),
 'std_fit_time': array([0.01262813, 0.08170524, 0.10029136, 0.01850265, 0.12483956,
        0.08378965, 0.02620548, 0.04987559, 0.06467989, 0.02071756,
        0.06346218, 0.06078578]),
 'mean_score_time': array([0.01565156, 0.04020329, 0.06788735, 0.01340017, 0.0331955 ,
        0.05152097, 0.01112661, 0.02510343, 0.04869056, 0.01601925,
        0.02251916, 0.04065094]),
 'std_score_time': array([0.00193342, 0.0157918 , 0.00376278, 0.00149715, 0.0046486 ,
        0.00592093, 0.00578574, 0.00623024, 0.00294776, 0.00412548,
        0.00830344, 0.00766564]),
 'param_max_features': masked_array(data=[204, 204, 204, 292, 292, 292, 409, 409, 409, 682, 682,
                    682],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
  

In [336]:
#explore the data to understand the difference in performance between different candidates
m.cv_results_['mean_test_score']

array([0.74600302, 0.74600302, 0.74984917, 0.76553544, 0.74992459,
       0.74984917, 0.74215686, 0.73815988, 0.73808446, 0.74607843,
       0.74600302, 0.74600302])

In [337]:
m.cv_results_['params']

[{'max_features': 204, 'n_estimators': 100},
 {'max_features': 204, 'n_estimators': 250},
 {'max_features': 204, 'n_estimators': 500},
 {'max_features': 292, 'n_estimators': 100},
 {'max_features': 292, 'n_estimators': 250},
 {'max_features': 292, 'n_estimators': 500},
 {'max_features': 409, 'n_estimators': 100},
 {'max_features': 409, 'n_estimators': 250},
 {'max_features': 409, 'n_estimators': 500},
 {'max_features': 682, 'n_estimators': 100},
 {'max_features': 682, 'n_estimators': 250},
 {'max_features': 682, 'n_estimators': 500}]

#### Save model

In [338]:
#save the model for future use with joblib
joblib.dump(m, "logBB_rf_morgan.pkl", compress=3)

['logBB_rf_morgan.pkl']

#### Predict test set compounds

In [339]:
# load scale if necessary
scale = joblib.load("logBB_scale.pkl")

In [340]:
# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)

In [341]:
# predict logBB class
pred_rf = m.predict(x_ts)

In [342]:
pred_rf

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1])

#### calculate statistics for test set preditions

In [343]:
accuracy_score(y_ts, pred_rf)

0.7692307692307693

In [344]:
matthews_corrcoef(y_ts, pred_rf)

0.5225580604829463

In [345]:
cohen_kappa_score(y_ts, pred_rf)

0.5222929936305732

#### applicability domain estimates

In [346]:
# if the model includes several ones like RF models or consensus models (or for probabilistic models)
# we can calculate consistency of predictions amongs those models and use it for estimation of applicability domain
pred_prob = m.predict_proba(x_ts)

In [347]:
# probablity
pred_prob

array([[0.94, 0.06],
       [0.17, 0.83],
       [0.3 , 0.7 ],
       [0.23, 0.77],
       [0.39, 0.61],
       [0.24, 0.76],
       [0.03, 0.97],
       [0.01, 0.99],
       [0.92, 0.08],
       [0.88, 0.12],
       [0.21, 0.79],
       [0.86, 0.14],
       [0.48, 0.52],
       [0.93, 0.07],
       [0.59, 0.41],
       [0.49, 0.51],
       [0.1 , 0.9 ],
       [0.06, 0.94],
       [0.01, 0.99],
       [0.1 , 0.9 ],
       [0.49, 0.51],
       [0.01, 0.99],
       [0.17, 0.83],
       [0.43, 0.57],
       [0.21, 0.79],
       [0.43, 0.57],
       [0.57, 0.43],
       [0.38, 0.62],
       [0.81, 0.19],
       [0.68, 0.32],
       [0.02, 0.98],
       [0.99, 0.01],
       [0.12, 0.88],
       [0.77, 0.23],
       [0.23, 0.77],
       [0.53, 0.47],
       [0.87, 0.13],
       [0.76, 0.24],
       [0.59, 0.41],
       [0.  , 1.  ],
       [0.91, 0.09],
       [0.  , 1.  ],
       [0.3 , 0.7 ],
       [0.51, 0.49],
       [0.23, 0.77],
       [0.76, 0.24],
       [0.23, 0.77],
       [0.79,

In [348]:
# setup threshold
threshold = 0.8

In [349]:
# calc maximum predicted probability for each row (compound) and compare to the threshold
da = np.amax(pred_prob, axis=1) > threshold

In [350]:
da

array([ True,  True, False, False, False, False,  True,  True,  True,
        True, False,  True, False,  True, False, False,  True,  True,
        True,  True, False,  True,  True, False, False, False, False,
       False,  True, False,  True,  True,  True, False, False, False,
        True, False, False,  True,  True,  True, False, False, False,
       False, False, False,  True, False, False,  True, False, False,
       False,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True])

In [351]:
# calculate statistics
accuracy_score(np.asarray(y_ts)[da], pred_rf[da])

0.90625

In [352]:
matthews_corrcoef(np.asarray(y_ts)[da], pred_rf[da])

0.8050112948805689

In [353]:
cohen_kappa_score(np.asarray(y_ts)[da], pred_rf[da])

0.8032786885245902

In [354]:
# calculate coverage
sum(da) / len(da)

0.49230769230769234

#### Build SVM model

In [355]:
# create grid search dictionary
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [356]:
# setup model building
svm = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, n_jobs=2, cv=cv, verbose=1)

In [357]:
# run model building
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SVC(probability=True), n_jobs=2,
             param_grid={'C': [1, 10, 100, 1000, 10000],
                         'gamma': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]},
             verbose=1)

In [358]:
#check the best score opbtained after running the model on multiple folds and candidates
svm.best_score_

0.753921568627451

In [359]:
#check which parameters gives us the best result
svm.best_params_

{'C': 100, 'gamma': 0.0001}

In [360]:
# save the model for future use with joblib
joblib.dump(svm, "logBB_svm_morgan.pkl", compress=3)

['logBB_svm_morgan.pkl']

In [361]:
# predict logBB class for the test set compounds
pred_svm = svm.predict(x_ts)

In [362]:
pred_svm

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1])

In [363]:
# calculate the statistics
print("Accuracy = ", accuracy_score(y_ts, pred_svm))
print("MCC = ", matthews_corrcoef(y_ts, pred_svm))
print("Kappa = ", cohen_kappa_score(y_ts, pred_svm))

Accuracy =  0.7384615384615385
MCC =  0.4527053324612084
Kappa =  0.4466700050075113


In [364]:
# estimate applicability domain and calculate statistics again
pred_prob = svm.predict_proba(x_ts)

In [365]:
da = np.amax(pred_prob, axis=1) > threshold

In [366]:
print("Accuracy = ", accuracy_score(np.asarray(y_ts)[da], pred_svm[da]))
print("MCC = ", matthews_corrcoef(np.asarray(y_ts)[da], pred_svm[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(y_ts)[da], pred_svm[da]))
print("Coverage = ", sum(da) / len(da))

Accuracy =  0.8378378378378378
MCC =  0.6606958729393098
Kappa =  0.6563467492260062
Coverage =  0.5692307692307692


### build the third model (GBM) and compute consensus predictions from RF, and SVM models

In [367]:
# setup model building
param_grid = {"n_estimators": [100, 200, 300, 400, 500]}
gbm = GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5), 
                   param_grid, n_jobs=2, cv=cv, verbose=1)

In [368]:
# run model building
gbm.fit(x_tr, y_tr)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=GradientBoostingClassifier(max_features=0.5,
                                                  subsample=0.5),
             n_jobs=2, param_grid={'n_estimators': [100, 200, 300, 400, 500]},
             verbose=1)

In [369]:
#check the best score opbtained after running the model on multiple folds and candidates
gbm.best_score_

0.7654600301659126

In [370]:
#check which parameters gives us the best result
gbm.best_params_

{'n_estimators': 100}

In [371]:
pred_gbm = gbm.predict(x_ts)

In [372]:
# calculate statistics
print("Accuracy = ", accuracy_score(y_ts, pred_gbm))
print("MCC = ", matthews_corrcoef(y_ts, pred_gbm))
print("Kappa = ", cohen_kappa_score(y_ts, pred_gbm))

Accuracy =  0.8153846153846154
MCC =  0.6170300809990545
Kappa =  0.6157635467980296


#### consensus model

In [373]:
#the consensus model is used to average the results between deifferent models and produce a set of results with lesser disparities
#Here, we take the average of results and declare it as permeable (1) if the average is greater than or equal to 0.5
#or non permeable (0) if the average is smaller than 0.5
pred_c = 1 * (((pred_rf + pred_svm + pred_gbm) / 3) >= 0.5)

In [374]:
pred_c

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1])

In [375]:
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_c))
print("MCC = ", matthews_corrcoef(y_ts, pred_c))
print("Kappa = ", cohen_kappa_score(y_ts, pred_c))

Accuracy =  0.7846153846153846
MCC =  0.5528589525751528
Kappa =  0.5517241379310345


### Add some other descriptors with morgan fingerprinnts and look at the model performance

In [376]:
#now we give our model some more details about our chemical compound other than it's morgan fingerprint,
#to study the change in the performance of our model+*

In [377]:
# calculate some more relevant descriptors
descr = []
for m in mols:
    descr.append([Descriptors.MolLogP(m),
                  Descriptors.TPSA(m),
                  Descriptors.NHOHCount(m),
                  Descriptors.NOCount(m),
                  Descriptors.NumHAcceptors(m),
                  Descriptors.NumHDonors(m),
                  Descriptors.NumRotatableBonds(m),
                  Descriptors.NumHeteroatoms(m),
                  Descriptors.FractionCSP3(m)])
descr = np.asarray(descr)

In [378]:
descr.shape

(321, 9)

In [379]:
# add these descriptors to morgan fingerprints
x = np.concatenate((x, descr), axis=1)

In [380]:
x.shape

(321, 2057)

In [381]:
# randomly select 20% of compounds as test set
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.20, random_state=seed)

In [382]:
#scale the new training data
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)

In [383]:
# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [384]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [385]:
# run model building
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=2,
             param_grid={'max_features': [205, 293, 411, 685],
                         'n_estimators': [100, 250, 500]},
             verbose=1)

In [386]:
#check the best score opbtained after running the model on multiple folds and candidates
m.best_score_

0.7890648567119155

In [387]:
# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)

In [388]:
# predict logBB class for the test set compounds
pred = m.predict(x_ts)

In [389]:
pred

array([0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1])

In [390]:
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred))
print("MCC = ", matthews_corrcoef(y_ts, pred))
print("Kappa = ", cohen_kappa_score(y_ts, pred))

Accuracy =  0.8461538461538461
MCC =  0.6812012094229563
Kappa =  0.6798029556650247


In [391]:
# estimate applicability domain and calc stat
pred_prob = m.predict_proba(x_ts)

In [392]:
da = np.amax(pred_prob, axis=1) > threshold

In [393]:
print("Accuracy = ", accuracy_score(np.asarray(y_ts)[da], pred[da]))
print("MCC = ", matthews_corrcoef(np.asarray(y_ts)[da], pred[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(y_ts)[da], pred[da]))
print("Coverage = ", sum(da) / len(da))

Accuracy =  0.9696969696969697
MCC =  0.9354143466934854
Kappa =  0.9333333333333333
Coverage =  0.5076923076923077


The model has a better accuracy. Added descritors improved the model predictivity.

#### Let's try to analyse which variables are the most important in the model

In [394]:
# rebuild RF model manually using best parameters to be able to extract additional information from the model
rf = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"], 
                           max_features=m.best_params_["max_features"],
                           random_state=seed)
rf.fit(x_tr, y_tr)

RandomForestClassifier(max_features=293, n_estimators=250, random_state=42)

In [395]:
#to find out the importance of each feature sepeartely and give a numerical value to it
imp = rf.feature_importances_

In [396]:
imp

array([0.        , 0.00254217, 0.00036993, ..., 0.00855873, 0.03689803,
       0.02239462])

In [397]:
indices = np.argsort(imp)[::-1]

print("Feature ranking:")

# print top 10 features
for i in range(10):
    print("%d. feature %d (%f)" % (i + 1, indices[i], imp[indices[i]]))

Feature ranking:
1. feature 2049 (0.117948)
2. feature 2051 (0.059470)
3. feature 2048 (0.046727)
4. feature 2055 (0.036898)
5. feature 650 (0.034344)
6. feature 2052 (0.030104)
7. feature 2056 (0.022395)
8. feature 2050 (0.018988)
9. feature 2053 (0.015880)
10. feature 807 (0.013109)


2049 - MolLogP  
2050 - TPSA(m)  
2051 - NHOHCount  
2052 - NOCount 
2053 - NumHAcceptors  
2054 - NumHDonors  
2055 - NumRotatableBonds  
2056 - NumHeteroatoms  
2057 - FractionCSP3

features with numbers 1-2048 are different Morgan fingerprints  