### Simple QSAR example from a 2017 drug design workshop
Danny Barbaro


In [3]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

In [4]:
# from rdkit.Chem.Draw import IPythonConsole
# from rdkit.Chem import Draw
# IPythonConsole.ipython_useSVG=True

In [5]:
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
from sklearn.externals import joblib




#### Reading data (which includes molecules and activity) from an SDF file
An SDF file is a Standard Database Format which is kind of like a SQL database in a text format

In [6]:
# logBB is a database of molecules
file_name = "data/logBB.sdf"
molecules = []
y = []
# Get all the molecules out of the database and parse them with the Chem library
for molecule in Chem.SDMolSupplier(file_name):
    if molecule is not None:
        molecules.append(molecule)
        y.append(molecule.GetIntProp("logBB_class"))

#### Calculate descriptors (fingerprints) and convert them into numpy array

In [7]:
# generate binary Morgan fingerprint with radius 2

# Morgan fingerprint: go through each atom of the molecule and obtain all possible paths through this atom with a specific radius. Then each unique path is hashed into a number.
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in molecules]

In [8]:
def rdkit_numpy_convert(fingerprints):
    output = []
    for fingerprint in fingerprints:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fingerprint, arr)
        output.append(arr)
    return np.asarray(output)

In [9]:
x = rdkit_numpy_convert(fingerprints)

In [10]:
x.shape

(321, 2048)

In [11]:
# check if the data set is approximately balanced
# a perfect balance is 0.5 and the closer we are to this, the more effecticve the compution becomes
sum(y) / len(y)

0.5545171339563862

#### Set random seed to make all further calculations reproducible

In [12]:
seed = 73

#### Split the whole set on training and test sets

In [13]:
# randomly select 20% of compounds to be part of the test set
x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size=0.20, random_state=seed)

In [14]:
print(y_testing)

[1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1]


#### Create stratified folds for cross-validation


In [15]:
cross_validations = StratifiedKFold(n_splits=5, random_state=seed)

In [16]:
# print out ids of folds
for i, (train_index, test_index) in enumerate(cross_validations.split(x_training, y_training)):
    print("\nFold_" + str(i))
    print("TRAIN:", train_index)
    print("TEST:", test_index)


Fold_0
TRAIN: [ 46  47  48  50  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105
 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
 250 251 252 253 254 255]
TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 49 51
 52 

#### Scale X
This step may be crucial for certain modeling approaches such as support-vector machines (SVMs) in machine learning.
In the case of something more binary like whast is being doen here, it may prove to be less useful.

In [17]:
# create a new scaler which can later be used to apply the scale to any data to fit the given training set
scaler = StandardScaler().fit(x_training)
x_training = scaler.transform(x_training)

In [19]:
# for future convenience, we will dump the scaler object as a pickle files
#  pickle is just a serialization of a python object
joblib.dump(scaler, "data/logBB_scaler.pkl", compress=3)

['data/logBB_scaler.pkl']

## Building a random forest model

In [20]:
# create grid search dictionary
parameter_grid = {"max_features": [x_training.shape[1] // 10, x_training.shape[1] // 7, x_training.shape[1] // 5, x_training.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [21]:
# setup model using a random forest
model = GridSearchCV(RandomForestClassifier(), parameter_grid, n_jobs=2, cv=cross_validations, verbose=1)

In [22]:
# run model
model.fit(x_training, y_training)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   26.2s
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   37.9s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=73, shuffle=False),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
          

In [23]:
model.best_params_

{'max_features': 682, 'n_estimators': 100}

In [24]:
model.best_score_

0.80078125

In [25]:
model.cv_results_

{'mean_fit_time': array([0.43746314, 0.85785494, 1.53192692, 0.34266086, 0.83844147,
        1.67527976, 0.38311658, 0.95426869, 1.87873321, 0.4984786 ,
        1.15881963, 2.8201026 ]),
 'std_fit_time': array([0.18937149, 0.20664881, 0.03771478, 0.00383038, 0.0073081 ,
        0.00969643, 0.00795306, 0.01067708, 0.01977272, 0.04383401,
        0.0137222 , 0.4192366 ]),
 'mean_score_time': array([0.01271443, 0.02134485, 0.04073129, 0.01018934, 0.02102442,
        0.04052019, 0.00827141, 0.02067394, 0.05436492, 0.009868  ,
        0.01995416, 0.04068213]),
 'std_score_time': array([0.00455198, 0.00062809, 0.00216496, 0.00075373, 0.00163178,
        0.00157054, 0.00114138, 0.00080743, 0.02938982, 0.00164787,
        0.00097284, 0.00392852]),
 'param_max_features': masked_array(data=[204, 204, 204, 292, 292, 292, 409, 409, 409, 682, 682,
                    682],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
  

In [26]:
model.cv_results_['mean_test_score']

array([0.7734375 , 0.7890625 , 0.77734375, 0.796875  , 0.79296875,
       0.78125   , 0.796875  , 0.7734375 , 0.7890625 , 0.80078125,
       0.78515625, 0.7890625 ])

In [27]:
model.cv_results_['params']

[{'max_features': 204, 'n_estimators': 100},
 {'max_features': 204, 'n_estimators': 250},
 {'max_features': 204, 'n_estimators': 500},
 {'max_features': 292, 'n_estimators': 100},
 {'max_features': 292, 'n_estimators': 250},
 {'max_features': 292, 'n_estimators': 500},
 {'max_features': 409, 'n_estimators': 100},
 {'max_features': 409, 'n_estimators': 250},
 {'max_features': 409, 'n_estimators': 500},
 {'max_features': 682, 'n_estimators': 100},
 {'max_features': 682, 'n_estimators': 250},
 {'max_features': 682, 'n_estimators': 500}]

#### Dump the model

In [28]:
joblib.dump(model, "data/logBB_random_forest_morgan.pkl", compress=3)

['data/logBB_random_forest_morgan.pkl']

#### Predict the test set compounds with the model now

In [29]:
# load scaler from out pickle file if we didn't run it previously
scaler = joblib.load("data/logBB_scaler.pkl")

In [30]:
# apply the scaler to the test set
x_testing = scaler.transform(x_testing)

In [31]:
# predict logBB class
prediction_random_forest = model.predict(x_testing)

In [32]:
prediction_random_forest

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1])

#### Time to calculate some statistics for test set preditions

In [61]:
# simple ration between the two sets. 1 is perfect
print("Accuracy = ", accuracy_score(y_testing, prediction_random_forest))

# Matthews correlation coefficient
#   It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes
#   A coefficient of +1 represents a perfect prediction, 0 no better than random prediction and −1 indicates total disagreement between prediction and observation
print("MCC = ", matthews_corrcoef(y_testing, prediction_random_forest))

# Cohen's kappa
#   Thought to be a more robust measure than simple percent agreement calculation, as kappa takes into account the possibility of the agreement occurring by chance
#   A coefficient of +1 represents a perfect prediction, 0 is random, and negative is a inverse relation between model and results
print("Kappa = ", cohen_kappa_score(y_testing, prediction_random_forest))

Accuracy =  0.7230769230769231
MCC =  0.45743323311233
Kappa =  0.44967074317968014


#### Applicability Domain Estimates

In [36]:
# If a model includes several sub-models like a random forest model or consensus models, 
# we can calculate consistency of predictions across those models and use it for an estimation of applicability domain
prediction_probability = model.predict_proba(x_testing)

In [37]:
prediction_probability

array([[0.03, 0.97],
       [0.14, 0.86],
       [0.61, 0.39],
       [0.4 , 0.6 ],
       [0.54, 0.46],
       [0.05, 0.95],
       [0.01, 0.99],
       [0.29, 0.71],
       [0.57, 0.43],
       [0.12, 0.88],
       [0.35, 0.65],
       [0.27, 0.73],
       [0.08, 0.92],
       [0.27, 0.73],
       [0.31, 0.69],
       [0.96, 0.04],
       [0.37, 0.63],
       [0.56, 0.44],
       [0.33, 0.67],
       [0.93, 0.07],
       [0.  , 1.  ],
       [0.28, 0.72],
       [0.48, 0.52],
       [0.46, 0.54],
       [0.34, 0.66],
       [0.41, 0.59],
       [0.01, 0.99],
       [0.  , 1.  ],
       [0.96, 0.04],
       [0.79, 0.21],
       [0.84, 0.16],
       [0.3 , 0.7 ],
       [0.64, 0.36],
       [0.65, 0.35],
       [0.62, 0.38],
       [0.19, 0.81],
       [0.57, 0.43],
       [0.64, 0.36],
       [0.16, 0.84],
       [0.65, 0.35],
       [0.62, 0.38],
       [0.98, 0.02],
       [0.12, 0.88],
       [0.28, 0.72],
       [0.  , 1.  ],
       [0.95, 0.05],
       [0.94, 0.06],
       [0.37,

In [38]:
threshold = 0.8

In [39]:
# Calculate the maximum predicted probability for each compound and compare it to the desired threshold
da = np.amax(prediction_probability, axis=1) > threshold

In [40]:
da

array([ True,  True, False, False, False,  True,  True, False, False,
        True, False, False,  True, False, False,  True, False, False,
       False,  True,  True, False, False, False, False, False,  True,
        True,  True, False,  True, False, False, False, False,  True,
       False, False,  True, False, False,  True,  True, False,  True,
        True,  True, False, False, False, False, False, False,  True,
        True, False,  True, False,  True,  True, False, False,  True,
        True, False])

#### Time to calculate some statistics for the prediciton probabilities

In [60]:
print("Accuracy = ", accuracy_score(np.asarray(y_testing)[da], prediction_random_forest[da]))
print("MCC = ", matthews_corrcoef(np.asarray(y_testing)[da], prediction_random_forest[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(y_testing)[da], prediction_random_forest[da]))
print("Coverage = ", sum(da) / len(da))

Accuracy =  0.813953488372093
MCC =  0.6600586246040802
Kappa =  0.6348195329087049
Coverage =  0.6615384615384615


#### What Have We Just Accomplished?
1. Learned how to read in data from a common file format in the field (sdf)
2. Learned how to split our large ammount of data randomly yet consistently for training and testing
3. Learned how to train and run a model
4. Examined some common correclation measures
5. Learned how to save out model and other objects for reuse

### Moving onto building another common model

## Second Model: Support-Vector Machines (SVM)

In [45]:
# create grid search dictionary
parameter_grid = [{'kernel': ['rbf'], 'gamma': [10 ** i for i in range(-6, 0)],
                     'C': [10 ** i for i in range(0, 5)]},
                    {'kernel': ['linear'], 'C': [10 ** i for i in range(0, 5)]}]

In [46]:
# setup model
svm = GridSearchCV(SVC(probability=True), parameter_grid, n_jobs=2, cv=cross_validations, verbose=1)

In [47]:
# run model
svm.fit(x_training, y_training)

Fitting 5 folds for each of 35 candidates, totalling 175 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   14.9s
[Parallel(n_jobs=2)]: Done 175 out of 175 | elapsed:   56.6s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=73, shuffle=False),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=2,
             param_grid=[{'C': [1, 10, 100, 1000, 10000],
                          'gamma': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000, 10000],
                          'kernel': ['linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [48]:
svm.best_score_

0.7734375

In [49]:
svm.best_params_

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}

In [50]:
# save model
joblib.dump(svm, "data/logBB_svm_morgan.pkl", compress=3)

['data/logBB_svm_morgan.pkl']

In [53]:
# predict logBB for the test set compounds
prediction_svm = svm.predict(x_testing)

In [54]:
prediction_svm

array([1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0])

In [55]:
# calculate statistics
print("Accuracy = ", accuracy_score(y_testing, prediction_svm))
print("MCC = ", matthews_corrcoef(y_testing, prediction_svm))
print("Kappa = ", cohen_kappa_score(y_testing, prediction_svm))

Accuracy =  0.676923076923077
MCC =  0.37500790630501557
Kappa =  0.360655737704918


In [57]:
threshold = 0.8

In [56]:
# estimate applicability domain and calc stat
prediciton_probability = svm.predict_proba(x_testing)

In [58]:
da = np.amax(prediciton_probability, axis=1) > threshold

In [59]:
print("Accuracy = ", accuracy_score(np.asarray(y_testing)[da], prediction_svm[da]))
print("MCC = ", matthews_corrcoef(np.asarray(y_testing)[da], prediction_svm[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(y_testing)[da], prediction_svm[da]))
print("Coverage = ", sum(da) / len(da))

Accuracy =  0.7441860465116279
MCC =  0.5087719298245614
Kappa =  0.49519743863393817
Coverage =  0.6615384615384615


### Third model: Gradient Boosting Classifier (GBM)
#### This will compute the consensus predictions from random forest and SVM models

In [64]:
# setup model
parameter_grid = {"n_estimators": [100, 200, 300, 400, 500]}
gbm = GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5), 
                   parameter_grid, n_jobs=2, cv=cross_validations, verbose=1)

In [66]:
# run model
gbm.fit(x_training, y_training)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  25 out of  25 | elapsed:   18.1s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=73, shuffle=False),
             error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=0.5,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c

In [67]:
gbm.best_score_

0.796875

In [68]:
gbm.best_params_

{'n_estimators': 100}

In [69]:
prediction_gbm = gbm.predict(x_testing)

In [70]:
# calc statistics
print("Accuracy = ", accuracy_score(y_testing, prediction_gbm))
print("MCC = ", matthews_corrcoef(y_testing, prediction_gbm))
print("Kappa = ", cohen_kappa_score(y_testing, prediction_gbm))

Accuracy =  0.7076923076923077
MCC =  0.44894106591270994
Kappa =  0.4231667445119103


#### consensus model

In [73]:
prediction_consensus = 1 * (((prediction_random_forest + prediction_svm + prediction_gbm) / 3) >= 0.5)

In [74]:
prediction_consensus

array([1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0])

In [75]:
# calculate statistics
print("Accuracy = ", accuracy_score(y_testing, prediction_consensus))
print("MCC = ", matthews_corrcoef(y_testing, prediction_consensus))
print("Kappa = ", cohen_kappa_score(y_testing, prediction_consensus))

Accuracy =  0.7076923076923077
MCC =  0.4298466409757644
Kappa =  0.41991545326444346


### Add to the fingerprints some other descriptors and look at the consensus model performance

In [102]:
# calculate some descriptors
descriptors = []
for molecule in molecules:
    descriptors.append([Descriptors.MolLogP(molecule),
                  Descriptors.TPSA(molecule),
                  Descriptors.NHOHCount(molecule),
                  Descriptors.NOCount(molecule),
                  Descriptors.NumHAcceptors(molecule),
                  Descriptors.NumHDonors(molecule),
                  Descriptors.NumRotatableBonds(molecule),
                  Descriptors.NumHeteroatoms(molecule),
                  Descriptors.FractionCSP3(molecule)])
descriptors = np.asarray(descriptors)

In [103]:
descriptors.shape

(321, 9)

In [104]:
# add them to morgan fingerprints
x = np.concatenate((x, descriptors), axis=1)

In [105]:
x.shape

(321, 2066)

In [106]:
# randomly select 20% of compounds as test set
x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size=0.20, random_state=seed)

In [107]:
scaler = StandardScaler().fit(x_training)
x_training = scaler.transform(x_training)

In [108]:
# create grid search dictionary
parameter_grid = {"max_features": [x_training.shape[1] // 10, x_training.shape[1] // 7, x_training.shape[1] // 5, x_training.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [109]:
# setup model 
model = GridSearchCV(RandomForestClassifier(), parameter_grid, n_jobs=2, cv=cross_validations, verbose=1)

In [110]:
# run model
model.fit(x_training, y_training)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   26.6s
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   36.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=73, shuffle=False),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
          

In [111]:
model.best_score_

0.86328125

In [112]:
x_testing = scaler.transform(x_testing)
prediction = model.predict(x_testing)

In [113]:
prediction

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1])

In [114]:
# Calculate Statistics
print("Accuracy = ", accuracy_score(y_testing, prediction))
print("MCC = ", matthews_corrcoef(y_testing, prediction))
print("Kappa = ", cohen_kappa_score(y_testing, prediction))

Accuracy =  0.7538461538461538
MCC =  0.5281452275692496
Kappa =  0.5121951219512195


In [115]:
# estimate applicability domain and calc stat
prediction_probability = model.predict_proba(x_testing)

In [116]:
# threshold is defined above for other models
da = np.amax(prediction_probability, axis=1) > threshold

In [117]:
print("Accuracy = ", accuracy_score(np.asarray(y_testing)[da], prediction[da]))
print("MCC = ", matthews_corrcoef(np.asarray(y_testing)[da], prediction[da]))
print("Kappa = ", cohen_kappa_score(np.asarray(y_testing)[da], prediction[da]))
print("Coverage = ", sum(da) / len(da))

Accuracy =  0.8620689655172413
MCC =  0.7552286874651746
Kappa =  0.7264150943396226
Coverage =  0.4461538461538462


The model has a better accuracy. Added descritors improved the model predictivity.

#### Let's try to analyse which variables are the most important in the model

In [118]:
# rebuild the Random Forest model manually using best parameters to be able to extract additional information from the model
random_forest = RandomForestClassifier(n_estimators=model.best_params_["n_estimators"], 
                           max_features=model.best_params_["max_features"],
                           random_state=seed)
random_forest.fit(x_training, y_training)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=295, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=73, verbose=0,
                       warm_start=False)

In [119]:
importance = random_forest.feature_importances_

In [120]:
importance

array([0.        , 0.00149662, 0.00025045, ..., 0.00751906, 0.01958647,
       0.02424284])

In [121]:
indices = np.argsort(importance)[::-1]

print("Feature ranking:")

# print top 25 features
for i in range(25):
    print("%d.\tfeature_index = %d\t(%f)" % (i + 1, indices[i], importance[indices[i]]))

Feature ranking:
1.	feature_index = 2049	(0.094693)
2.	feature_index = 2058	(0.082216)
3.	feature_index = 2060	(0.043471)
4.	feature_index = 2057	(0.042505)
5.	feature_index = 2051	(0.037312)
6.	feature_index = 2048	(0.031235)
7.	feature_index = 2065	(0.024243)
8.	feature_index = 2055	(0.023781)
9.	feature_index = 2056	(0.022325)
10.	feature_index = 2052	(0.020145)
11.	feature_index = 2061	(0.020015)
12.	feature_index = 2064	(0.019586)
13.	feature_index = 650	(0.018186)
14.	feature_index = 2050	(0.012931)
15.	feature_index = 2059	(0.010798)
16.	feature_index = 2053	(0.008813)
17.	feature_index = 2062	(0.008812)
18.	feature_index = 2054	(0.008456)
19.	feature_index = 2063	(0.007519)
20.	feature_index = 794	(0.007504)
21.	feature_index = 378	(0.006732)
22.	feature_index = 892	(0.005309)
23.	feature_index = 327	(0.005174)
24.	feature_index = 795	(0.005014)
25.	feature_index = 807	(0.004759)


#### Note: Features with numbers 1-2048 are different auto-generated fingerprints
### In the order we defined them, the following feature have the associated indicies

2049 - MolLogP --------------- Rank: 1  
2050 - TPSA(m) --------------- Rank: 4  
2051 - NHOHCount ------------- Rank: 5  
2052 - NOCount --------------- Rank: 10  
2053 - NumHAcceptors --------- Rank: 16  
2054 - NumHDonors ------------ Rank: 18  
2055 - NumRotatableBonds ----- Rank: 8  
2056 - NumHeteroatoms -------- Rank: 9  
2057 - FractionCSP3 ---------- Rank: 4  
