# Train a Random Forest Regressor to predict on missing values and Use predicted values to improve classification accuracy

### Import Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

%matplotlib inline

### Read data

In [2]:
dataframe = pd.read_csv("./data/FDA_APPROVED.csv")

In [3]:
needed = ["MW Drug", "MW Sol", "CLogP", "HBA", "HBD", "PSDA", "Measured LogP"]
inputs = needed[:6]
output = needed[-1]

In [27]:
def label_class(row):
    if row["Formulation"] == "tablets":
        return 0
    elif row["Formulation"] == "capsules":
        return 1
    elif row["Formulation"] == "solution":
        return 2

In [4]:
dataframe["label"] = dataframe.apply(lambda row: label_class(row), axis=1)

NameError: name 'label_class' is not defined

In [5]:
data = dataframe[needed]

In [6]:
data.describe()

Unnamed: 0,MW Drug,MW Sol,CLogP,HBA,HBD,PSDA,Measured LogP
count,892.0,892.0,889.0,892.0,892.0,890.0,417.0
mean,391.620561,394.963913,2.009618,5.23991,2.473094,9.76964,1.959257
std,314.237643,316.635222,2.648748,5.278831,4.508672,28.333017,2.060949
min,46.07,46.07,-9.69,0.0,0.0,0.88,-8.83
25%,266.3325,266.39,0.47,3.0,1.0,2.9,0.6
50%,336.09,336.915,2.1,4.0,2.0,4.33,2.06
75%,424.4175,426.4925,3.68,6.0,3.0,6.5575,3.23
max,4491.98,4491.98,14.36,67.0,63.0,250.38,7.8


### Drop rows with `nan` values (Missing values)

In [None]:


data = data.dropna()

In [7]:
data = data.dropna()

### Split data into training and testing sets.
#### 20% data will be used as testing set

In [245]:
data

Unnamed: 0,MW Drug,MW Sol,CLogP,HBA,HBD,PSDA,Measured LogP
0,286.34,286.34,0.81,6,3,2.99,1.20
1,645.62,645.62,-6.66,19,14,1.84,-8.83
2,336.43,336.43,1.71,5,3,3.47,1.71
3,151.17,151.17,0.49,2,2,2.73,0.20
4,324.40,324.40,2.25,4,2,3.14,2.44
...,...,...,...,...,...,...,...
883,183.68,183.68,2.85,1,1,6.57,2.60
884,276.74,276.74,2.35,3,2,3.26,2.27
886,135.21,135.21,1.74,1,1,4.83,1.76
890,177.25,177.25,1.67,2,1,7.58,1.50


In [8]:
train, test = train_test_split(data, test_size=0.2, random_state=1010)

In [9]:
x_train = train[inputs]
y_train = train[output]
x_test = test[inputs]
y_test = test[output]

In [10]:
x_train

Unnamed: 0,MW Drug,MW Sol,CLogP,HBA,HBD,PSDA
497,418.45,418.45,4.00,6,1,3.44
565,258.24,258.24,0.53,4,1,2.91
77,314.86,314.86,5.92,2,0,179.92
220,479.54,479.54,5.23,6,1,4.21
807,444.45,480.90,-0.91,9,6,2.44
...,...,...,...,...,...,...
382,285.69,285.69,-0.91,7,3,2.53
38,307.44,307.44,2.32,4,2,5.59
395,527.53,527.53,0.84,11,5,2.63
389,1202.64,1202.64,14.36,12,5,4.15


In [11]:
y_train

497    3.05
565    0.33
77     5.19
220    3.82
807   -1.30
       ... 
382    0.02
38     2.81
395    1.83
389    2.95
69     5.41
Name: Measured LogP, Length: 333, dtype: float64

### Create a random forest regressor

In [12]:
classifier = RandomForestRegressor(n_estimators=100, max_depth=8)

In [13]:
classifier.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=8, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [14]:
predicted = classifier.predict(x_test)

### Evaluate model
#### Calculate coefficient of determination

In [253]:
y_test.values

array([ 2.38,  2.27, -1.78, -1.24, -1.85,  1.19,  0.88,  1.26,  5.25,
        4.02,  4.25,  3.79,  0.87,  4.85,  1.76,  4.7 ,  3.77,  5.2 ,
        6.  ,  3.37,  3.7 ,  2.42,  2.44,  3.23,  1.78,  1.17,  4.29,
        2.41,  3.42, -2.74, -1.85,  4.55,  5.03,  2.05,  2.24,  0.57,
        0.05,  6.3 ,  2.07,  1.92,  2.63,  4.01,  3.97,  5.12,  6.97,
        0.89,  0.98, -0.02, -0.93,  0.2 ,  0.34,  3.78,  1.89,  2.97,
        2.15,  0.39,  3.2 ,  3.81,  0.1 ,  1.95,  1.16,  1.43,  2.45,
        3.06, -0.31,  1.7 ,  4.27,  0.36,  4.16,  0.75,  2.18,  1.3 ,
        4.03,  5.  , -2.09,  3.41,  3.  ,  1.2 , -0.58,  2.48, -1.03,
       -0.24,  2.09,  7.8 ])

In [254]:
predicted

array([ 2.08844442,  2.40802287, -1.66996687, -1.28920937, -0.43741084,
        1.23987536,  1.65318634,  1.52678711,  3.74051803,  2.71802534,
        4.51447385,  3.76903087, -1.10550893,  4.56291136,  1.66961636,
        3.5476084 ,  4.05860173,  3.9920326 ,  5.25328069,  2.69986013,
        3.77002312,  2.44144135,  3.67658609,  2.88173705,  1.8990921 ,
        0.40825202,  4.02111497,  2.40732425,  2.71809819, -2.30764167,
       -2.32050476,  4.77674785,  4.72937669,  1.80685687,  2.5220473 ,
        1.21608442,  0.73855015,  5.68619924,  2.54537057,  0.63288974,
        2.06798406,  3.53705107,  2.9379564 ,  4.14832553,  5.87643258,
        1.04049323,  0.69297491, -0.44610924, -1.0080849 ,  0.56825378,
        0.91679911,  2.91068102,  1.46860112,  2.88471564,  2.06902227,
       -2.32125968,  2.92216215,  3.05908295,  0.37146261,  2.2570165 ,
        0.42674314,  1.80271425,  2.32816147,  1.9841447 , -0.48911763,
        1.44965424,  3.87423665,  0.27815144,  3.32747776,  0.85

In [15]:
classifier.score(x_test, y_test)

0.8808490109025905

#### Calculate mean squared error

In [16]:
se = 0
for pred, real in zip(predicted, y_test.values):
    se += (pred - real) ** 2

mse = se / len(y_test)

print(mse)

0.5505074570075267


# Something that might be useful:

### Label formulation:

In [17]:
def label_class(row):
    if row["Formulation"] == "tablets":
        return 0
    elif row["Formulation"] == "capsules":
        return 1
    elif row["Formulation"] == "solution":
        return 2

In [18]:
dataframe["label"] = dataframe.apply(lambda row: label_class(row), axis=1)

### Trying with Deep Neutral Network

In [19]:
from tensorflow import keras

In [20]:
model = keras.models.Sequential()

model.add(keras.layers.Dense(4,
          input_dim=len(inputs),
          kernel_initializer="glorot_uniform",
          bias_initializer="glorot_uniform",
          activation="relu"))

for i in range(4):
    model.add(keras.layers.Dense(32,
              kernel_initializer="glorot_uniform",
              bias_initializer="glorot_uniform",
              activation="relu"))

model.add(keras.layers.Dense(1,
          kernel_initializer="glorot_uniform",
          bias_initializer="glorot_uniform",
          activation="relu"))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [127]:
x_train

Unnamed: 0,label,MW Drug,MW Sol,CLogP,HBA,HBD,PSDA
429,0.0,404.51,404.51,6.34,2,0,172.13
439,0.0,323.42,323.42,1.09,4,2,3.62
543,0.0,418.58,418.58,4.48,3,1,5.46
454,1.0,300.44,300.44,6.74,2,1,7.36
123,0.0,345.42,345.42,2.57,5,1,4.86
...,...,...,...,...,...,...,...
387,0.0,326.83,326.83,3.71,4,1,12.75
38,0.0,307.44,307.44,2.32,4,2,5.59
401,0.0,328.15,328.15,3.09,2,0,5.87
399,0.0,336.30,336.30,3.66,4,2,3.33


In [128]:
y_train

429    5.78
439    1.36
543    4.68
454    6.30
123    2.23
       ... 
387    3.23
38     2.81
401    1.96
399    2.07
69     5.41
Name: Measured LogP, Length: 330, dtype: float64

In [21]:
optimizer = keras.optimizers.Adam(learning_rate=0.0002)
loss = keras.losses.MeanSquaredError()

In [22]:
model.compile(optimizer=optimizer, loss=loss, metrics=["mse"])

In [23]:
model.fit(x_train.values, y_train, batch_size=10, epochs=500, validation_split=0.1, verbose=2)

Train on 299 samples, validate on 34 samples
Epoch 1/500
299/299 - 0s - loss: 2100.7613 - mean_squared_error: 2100.7612 - val_loss: 1418.3415 - val_mean_squared_error: 1418.3414
Epoch 2/500
299/299 - 0s - loss: 620.2953 - mean_squared_error: 620.2952 - val_loss: 412.4555 - val_mean_squared_error: 412.4556
Epoch 3/500
299/299 - 0s - loss: 166.6615 - mean_squared_error: 166.6615 - val_loss: 96.7045 - val_mean_squared_error: 96.7045
Epoch 4/500
299/299 - 0s - loss: 32.2998 - mean_squared_error: 32.2998 - val_loss: 13.2408 - val_mean_squared_error: 13.2408
Epoch 5/500
299/299 - 0s - loss: 5.9154 - mean_squared_error: 5.9154 - val_loss: 3.2473 - val_mean_squared_error: 3.2473
Epoch 6/500
299/299 - 0s - loss: 3.7992 - mean_squared_error: 3.7992 - val_loss: 3.0046 - val_mean_squared_error: 3.0046
Epoch 7/500
299/299 - 0s - loss: 3.7381 - mean_squared_error: 3.7381 - val_loss: 3.0156 - val_mean_squared_error: 3.0156
Epoch 8/500
299/299 - 0s - loss: 3.7460 - mean_squared_error: 3.7460 - val_los

Epoch 68/500
299/299 - 0s - loss: 3.7328 - mean_squared_error: 3.7328 - val_loss: 2.9439 - val_mean_squared_error: 2.9439
Epoch 69/500
299/299 - 0s - loss: 3.6982 - mean_squared_error: 3.6982 - val_loss: 3.0545 - val_mean_squared_error: 3.0545
Epoch 70/500
299/299 - 0s - loss: 3.7326 - mean_squared_error: 3.7326 - val_loss: 2.9600 - val_mean_squared_error: 2.9600
Epoch 71/500
299/299 - 0s - loss: 3.7219 - mean_squared_error: 3.7219 - val_loss: 2.9523 - val_mean_squared_error: 2.9523
Epoch 72/500
299/299 - 0s - loss: 3.7189 - mean_squared_error: 3.7189 - val_loss: 2.9497 - val_mean_squared_error: 2.9497
Epoch 73/500
299/299 - 0s - loss: 3.7392 - mean_squared_error: 3.7392 - val_loss: 2.9814 - val_mean_squared_error: 2.9814
Epoch 74/500
299/299 - 0s - loss: 3.7461 - mean_squared_error: 3.7461 - val_loss: 3.0403 - val_mean_squared_error: 3.0403
Epoch 75/500
299/299 - 0s - loss: 3.7362 - mean_squared_error: 3.7362 - val_loss: 3.1535 - val_mean_squared_error: 3.1535
Epoch 76/500
299/299 - 0

Epoch 135/500
299/299 - 0s - loss: 3.6533 - mean_squared_error: 3.6533 - val_loss: 3.5254 - val_mean_squared_error: 3.5254
Epoch 136/500
299/299 - 0s - loss: 3.8700 - mean_squared_error: 3.8700 - val_loss: 3.0045 - val_mean_squared_error: 3.0045
Epoch 137/500
299/299 - 0s - loss: 3.7876 - mean_squared_error: 3.7876 - val_loss: 3.3319 - val_mean_squared_error: 3.3319
Epoch 138/500
299/299 - 0s - loss: 3.7716 - mean_squared_error: 3.7716 - val_loss: 3.0424 - val_mean_squared_error: 3.0424
Epoch 139/500
299/299 - 0s - loss: 3.6865 - mean_squared_error: 3.6865 - val_loss: 2.8565 - val_mean_squared_error: 2.8565
Epoch 140/500
299/299 - 0s - loss: 3.6741 - mean_squared_error: 3.6741 - val_loss: 2.8362 - val_mean_squared_error: 2.8362
Epoch 141/500
299/299 - 0s - loss: 3.6135 - mean_squared_error: 3.6135 - val_loss: 2.8631 - val_mean_squared_error: 2.8631
Epoch 142/500
299/299 - 0s - loss: 3.6374 - mean_squared_error: 3.6374 - val_loss: 3.0266 - val_mean_squared_error: 3.0266
Epoch 143/500
29

Epoch 202/500
299/299 - 0s - loss: 3.5823 - mean_squared_error: 3.5823 - val_loss: 2.8078 - val_mean_squared_error: 2.8078
Epoch 203/500
299/299 - 0s - loss: 3.4405 - mean_squared_error: 3.4405 - val_loss: 2.6505 - val_mean_squared_error: 2.6505
Epoch 204/500
299/299 - 0s - loss: 3.4721 - mean_squared_error: 3.4721 - val_loss: 3.0272 - val_mean_squared_error: 3.0272
Epoch 205/500
299/299 - 0s - loss: 3.5520 - mean_squared_error: 3.5520 - val_loss: 2.6348 - val_mean_squared_error: 2.6348
Epoch 206/500
299/299 - 0s - loss: 3.4661 - mean_squared_error: 3.4661 - val_loss: 2.6066 - val_mean_squared_error: 2.6066
Epoch 207/500
299/299 - 0s - loss: 3.5323 - mean_squared_error: 3.5323 - val_loss: 2.6419 - val_mean_squared_error: 2.6419
Epoch 208/500
299/299 - 0s - loss: 3.4563 - mean_squared_error: 3.4563 - val_loss: 2.6476 - val_mean_squared_error: 2.6476
Epoch 209/500
299/299 - 0s - loss: 3.5587 - mean_squared_error: 3.5587 - val_loss: 3.2407 - val_mean_squared_error: 3.2407
Epoch 210/500
29

Epoch 269/500
299/299 - 0s - loss: 2.8134 - mean_squared_error: 2.8134 - val_loss: 2.1268 - val_mean_squared_error: 2.1268
Epoch 270/500
299/299 - 0s - loss: 2.7987 - mean_squared_error: 2.7987 - val_loss: 2.2772 - val_mean_squared_error: 2.2772
Epoch 271/500
299/299 - 0s - loss: 2.8309 - mean_squared_error: 2.8309 - val_loss: 2.1131 - val_mean_squared_error: 2.1131
Epoch 272/500
299/299 - 0s - loss: 2.7230 - mean_squared_error: 2.7230 - val_loss: 2.0866 - val_mean_squared_error: 2.0866
Epoch 273/500
299/299 - 0s - loss: 2.5545 - mean_squared_error: 2.5545 - val_loss: 2.6871 - val_mean_squared_error: 2.6871
Epoch 274/500
299/299 - 0s - loss: 2.6993 - mean_squared_error: 2.6993 - val_loss: 2.0271 - val_mean_squared_error: 2.0271
Epoch 275/500
299/299 - 0s - loss: 2.6839 - mean_squared_error: 2.6839 - val_loss: 2.1438 - val_mean_squared_error: 2.1438
Epoch 276/500
299/299 - 0s - loss: 2.6217 - mean_squared_error: 2.6217 - val_loss: 2.1899 - val_mean_squared_error: 2.1899
Epoch 277/500
29

Epoch 336/500
299/299 - 0s - loss: 1.9417 - mean_squared_error: 1.9417 - val_loss: 2.1139 - val_mean_squared_error: 2.1139
Epoch 337/500
299/299 - 0s - loss: 1.9957 - mean_squared_error: 1.9957 - val_loss: 1.7098 - val_mean_squared_error: 1.7098
Epoch 338/500
299/299 - 0s - loss: 1.9387 - mean_squared_error: 1.9387 - val_loss: 2.0986 - val_mean_squared_error: 2.0986
Epoch 339/500
299/299 - 0s - loss: 2.0096 - mean_squared_error: 2.0096 - val_loss: 1.9163 - val_mean_squared_error: 1.9163
Epoch 340/500
299/299 - 0s - loss: 1.8546 - mean_squared_error: 1.8546 - val_loss: 1.9360 - val_mean_squared_error: 1.9360
Epoch 341/500
299/299 - 0s - loss: 1.8310 - mean_squared_error: 1.8310 - val_loss: 2.1172 - val_mean_squared_error: 2.1172
Epoch 342/500
299/299 - 0s - loss: 1.9514 - mean_squared_error: 1.9514 - val_loss: 1.7250 - val_mean_squared_error: 1.7250
Epoch 343/500
299/299 - 0s - loss: 1.9511 - mean_squared_error: 1.9511 - val_loss: 2.0470 - val_mean_squared_error: 2.0470
Epoch 344/500
29

Epoch 403/500
299/299 - 0s - loss: 1.6405 - mean_squared_error: 1.6405 - val_loss: 1.7668 - val_mean_squared_error: 1.7668
Epoch 404/500
299/299 - 0s - loss: 1.6184 - mean_squared_error: 1.6184 - val_loss: 1.8892 - val_mean_squared_error: 1.8892
Epoch 405/500
299/299 - 0s - loss: 1.5766 - mean_squared_error: 1.5766 - val_loss: 1.6797 - val_mean_squared_error: 1.6797
Epoch 406/500
299/299 - 0s - loss: 1.6168 - mean_squared_error: 1.6168 - val_loss: 2.1755 - val_mean_squared_error: 2.1755
Epoch 407/500
299/299 - 0s - loss: 1.6728 - mean_squared_error: 1.6728 - val_loss: 2.5928 - val_mean_squared_error: 2.5928
Epoch 408/500
299/299 - 0s - loss: 1.5990 - mean_squared_error: 1.5990 - val_loss: 1.8097 - val_mean_squared_error: 1.8097
Epoch 409/500
299/299 - 0s - loss: 1.6183 - mean_squared_error: 1.6183 - val_loss: 1.6318 - val_mean_squared_error: 1.6318
Epoch 410/500
299/299 - 0s - loss: 1.6309 - mean_squared_error: 1.6309 - val_loss: 2.4705 - val_mean_squared_error: 2.4705
Epoch 411/500
29

Epoch 470/500
299/299 - 0s - loss: 1.7109 - mean_squared_error: 1.7109 - val_loss: 2.2042 - val_mean_squared_error: 2.2042
Epoch 471/500
299/299 - 0s - loss: 1.6548 - mean_squared_error: 1.6548 - val_loss: 2.4000 - val_mean_squared_error: 2.4000
Epoch 472/500
299/299 - 0s - loss: 1.6771 - mean_squared_error: 1.6771 - val_loss: 2.4736 - val_mean_squared_error: 2.4736
Epoch 473/500
299/299 - 0s - loss: 1.7698 - mean_squared_error: 1.7698 - val_loss: 2.6811 - val_mean_squared_error: 2.6811
Epoch 474/500
299/299 - 0s - loss: 1.6545 - mean_squared_error: 1.6545 - val_loss: 2.5840 - val_mean_squared_error: 2.5840
Epoch 475/500
299/299 - 0s - loss: 1.6364 - mean_squared_error: 1.6364 - val_loss: 2.0993 - val_mean_squared_error: 2.0993
Epoch 476/500
299/299 - 0s - loss: 1.7678 - mean_squared_error: 1.7678 - val_loss: 2.1522 - val_mean_squared_error: 2.1522
Epoch 477/500
299/299 - 0s - loss: 1.6703 - mean_squared_error: 1.6703 - val_loss: 2.0850 - val_mean_squared_error: 2.0850
Epoch 478/500
29

<tensorflow.python.keras.callbacks.History at 0x7f3ca3df9750>

In [24]:
model.evaluate(x_test, y_test)



[1.8643450793765841, 1.864345]