# Train a Random Forest Regressor to predict on missing values and Use predicted values to improve classification accuracy

### Import Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
dataframe = pd.read_csv("./data/FDA_APPROVED.csv")

In [3]:
def label_class(row):
    if row["Formulation"] == "tablets":
        return 0
    elif row["Formulation"] == "capsules":
        return 1
    elif row["Formulation"] == "solution":
        return 2

In [4]:
dataframe["label"] = dataframe.apply(lambda row: label_class(row), axis=1)

In [5]:
needed = ["% Excreted Unchanged in Urine", "Maximum Strength Dose Value", "MW Drug", "MW Sol", "CLogP", "HBA", "HBD", "PSDA", "label"]
inputs = needed[:7]
output = needed[-1]

In [6]:
data = dataframe[needed]

In [7]:
data.describe()

Unnamed: 0,% Excreted Unchanged in Urine,Maximum Strength Dose Value,MW Drug,MW Sol,CLogP,HBA,HBD,PSDA,label
count,800.0,862.0,892.0,892.0,889.0,892.0,892.0,890.0,862.0
mean,23.648456,155.339907,391.620561,394.963913,2.009618,5.23991,2.473094,9.76964,0.522042
std,30.43448,243.924312,314.237643,316.635222,2.648748,5.278831,4.508672,28.333017,0.782415
min,0.0,0.0,46.07,46.07,-9.69,0.0,0.0,0.88,0.0
25%,0.5,10.0,266.3325,266.39,0.47,3.0,1.0,2.9,0.0
50%,6.25,50.0,336.09,336.915,2.1,4.0,2.0,4.33,0.0
75%,45.0,200.0,424.4175,426.4925,3.68,6.0,3.0,6.5575,1.0
max,100.0,3000.0,4491.98,4491.98,14.36,67.0,63.0,250.38,2.0


In [8]:
data = data.dropna()

In [9]:
data

Unnamed: 0,% Excreted Unchanged in Urine,Maximum Strength Dose Value,MW Drug,MW Sol,CLogP,HBA,HBD,PSDA,label
0,1.2,300.0,286.34,286.34,0.81,6,3,2.99,0.0
1,1.0,100.0,645.62,645.62,-6.66,19,14,1.84,0.0
2,10.0,400.0,336.43,336.43,1.71,5,3,3.47,1.0
3,3.0,1000.0,151.17,151.17,0.49,2,2,2.73,0.0
5,1.4,500.0,180.16,180.16,1.02,3,1,2.64,0.0
...,...,...,...,...,...,...,...,...,...
887,35.0,100.0,199.30,199.30,1.62,2,0,9.14,0.0
888,50.0,3.0,167.30,167.30,2.83,1,1,11.48,0.0
889,40.0,5.0,149.24,149.24,1.89,1,1,10.24,0.0
890,19.0,25.0,177.25,177.25,1.67,2,1,7.58,0.0


In [28]:
train, test = train_test_split(data, test_size=0.2, random_state=1010)

In [29]:
x_train = train[inputs]
y_train = train[output]
x_test = test[inputs]
y_test = test[output]

In [30]:
classifier = RandomForestClassifier(n_estimators=100, max_depth=8)

In [31]:
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
classifier.score(x_test, y_test)

0.7142857142857143

# Trying with Deep Neutral Network

In [16]:
from tensorflow import keras

In [17]:
model = keras.models.Sequential()

In [18]:
x_train

Unnamed: 0,% Excreted Unchanged in Urine,Maximum Strength Dose Value,MW Drug,MW Sol,CLogP,HBA,HBD
192,13.0,100.0,247.34,247.34,2.23,2,0
604,2.0,12.0,424.54,424.54,2.16,5,3
535,1.5,2.0,452.60,452.60,5.30,5,2
125,5.0,3.0,388.82,388.82,1.25,6,0
394,0.1,70.0,488.02,488.02,2.88,8,3
...,...,...,...,...,...,...,...
600,45.0,10.0,333.24,333.24,-1.98,8,3
21,10.0,33.0,393.47,393.47,4.69,5,2
242,2.0,40.0,329.37,329.37,4.24,4,1
829,75.0,800.0,225.21,225.21,-2.42,6,3


In [19]:
model.add(keras.layers.Dense(4,
          input_dim=len(inputs),
          kernel_initializer="glorot_uniform",
          bias_initializer="glorot_uniform",
          activation="relu"))

for i in range(2):
    model.add(keras.layers.Dense(8,
              kernel_initializer="glorot_uniform",
              bias_initializer="glorot_uniform",
              activation="relu"))

model.add(keras.layers.Dense(3,
          kernel_initializer="glorot_uniform",
          bias_initializer="glorot_uniform",
          activation="relu"))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [20]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])

In [21]:
model.fit(x_train.values, y_train, batch_size=20, epochs=100)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 613 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/1

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f13437f2990>

In [22]:
model.evaluate(x_test, y_test)



[1.1795792718986413, 0.5584416]