# Train a Random Forest Regressor to predict on missing values and Use predicted values to improve classification accuracy

### Import Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

%matplotlib inline

### Read data

In [2]:
dataframe = pd.read_csv("./data/FDA_APPROVED.csv")

In [3]:
needed = ["MW Drug", "MW Sol", "CLogP", "HBA", "HBD", "PSA", "PSDA", "Measured LogP"]
inputs = needed[:-1]
output = needed[-1]

In [4]:
data = dataframe[needed]

In [5]:
data.describe()

Unnamed: 0,MW Drug,MW Sol,CLogP,HBA,HBD,PSA,PSDA,Measured LogP
count,892.0,892.0,889.0,892.0,892.0,892.0,890.0,417.0
mean,391.620561,394.963913,2.009618,5.23991,2.473094,102.422836,9.76964,1.959257
std,314.237643,316.635222,2.648748,5.278831,4.508672,145.248942,28.333017,2.060949
min,46.07,46.07,-9.69,0.0,0.0,0.0,0.88,-8.83
25%,266.3325,266.39,0.47,3.0,1.0,47.41,2.9,0.6
50%,336.09,336.915,2.1,4.0,2.0,75.525,4.33,2.06
75%,424.4175,426.4925,3.68,6.0,3.0,115.455,6.5575,3.23
max,4491.98,4491.98,14.36,67.0,63.0,2095.89,250.38,7.8


### Drop rows with `nan` values (Missing values)

In [6]:
clean_data = data.dropna()

### Split data into training and testing sets.
#### 20% data will be used as testing set

In [7]:
train, test = train_test_split(clean_data, test_size=0.2, random_state=1010)

In [8]:
x_train = train[inputs]
y_train = train[output]
x_test = test[inputs]
y_test = test[output]

In [9]:
x_train

Unnamed: 0,MW Drug,MW Sol,CLogP,HBA,HBD,PSA,PSDA
497,418.45,418.45,4.00,6,1,121.64,3.44
565,258.24,258.24,0.53,4,1,88.83,2.91
77,314.86,314.86,5.92,2,0,1.75,179.92
220,479.54,479.54,5.23,6,1,114.03,4.21
807,444.45,480.90,-0.91,9,6,197.07,2.44
...,...,...,...,...,...,...,...
382,285.69,285.69,-0.91,7,3,112.88,2.53
38,307.44,307.44,2.32,4,2,55.03,5.59
395,527.53,527.53,0.84,11,5,200.34,2.63
389,1202.64,1202.64,14.36,12,5,290.07,4.15


In [10]:
y_train

497    3.05
565    0.33
77     5.19
220    3.82
807   -1.30
       ... 
382    0.02
38     2.81
395    1.83
389    2.95
69     5.41
Name: Measured LogP, Length: 333, dtype: float64

### Create a random forest regressor

In [11]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=8)

In [12]:
regressor.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=8, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [13]:
predicted = regressor.predict(x_test)

### Evaluate model
#### Calculate coefficient of determination

In [14]:
y_test.to_numpy()

array([ 2.38,  2.27, -1.78, -1.24, -1.85,  1.19,  0.88,  1.26,  5.25,
        4.02,  4.25,  3.79,  0.87,  4.85,  1.76,  4.7 ,  3.77,  5.2 ,
        6.  ,  3.37,  3.7 ,  2.42,  2.44,  3.23,  1.78,  1.17,  4.29,
        2.41,  3.42, -2.74, -1.85,  4.55,  5.03,  2.05,  2.24,  0.57,
        0.05,  6.3 ,  2.07,  1.92,  2.63,  4.01,  3.97,  5.12,  6.97,
        0.89,  0.98, -0.02, -0.93,  0.2 ,  0.34,  3.78,  1.89,  2.97,
        2.15,  0.39,  3.2 ,  3.81,  0.1 ,  1.95,  1.16,  1.43,  2.45,
        3.06, -0.31,  1.7 ,  4.27,  0.36,  4.16,  0.75,  2.18,  1.3 ,
        4.03,  5.  , -2.09,  3.41,  3.  ,  1.2 , -0.58,  2.48, -1.03,
       -0.24,  2.09,  7.8 ])

In [15]:
predicted

array([ 2.04455457,  2.39309013, -1.51415887, -1.21736349, -0.46126221,
        1.22757276,  1.63961259,  1.43230511,  3.84037348,  2.70117847,
        4.44609559,  3.79932376, -1.08816812,  4.50314178,  1.68958659,
        3.47413851,  4.09971321,  4.02832147,  4.92971286,  2.73166887,
        3.6823512 ,  2.59980375,  3.77972539,  2.99833805,  1.88913777,
        0.3617189 ,  4.05954059,  2.38157259,  2.6896814 , -2.42614167,
       -2.3905525 ,  4.77747509,  4.7864904 ,  1.83460856,  2.4594724 ,
        1.24972213,  0.70391444,  5.41453238,  2.62294457,  0.67723721,
        2.06658183,  3.70964965,  2.9332547 ,  4.35364513,  5.45529738,
        1.05306213,  0.72563073, -0.2769038 , -1.02678487,  0.57463049,
        0.96339704,  2.87636475,  1.42932166,  2.89272272,  2.06979061,
       -2.37856258,  2.88844476,  3.07672765,  0.45179206,  2.2364587 ,
        0.64537841,  1.82530956,  2.35146982,  1.96344017, -0.60620004,
        1.41868687,  4.0531683 ,  0.45726743,  3.47197592,  0.85

In [16]:
regressor.score(x_test, y_test)

0.8762064893625343

#### Calculate mean squared error

In [17]:
squared_errors = 0
for pred, real in zip(predicted, y_test.to_numpy()):
    squared_errors += (pred - real) ** 2

mse = squared_errors / len(y_test)

print(mse)

0.5719570710348988


### Predict on missing value

In [18]:
inputs_clean_data = data.dropna(subset=inputs)
inputs_clean_data

Unnamed: 0,MW Drug,MW Sol,CLogP,HBA,HBD,PSA,PSDA,Measured LogP
0,286.34,286.34,0.81,6,3,95.80,2.99,1.20
1,645.62,645.62,-6.66,19,14,351.80,1.84,-8.83
2,336.43,336.43,1.71,5,3,97.05,3.47,1.71
3,151.17,151.17,0.49,2,2,55.41,2.73,0.20
4,324.40,324.40,2.25,4,2,103.22,3.14,2.44
...,...,...,...,...,...,...,...,...
887,199.30,199.30,1.62,2,0,21.80,9.14,
888,167.30,167.30,2.83,1,1,14.57,11.48,
889,149.24,149.24,1.89,1,1,14.57,10.24,
890,177.25,177.25,1.67,2,1,23.37,7.58,1.50


In [7]:
filled_data = pd.DataFrame(columns=list(inputs_clean_data))
for index, row in dataframe.iterrows():
    if pd.isna(row[output]):
        if True not in pd.isna(row[inputs].to_numpy()):
            row[output] = regressor.predict(row[inputs].to_numpy().reshape(1, -1))[0]
    else:
        pass
    
    filled_data = filled_data.append(row)
    
filled_data

NameError: name 'inputs_clean_data' is not defined

### Use predicted missing values to train a classifier

In [20]:
needed = ["CLogP", "HBA", "HBD", "PSA", "Measured LogP", "% Excreted Unchanged in Urine", "Formulation"]
inputs = needed[:-1]
output = needed[-1]

In [21]:
data = filled_data[needed]
data

Unnamed: 0,CLogP,HBA,HBD,PSA,Measured LogP,% Excreted Unchanged in Urine,Formulation
0,0.81,6,3,95.80,1.200000,1.2,tablets
1,-6.66,19,14,351.80,-8.830000,1.0,tablets
2,1.71,5,3,97.05,1.710000,10.0,capsules
3,0.49,2,2,55.41,0.200000,3.0,tablets
4,2.25,4,2,103.22,2.440000,,tablets
...,...,...,...,...,...,...,...
887,1.62,2,0,21.80,1.091354,35.0,tablets
888,2.83,1,1,14.57,2.968389,50.0,tablets
889,1.89,1,1,14.57,1.998022,40.0,tablets
890,1.67,2,1,23.37,1.500000,19.0,tablets


In [22]:
clean_data = data.dropna()
clean_data

Unnamed: 0,CLogP,HBA,HBD,PSA,Measured LogP,% Excreted Unchanged in Urine,Formulation
0,0.81,6,3,95.80,1.200000,1.2,tablets
1,-6.66,19,14,351.80,-8.830000,1.0,tablets
2,1.71,5,3,97.05,1.710000,10.0,capsules
3,0.49,2,2,55.41,0.200000,3.0,tablets
5,1.02,3,1,68.21,1.190000,1.4,tablets
...,...,...,...,...,...,...,...
887,1.62,2,0,21.80,1.091354,35.0,tablets
888,2.83,1,1,14.57,2.968389,50.0,tablets
889,1.89,1,1,14.57,1.998022,40.0,tablets
890,1.67,2,1,23.37,1.500000,19.0,tablets


In [23]:
train, test = train_test_split(clean_data, test_size=0.2, random_state=1010)

In [24]:
x_train = train[inputs]
y_train = train[output]
x_test = test[inputs]
y_test = test[output]

In [25]:
x_train

Unnamed: 0,CLogP,HBA,HBD,PSA,Measured LogP,% Excreted Unchanged in Urine
192,2.23,2,0,28.24,2.720000,13.0
604,2.16,5,3,97.72,2.317107,2.0
535,5.30,5,2,83.65,4.188949,1.5
125,1.25,6,0,79.29,1.409485,5.0
394,2.88,8,3,101.85,3.000759,0.1
...,...,...,...,...,...,...
600,-1.98,8,3,149.47,-0.885588,45.0
21,4.69,5,2,85.11,4.139123,10.0
242,4.24,4,1,41.89,3.850960,2.0
829,-2.42,6,3,112.32,-1.560000,75.0


In [26]:
y_train

192     tablets
604    capsules
535     tablets
125     tablets
394     tablets
         ...   
600     tablets
21     solution
242     tablets
829     tablets
649     tablets
Name: Formulation, Length: 613, dtype: object

In [27]:
classifier = RandomForestClassifier(max_depth=8)

In [28]:
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [29]:
classifier.score(x_test, y_test)

0.6623376623376623

↑↑↑ :(

In [30]:
from sklearn.metrics import classification_report

In [31]:
print(classification_report(y_test, classifier.predict(x_test)))

              precision    recall  f1-score   support

    capsules       0.00      0.00      0.00        21
    solution       0.57      0.37      0.45        35
     tablets       0.68      0.91      0.78        98

    accuracy                           0.66       154
   macro avg       0.41      0.43      0.41       154
weighted avg       0.56      0.66      0.60       154



  _warn_prf(average, modifier, msg_start, len(result))


# Something that might be useful:

### Label formulation:

In [32]:
def label_class(row):
    if row["Formulation"] == "tablets":
        return 0
    elif row["Formulation"] == "capsules":
        return 1
    elif row["Formulation"] == "solution":
        return 2

In [33]:
dataframe["label"] = dataframe.apply(lambda row: label_class(row), axis=1)

### Random

In [1]:
import pandas as pd
import numpy as np
import predict_missing_value

In [2]:
dataframe = pd.read_csv("./data/FDA_APPROVED.csv")

In [3]:
needed = ["MW Drug", "MW Sol", "CLogP", "HBA", "HBD", "PSA", "PSDA", "Measured LogP"]
inputs = needed[:-1]
output = needed[-1]

In [4]:
a = predict_missing_value.fill_missing_value(dataframe, needed, inputs, output)

Coefficient of determination on testing set: 0.85
Mean squared error on testing set: 0.52


In [5]:
a

Unnamed: 0,MW Drug,MW Sol,CLogP,HBA,HBD,PSA,PSDA,Measured LogP,% Excreted Unchanged in Urine,ALOGPS 2.1 solubility,...,Maximum Strength Dose Value,Measured LogD74,Measured LogS (molar),Measured Solubility (mg/mL),Ro5,Route,cDose Number (ALOGPS based),cDose Number (minVSLgS based),minVSLgS 3-7.5,pDose
0,286.34,286.34,0.81,6,3,95.80,2.99,1.200000,1.2,-2.37,...,300.0,1.20,-0.57,77.00,0.0,oral,1.000,4.100000e+00,-3.07,2.98
1,645.62,645.62,-6.66,19,14,351.80,1.84,-8.830000,1.0,-0.64,...,100.0,,,,3.0,oral,0.003,3.000000e-07,3.30,3.81
2,336.43,336.43,1.71,5,3,97.05,3.47,1.710000,10.0,-3.27,...,400.0,0.19,,,0.0,oral,8.900,1.000000e-01,-1.46,2.92
3,151.17,151.17,0.49,2,2,55.41,2.73,0.200000,3.0,-1.55,...,1000.0,0.40,-0.80,23.70,0.0,oral,0.900,1.200000e+00,-1.66,2.18
4,324.40,324.40,2.25,4,2,103.22,3.14,2.440000,,-3.85,...,500.0,-0.36,-1.98,3.43,0.0,oral,44.000,8.200000e+00,-3.13,2.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,199.30,199.30,1.62,2,0,21.80,9.14,1.489529,35.0,0.08,...,100.0,1.08,-0.50,63.70,0.0,oral,0.002,1.000000e-02,-0.69,3.30
888,167.30,167.30,2.83,1,1,14.57,11.48,2.910660,50.0,-3.12,...,3.0,,0.10,212.00,0.0,oral,0.080,1.000000e-06,1.77,4.83
889,149.24,149.24,1.89,1,1,14.57,10.24,2.437823,40.0,-1.80,...,5.0,,0.83,1000.00,0.0,oral,0.008,1.000000e-05,0.99,4.47
890,177.25,177.25,1.67,2,1,23.37,7.58,1.500000,19.0,-1.87,...,25.0,,-1.85,2.50,0.0,oral,0.040,9.000000e-04,-0.20,3.85


In [6]:
a[output].isna().to_numpy()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False,

In [7]:
True in a[output].isna().to_numpy()

True