In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',100)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn import neighbors
import xgboost as xg
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt

# Hyperparameter tuner and Cross Validation
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV
import math

#sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
sns.set(rc={"figure.dpi":300, 'savefig.dpi':800})

In [10]:
df = pd.read_csv("df1.csv")

In [12]:
X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_amt"]]

In [14]:
columns = []
for i in X.columns:
    columns.append(i)
columns

['drug_name',
 'drug_load',
 'drug_mw',
 'mn_length',
 'skin_type',
 'mn_type',
 'mn_surface_area',
 'permeation_time']

# Voting Regressor - Standard Scaling

In [24]:
scaler = StandardScaler()
kf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

n=0
r2 = []
rmse = []
mae=[]

for trn_idx, test_idx in kf.split(df[columns],y['drug_perm_amt']):
    print(f"fold: {n+1}")
    X_tr,X_tst = df[columns].iloc[trn_idx], df[columns].iloc[test_idx]
    y_tr,y_tst = y['drug_perm_amt'].iloc[trn_idx], y['drug_perm_amt'].iloc[test_idx]
    
    model_1 = xg.XGBRegressor(max_depth = 3, eta = 0.1, verbosity=0)
    model_2 = RandomForestRegressor(n_estimators=50, random_state=1)
    model_4 = GradientBoostingRegressor()
    model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
    final_model = VotingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('gbr', model_4), ('knn',model_5)])
    
    nmr = ['drug_load', 'drug_mw', 'mn_length', 'mn_surface_area', 'permeation_time']
    for i in columns:
        if i in nmr:
            X_tr[i] = scaler.fit_transform(X_tr[i].values.reshape(-1,1))
            X_tst[i] = scaler.transform(X_tst[i].values.reshape(-1,1))
    
    final_model.fit(X_tr, y_tr)
    y_pred = final_model.predict(X_tst)
    print("R2: ", r2_score(y_tst, y_pred))
    rmse1 = mean_squared_error(y_tst, final_model.predict(X_tst), squared=False)
    print(": model's RMSE = {}".format(rmse1))
    rmse.append(rmse1)
    mae1 = mean_absolute_error(y_tst, final_model.predict(X_tst))
    print(": model's MAE = {}".format(mae1))
    mae.append(mae1)
    r2.append(r2_score(y_tst, y_pred))
    print("------------------------------------------------------------------------------------------------------")
    
    n+=1
    
print("R2: {}, RMSE: {}, MAE: {}".format(np.mean(r2), np.mean(rmse), np.mean(mae)))

fold: 1
R2:  0.9623490741210831
: model's RMSE = 431.18838586141646
: model's MAE = 215.99980980349665
------------------------------------------------------------------------------------------------------
fold: 2
R2:  0.9771974437735526
: model's RMSE = 1038.0315303513114
: model's MAE = 527.4616204016421
------------------------------------------------------------------------------------------------------
fold: 3
R2:  0.9089118346552614
: model's RMSE = 571.3191779689449
: model's MAE = 287.54671922728664
------------------------------------------------------------------------------------------------------
fold: 4
R2:  0.9833756801040711
: model's RMSE = 410.6884709688192
: model's MAE = 207.8838550348673
------------------------------------------------------------------------------------------------------
fold: 5
R2:  0.9901063604221471
: model's RMSE = 634.1694156990599
: model's MAE = 281.89430392128645
------------------------------------------------------------------------------

# Voting Regressor - MinMax Scaling

In [25]:
scaler = MinMaxScaler()
kf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

n=0
r2 = []
rmse = []
mae = []

for trn_idx, test_idx in kf.split(df[columns],y['drug_perm_amt']):
    print(f"fold: {n+1}")
    X_tr,X_tst = df[columns].iloc[trn_idx], df[columns].iloc[test_idx]
    y_tr,y_tst = y['drug_perm_amt'].iloc[trn_idx], y['drug_perm_amt'].iloc[test_idx]
    
    model_1 = xg.XGBRegressor(max_depth = 3, eta = 0.1, verbosity=0)
    model_2 = RandomForestRegressor(n_estimators=50, random_state=1)
    model_4 = GradientBoostingRegressor()
    model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
    final_model = VotingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('gbr', model_4), ('knn',model_5)])
    
    for i in columns:
        X_tr[i] = scaler.fit_transform(X_tr[i].values.reshape(-1,1))
        X_tst[i] = scaler.transform(X_tst[i].values.reshape(-1,1))
    
    final_model.fit(X_tr, y_tr)
    y_pred = final_model.predict(X_tst)
    print("R2: ", r2_score(y_tst, y_pred))
    rmse1 = mean_squared_error(y_tst, final_model.predict(X_tst), squared=False)
    print(": model's RMSE = {}".format(rmse1))
    rmse.append(rmse1)
    mae1 = mean_absolute_error(y_tst, final_model.predict(X_tst))
    print(": model's MAE = {}".format(mae1))
    mae.append(mae1)
    r2.append(r2_score(y_tst, y_pred))
    print("------------------------------------------------------------------------------------------------------")
    
    n+=1
    
print("R2: {}, RMSE: {}, MAE: {}".format(np.mean(r2), np.mean(rmse), np.mean(mae)))

fold: 1
R2:  0.9606372378773327
: model's RMSE = 440.8816335336052
: model's MAE = 213.7692439932661
------------------------------------------------------------------------------------------------------
fold: 2
R2:  0.9778107260552238
: model's RMSE = 1023.9772874512579
: model's MAE = 511.867171806835
------------------------------------------------------------------------------------------------------
fold: 3
R2:  0.9055090322799388
: model's RMSE = 581.8927867625594
: model's MAE = 296.4991815470548
------------------------------------------------------------------------------------------------------
fold: 4
R2:  0.9834084677932576
: model's RMSE = 410.2832762901756
: model's MAE = 208.39149533990746
------------------------------------------------------------------------------------------------------
fold: 5
R2:  0.9891408717375064
: model's RMSE = 664.3925167037511
: model's MAE = 324.88641190291014
---------------------------------------------------------------------------------

# Voting Regressor - No Scaling

In [6]:
kf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
columns = ['drug_name', 'drug_load', 'drug_mw', 'mn_surface_area', 'mn_length', 'skin_type', 'mn_type', 'permeation_time']
n=0
r2 = []
rmse = []
mae = []
mse = []

for trn_idx, test_idx in kf.split(df[columns],y['drug_perm_amt']):
    print(f"fold: {n+1}")
    X_tr,X_tst = df[columns].iloc[trn_idx], df[columns].iloc[test_idx]
    y_tr,y_tst = y['drug_perm_amt'].iloc[trn_idx], y['drug_perm_amt'].iloc[test_idx]
    
    model_1 = xg.XGBRegressor()
    model_2 = RandomForestRegressor(random_state=1)
    model_4 = GradientBoostingRegressor()
    model_5 = neighbors.KNeighborsRegressor()
    final_model = VotingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('gbr', model_4), ('knn',model_5)])
    
    final_model.fit(X_tr, y_tr)
    y_pred = final_model.predict(X_tst)
    print("R2: ", r2_score(y_tst, y_pred))
    rmse1 = mean_squared_error(y_tst, final_model.predict(X_tst), squared=False)
    print(": model's RMSE = {}".format(rmse1))
    rmse.append(rmse1)
    mae1 = mean_absolute_error(y_tst, final_model.predict(X_tst))
    print(": model's MAE = {}".format(mae1))
    mae.append(mae1)
    mse1 = mean_squared_error(y_tst, final_model.predict(X_tst), squared=True)
    print(": model's MSE = {}".format(mse1))
    mse.append(mse1)
    r2.append(r2_score(y_tst, y_pred))
    print("------------------------------------------------------------------------------------------------------")
    
    n+=1
    
print("R2: {}, RMSE: {}, MAE: {}, MSE: {}".format(np.mean(r2), np.mean(rmse), np.mean(mae), np.mean(mse)))

fold: 1
R2:  0.9671959804148532
: model's RMSE = 402.4785647002958
: model's MAE = 211.03567871111113
: model's MSE = 161988.9950432102
------------------------------------------------------------------------------------------------------
fold: 2
R2:  0.9823750396633313
: model's RMSE = 912.6050170721569
: model's MAE = 483.6049790834028
: model's MSE = 832847.9171852719
------------------------------------------------------------------------------------------------------
fold: 3
R2:  0.9377658277036057
: model's RMSE = 472.2394715061029
: model's MAE = 251.02035960146426
: model's MSE = 223010.1184483634
------------------------------------------------------------------------------------------------------
fold: 4
R2:  0.9745279401169088
: model's RMSE = 508.36143998124675
: model's MAE = 283.18678598545586
: model's MSE = 258431.35365980671
------------------------------------------------------------------------------------------------------
fold: 5
R2:  0.9853136195143272
: model's R

In [19]:
X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_amt"]]

model_1 = xg.XGBRegressor()
model_2 = RandomForestRegressor()
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor()
final_model1 = VotingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('gbr', model_4), ('knn',model_5)])

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(final_model1, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# neg_root_mean_squared_error
print("Root mean squared error (RMSE) score:", np.negative(np.mean(scores)))

Root mean squared error (RMSE) score: 689.0813011517272


# Tuned model - Best one

In [20]:
X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_amt"]]

model_1 = xg.XGBRegressor(max_depth = 3, eta = 0.1, verbosity = 0)
model_2 = RandomForestRegressor(n_estimators=50, random_state=1)
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
final_model1 = VotingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('gbr', model_4), ('knn',model_5)])

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(final_model1, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
# neg_root_mean_squared_error
print("Root mean squared error (RMSE) score:", np.negative(np.mean(scores)))

Root mean squared error (RMSE) score: 669.6925638937176


In [16]:
X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_amt"]]

model_1 = xg.XGBRegressor(max_depth = 3, eta = 0.1, verbosity = 0)
model_2 = RandomForestRegressor(n_estimators=50, random_state=1)
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
final_model1 = VotingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('gbr', model_4), ('knn',model_5)])

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(final_model1, X, y, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
# neg_root_mean_squared_error
print("R2 score:", (np.mean(scores)))

R2 score: 0.9624817466573172
