In [17]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',100)

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import neighbors
import xgboost as xg
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt

# Hyperparameter tuner and Cross Validation
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingRegressor

#sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
sns.set(rc={"figure.dpi":300, 'savefig.dpi':800})

In [5]:
df = pd.read_csv("df1.csv")

# Amount

In [3]:
X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_amt"]]

model_1 = xg.XGBRegressor()
model_2 = RandomForestRegressor()
model_3 = DecisionTreeRegressor()
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
final_model = StackingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('dt', model_3), ('knn',model_5)], final_estimator=model_4, cv=5)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(final_model, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
#print(sorted(scores.tolist()))
print("RMSE score:", np.negative(np.mean(scores)))

RMSE score: 2649.3868293900537


In [3]:
X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_amt"]]

model_1 = xg.XGBRegressor()
model_2 = RandomForestRegressor()
model_3 = DecisionTreeRegressor()
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
final_model = StackingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('dt', model_3), ('knn',model_5)], final_estimator=model_4, cv=5)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(final_model, X, y, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
#print(sorted(scores.tolist()))
print("R2 score:", (np.mean(scores)))

R2 score: -0.014365328940422743


# Percentage

In [12]:
# Percentage - SR

X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_per"]]

model_1 = xg.XGBRegressor()
model_2 = RandomForestRegressor(random_state=1)
model_3 = DecisionTreeRegressor(random_state=1)
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
final_model = StackingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('dt', model_3), ('knn',model_5)], final_estimator=model_4, cv=5)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(final_model, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
#print(sorted(scores.tolist()))
print("RMSE score:", np.negative(np.mean(scores)))

RMSE score: 16.54164239451507


In [10]:
# Percentage - SR

X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_per"]]

model_1 = xg.XGBRegressor()
model_2 = RandomForestRegressor(random_state=1)
model_3 = DecisionTreeRegressor(random_state=1)
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
final_model = StackingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('dt', model_3), ('knn',model_5)], final_estimator=model_4, cv=5)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(final_model, X, y, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
#print(sorted(scores.tolist()))
print("R2 score:", (np.mean(scores)))

R2 score: 0.4007945112586814


# Actual vs Prediction

In [5]:
# Per - VR
X = df.drop(['drug_perm_per','drug_perm_amt'], axis=1)
y = df[["drug_perm_per"]]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, shuffle=True, random_state=0)

model_1 = xg.XGBRegressor()
model_2 = RandomForestRegressor(random_state=1)
model_3 = DecisionTreeRegressor(random_state=1)
model_4 = GradientBoostingRegressor()
model_5 = neighbors.KNeighborsRegressor(n_neighbors = 1)
final_model = StackingRegressor(estimators=[('xgb', model_1), ('rf', model_2), ('dt', model_3), ('knn',model_5)], final_estimator=model_4, cv=5)

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [6]:
y_test.to_csv('SR_y_test.csv')
pd.DataFrame({'y_pred':list(y_pred)}).to_csv('SR_y_pred.csv')

In [9]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

3.6273386276967883

In [12]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred) 

0.9778794541494622