In [1]:
# import libraries
import pandas as pd
import xgboost as xgb
import numpy as np

In [2]:
data = pd.read_csv('/Users/danherman/Desktop/oud_treatment_outcome/data/clean_data/final.csv')

In [4]:
data.head()

Unnamed: 0,patdeid,sru_alcohol,sru_cannabis,sru_cocaine,sru_amphetamine,sru_methamphetamine,sru_opiates,sru_benzodiazepines,sru_propoxyphene,sru_methadone,...,total_dose_245.0,total_dose_250.0,total_dose_255.0,total_dose_265.0,total_dose_270.0,total_dose_300.0,total_dose_360.0,total_dose_390.0,admin_location_1.0,admin_location_2.0
0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# isolate x and y variables
y = data.drop(columns='outcome') 
X = data.outcome

In [None]:
# split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22)

In [None]:
# show target mean

display(np.mean(y_train))
display(np.mean(y_test))


In [None]:
# create xgboost matrix
train = xgb.DMatrix(X_train, label = y_train)
test = xgb.DMatrix(X_test, label = y_test)

In [None]:
# show target mean

display(np.mean(y_train))
display(np.mean(y_test))


In [None]:
# create xgboost matrix
train = xgb.DMatrix(X_train, label = y_train)
test = xgb.DMatrix(X_test, label = y_test)

In [None]:
# set the parameters for xgboost
parameters1 = {'learning_rate':0.3,
                'max_depth': 2,
                'colsample_bytree': 1,
                'subsample': 1,
                'min_child_weight': 1,
                'gamma': 0,
                'random_state': 1502,
                'eval_metric': 'auc',
                'objective': 'binary:logistic'}


In [None]:
# run XGBoost
model = xgb.train(params=parameters1,
                  dtrain=train,
                  num_boost_round=200,
                  evals = [(test, 'yes')],
                  verbose_eval=50)
                  

In [None]:
# predictions
predictions1 = model.predict(test)
predictions1 = np.where(predictions1 > 0.5, 1, 0)

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
# show confusion matrix display
confusion_matrix1 = confusion_matrix(y_test, predictions1)
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix1, display_labels=['No', 'Yes'])
disp.plot();

In [None]:
report1 = classification_report(y_test, predictions1)
print(report1)

In [None]:
# isolate categorical variables

cat = data.select_dtypes(exclude="number")

In [None]:
# transform categorical variables

dummies = pd.get_dummies(cat, drop_first=True, dtype=int)

In [None]:
# joining numerical and categorical variables
final_dataset = pd.concat([X, dummies], axis=1)

In [None]:
feature_columns = list(final_dataset.columns.values)
feature_columns = feature_columns[:-1]

In [None]:
# isolate x and y variables part 2
y = final_dataset.iloc[:, -1].values
X = final_dataset.iloc[:, :-1].values

In [None]:
# split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1502)

In [None]:
# create xgboost matrix
train = xgb.DMatrix(X_train, label = y_train, feature_names = feature_columns)
test = xgb.DMatrix(X_test, label = y_test, feature_names = feature_columns)

In [None]:
# set the parameters for xgboost part 2
parameters2 = {'learning_rate':0.3,
                'max_depth': 2,
                'colsample_bytree': 1,
                'subsample': 1,
                'min_child_weight': 1,
                'gamma': 0,
                'random_state': 1502,
                'eval_metric': 'auc',
                'objective': 'binary:logistic'}


In [None]:
# run XGBoost
model2 = xgb.train(params=parameters2,
                  dtrain=train,
                  num_boost_round=200,
                  evals = [(test, 'yes')],
                  verbose_eval=50)
                  

In [None]:
# predictions
predictions2 = model2.predict(test)
predictions2 = np.where(predictions2 > 0.5, 1, 0)

In [None]:
# show confusion matrix display
confusion_matrix2 = confusion_matrix(y_test, predictions2)
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix2, display_labels=['No', 'Yes'])
disp.plot();

In [None]:
report2 = classification_report(y_test, predictions2)
print(report2)

In [None]:
# setting the cross validation parameters
from sklearn.model_selection import KFold

tune_control = KFold(n_splits=5, shuffle=True, random_state=1502).split(X = X_train, y = y_train)

In [None]:
# set parameter tuning
tune_grid = {'learning_rate':[0.05,0.3],
                'max_depth':range(2,9,2),
                'colsample_bytree': [0.5,1],
                'subsample': [1],
                'min_child_weight':[1],
                'gamma': [0],
                'random_state': [1502],
                'n_estimators':range(200, 2000, 200),
                'booster':['gbtree']
                }

In [None]:
# state we are doing a classification problem
from xgboost import XGBClassifier
classifier = XGBClassifier(objective='binary:logistic')

In [None]:
# cross validation assembly
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator = classifier,  
                            param_grid = tune_grid, 
                            scoring = 'roc_auc',
                            n_jobs = 8,
                            cv = tune_control,
                            verbose = 5)

In [None]:
# setting evaluation parameters
evaluation_parameters = {'early_stopping_rounds':100,
                        'eval_metric':'auc',
                        'eval_set':[(X_test, y_test)]}  

In [None]:
# hyperparameter tuning and cross validation
tune_model = grid_search.fit(X=X_train, y=y_train, **evaluation_parameters)

grid_search.best_params_, grid_search.best_score_