In this notebook, we try to tune hyperparameters for XGBoost for the claims prediction problem using Randomized and Grid Search and using cross validation.

In [15]:
import numpy as np
import pandas as pd

#Metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

##Classifiers
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

#Hyperparameters
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from pprint import pprint
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

from sklearn.model_selection import cross_val_score

First we import the data and preprocess it. As before, bin and drug are the important features.

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [4]:
import io
drugs_train = pd.read_csv(io.BytesIO(uploaded['train.csv']),index_col=1, parse_dates=True)

In [5]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [6]:
import io
drugs_test = pd.read_csv(io.BytesIO(uploaded['test.csv']),index_col=1, parse_dates=True)

In [7]:
pa_columns = ['correct_diagnosis','tried_and_failed','contraindication','pa_approved','reject_code']
id_columns = ['dim_pa_id','dim_date_id','dim_claim_id','Unnamed: 0']
date_columns = ['calendar_year']
drugs_train = drugs_train.drop(columns=pa_columns+id_columns+date_columns)
drugs_train = drugs_train.dropna()
drugs_test = drugs_test.drop(columns=pa_columns+id_columns+date_columns)
drugs_test = drugs_test.dropna()

In [8]:
drugs_train=drugs_train.drop(columns='calendar_month')
drugs_train=drugs_train.drop(columns='calendar_day')
drugs_train=drugs_train.drop(columns='day_of_week')
drugs_train=drugs_train.drop(columns='is_weekday')
drugs_train=drugs_train.drop(columns='is_workday')
drugs_train=drugs_train.drop(columns='is_holiday')
drugs_test=drugs_test.drop(columns='calendar_month')
drugs_test=drugs_test.drop(columns='calendar_day')
drugs_test=drugs_test.drop(columns='day_of_week')
drugs_test=drugs_test.drop(columns='is_weekday')
drugs_test=drugs_test.drop(columns='is_workday')
drugs_test=drugs_test.drop(columns='is_holiday')

As usual, we next encode the categorical variables bin and drug.

In [9]:
one_hot_encoded_traindata = pd.get_dummies(drugs_train, columns = ['bin', 'drug'])
one_hot_encoded_testdata = pd.get_dummies(drugs_test, columns = ['bin', 'drug'])

In [10]:
X = one_hot_encoded_traindata.loc[:, one_hot_encoded_traindata.columns != 'pharmacy_claim_approved']
y = one_hot_encoded_traindata.loc[:, one_hot_encoded_traindata.columns == 'pharmacy_claim_approved']

In [11]:
X_test = one_hot_encoded_testdata.loc[:, one_hot_encoded_testdata.columns != 'pharmacy_claim_approved']
y_test = one_hot_encoded_testdata.loc[:, one_hot_encoded_testdata.columns == 'pharmacy_claim_approved']

We now write a function to evaluate the fit of a particular model. We split into 5 CV sets.

In [12]:
metrics = {'accuracy':accuracy_score,'precision':precision_score,'recall':recall_score,
          'f1':f1_score,'roc_auc':roc_auc_score}

def evaluate_model(model):    
    scores = {metric:[] for metric in metrics.keys()}
    SKF = StratifiedKFold(n_splits=5)

    k = 0
    for train_idx, test_idx in SKF.split(X.values, y.values):
        print('On data split # ',k+1)
        model.fit(X.values[train_idx],y.values[train_idx])
        ykpred = model.predict(X.values[test_idx])
        for i, (metric,score) in enumerate(metrics.items()):
            scores[metric].append(score(y.values[test_idx], ykpred))
        k= k + 1
    
    return pd.DataFrame(scores)

The following are the default parameters for the XGBoost model.

In [16]:
xgbc = XGBClassifier(random_state=485)
pprint(xgbc.get_params())

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 485,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}


We try to optimize the number of estimators and the learning rate. We try to find the optimal number of estimators between 1 and 100 and the optimal learning rate between 0 and 0.5.

We use Randomized Search first.

In [17]:
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 100, num = 20)]
learning_rate = [x for x in np.arange(0, 0.5, 0.02)]
search_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               }
pprint(search_grid)

{'learning_rate': [0.0,
                   0.02,
                   0.04,
                   0.06,
                   0.08,
                   0.1,
                   0.12,
                   0.14,
                   0.16,
                   0.18,
                   0.2,
                   0.22,
                   0.24,
                   0.26,
                   0.28,
                   0.3,
                   0.32,
                   0.34,
                   0.36,
                   0.38,
                   0.4,
                   0.42,
                   0.44,
                   0.46,
                   0.48],
 'n_estimators': [1,
                  6,
                  11,
                  16,
                  21,
                  27,
                  32,
                  37,
                  42,
                  47,
                  53,
                  58,
                  63,
                  68,
                  73,
                  79,
                  84,
       

In [19]:
xgbc_random = RandomizedSearchCV(estimator=xgbc, param_distributions=search_grid, scoring='precision', n_jobs=-1)

xgbc_random.fit(X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(estimator=XGBClassifier(random_state=485), n_jobs=-1,
                   param_distributions={'learning_rate': [0.0, 0.02, 0.04, 0.06,
                                                          0.08, 0.1, 0.12, 0.14,
                                                          0.16, 0.18, 0.2, 0.22,
                                                          0.24, 0.26, 0.28, 0.3,
                                                          0.32, 0.34, 0.36,
                                                          0.38, 0.4, 0.42, 0.44,
                                                          0.46, 0.48],
                                        'n_estimators': [1, 6, 11, 16, 21, 27,
                                                         32, 37, 42, 47, 53, 58,
                                                         63, 68, 73, 79, 84, 89,
                                                         94, 100]},
                   scoring='precision')

In [20]:
xgbc_random.best_params_

{'learning_rate': 0.14, 'n_estimators': 100}

The optimal learning rate comes to 0.14 and the optimal number of estimators comes to 100. Compare this to what we gor without CV.

We now fit the model using these parameter values. The other parameter values are assumed at their default levels.

In [23]:
xgbc_random_tuning = evaluate_model(XGBClassifier(n_estimators = 100,learning_rate = 0.14))

On data split #  1


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


On data split #  2


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


On data split #  3


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


On data split #  4


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


On data split #  5


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


The metric values show no marked improvement though. We then repeat the process for Grid Search and again, 0.9 is the maximal precision we can achieve. It seems about 10% of the data points are always giving false positive cases.

In [24]:
random = xgbc_random_tuning.mean()
random

accuracy     0.935228
precision    0.900174
recall       1.000000
f1           0.947465
roc_auc      0.922135
dtype: float64

In [None]:
xgbc_grid = GridSearchCV(estimator=xgbc, param_grid=search_grid, n_jobs=-1, scoring='precision')
xgbc_grid.fit(X,y)