In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [None]:
import io
drugs_train = pd.read_csv(io.BytesIO(uploaded['train.csv']),index_col=1, parse_dates=True)

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [None]:
import io
drugs_test = pd.read_csv(io.BytesIO(uploaded['test.csv']),index_col=1, parse_dates=True)

In [None]:
pa_columns = ['correct_diagnosis','tried_and_failed','contraindication','pa_approved','reject_code']
id_columns = ['dim_pa_id','dim_date_id','dim_claim_id','Unnamed: 0']
date_columns = ['calendar_year']
drugs_train = drugs_train.drop(columns=pa_columns+id_columns+date_columns)
drugs_train = drugs_train.dropna()

In [None]:
pa_columns = ['correct_diagnosis','tried_and_failed','contraindication','pa_approved','reject_code']
id_columns = ['dim_pa_id','dim_date_id','dim_claim_id','Unnamed: 0']
date_columns = ['calendar_year']
drugs_test = drugs_test.drop(columns=pa_columns+id_columns+date_columns)
drugs_test = drugs_test.dropna()

In [None]:
drugs_train=drugs_train.drop(columns='calendar_month')
drugs_train=drugs_train.drop(columns='calendar_day')
drugs_train=drugs_train.drop(columns='day_of_week')
drugs_train=drugs_train.drop(columns='is_weekday')
drugs_train=drugs_train.drop(columns='is_workday')
drugs_train=drugs_train.drop(columns='is_holiday')

In [None]:
drugs_test=drugs_test.drop(columns='calendar_month')
drugs_test=drugs_test.drop(columns='calendar_day')
drugs_test=drugs_test.drop(columns='day_of_week')
drugs_test=drugs_test.drop(columns='is_weekday')
drugs_test=drugs_test.drop(columns='is_workday')
drugs_test=drugs_test.drop(columns='is_holiday')

In [None]:
one_hot_encoded_traindata = pd.get_dummies(drugs_train, columns = ['bin', 'drug'])
one_hot_encoded_testdata = pd.get_dummies(drugs_test, columns = ['bin', 'drug'])

In [None]:
X_train = one_hot_encoded_traindata.loc[:, one_hot_encoded_traindata.columns != 'pharmacy_claim_approved']
y_train = one_hot_encoded_traindata.loc[:, one_hot_encoded_traindata.columns == 'pharmacy_claim_approved']

In [None]:
X_test = one_hot_encoded_testdata.loc[:, one_hot_encoded_testdata.columns != 'pharmacy_claim_approved']
y_test = one_hot_encoded_testdata.loc[:, one_hot_encoded_testdata.columns == 'pharmacy_claim_approved']

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)

  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 10%|█         | 1/10 [00:06<00:59,  6.60s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 20%|██        | 2/10 [00:13<00:52,  6.51s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 30%|███       | 3/10 [00:19<00:45,  6.46s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 40%|████      | 4/10 [00:25<00:38,  6.45s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 50%|█████     | 5/10 [00:32<00:32,  6.46s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 60%|██████    | 6/10 [00:38<00:25,  6.43s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 70%|███████   | 7/10 [00:45<00:19,  6.43s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 80%|████████  | 8/10 [00:51<00:12,  6.42s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
 90%|█████████ | 9/10 [00:57<00:06,  6.42s/it, best loss: -0.806637004499241]

  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)



SCORE:
0.806637004499241
100%|██████████| 10/10 [01:04<00:00,  6.44s/it, best loss: -0.806637004499241]


In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.8640495937092425, 'gamma': 2.8534612594147593, 'max_depth': 13.0, 'min_child_weight': 5.0, 'reg_alpha': 54.0, 'reg_lambda': 0.7505245291782435}


In [None]:
from xgboost import XGBClassifier

In [None]:
xgbc = XGBClassifier(colsample_bytree=0.8640495937092425, gamma=2.8534612594147593, max_depth=13, min_child_weight=5, reg_alpha=54.0, reg_lambda=0.7505245291782435)

In [None]:
xgbc.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(colsample_bytree=0.8640495937092425, gamma=2.8534612594147593,
              max_depth=13, min_child_weight=5, reg_alpha=54.0,
              reg_lambda=0.7505245291782435)

In [None]:
ypred = xgbc.predict(X_test)

In [None]:
print(accuracy_score(y_test, ypred))
print(f1_score(y_test, ypred))
print(precision_score(y_test, ypred))
print(recall_score(y_test, ypred))
print(roc_auc_score(y_test, ypred))

0.9354859203023999
0.9475758871943125
0.900374550206896
1.0
0.9226353079459747


In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from pprint import pprint

In [None]:
pprint(xgbc.get_params())

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.8640495937092425,
 'gamma': 2.8534612594147593,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 13,
 'min_child_weight': 5,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 54.0,
 'reg_lambda': 0.7505245291782435,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}


In [None]:
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 50, num = 10)]
learning_rate = [x for x in np.arange(0.1, 2.2, 0.2)]

In [None]:
search_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               }
pprint(search_grid)

{'learning_rate': [0.1,
                   0.30000000000000004,
                   0.5000000000000001,
                   0.7000000000000001,
                   0.9000000000000001,
                   1.1000000000000003,
                   1.3000000000000003,
                   1.5000000000000004,
                   1.7000000000000004,
                   1.9000000000000004,
                   2.1000000000000005],
 'n_estimators': [1, 6, 11, 17, 22, 28, 33, 39, 44, 50]}


In [None]:
xgbcrandom = RandomizedSearchCV(estimator=xgbc, param_distributions=search_grid, scoring='precision', n_jobs=-1)

In [None]:
xgbcrandom.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(estimator=XGBClassifier(colsample_bytree=0.8640495937092425,
                                           gamma=2.8534612594147593,
                                           max_depth=13, min_child_weight=5,
                                           reg_alpha=54.0,
                                           reg_lambda=0.7505245291782435),
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.1,
                                                          0.30000000000000004,
                                                          0.5000000000000001,
                                                          0.7000000000000001,
                                                          0.9000000000000001,
                                                          1.1000000000000003,
                                                          1.3000000000000003,
                                                          1.5000000000000004,


In [None]:
xgbcrandom.best_params_

{'learning_rate': 0.9000000000000001, 'n_estimators': 17}

In [None]:
xgbcrandomtuned = XGBClassifier(learning_rate=0.9, n_estimators=17, colsample_bytree=0.8640495937092425, gamma=2.8534612594147593, max_depth=13, min_child_weight=5, reg_alpha=54.0, reg_lambda=0.7505245291782435)

In [None]:
xgbcrandomtuned.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(colsample_bytree=0.8640495937092425, gamma=2.8534612594147593,
              learning_rate=0.9, max_depth=13, min_child_weight=5,
              n_estimators=17, reg_alpha=54.0, reg_lambda=0.7505245291782435)

In [None]:
ypredrandomtuned = xgbcrandomtuned.predict(X_test)

In [None]:
print(accuracy_score(y_test, ypredrandomtuned))
print(f1_score(y_test, ypredrandomtuned))
print(precision_score(y_test, ypredrandomtuned))
print(recall_score(y_test, ypredrandomtuned))
print(roc_auc_score(y_test, ypredrandomtuned))

0.9354859203023999
0.9475758871943125
0.900374550206896
1.0
0.9226353079459747
