In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from pprint import pprint

### Hyperparameter Tuning the Random Forest ###
In this notebook I used Random Forest Classifier to predict whether a Claim will be accepted or rejected. 

- **Import data**

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday,dim_claim_id,bin,drug,reject_code,pharmacy_claim_approved,dim_pa_id,correct_diagnosis,tried_and_failed,contraindication,pa_approved,dim_date_id
0,106328,2017-04-07,2017,4,7,6,1,1,0,106329,999001,A,0,1,,,,,,
1,31702,2017-01-30,2017,1,30,2,1,1,0,31703,999001,C,0,1,,,,,,
2,1270970,2019-11-11,2019,11,11,2,1,1,0,1270971,417740,B,70,0,528977.0,1.0,1.0,0.0,0.0,1045.0
3,1093403,2019-06-28,2019,6,28,6,1,1,0,1093404,999001,A,76,0,454463.0,1.0,0.0,0.0,1.0,909.0
4,61846,2017-02-27,2017,2,27,2,1,1,0,61847,999001,A,0,1,,,,,,


- **Data manipulation**

In [4]:
df = train[['drug', 'bin', 'calendar_month', 
          'calendar_day', 'day_of_week']]
X = pd.get_dummies(df, columns=['drug', 'bin'], drop_first=False)
y = train['pharmacy_claim_approved']

In [5]:
X.head()

Unnamed: 0,calendar_month,calendar_day,day_of_week,drug_A,drug_B,drug_C,bin_417380,bin_417614,bin_417740,bin_999001
0,4,7,6,1,0,0,0,0,0,1
1,1,30,2,0,0,1,0,0,0,1
2,11,11,2,0,1,0,0,0,1,0
3,6,28,6,1,0,0,0,0,0,1
4,2,27,2,1,0,0,0,0,0,1


- **Random Search Cross Validation**

In [6]:
##Create object
rf = RandomForestClassifier(random_state=485)
##Print classifier parameters
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 485,
 'verbose': 0,
 'warm_start': False}


- **Random Search Cross Validation**
    - **Random Hyperparameter Grid**

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


## Claims: 10 Features##

- **Random Search Cross Validation**
    - Random Search Training

In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(n_jobs = -1)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, verbose=2, random_state=42)
# Fit the random search model
rf_random.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  21.7s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  20.9s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  21.2s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  20.9s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  21.0s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time= 1.2min
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=60

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [10]:
#Print best parameters
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

- **Evaluate Random Search**

In [4]:
#Create Classifier object
rf_best = RandomForestClassifier(n_estimators=200, min_samples_split= 5, min_samples_leaf= 2,
                                 max_features= 'sqrt', max_depth=10, bootstrap= True)

#Create KFold object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4124)

#Create performances matrices 
cv_accs_best = np.zeros((5,1))
cv_aucs_best = np.zeros((5,1))

i = 0
for train_index, test_index in kfold.split(X, y):
    print(i)
    # this gets the training and holdout sets
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_val = X.iloc[test_index]
    y_val = y.iloc[test_index]

    ## make model clones
    rf_clone = clone(rf_best)
    rf_clone.fit(X_train, y_train)
    rf_clone_preds = rf_clone.predict(X_val)

    ## record the performances
    cv_accs_best[i,0] = accuracy_score(y_val, rf_clone_preds)
    cv_aucs_best[i,0] = roc_auc_score(y_val, rf_clone_preds)

    i = i + 1

0
1
2
3
4


In [5]:
## Accuracy
100*np.mean(cv_accs_best, axis=1)
# array([93.57535188, 93.48594992, 93.52673957, 93.59267351, 93.43342627])

array([93.57535188, 93.48594992, 93.52673957, 93.59267351, 93.43342627])

In [6]:
## Roc Auc
100*np.mean(cv_aucs_best, axis=1) 

array([92.27669036, 92.16921692, 92.21825168, 92.29751333, 92.10607628])

## Claims: 7 Features ##

In [8]:
X_claims = X[['drug_A', 'drug_B', 'drug_C', 'bin_417380', 'bin_417614', 'bin_417740', 'bin_999001']]

In [9]:
X_claims.head()

Unnamed: 0,drug_A,drug_B,drug_C,bin_417380,bin_417614,bin_417740,bin_999001
0,1,0,0,0,0,0,1
1,0,0,1,0,0,0,1
2,0,1,0,0,0,1,0
3,1,0,0,0,0,0,1
4,1,0,0,0,0,0,1


- **Random Search Cross Validation**
    - Random Search Training

In [10]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_claims = RandomForestClassifier(n_jobs = -1)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random_claims = RandomizedSearchCV(estimator = rf_claims, param_distributions = random_grid, verbose=2, random_state=142)
# Fit the random search model
rf_random_claims.fit(X_claims, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  52.9s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  52.3s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  52.1s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  52.4s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  52.4s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1400; total time=  45.8s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=140

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=142, verbose=2)

In [11]:
rf_random_claims.best_params_

{'n_estimators': 1600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

- **Evaluate Random Search**

In [12]:
#Create Classifier object
rf_best_claims = RandomForestClassifier(n_estimators=1600, min_samples_split= 2, min_samples_leaf= 1,
                                 max_features= 'sqrt', max_depth=20, bootstrap= True)

#Create KFold object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4124)

#Create performances matrices 
cv_accs_best_claims = np.zeros((5,1))
cv_aucs_best_claims = np.zeros((5,1))

i = 0
for train_index, test_index in kfold.split(X_claims, y):
    print(i)
    # this gets the training and holdout sets
    X_train = X_claims.iloc[train_index]
    y_train = y.iloc[train_index]
    X_val = X_claims.iloc[test_index]
    y_val = y.iloc[test_index]

    ## make model clones
    rf_clone = clone(rf_best_claims)
    rf_clone.fit(X_train, y_train)
    rf_clone_preds = rf_clone.predict(X_val)

    ## record the performances
    cv_accs_best_claims[i,0] = accuracy_score(y_val, rf_clone_preds)
    cv_aucs_best_claims[i,0] = roc_auc_score(y_val, rf_clone_preds)

    i = i + 1

0
1
2
3
4


In [15]:
## Accuracy
100*np.mean(cv_accs_best_claims, axis=1)

array([93.57535188, 93.48594992, 93.52673957, 93.59267351, 93.43342627])

In [16]:
## Roc Auc
100*np.mean(cv_aucs_best_claims, axis=1) 

array([92.27669036, 92.16921692, 92.21825168, 92.29751333, 92.10607628])

# ePRIOR AUTHORIZATIONS 

## PAs, 15 Features ##

In [17]:
##Import data
X_pa = pd.read_csv('x_train.csv')
y_pa = pd.read_csv('y_train.csv').squeeze()

In [20]:
X_pa.drop(columns=['is_weekday', 'is_workday', 'is_holiday', 'correct_diagnosis'])

Unnamed: 0,calendar_month,calendar_day,day_of_week,tried_and_failed,contraindication,drug_A,drug_B,drug_C,bin_417380,bin_417614,bin_417740,bin_999001,reject_code_70.0,reject_code_75.0,reject_code_76.0
0,10,26,6,0,0,0,1,0,0,0,0,1,0,0,1
1,9,21,5,1,0,0,1,0,0,1,0,0,0,1,0
2,8,29,3,0,0,1,0,0,0,0,0,1,0,0,1
3,10,4,4,1,0,1,0,0,0,1,0,0,1,0,0
4,2,7,5,0,0,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444755,2,6,2,1,1,1,0,0,0,1,0,0,1,0,0
444756,4,7,6,0,0,1,0,0,0,0,0,1,0,0,1
444757,2,21,4,0,0,1,0,0,0,1,0,0,1,0,0
444758,11,14,4,1,0,0,0,1,0,0,1,0,0,1,0


- **Random Search Cross Validation**
    - Random Search Training

In [22]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_pa = RandomForestClassifier(n_jobs = -1)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_pa_random = RandomizedSearchCV(estimator = rf_pa, param_distributions = random_grid, verbose=2, random_state=42)
# Fit the random search model
rf_pa_random.fit(X_pa, y_pa)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  14.5s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  13.6s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  13.4s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  13.6s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  13.5s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=  45.9s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=60

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [23]:
rf_pa_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

- **Evaluate Random Search**

In [24]:
#Create Classifier object
rf_pa_best = RandomForestClassifier(n_estimators=200, min_samples_split= 5, min_samples_leaf= 2,
                                 max_features= 'sqrt', max_depth=10, bootstrap= True)

#Create KFold object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4124)

#Create performances matrices 
cv_accs_best_pa = np.zeros((5,1))
cv_aucs_best_pa = np.zeros((5,1))

i = 0
for train_index, test_index in kfold.split(X_pa, y_pa):
    print(i)
    # this gets the training and holdout sets
    X_train = X_pa.iloc[train_index]
    y_train = y_pa.iloc[train_index]
    X_val = X_pa.iloc[test_index]
    y_val = y_pa.iloc[test_index]

    ## make model clones
    rf_clone = clone(rf_pa_best)
    rf_clone.fit(X_train, y_train)
    rf_clone_preds = rf_clone.predict(X_val)

    ## record the performances
    cv_accs_best_pa[i,0] = accuracy_score(y_val, rf_clone_preds)
    cv_aucs_best_pa[i,0] = roc_auc_score(y_val, rf_clone_preds)

    i = i + 1

0
1
2
3
4


In [25]:
## Accuracy
100*np.mean(cv_accs_best_pa, axis=1)

array([81.60580987, 81.26517672, 81.33824984, 81.54735138, 81.3146416 ])

In [26]:
## Roc Auc
100*np.mean(cv_aucs_best_pa, axis=1) 

array([72.46055657, 72.13406064, 72.31895002, 72.56265907, 72.29612088])

## PAs, 7 Features

In [31]:
X_pa_new =X_pa[["reject_code_70.0", 'calendar_day', 'reject_code_75.0', 'contraindication', 'calendar_month', 'tried_and_failed', 'day_of_week']]

In [32]:
X_pa_new.head()

Unnamed: 0,reject_code_70.0,calendar_day,reject_code_75.0,contraindication,calendar_month,tried_and_failed,day_of_week
0,0,26,0,0,10,0,6
1,0,21,1,0,9,1,5
2,0,29,0,0,8,0,3
3,1,4,0,0,10,1,4
4,1,7,0,0,2,0,5


- **Random Search Cross Validation**
    - Random Search Training

In [33]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_pa_new = RandomForestClassifier(n_jobs = -1)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_pa_random_new = RandomizedSearchCV(estimator = rf_pa_new, param_distributions = random_grid, verbose=2, random_state=42)
# Fit the random search model
rf_pa_random_new.fit(X_pa_new, y_pa)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  10.8s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   9.9s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   9.7s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   9.8s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   9.8s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=  30.5s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=60

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [34]:
rf_pa_random_new.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

- **Evaluate Grid Search**

In [35]:
#Create Classifier object
rf_pa_best_new = RandomForestClassifier(n_estimators=200, min_samples_split= 5, min_samples_leaf= 2,
                                 max_features= 'sqrt', max_depth=10, bootstrap= True)

#Create KFold object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4124)

#Create performances matrices 
cv_accs_best_pa_new = np.zeros((5,1))
cv_aucs_best_pa_new = np.zeros((5,1))

i = 0
for train_index, test_index in kfold.split(X_pa_new, y_pa):
    print(i)
    # this gets the training and holdout sets
    X_train = X_pa_new.iloc[train_index]
    y_train = y_pa.iloc[train_index]
    X_val = X_pa_new.iloc[test_index]
    y_val = y_pa.iloc[test_index]

    ## make model clones
    rf_clone = clone(rf_pa_best_new)
    rf_clone.fit(X_train, y_train)
    rf_clone_preds = rf_clone.predict(X_val)

    ## record the performances
    cv_accs_best_pa_new[i,0] = accuracy_score(y_val, rf_clone_preds)
    cv_aucs_best_pa_new[i,0] = roc_auc_score(y_val, rf_clone_preds)

    i = i + 1

0
1
2
3
4


In [36]:
## Accuracy
100*np.mean(cv_accs_best_pa_new, axis=1)

array([79.69354258, 79.59011602, 79.73064124, 79.72389603, 79.65194712])

In [37]:
## Roc Auc
100*np.mean(cv_aucs_best_pa_new, axis=1) 

array([73.89671558, 73.83306203, 74.04900572, 73.96197667, 73.8116383 ])

## Accuracy comparison

In [38]:
print('PAs, 15 features:', 100*np.mean(cv_accs_best_pa, axis=1))
print()
print('PAs, 7 features:', 100*np.mean(cv_accs_best_pa_new, axis=1))

PAs, 15 features: [81.60580987 81.26517672 81.33824984 81.54735138 81.3146416 ]

PAs, 7 features: [79.69354258 79.59011602 79.73064124 79.72389603 79.65194712]


## Grid Search with Cross Validation

In [41]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 8, 10, 15],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [50, 100, 200, 300]
}

# Create a based model
rf1 = RandomForestClassifier(n_jobs=-1)
# Instantiate the Random search model
grid_search_rf1 = GridSearchCV(estimator = rf1, param_grid = param_grid, verbose = 2)
# Fit the random search model
grid_search_rf1.fit(X_pa, y_pa)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time=   2.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time=   1.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time=   1.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time=   1.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time=   1.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=100; total time=   3.4s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=3, n_estimators=100; total time=   3.

GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'bootstrap': [True], 'max_depth': [5, 8, 10, 15],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [3, 5, 7],
                         'n_estimators': [50, 100, 200, 300]},
             verbose=2)

In [42]:
grid_search_rf1.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 3,
 'n_estimators': 300}

In [43]:
#Create Classifier object
rf_grid = RandomForestClassifier(n_estimators=300, min_samples_split= 3, min_samples_leaf= 4,
                                 max_features= 'sqrt', max_depth=10, bootstrap= True)

#Create KFold object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4124)

#Create performances matrices 
cv_accs_grid = np.zeros((5,1))
cv_aucs_grid = np.zeros((5,1))

i = 0
for train_index, test_index in kfold.split(X_pa, y_pa):
    print(i)
    # this gets the training and holdout sets
    X_train = X_pa.iloc[train_index]
    y_train = y_pa.iloc[train_index]
    X_val = X_pa.iloc[test_index]
    y_val = y_pa.iloc[test_index]

    ## make model clones
    rf_clone = clone(rf_grid)
    rf_clone.fit(X_train, y_train)
    rf_clone_preds = rf_clone.predict(X_val)

    ## record the performances
    cv_accs_grid[i,0] = accuracy_score(y_val, rf_clone_preds)
    cv_aucs_grid[i,0] = roc_auc_score(y_val, rf_clone_preds)

    i = i + 1

0
1
2
3
4


In [44]:
## Accuracy
100*np.mean(cv_accs_grid, axis=1)

array([81.6193003 , 81.28203975, 81.34611926, 81.54622718, 81.35511287])

In [45]:
## Roc Auc
100*np.mean(cv_aucs_grid, axis=1) 

array([72.51433767, 72.14418922, 72.32565878, 72.52810806, 72.22366729])

## Accuracy

In [47]:
print('PAs, 15 features with Random Search:', 100*np.mean(cv_accs_best_pa, axis=1))
print()
print('PAs, 15 features with Grid Search', 100*np.mean(cv_accs_grid, axis=1))
print()
print('PAs, 7 features with Random Search:', 100*np.mean(cv_accs_best_pa_new, axis=1))
print()

PAs, 15 features with Random Search: [81.60580987 81.26517672 81.33824984 81.54735138 81.3146416 ]

PAs, 15 features with Grid Search [81.6193003  81.28203975 81.34611926 81.54622718 81.35511287]

PAs, 7 features with Random Search: [79.69354258 79.59011602 79.73064124 79.72389603 79.65194712]



## PAs, 10 Features

In [48]:
X_pa_1 =X_pa[["reject_code_70.0", 'calendar_day', 'reject_code_75.0', 'contraindication', 'calendar_month', 'tried_and_failed', 'day_of_week', 'drug_C', 'drug_A', 'bin_417614']]
X_pa_1.head()

Unnamed: 0,reject_code_70.0,calendar_day,reject_code_75.0,contraindication,calendar_month,tried_and_failed,day_of_week,drug_C,drug_A,bin_417614
0,0,26,0,0,10,0,6,0,0,0
1,0,21,1,0,9,1,5,0,0,1
2,0,29,0,0,8,0,3,0,1,0
3,1,4,0,0,10,1,4,0,1,1
4,1,7,0,0,2,0,5,0,0,0


In [49]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_pa_1 = RandomForestClassifier(n_jobs = -1)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_pa_random_1 = RandomizedSearchCV(estimator = rf_pa_1, param_distributions = random_grid, verbose=2, random_state=42)
# Fit the random search model
rf_pa_random_1.fit(X_pa_1, y_pa)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  12.5s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  11.5s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  11.5s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  11.4s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  11.4s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=  36.8s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=60

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [53]:
rf_pa_random_1.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

- **Evaluate Random Search**

In [54]:
#Create Classifier object
rf_pa_1 = RandomForestClassifier(n_estimators=200, min_samples_split= 5, min_samples_leaf= 2,
                                 max_features= 'sqrt', max_depth=10, bootstrap= True)

#Create KFold object
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4124)

#Create performances matrices 
cv_accs_best_pa_1 = np.zeros((5,1))
cv_aucs_best_pa_1 = np.zeros((5,1))

i = 0
for train_index, test_index in kfold.split(X_pa_1, y_pa):
    print(i)
    # this gets the training and holdout sets
    X_train = X_pa_1.iloc[train_index]
    y_train = y_pa.iloc[train_index]
    X_val = X_pa_1.iloc[test_index]
    y_val = y_pa.iloc[test_index]

    ## make model clones
    rf_clone = clone(rf_pa_1)
    rf_clone.fit(X_train, y_train)
    rf_clone_preds = rf_clone.predict(X_val)

    ## record the performances
    cv_accs_best_pa_1[i,0] = accuracy_score(y_val, rf_clone_preds)
    cv_aucs_best_pa_1[i,0] = roc_auc_score(y_val, rf_clone_preds)

    i = i + 1

0
1
2
3
4


In [55]:
print('PAs, 15 features with Random Search:', 100*np.mean(cv_accs_best_pa, axis=1))
print()
print('PAs, 15 features with Grid Search', 100*np.mean(cv_accs_grid, axis=1))
print()
print('PAs, 10 features with Random Search', 100*np.mean(cv_accs_best_pa_1, axis=1))
print()
print('PAs, 7 features with Random Search:', 100*np.mean(cv_accs_best_pa_new, axis=1))
print()

PAs, 15 features with Random Search: [81.60580987 81.26517672 81.33824984 81.54735138 81.3146416 ]

PAs, 15 features with Grid Search [81.6193003  81.28203975 81.34611926 81.54622718 81.35511287]

PAs, 10 features with Random Search [81.54735138 81.24494109 81.3764727  81.48889289 81.28203975]

PAs, 7 features with Random Search: [79.69354258 79.59011602 79.73064124 79.72389603 79.65194712]

