# Tuning the XGBoost classification algorithm on TESS data using the pr_auc metric 

The following is a tuning of the XGBoost algorithm to the pr_auc error metric.

In [21]:
# import modules
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

df = pd.read_csv("TESSfield_19h_44d_combinedfeatures_try2.csv", index_col=0)
X = df.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY', 'Catalog_Period',
             'Depth', 'Catalog_Epoch', 'SNR'], axis=1)

y = df['CombinedY']

def modelfit(alg, X, y, cv_folds=4):
    # StratifiedKFold automatically used by cross_val_predict on binary classification
    # bear in mind that this does not use trapezfoid rule
    # y_pred calculates the probabilities that each value is 1 or 0 using stratified cross validation
    # pr_auc calculates the area under a precision recall curve
    y_pred = cross_val_predict(alg, X, y, method='predict_proba', cv=cv_folds)[:,1]

    
    pr_auc = metrics.average_precision_score(y, y_pred)
    
    return pr_auc

xgb1 = XGBClassifier(objective='binary:logistic')

First we will get a baseline score: 

In [22]:
modelfit(xgb1, X, y)

0.75080326356931726

Now we will try adjusting depth, we will try 1, 6, 12, and 18. 

In [23]:
depth_vals = [1, 6, 12, 18, 40]

for vals in depth_vals:
    print 'testing depth value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=vals,    
        objective='binary:logistic')
    
    print modelfit(xgb, X, y)

testing depth value 1
0.764009207239
testing depth value 6
0.731479877352
testing depth value 12
0.756981717936
testing depth value 18
0.75714369004
testing depth value 40
0.750377616871


The best score is at 1. Next is colsample_bytree, with standard values between .5-1; we'll try lower values as well to be safe. 

In [24]:
colsample_vals = [.2, .4 ,.6,.8, 1]

for vals in colsample_vals:
    print 'testing colsample_bytree value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)

testing colsample_bytree value 0.2
0.760282454355
testing colsample_bytree value 0.4
0.764976955446
testing colsample_bytree value 0.6
0.764551806212
testing colsample_bytree value 0.8
0.7636037221
testing colsample_bytree value 1
0.764009207239


~ .7649 is the best with a colsample_bytree value of .4.

Next is subsample, we'll try the same values as above. 

In [26]:
subsample_vals = [.2, .4, .6, .8, 1]

for vals in subsample_vals:
    print 'testing subsample value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=.4,
        subsample=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)


testing subsample value 0.2
0.767834899619
testing subsample value 0.4
0.768505268033
testing subsample value 0.6
0.767990545297
testing subsample value 0.8
0.768153950397
testing subsample value 1
0.764976955446


So the best score is ~.7685 with a subsample val of .4

Next up is min child weight typically in the range of 0.1 
to 10.

In [30]:
min_child_weight = [.1, .5, 1, 3, 5, 7, 10, 175]

for vals in min_child_weight:
    print 'testing min_child value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=.4,
        subsample=.4,
        min_child_weight=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)


testing min_child value 0.1
0.768170594879
testing min_child value 0.5
0.768403483305
testing min_child value 1
0.768505268033
testing min_child value 3
0.768557212882
testing min_child value 5
0.768665280547
testing min_child value 7
0.768988990562
testing min_child value 10
0.768939362254
testing min_child value 175
0.622267488795


Here 7 gives us the max score of ~0.7689, finally we'll check learning rate against various number of estimators as they interact heavily. 

In [31]:
learning_rate = [1e-3, 1e-2, .05, .07, 
                .09, .1]
n_estimators = [100, 500, 1000, 3000, 5000,
               7000, 9000, 11000, 13000]

for rates in learning_rate:
    for estimators in n_estimators:
        print 'testing rates value {0}, with n_estimators {1}'.format(rates, 
                                                                      estimators)
        xgb = XGBClassifier(
            max_depth=1,
            colsample_bytree=.4,
            subsample= .4,
            min_child_weight=7,
            n_estimators=estimators, 
            learning_rate=rates,
            objective='binary:logistic')
            
        print modelfit(xgb, X, y)

testing rates value 0.001, with n_estimators 100
0.635061402304
testing rates value 0.001, with n_estimators 500
0.648475374884
testing rates value 0.001, with n_estimators 1000
0.65049502959
testing rates value 0.001, with n_estimators 3000
0.666099042473
testing rates value 0.001, with n_estimators 5000
0.735681842486
testing rates value 0.001, with n_estimators 7000
0.760974993808
testing rates value 0.001, with n_estimators 9000
0.76735242654
testing rates value 0.001, with n_estimators 11000
0.770437025791
testing rates value 0.001, with n_estimators 13000
0.772355568412
testing rates value 0.01, with n_estimators 100
0.648819177507
testing rates value 0.01, with n_estimators 500
0.73497494105
testing rates value 0.01, with n_estimators 1000
0.768712776119
testing rates value 0.01, with n_estimators 3000
0.776149067003
testing rates value 0.01, with n_estimators 5000
0.774461009558
testing rates value 0.01, with n_estimators 7000
0.771426764645
testing rates value 0.01, with n_est

Overall the best eta value is 0.01 with n_estimators 3000
and a score of 0.776149067003. Next we'll get the final baseline pr_auc and roc_auc then see how it performs on the test data. 

In [33]:
xgb = XGBClassifier(
          max_depth=1,
          colsample_bytree=.4,
          subsample= .4,
          min_child_weight=7,
          n_estimators=3000, 
          learning_rate=.01,
          objective='binary:logistic')

y_pred = cross_val_predict(xgb, X, y, cv=4, method='predict_proba')[:, 1]
pr_auc = metrics.average_precision_score(y, y_pred)
roc_auc = metrics.roc_auc_score(y, y_pred)

print 'The pr_auc score is: {0}'.format(pr_auc)
print 'The roc_auc score is: {0}'.format(roc_auc)

The pr_auc score is: 0.776149067003
The roc_auc score is: 0.909755763766


Now we will use the model on the testing data. 

In [34]:
df_test = pd.read_csv("TESSfield_12h_-20d_combinedfeatures_try2.csv", index_col=0)
X_test = df_test.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY', 'Catalog_Period',
             'Depth', 'Catalog_Epoch', 'SNR'], axis=1)

y_test = df_test['CombinedY']

In [37]:
xgb.fit(X, y) # fitting on the 19h set

y_pred_test = xgb.predict_proba(X_test)[:, 1] # testing on the 12h set 

pr_auc = metrics.average_precision_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_test)

print 'The pr_auc score is: {0}'.format(pr_auc)
print 'The roc_auc score is: {0}'.format(roc_auc)

The pr_auc score is: 0.739524668741
The roc_auc score is: 0.906274573981
