# Tuning the XGBoost classification algorithm on TESS data

In [4]:
# import modules
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

df = pd.read_csv("TESSfield_19h_44d_combinedfeatures_try2.csv", index_col=0)
X = df.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY', 'Catalog_Period',
             'Depth', 'Catalog_Epoch', 'SNR'], axis=1)

y = df['CombinedY']

def modelfit(alg, X, y, cv_folds=4):
    # StratifiedKFold automatically used by cross_val_predict on binary classification
    # bear in mind that this does not use trapezfoid rule
    # y_pred calculates the probabilities that each value is 1 or 0 using stratified cross validation
    # pr_auc calculates the area under a precision recall curve
    y_pred = cross_val_predict(alg, X, y, cv=cv_folds)
    pr_auc = metrics.average_precision_score(y, y_pred)
    return pr_auc

xgb1 = XGBClassifier(objective='binary:logistic')



First we will get a baseline score: 

In [5]:
modelfit(xgb1, X, y)

0.78547873498859899

Now we will try adjusting depth, we will try 1, 6, 12, and 18. 

In [11]:
depth_vals = [1, 6, 12, 18, 40]

for vals in depth_vals:
    print 'testing depth value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=vals,    
        objective='binary:logistic')
    print modelfit(xgb, X, y)

testing depth value 1
0.767579061431
testing depth value 6
0.783890308475
testing depth value 12
0.80763570566
testing depth value 18
0.810930893584
testing depth value 40
0.807118399815


Lets try somewhere between 18 and 40. 

 xgb = XGBClassifier(
        max_depth=30,    
        objective='binary:logistic')
modelfit(xgb, X, y)

What about one value up? 

In [19]:
 xgb = XGBClassifier(
        max_depth=19,    
        objective='binary:logistic')
modelfit(xgb, X, y)

0.80445089166267381

Seems like we've hit the best score at 18. Next we'll do colsample_bytree, trying a range of values. 

In [22]:
colsample_vals = [.2, .4 ,.6,.8,1]

for vals in colsample_vals:
    print 'testing colsample_bytree value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=18,
        colsample_bytree=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)

testing colsample_bytree value 0.2
0.788727091189
testing colsample_bytree value 0.4
0.808734659418
testing colsample_bytree value 0.6
0.812794558806
testing colsample_bytree value 0.8
0.810824905179
testing colsample_bytree value 1
0.810930893584


.6 gets the best score, we'll now try .5 and .7 to be safe

In [23]:
xgb = XGBClassifier(
        max_depth=18,
        colsample_bytree=.5,
        objective='binary:logistic')

modelfit(xgb, X, y)

0.81172461896230519

In [24]:
xgb = XGBClassifier(
        max_depth=18,
        colsample_bytree=.7,
        objective='binary:logistic')

modelfit(xgb, X, y)

0.81181606461583311

.812 is the best out of the three with a colsample_bytree value of .6

Next is subsample, we'll try the same values as above. 

In [28]:
subsample_vals = [.2, .4, .6, .8, 1]

for vals in sample_vals:
    print 'testing subsample value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=18,
        colsample_bytree=vals,
        subsample=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)


testing subsample value 0.2
0.791456280064
testing subsample value 0.4
0.800553718158
testing subsample value 0.6
0.806565170847
testing subsample value 0.8
0.812057861039
testing subsample value 1
0.810930893584


.8 seems to be the best, lets try .9 and .7 to be thorough

In [31]:
xgb = XGBClassifier(
        max_depth=18,
        colsample_bytree=.6,
        subsample=.9,
        objective='binary:logistic')
modelfit(xgb, X, y)

0.80972563161946054

In [32]:
xgb = XGBClassifier(
        max_depth=18,
        colsample_bytree=.6,
        subsample=.7,
        objective='binary:logistic')
modelfit(xgb, X, y)

0.80206649134487595

So the best score is ~.812 with a subsample val of .8

Next up is min child weight

In [34]:
min_child_weight = [.1, .5, 1, 3, 5, 7, 10, 175]

for vals in min_child_weight:
    print 'testing min_child value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=18,
        colsample_bytree=.6,
        subsample=.8,
        min_child_weight=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)


testing min_child value 0.1
0.814750800284
testing min_child value 0.5
0.808962807971
testing min_child value 1
0.808414873856
testing min_child value 3
0.801848718167
testing min_child value 5
0.800883928252
testing min_child value 7
0.804091074989
testing min_child value 10
0.801430963606
testing min_child value 175
0.742936356452


Here .1 gives us the max score of ~.814

In [39]:
learning_rate = [1e-3, 1e-2, .05, .07, 
                .09, .1]
n_estimators = [100, 500, 1000, 3000, 5000,
               7000, 9000, 11000, 13000]

for rates in learning_rate:
    for estimators in n_estimators:
        print 'testing rates value {0}, with n_estimators {1}'.format(rates, 
                                                                      estimators)
        xgb = XGBClassifier(
            max_depth=18,
            colsample_bytree=.6,
            subsample=.8,
            min_child_weight=.1,
            n_estimators=estimators, 
            learning_rate=rates,
            objective='binary:logistic')
            
        print modelfit(xgb, X, y)

testing rates value 0.001, with n_estimators 100
0.808393493593
testing rates value 0.001, with n_estimators 500
0.807281354588
testing rates value 0.001, with n_estimators 1000
0.809051303085
testing rates value 0.001, with n_estimators 3000
0.811904729499
testing rates value 0.001, with n_estimators 5000
0.81366790197
testing rates value 0.001, with n_estimators 7000
0.814347817636
testing rates value 0.001, with n_estimators 9000
0.814347817636
testing rates value 0.001, with n_estimators 11000
0.81366790197
testing rates value 0.001, with n_estimators 13000
0.814888406412
testing rates value 0.01, with n_estimators 100
0.81056038637
testing rates value 0.01, with n_estimators 500
0.817474871774
testing rates value 0.01, with n_estimators 1000
0.816937185069
testing rates value 0.01, with n_estimators 3000
0.817589610147
testing rates value 0.01, with n_estimators 5000
0.816242668622
testing rates value 0.01, with n_estimators 7000
0.816114671961
testing rates value 0.01, with n_est

Searching the list gives a max value of 0.817589610147 using an eta(learning rate) of 0.01, with n_estimators 3000.

Its also interesting to note that we get a very close score of 0.817474871774 with an eta of  0.01, and relatively few estimators of 500.