# In this notebook the AUC_ROC error metric is used on the TESS 19h_44d simulate data.

In [4]:
# import modules
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

df = pd.read_csv("TESSfield_19h_44d_combinedfeatures_try2.csv", index_col=0)
X = df.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY', 'Catalog_Period',
             'Depth', 'Catalog_Epoch', 'SNR'], axis=1)

y = df['CombinedY']

def modelfit(alg, X, y, cv_folds=4):
    # StratifiedKFold automatically used by cross_val_predict on binary classification
    # bear in mind that this does not use trapezfoid rule
    # y_pred calculates the probabilities that each value is 1 or 0 using stratified cross validation
    # pr_auc calculates the area under a precision recall curve
    y_pred = cross_val_predict(alg, X, y, cv=cv_folds, 
                              method='predict_proba')[:,1]
    roc_auc = metrics.roc_auc_score(y, y_pred)
    return roc_auc

xgb1 = XGBClassifier(objective='binary:logistic')



First we get a baseline score. 

In [5]:
modelfit(xgb1, X, y)

0.90860189955248649

The outline dictates to start with max_depth, citing normal values as between 6 and 18. 

In [6]:
depth_vals = [1, 6, 12, 18, 40]

for vals in depth_vals:
    print 'testing depth value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=vals,    
        objective='binary:logistic')
    print modelfit(xgb, X, y)

testing depth value 1
0.909164086008
testing depth value 6
0.894389435916
testing depth value 12
0.900500625059
testing depth value 18
0.901362294336
testing depth value 40
0.899001036477


The highest value is ~.909 at max_depth = 1. 

Next is colsample_bytree with standard values between 0.5-1 but we'll add periphery values for the sake of curiosity. 

In [7]:
colsample_vals = [.2, .4 , .6, .8, 1]

for vals in colsample_vals:
    print 'testing colsample_bytree value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)

testing colsample_bytree value 0.2
0.911654283377
testing colsample_bytree value 0.4
0.90861720581
testing colsample_bytree value 0.6
0.909458591323
testing colsample_bytree value 0.8
0.909112782013
testing colsample_bytree value 1
0.909164086008


Colsample_bytree value of .2 has the best score, bringing the score up to ~.9116 

Next to tune is subsample, with normal values in the range 0.5-1 like above, we'll add periphery values as well for the sake of curiosity. 

In [8]:
sample_vals = [.2, .4, .6, .8, 1]

for vals in sample_vals:
    print 'testing subsample value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=.2,
        subsample=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)


testing subsample value 0.2
0.91161908408
testing subsample value 0.4
0.910092297585
testing subsample value 0.6
0.910194888586
testing subsample value 0.8
0.910826012661
testing subsample value 1
0.911654283377


In [None]:
We see the 1 has the highest, like with colsample_bytree.

In [9]:
min_child_weight = [.1, .5, 1, 3, 5, 7, 10, 175]

for vals in min_child_weight:
    print 'testing min_child value {0}'.format(vals)
    xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=.2,
        subsample=1,
        min_child_weight=vals,
        objective='binary:logistic')
    print modelfit(xgb, X, y)


testing min_child value 0.1
0.91169119847
testing min_child value 0.5
0.91169119847
testing min_child value 1
0.911654283377
testing min_child value 3
0.911609485816
testing min_child value 5
0.911915814833
testing min_child value 7
0.911915814833
testing min_child value 10
0.911929931926
testing min_child value 175
0.915413201354


The max value is 0.915413201354 at a high min_child of 175, let's see if going higher increases the score. 

In [16]:
xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=.2,
        subsample=1,
        min_child_weight=170,
        objective='binary:logistic')

print modelfit(xgb, X, y)


0.915720192906


Decreasing it by 5 also caused an increase, lets just try scores in that range, since the algorithm gives results quickly we'll try a broad range.  

In [23]:
min_child_weight = range(150, 200)
min_child = list()
scores=list()

for vals in min_child_weight:
#     print 'testing min_child value {0}'.format(vals)
    min_child.append(vals)
    xgb = XGBClassifier(
        max_depth=1,
        colsample_bytree=.2,
        subsample=1,
        min_child_weight=vals,
        objective='binary:logistic')
    score = modelfit(xgb, X, y)
    scores.append(score)

print "The best score was :{0}, for min_child: {1}".format(max(scores), min_child[scores.index(max(scores))])


The best score was :0.91628353455, for min_child: 158


In [24]:
learning_rate = [1e-3, 1e-2, .05, .07, 
                .09, .1]
n_estimators = [100, 500, 1000, 3000, 5000,
               7000, 9000, 11000, 13000]

for rates in learning_rate:
    for estimators in n_estimators:
        print 'testing rates value {0}, with n_estimators {1}'.format(rates, 
                                                                    estimators)

        xgb = XGBClassifier(
        max_depth=1,
        objective='binary:logistic',
        colsample_bytree=.2,
        subsample=1,
        min_child_weight=158,
        n_estimators=estimators, 
        learning_rate=rates)
        
        print modelfit(xgb, X, y)

testing rates value 0.001, with n_estimators 100
0.885354904387
testing rates value 0.001, with n_estimators 500
0.886195355555
testing rates value 0.001, with n_estimators 1000
0.888254871178
testing rates value 0.001, with n_estimators 3000
0.89012493576
testing rates value 0.001, with n_estimators 5000
0.904767231738
testing rates value 0.001, with n_estimators 7000
0.912527028881
testing rates value 0.001, with n_estimators 9000
0.915730827442
testing rates value 0.001, with n_estimators 11000
0.916850086942
testing rates value 0.001, with n_estimators 13000
0.91719431636
testing rates value 0.01, with n_estimators 100
0.888356969525
testing rates value 0.01, with n_estimators 500
0.903279976499
testing rates value 0.01, with n_estimators 1000
0.916295426205
testing rates value 0.01, with n_estimators 3000
0.917440677463
testing rates value 0.01, with n_estimators 5000
0.916808788925
testing rates value 0.01, with n_estimators 7000
0.91644261091
testing rates value 0.01, with n_est

So the final eta value here is 0.07 with n_estimators 500
and a score of 0.918584094008. 


Now we'll do the test set. 

In [33]:
df_test = pd.read_csv("TESSfield_12h_-20d_combinedfeatures_try2.csv", index_col=0)
X_test = df_test.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY', 'Catalog_Period',
             'Depth', 'Catalog_Epoch', 'SNR'], axis=1)

y_test = df_test['CombinedY']

xgb = XGBClassifier(
        max_depth=1,
        objective='binary:logistic',
        colsample_bytree=.2,
        subsample=1,
        min_child_weight=158,
        n_estimators=500, 
        learning_rate=.07)

xgb.fit(X, y) # fitting on the 19h set

y_pred_test = xgb.predict_proba(X_test)[:, 1] # testing on the 12h set 

pr_auc = metrics.average_precision_score(y_test, y_pred_test)
roc_auc = metrics.roc_auc_score(y_test, y_pred_test)

print 'The roc_auc score is: {0}'.format(roc_auc)

The roc_auc score is: 0.905512732443



Finally we'll try ensembling the pr_auc and roc_auc fitted models to see the effect. 

In [36]:
from sklearn.ensemble import VotingClassifier


estimators = []

xgb_pr_auc = XGBClassifier(
          max_depth=1,
          colsample_bytree=.4,
          subsample= .4,
          min_child_weight=7,
          n_estimators=3000, 
          learning_rate=.01,
          objective='binary:logistic')

estimators.append(('xgb1', xgb_pr_auc))

xgb_roc_auc = XGBClassifier(
        max_depth=1,
        objective='binary:logistic',
        colsample_bytree=.2,
        subsample=1,
        min_child_weight=158,
        n_estimators=500, 
        learning_rate=.07)


estimators.append(('xgb2', xgb_roc_auc))

# set voting = soft to get probabilities 
ensemble = VotingClassifier(estimators, n_jobs=-1, voting='soft')


ensemble.fit(X, y)
y_pred = ensemble.predict_proba(X_test)[:, 1]

pr_auc = metrics.average_precision_score(y_test, y_pred)
pr_roc = metrics.roc_auc_score(y_test, y_pred)

print 'pr_roc: {0}'.format(pr_roc)

pr_roc: 0.907411232318


We see that this simple ensemble caused a minor increase in score at the 3rd decimal place. However future aditions of algorithms should include models that aren't GBM in order to increase the diversity. 