# The following notebook takes the combined feature file and applies a couple different ways of testing the effects of each feature on the auc_pr score of xgboost on default parameters. 

The following order is taken: 

1)The features are deleted one at at time regardless of the change in scores from right to left and the scores are reported.

2) The features are deleted from left to right one at a time regardless of change in scores. 

3) The features are dropped if they create a positive difference in the score. 

4) The features are temporarily deleted one at a time from right to left depedning on how their removal effects the pr_auc score. In this portion dropping the feature must cause a score change above .005 in order to be removed. 

5) The new feature file from 3 is taken and features are dropped randomly if the score change is greater than the threshold of .005. 

6) The fulll feature file is used and features are dropped randomly if their score is above a threshold. 

In [1]:
# deleting the features one at a time indiscriminantly from the right
import pandas as pd
from sklearn import metrics
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_val_predict

# open Chelsea's combined feature file
# remove the last row to make the dimensions match with the raw LC file
data_combined_features = pd.read_csv("TESSfield_05h_01d_combinedfeatures.csv",
                                     header=0, index_col=0)
data_combined_features = data_combined_features.drop(data_combined_features.index[-1])

# drop the columns that aren't features and get targets 
X = data_combined_features.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY',
                                 'Catalog_Period', 'Depth', 'Catalog_Epoch', 'SNR'],
                                axis=1)

target = 'CombinedY'
y = data_combined_features['CombinedY']
predictors = [x for x in X.columns if x not in [target]]


scores = list()
def modelfit(alg, predictors, df, cv_folds=4):

    # StratifiedKFold automatically used by cross_val_predict on binary classification 
    # bear in mind that this does not use trapezoid rule
    predictors = [x for x in X.columns if x not in [target]]
    while len(predictors) > 0:
        for x in predictors:
            y_pred = cross_val_predict(alg, df[predictors], df[target], cv=cv_folds)
            pr_auc = metrics.average_precision_score(df[target], y_pred)
            # Print model report:
            print "pr_auc model score: {0}".format(pr_auc)
            print "Dropping predictor {0}".format(predictors[-1])
            del predictors[-1]
            scores.append(pr_auc)

xgb1 = XGBClassifier(
   objective='binary:logistic')

modelfit(xgb1, predictors, data_combined_features)



pr_auc model score: 0.699410990974
Dropping predictor P19
pr_auc model score: 0.70257779817
Dropping predictor P18
pr_auc model score: 0.701695051255
Dropping predictor P17
pr_auc model score: 0.708411308736
Dropping predictor P16
pr_auc model score: 0.705255657316
Dropping predictor P15
pr_auc model score: 0.703905570515
Dropping predictor P14
pr_auc model score: 0.699666604825
Dropping predictor P13
pr_auc model score: 0.697122253662
Dropping predictor P12
pr_auc model score: 0.693576606645
Dropping predictor P11
pr_auc model score: 0.702092662611
Dropping predictor P10
pr_auc model score: 0.714701020128
Dropping predictor P9
pr_auc model score: 0.706627389138
Dropping predictor P8
pr_auc model score: 0.695744191453
Dropping predictor P7
pr_auc model score: 0.696236533057
Dropping predictor P6
pr_auc model score: 0.691200028507
Dropping predictor P5
pr_auc model score: 0.7007406319
Dropping predictor P4
pr_auc model score: 0.693643577149
Dropping predictor P3
pr_auc model score: 0.69

We can see that by deleting one feature at a time, we achieve a max pr_auc model score of 0.714701020128 after deleting the first 10 PCA components. What if we do the same, but starting from the BLA features? 

In [7]:
# deleting the features one at a time indiscriminantly from the left


# drop the columns that aren't features and get targets, we need to recall the features
# as they were altered the last experiment 
X = data_combined_features.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY',
                                 'Catalog_Period', 'Depth', 'Catalog_Epoch', 'SNR'],
                                axis=1)

target = 'CombinedY'
y = data_combined_features['CombinedY']
predictors = [x for x in X.columns if x not in [target]]

scores = list()
def modelfit(alg, predictors, df, cv_folds=4):

    # StratifiedKFold automatically used by cross_val_predict on binary classification 
    # bear in mind that this does not use trapezoid rule
    predictors = [x for x in X.columns if x not in [target]]
    while len(predictors) > 0:
        for x in predictors:
            y_pred = cross_val_predict(alg, df[predictors], df[target], cv=cv_folds)
            pr_auc = metrics.average_precision_score(df[target], y_pred)
            # Print model report:
            print "pr_auc model score: {0}".format(pr_auc)
            print "Dropping predictor {0}".format(predictors[0])
            del predictors[0]
            scores.append(pr_auc)

xgb1 = XGBClassifier(
   objective='binary:logistic')

modelfit(xgb1, predictors, data_combined_features)

pr_auc model score: 0.699410990974
Dropping predictor BLS_Period_1_0
pr_auc model score: 0.7007406319
Dropping predictor BLS_Tc_1_0
pr_auc model score: 0.708888928806
Dropping predictor BLS_SN_1_0
pr_auc model score: 0.690747751006
Dropping predictor BLS_SR_1_0
pr_auc model score: 0.677529510817
Dropping predictor BLS_SDE_1_0
pr_auc model score: 0.69309749028
Dropping predictor BLS_Depth_1_0
pr_auc model score: 0.703466408039
Dropping predictor BLS_Qtran_1_0
pr_auc model score: 0.685181619696
Dropping predictor BLS_Qingress_1_0
pr_auc model score: 0.686898037606
Dropping predictor BLS_OOTmag_1_0
pr_auc model score: 0.688631705948
Dropping predictor BLS_i1_1_0
pr_auc model score: 0.686568538615
Dropping predictor BLS_i2_1_0
pr_auc model score: 0.684800265022
Dropping predictor BLS_deltaChi2_1_0
pr_auc model score: 0.644127599675
Dropping predictor BLS_fraconenight_1_0
pr_auc model score: 0.66559493405
Dropping predictor BLS_Npointsintransit_1_0
pr_auc model score: 0.660582119336
Droppin

We see that this method does not yeild any significant score increases. In the next cell the scores are compared to eachother before deleting them, a threshold of .005 is chosen for a feature to be dropped. 

In [16]:
# the feature is deleted if a positive change in score occurs 
def modelfit(alg, X, y, cv_folds=4):
    # StratifiedKFold automatically used by cross_val_predict on binary classification
    # bear in mind that this does not use trapezfoid rule
    # y_pred calculates the probabilities that each value is 1 or 0 using stratified cross validation
    # pr_auc calculates the area under a precision recall curve
    y_pred = cross_val_predict(alg, X, y, cv=cv_folds)
    pr_auc = metrics.average_precision_score(y, y_pred)
    return pr_auc

# feature testing function, in future will select
# multiple columns at a time 
def feature_testing(alg, X, y, threshold, num_features):
    print 'testing features linearly, {0} at a time'.format(num_features)
    X = X
    X_copy = X.copy()
    score_list = []
    for i, column in enumerate(X.columns):
        print 'Working on {0}'.format(column)
        df_temp = X_copy.drop(column, axis=1)  # temporarily drop features 
        score_temp = modelfit(alg, df_temp, y)  # test the model with the feature dropped
        score_list.append(score_temp)
        score_perm = modelfit(alg, X, y)  # test the model without the feature dropped
        print score_temp, score_perm 
        # if the difference between the scores is >= threshold features' dropped
        if score_temp > score_perm:  
            X.drop(column, axis=1, inplace=True)
            print 'dropped feature {0}'.format(column)
        else:
            df_temp = X

    print "The new feature file has {0} features, which are: {1}".format(len(X.columns), X.columns)
    print "The pr_auc score training on the new features is {0}".format(modelfit(xgb1, X, y))
    X.to_csv('reduced_combined_features.csv')
    return X


xgb1 = XGBClassifier(objective='binary:logistic')
feature_testing(xgb1, X, y, threshold=.005, num_features=1)


testing features linearly, 1 at a time
Working on BLS_Period_1_0
0.706977402686 0.705737072347
dropped feature BLS_Period_1_0
Working on BLS_Tc_1_0
0.716382551255 0.706977402686
dropped feature BLS_Tc_1_0
Working on BLS_SN_1_0
0.687181435764 0.730375516495
Working on BLS_SR_1_0
0.713356209433 0.730375516495
Working on BLS_SDE_1_0
0.708888928806 0.730375516495
Working on BLS_Qtran_1_0
0.704434002432 0.730375516495
Working on BLS_Qingress_1_0
0.713356209433 0.730375516495
Working on BLS_OOTmag_1_0
0.705737072347 0.730375516495
Working on BLS_i1_1_0
0.711559727021 0.730375516495
Working on BLS_i2_1_0
0.712597113111 0.730375516495
Working on BLS_deltaChi2_1_0
0.680849933295 0.730375516495
Working on BLS_Npointsintransit_1_0
0.70758745564 0.730375516495
Working on BLS_Ntransits_1_0
0.70706307156 0.730375516495
Working on BLS_Npointsbeforetransit_1_0
0.708411308736 0.730375516495
Working on BLS_Npointsaftertransit_1_0
0.710733512794 0.730375516495
Working on BLS_Rednoise_1_0
0.713872287098 0

Unnamed: 0,BLS_SN_1_0,BLS_SR_1_0,BLS_SDE_1_0,BLS_Qtran_1_0,BLS_Qingress_1_0,BLS_OOTmag_1_0,BLS_i1_1_0,BLS_i2_1_0,BLS_deltaChi2_1_0,BLS_Npointsintransit_1_0,...,P10,P11,P12,P13,P14,P15,P16,P17,P18,P19
0,3.15761,0.00057,3.17175,0.09412,0.13533,6.69205,0.80702,0.90114,-3.63099,44.0,...,0.243625,-0.034061,-0.015203,0.010847,0.045057,0.160046,-0.076939,-0.013390,0.286184,-0.095256
1,3.39857,0.00240,3.47764,0.00960,0.16927,8.50662,0.16537,0.17496,-15.08698,6.0,...,0.945552,0.014085,0.893782,-0.092023,-0.384054,1.157046,-0.354497,0.177996,-0.181550,1.537816
2,3.54161,0.00379,3.68089,0.08176,0.22902,8.99762,0.91567,0.99742,-27.33371,38.0,...,-0.055822,2.301863,0.725926,-0.876449,0.699427,-0.749042,-0.168139,-0.262213,-0.663934,1.133305
3,3.32998,0.00442,3.73826,0.07359,0.26568,9.09469,0.00000,0.07359,-22.01950,40.0,...,-0.165849,-1.011049,-1.355278,0.136422,-0.547113,0.685484,-0.732569,0.466468,-0.762763,-1.792199
4,3.89927,0.00140,4.35553,0.01057,0.20050,7.81212,0.05307,0.06364,-19.66739,5.0,...,0.372705,0.012813,-0.552150,0.413261,0.094028,-0.208469,-0.333133,0.167424,-0.485250,-0.266381
5,4.63677,0.00190,4.98802,0.01093,0.25147,7.98366,0.75233,0.76326,-14.71517,5.0,...,0.172163,0.441734,-0.072608,0.099632,-0.322196,0.400600,-0.126875,-0.015872,1.155116,-0.453791
6,3.78605,0.00180,4.07410,0.01203,0.25548,8.73057,0.48302,0.49505,-12.68264,6.0,...,0.021792,-0.088671,0.169079,1.094584,-0.942680,0.329634,-0.628445,0.753287,-0.099502,1.297456
7,3.32529,0.00077,3.49558,0.04872,0.20843,7.62108,0.27443,0.32314,-9.84866,24.0,...,0.254001,-0.143716,0.043782,0.243294,0.145515,0.173636,0.026701,0.125622,0.052118,0.116465
8,4.55340,0.00039,4.43720,0.11215,0.17988,5.95826,0.12718,0.23933,-3.24836,65.0,...,0.135569,-0.044124,0.086307,0.056456,0.034038,0.100405,0.010317,-0.089762,0.056381,-0.002881
9,4.09386,0.00043,4.02285,0.37399,0.45043,6.09602,0.01163,0.38562,-3.46383,202.0,...,0.171071,-0.073520,0.068015,0.055449,0.042292,0.163543,-0.020399,-0.054816,0.031632,-0.003205


In [2]:
# deleting the features one at a time based on a threshold value


# fitting function 
def modelfit(alg, X, y, cv_folds=4):
    # StratifiedKFold automatically used by cross_val_predict on binary classification
    # bear in mind that this does not use trapezfoid rule
    # y_pred calculates the probabilities that each value is 1 or 0 using stratified cross validation
    # pr_auc calculates the area under a precision recall curve
    y_pred = cross_val_predict(alg, X, y, cv=cv_folds)
    pr_auc = metrics.average_precision_score(y, y_pred)
    return pr_auc

# feature testing function, in future will select
# multiple columns at a time 
def feature_testing(alg, X, y, threshold, num_features):
    print 'testing features linearly, {0} at a time'.format(num_features)
    X = X
    X_copy = X.copy()
    score_list = []
    for i, column in enumerate(X.columns):
        print 'Working on {0}'.format(column)
        df_temp = X_copy.drop(column, axis=1)  # temporarily drop features 
        score_temp = modelfit(alg, df_temp, y)  # test the model with the feature dropped
        score_list.append(score_temp)
        score_perm = modelfit(alg, X, y)  # test the model without the feature dropped
        print score_temp, score_perm 
        # if the difference between the scores is >= threshold features' dropped
        if score_temp - score_perm >= threshold:  
            X.drop(column, axis=1, inplace=True)
            print 'dropped feature {0}'.format(column)
        else:
            df_temp = X

    print "The new feature file has {0} features, which are: {1}".format(len(X.columns), X.columns)
    print "The pr_auc score training on the new features is {0}".format(modelfit(xgb1, X, y))
    X.to_csv('reduced_combined_features.csv')
    return X


xgb1 = XGBClassifier(objective='binary:logistic')
feature_testing(xgb1, X, y, threshold=.005, num_features=1)

testing features linearly, 1 at a time
Working on BLS_Period_1_0
0.7007406319 0.699410990974
Working on BLS_Tc_1_0
0.696236533057 0.699410990974
Working on BLS_SN_1_0
0.689364968058 0.699410990974
Working on BLS_SR_1_0
0.699410990974 0.699410990974
Working on BLS_SDE_1_0
0.703466408039 0.699410990974
Working on BLS_Depth_1_0
0.721974298407 0.699410990974
dropped feature BLS_Depth_1_0
Working on BLS_Qtran_1_0
0.687548825758 0.721974298407
Working on BLS_Qingress_1_0
0.703905570515 0.721974298407
Working on BLS_OOTmag_1_0
0.699410990974 0.721974298407
Working on BLS_i1_1_0
0.696485691649 0.721974298407
Working on BLS_i2_1_0
0.692151844713 0.721974298407
Working on BLS_deltaChi2_1_0
0.675700204108 0.721974298407
Working on BLS_fraconenight_1_0
0.712033480509 0.721974298407
Working on BLS_Npointsintransit_1_0
0.703466408039 0.721974298407
Working on BLS_Ntransits_1_0
0.701273037606 0.721974298407
Working on BLS_Npointsbeforetransit_1_0
0.700298066908 0.721974298407
Working on BLS_Npointsaf

Unnamed: 0,BLS_Period_1_0,BLS_Tc_1_0,BLS_SN_1_0,BLS_SR_1_0,BLS_SDE_1_0,BLS_Qtran_1_0,BLS_Qingress_1_0,BLS_OOTmag_1_0,BLS_i1_1_0,BLS_i2_1_0,...,P10,P11,P12,P13,P14,P15,P16,P17,P18,P19
0,1.644440,2.457828e+06,3.15761,0.00057,3.17175,0.09412,0.13533,6.69205,0.80702,0.90114,...,0.243625,-0.034061,-0.015203,0.010847,0.045057,0.160046,-0.076939,-0.013390,0.286184,-0.095256
1,4.361578,2.457828e+06,3.39857,0.00240,3.47764,0.00960,0.16927,8.50662,0.16537,0.17496,...,0.945552,0.014085,0.893782,-0.092023,-0.384054,1.157046,-0.354497,0.177996,-0.181550,1.537816
2,1.383916,2.457828e+06,3.54161,0.00379,3.68089,0.08176,0.22902,8.99762,0.91567,0.99742,...,-0.055822,2.301863,0.725926,-0.876449,0.699427,-0.749042,-0.168139,-0.262213,-0.663934,1.133305
3,1.392855,2.457827e+06,3.32998,0.00442,3.73826,0.07359,0.26568,9.09469,0.00000,0.07359,...,-0.165849,-1.011049,-1.355278,0.136422,-0.547113,0.685484,-0.732569,0.466468,-0.762763,-1.792199
4,1.963971,2.457827e+06,3.89927,0.00140,4.35553,0.01057,0.20050,7.81212,0.05307,0.06364,...,0.372705,0.012813,-0.552150,0.413261,0.094028,-0.208469,-0.333133,0.167424,-0.485250,-0.266381
5,1.009763,2.457828e+06,4.63677,0.00190,4.98802,0.01093,0.25147,7.98366,0.75233,0.76326,...,0.172163,0.441734,-0.072608,0.099632,-0.322196,0.400600,-0.126875,-0.015872,1.155116,-0.453791
6,1.805342,2.457828e+06,3.78605,0.00180,4.07410,0.01203,0.25548,8.73057,0.48302,0.49505,...,0.021792,-0.088671,0.169079,1.094584,-0.942680,0.329634,-0.628445,0.753287,-0.099502,1.297456
7,1.263810,2.457827e+06,3.32529,0.00077,3.49558,0.04872,0.20843,7.62108,0.27443,0.32314,...,0.254001,-0.143716,0.043782,0.243294,0.145515,0.173636,0.026701,0.125622,0.052118,0.116465
8,4.539142,2.457828e+06,4.55340,0.00039,4.43720,0.11215,0.17988,5.95826,0.12718,0.23933,...,0.135569,-0.044124,0.086307,0.056456,0.034038,0.100405,0.010317,-0.089762,0.056381,-0.002881
9,4.536980,2.457828e+06,4.09386,0.00043,4.02285,0.37399,0.45043,6.09602,0.01163,0.38562,...,0.171071,-0.073520,0.068015,0.055449,0.042292,0.163543,-0.020399,-0.054816,0.031632,-0.003205


We can see that by comparing one feature to the next, with a threshold of .005 we can achieve a max score of 0.721974298407. The features that was dropped is BLS_Depth_1_0. Next we will take this new feature file with the new score, and try deleting features randomly to see if we can improve the score.  

In [3]:
import random

def feature_selection_random(X, y, alg, threshold, num_features=1):
    X = X
    X_copy = X.copy()
    print "The default score on the full dataset is: {0} ".format(modelfit(xgb1,
                                                                           X, y)), 
    print "Testing Features Randomly, {0} at a time".format(num_features)
    score_list = []
    rand_ints = random.sample(xrange(0, len(X.columns)), len(X.columns))
    for i, rand_int in enumerate(rand_ints):
        df_temp = X_copy.drop(X.columns[rand_int], axis=1)
        score_temp = modelfit(alg, df_temp, y)
        score_list.append(score_temp)
        score_perm = modelfit(alg, X, y)
        print score_temp, score_perm
        if score_temp - score_perm >= threshold:
            X.drop(X.columns[rand_int], axis=1, inplace=True)
            print 'dropped feature {0}'.format(X.columns[rand_int])
            filter(lambda a: a != len(X.columns), rand_ints)
        else:
            df_temp = X

feature_selection_random(X, y, xgb1, .005)

The default score on the full dataset is: 0.721974298407  Testing Features Randomly, 1 at a time
0.688955495642 0.721974298407
0.712033480509 0.721974298407
0.705255657316 0.721974298407
0.710213247724 0.721974298407
0.667301434733 0.721974298407
0.715170838132 0.721974298407
0.717003889458 0.721974298407
0.698104443179 0.721974298407
0.708888928806 0.721974298407
0.703154582109 0.721974298407
0.715729953241 0.721974298407
0.715170838132 0.721974298407
0.703466408039 0.721974298407
0.721974298407 0.721974298407
0.70758745564 0.721974298407
0.693938853933 0.721974298407
0.712597113111 0.721974298407
0.705737072347 0.721974298407
0.70257779817 0.721974298407
0.703905570515 0.721974298407
0.708888928806 0.721974298407
0.705737072347 0.721974298407
0.699992015513 0.721974298407
0.705255657316 0.721974298407
0.682954026389 0.721974298407
0.710733512794 0.721974298407
0.721424402787 0.721974298407
0.710733512794 0.721974298407
0.718855645389 0.721974298407
0.705737072347 0.721974298407
0.704

So we see that by randomly deleting one feature at a time we weren't able to surpass the previous score. What if we put the feature back and try deleting them randomly one at a time? 

In [15]:
# fitting function 
def modelfit(alg, X, y, cv_folds=4):
    # StratifiedKFold automatically used by cross_val_predict on binary classification
    # bear in mind that this does not use trapezfoid rule
    # y_pred calculates the probabilities that each value is 1 or 0 using stratified cross validation
    # pr_auc calculates the area under a precision recall curve
    y_pred = cross_val_predict(alg, X, y, cv=cv_folds)
    pr_auc = metrics.average_precision_score(y, y_pred)
    return pr_auc

# drop the columns that aren't features and get targets 
X = data_combined_features.drop(['Ids', 'CatalogY', 'ManuleY', 'CombinedY',
                                 'Catalog_Period', 'Depth', 'Catalog_Epoch', 'SNR'],
                                axis=1)

import random

def feature_selection_random(X, y, alg, threshold, num_features=1):
    X = X
    X_copy = X.copy()
    print "The default score on the full dataset is: {0} ".format(modelfit(xgb1,
                                                                           X, y)), 
    print "Testing Features Randomly, {0} at a time".format(num_features)
    score_list = []
    
    for i, column in enumerate(X.columns):
        rand_ints = random.sample(xrange(0, len(X.columns)), len(X.columns))
        rand_int = rand_ints[i]
        df_temp = X_copy.drop(X.columns[rand_int], axis=1)
        score_temp = modelfit(alg, df_temp, y)
        score_list.append(score_temp)
        score_perm = modelfit(alg, X, y)
        print score_temp, score_perm
        if score_temp - score_perm >= threshold:
            X.drop(X.columns[rand_int], axis=1, inplace=True)
            print 'dropped feature {0}'.format(X.columns[rand_int])
            filter(lambda x: x == len(X.columns), rand_ints)
        else:
            df_temp = X

feature_selection_random(X, y, xgb1, .005)

The default score on the full dataset is: 0.699410990974  Testing Features Randomly, 1 at a time
0.688955495642 0.699410990974
0.712033480509 0.699410990974
dropped feature BLS_Npointsintransit_1_0
0.721974298407 0.712033480509
dropped feature BLS_Qtran_1_0
0.705737072347 0.705737072347
0.689364968058 0.705737072347
0.702092662611 0.705737072347
0.70257779817 0.705737072347
0.7007406319 0.705737072347
0.6953404765 0.705737072347
0.706309542156 0.705737072347
0.703466408039 0.705737072347


KeyboardInterrupt: 

I can't seem to figure out why the score becomes lower after dropping the BLS_Qtrab_1_0 feature even though it passes the threshold test. 

## The highest score achieved was: 0.730375516495, using the method of dropping a feature  if any positive change in score occurs. The features that were dropped in this schema are:
## BLS_Period_1_0 and BLS_Tc_1_0. 