In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef, make_scorer


### Load Data

In [21]:
train_numeric_reduced=pd.read_csv('bosch_data/train_numeric_reduced.csv')

In [22]:
train_numeric_reduced.head()

Unnamed: 0.1,Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,388849,778109,0.062,-0.011,-0.179,-0.179,0.118,0.025,-0.015,-0.072,...,,,,,,,,,,0
1,809765,1619607,,,,,,,,,...,,,,,,,,,,0
2,548564,1097782,0.01,-0.011,-0.033,0.003,-0.056,0.07,0.0,0.008,...,,,,,,,,,,0
3,173194,346239,-0.199,-0.19,0.294,0.294,-0.056,-0.203,0.052,0.288,...,,,,,,,,,,0
4,209792,419392,-0.042,-0.034,0.348,0.33,0.074,0.025,0.0,-0.032,...,,,,,,,,,,0


In [23]:
id_feature=pd.read_csv('bosch_data/idfeature.csv')

In [24]:
id_feature.shape

(2367494, 9)

### Merge other tables

In [26]:
#train = pd.merge(train_numeric_reduced, id_feature, on='Id', how='left')

In [27]:
train = train_numeric_reduced

### Define score function

In [28]:
def mcc(tp, tn, fp, fn):
    sup = (tp * tn - fp * fn)*1.0
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf == 0:
        return 0
    else:
        return sup / np.sqrt(inf)


def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true)  # number of positive
    numn = n - nump  # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    
    #print(mccs)
    if show:
        best_proba = y_prob[idx[best_id]]
        y_pred = (y_prob > best_proba).astype(int)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc


def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc




## Train graident boosting tree

In [None]:
#### Get feature and target label

In [42]:
import xgboost as xgb
listp=[x for x in train.columns if x not in ["Id","Unnamed: 0","Response"]]


X=train[listp]
Y=train['Response']

### Generate train table for xgb

In [43]:
dvisibletrain = xgb.DMatrix(X,Y,silent=True)

### Training xgb

In [44]:
param = {'max_depth':8, 'eta':.1, 'silent':1}
param['colsample_bytree'] = 0.5
param['subsample'] = 0.8
param['min_child_weight'] = 3
param['base_score'] = 0.005
param['objective'] = "binary:logistic"

watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')]
## how many trees 
num_round=50

clf=xgb.train(param, dvisibletrain, num_round, 
              evals=watchlist,feval=mcc_eval, early_stopping_rounds=50,
             maximize=True)

[0]	train-MCC:0.072475	val-MCC:0.072475
Multiple eval metrics have been passed: 'val-MCC' will be used for early stopping.

Will train until val-MCC hasn't improved in 50 rounds.
[1]	train-MCC:0.078619	val-MCC:0.078619
[2]	train-MCC:0.088287	val-MCC:0.088287
[3]	train-MCC:0.095305	val-MCC:0.095305
[4]	train-MCC:0.142567	val-MCC:0.142567
[5]	train-MCC:0.246954	val-MCC:0.246954
[6]	train-MCC:0.246954	val-MCC:0.246954
[7]	train-MCC:0.246954	val-MCC:0.246954
[8]	train-MCC:0.2424	val-MCC:0.2424
[9]	train-MCC:0.236345	val-MCC:0.236345
[10]	train-MCC:0.284129	val-MCC:0.284129
[11]	train-MCC:0.275941	val-MCC:0.275941
[12]	train-MCC:0.32317	val-MCC:0.32317
[13]	train-MCC:0.28517	val-MCC:0.28517
[14]	train-MCC:0.32317	val-MCC:0.32317
[15]	train-MCC:0.343494	val-MCC:0.343494
[16]	train-MCC:0.332374	val-MCC:0.332374
[17]	train-MCC:0.315666	val-MCC:0.315666
[18]	train-MCC:0.328713	val-MCC:0.328713
[19]	train-MCC:0.332374	val-MCC:0.332374
[20]	train-MCC:0.349291	val-MCC:0.349291
[21]	train-MCC:0.323

### Score on test data

In [34]:
test=pd.read_csv('bosch_data/test_numeric.csv')

In [45]:
dtest = xgb.DMatrix(test[listp],silent=True)

In [47]:
prediction = clf.predict(dtest)

In [52]:
y_pred = (prediction > .08).astype(int)

In [53]:
resultSubmission = pd.DataFrame({'Id':test['Id'],'Response':y_pred})

In [54]:
resultSubmission.to_csv('submitResponse.csv',index=False)

In [55]:
resultSubmission.shape

(1183748, 2)

In [56]:
resultSubmission.head()

Unnamed: 0,Id,Response
0,1,0
1,2,0
2,3,0
3,5,0
4,8,0
