In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn import metrics

### Load Data

In [20]:
pos_data = train_numeric_reduced.loc[train_numeric_reduced['Response'] == 1] 
neg_data = train_numeric_reduced.loc[train_numeric_reduced['Response'] == 0]
pos_data = pos_data.apply(np.random.permutation)
neg_data = neg_data.apply(np.random.permutation)
np.size(pos_data,0), np.size(neg_data,0)

(6879, 1176868)

In [21]:
pos_num = np.size(pos_data,0)
neg_num = np.size(neg_data,0)
train_pos_num = int(pos_num*0.5+1)
train_neg_num = int(neg_num*0.5)
val_pos_num = int(pos_num*0.25+1)
val_neg_num = int(neg_num*0.25)
test_pos_num= int(pos_num*0.25)
test_neg_num = int(neg_num*0.25)
train_pos_num + val_pos_num + test_pos_num, train_neg_num + val_neg_num + test_neg_num

(6879, 1176868)

In [22]:
train_pos_id = pd.read_csv('data/NewDataSet/train_pos_id.csv')
train_neg_id = pd.read_csv('data/NewDataSet/train_neg_id.csv')
val_pos_id = pd.read_csv('data/NewDataSet/val_pos_id.csv')
val_neg_id = pd.read_csv('data/NewDataSet/val_neg_id.csv')
test_pos_id = pd.read_csv('data/NewDataSet/test_pos_id.csv')
test_neg_id = pd.read_csv('data/NewDataSet/test_neg_id.csv')
train_id = pd.read_csv('data/NewDataSet/train_id.csv')
val_id = pd.read_csv('data/NewDataSet/val_id.csv')
test_id = pd.read_csv('data/NewDataSet/test_id.csv')

In [23]:
# train_pos = pos_data.iloc[0:train_pos_num]
# np.size(train_pos,0)
train_pos = pos_data.loc[pos_data['Id'].isin(train_id['Id'])]
train_neg = neg_data.loc[neg_data['Id'].isin(train_id['Id'])]
np.size(train_pos, 0), np.size(train_neg, 0)

(3440, 588434)

In [24]:
# train_neg_num = int(neg_num*0.5)
# train_neg = neg_data.iloc[0:train_neg_num]
# np.size(train_neg, 0)
val_pos = pos_data.loc[pos_data['Id'].isin(val_id['Id'])]
val_neg = neg_data.loc[neg_data['Id'].isin(val_id['Id'])]
np.size(val_pos, 0), np.size(val_neg,0)

(1720, 294217)

In [25]:
test_pos = pos_data.loc[pos_data['Id'].isin(test_id['Id'])]
test_neg = neg_data.loc[neg_data['Id'].isin(test_id['Id'])]
np.size(test_pos, 0), np.size(test_neg,0)

(1719, 294217)

### Merge other tables

In [43]:
id_feature=pd.read_csv('data/idfeature.csv')
id_feature.shape

(2367494, 9)

In [44]:
id_feature.head()

Unnamed: 0.1,Unnamed: 0,index,Id,Response,StartTime,0_¯\_(ツ)_/¯_1,0_¯\_(ツ)_/¯_2,0_¯\_(ツ)_/¯_3,0_¯\_(ツ)_/¯_4
0,2083747,2083747,1800056,,-1.0,2,-1,9999999,-1
1,2083748,2083748,1800057,,-1.0,1,-2,1,-2
2,2083749,2083749,1800059,,-1.0,2,-6,2,-6
3,2083750,2083750,1800065,,-1.0,6,-1,6,-1
4,2083751,2083751,1800066,,-1.0,1,-5,1,-5


In [None]:
id_feature

In [172]:
pd.merge(train_numeric_reduced, id_feature, on='Id', how='left')
#train_reduced_data = pd.merge(train_numeric_reduced, id_feature, on='Id', how='left')
#train = train_numeric_reduced

In [None]:
id_feature

### Down Sampling

In [26]:
test = test_pos.append(test_neg)
test.shape

(295936, 67)

In [27]:
y_true = test['Response'].values

In [31]:
def mcc(tp, tn, fp, fn):
    sup = (tp * tn - fp * fn)*1.0
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf == 0:
        return 0
    else:
        return sup / np.sqrt(inf)

def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true)  # number of positive
    numn = n - nump  # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    
    #print(mccs)
    if show:
        best_proba = y_prob[idx[best_id]]
        y_pred = (y_prob > best_proba).astype(int)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc


def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc

## Training xgb

In [37]:
ratio = [1, 3, 10, 171] #, 20, 30, 40, 50, 100, 150, 171]
listp=[x for x in pos_data.columns if x not in ["Id","Response"]]

In [38]:
res_columns = ['pn_ratio', 'train_num', 'val_num', 'test_num', 'fpr0','fpr1', 'tpr0','tpr1', 'auc']#, 'mcc']
res = pd.DataFrame(columns = res_columns)

In [39]:
param = {'max_depth':10, 'eta':.1, 'silent':1}
param['colsample_bytree'] = 0.8
param['subsample'] = 0.8
param['min_child_weight'] = 3
param['base_score'] = 0.005
param['objective'] = "binary:logistic"
param['n_estimators']=200
param['learning_rate'] = 0.05
param['eval_metric'] = "auc"

In [40]:
for i in ratio:
    print('** Pos-Neg Ratio = 1:{0}'.format(i))
    train_neg_num = train_pos_num * i
    val_neg_num = val_pos_num * i
    
    train = train_pos.append(train_neg.iloc[0:train_neg_num])
    val = val_pos.append(val_neg.iloc[0:val_neg_num])
    
    Xtrain=train[listp]
    Ytrain=train['Response']
    Xval = val[listp]
    Yval = val['Response']
    
    dvisibletrain = xgb.DMatrix(Xtrain,Ytrain,silent=True)
    dvisibleval = xgb.DMatrix(Xval,Yval, silent=True)
    watchlist = [(dvisibletrain, 'train'), (dvisibleval, 'val')]
    
    ## how many trees 
    num_round=100
    clf=xgb.train(param, dvisibletrain, num_round, 
                  evals=watchlist,
                  early_stopping_rounds=50,
                  #feval = mcc_eval,
                  maximize=True)
    dtest = xgb.DMatrix(test[listp],silent=True)
    prediction = clf.predict(dtest)
    y_pred = (prediction > .08).astype(int)
    fpr, tpr, _ = metrics.roc_curve(y_true, y_pred)
    auc = metrics.auc(fpr, tpr)
    #mcc = eval_mcc(y_true, y_pred)
    res.loc[len(res)]= [i, np.size(train,0), np.size(val,0), np.size(test,0), fpr[0], fpr[1], tpr[0], tpr[1], auc]#, mcc]

** Pos-Neg Ratio = 1:1
[0]	train-auc:0.5	val-auc:0.5
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 50 rounds.
[1]	train-auc:0.711491	val-auc:0.510248
[2]	train-auc:0.780757	val-auc:0.522429
[3]	train-auc:0.832035	val-auc:0.524052
[4]	train-auc:0.858134	val-auc:0.516846
[5]	train-auc:0.884621	val-auc:0.510456
[6]	train-auc:0.901188	val-auc:0.512197
[7]	train-auc:0.91285	val-auc:0.506338
[8]	train-auc:0.928413	val-auc:0.512346
[9]	train-auc:0.935946	val-auc:0.508978
[10]	train-auc:0.942951	val-auc:0.510105
[11]	train-auc:0.953782	val-auc:0.508001
[12]	train-auc:0.960934	val-auc:0.506534
[13]	train-auc:0.968672	val-auc:0.508113
[14]	train-auc:0.972034	val-auc:0.506732
[15]	train-auc:0.97542	val-auc:0.50385
[16]	train-auc:0.978794	val-auc:0.506651
[17]	train-auc:0.980973	val-auc:0.505026
[18]	train-auc:0.983008	val-auc:0.503862
[19]	train-auc:0.985551	val-auc:0.505782
[20]	train-auc:0.986824	val-auc:0.507955


In [42]:
res.to_csv('results_f65_11242042.csv', index=False)

In [41]:
res

Unnamed: 0,pn_ratio,train_num,val_num,test_num,fpr0,fpr1,tpr0,tpr1,auc
0,1.0,6880.0,3440.0,295936.0,0.0,1.0,0.0,1.0,0.5
1,3.0,13760.0,6880.0,295936.0,0.0,0.999813,0.0,1.0,0.500093
2,10.0,37840.0,18920.0,295936.0,0.0,0.465021,0.0,0.442699,0.488839
3,171.0,591680.0,295840.0,295936.0,0.0,1.0,0.0,1.0,0.5


In [87]:
#prec, rec, thres = metrics.precision_recall_curve(y_true, y_pred)
prec, rec, thres

array([ 0.00580869,  0.234375  ,  1.        ])

In [None]:
### Test : <0.1 missing rate features 104


