In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [10]:
train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [7]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,...,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,...,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,...,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,...,4,0,0,4,0,1,1,0,0,1


In [13]:
x_train = train_df.drop(["id","target"], axis=1)
y_train = train_df['target'].values

x_test = test_df.drop(["id"], axis=1)

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=1337)

In [32]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.02
params['max_depth'] = 4

#params['gpu_id'] = 0
#params['max_bin'] = 16
#params['tree_method'] = 'gpu_hist'


In [33]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [34]:

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=50, verbose_eval=10, maximize=True, feval=gini_xgb)

[0]	train-auc:0.594979	valid-auc:0.590716	train-gini:0.189201	valid-gini:0.184556
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 50 rounds.
[10]	train-auc:0.613184	valid-auc:0.609746	train-gini:0.226842	valid-gini:0.220043
[20]	train-auc:0.616377	valid-auc:0.611917	train-gini:0.233002	valid-gini:0.224294
[30]	train-auc:0.617968	valid-auc:0.612641	train-gini:0.236231	valid-gini:0.225726
[40]	train-auc:0.618841	valid-auc:0.613424	train-gini:0.23798	valid-gini:0.227284
[50]	train-auc:0.619847	valid-auc:0.61424	train-gini:0.239977	valid-gini:0.228911
[60]	train-auc:0.620693	valid-auc:0.615154	train-gini:0.24165	valid-gini:0.230758
[70]	train-auc:0.621468	valid-auc:0.615791	train-gini:0.243159	valid-gini:0.232049
[80]	train-auc:0.622319	valid-auc:0.616514	train-gini:0.244858	valid-gini:0.233491
[90]	train-auc:0.623051	valid-auc:0.617591	train-gini:0.24623	valid-gini:0.235498
[100]	train-auc:0.624101	valid

In [36]:
id_test = test_df['id'].values

d_test = xgb.DMatrix(x_test, label=y_valid)
xgb_pred = bst.predict(d_test)

output = pd.DataFrame({'id': id_test, 'target': xgb_pred})
output.to_csv("simple_sub.csv", index=False)   