In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from sklearn.model_selection import KFold

# Read in our input data
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# This prints out (rows, columns) in each dataframe
print('Train shape:', df_train.shape)
print('Test shape:', df_test.shape)

print('Columns:', df_train.columns)

y_train = df_train['target'].values
id_train = df_train['id'].values
id_test = df_test['id'].values

# We drop these variables as we don't want to train on them
# The other 57 columns are all numerical and can be trained on without preprocessing
x_train = df_train.drop(['target', 'id'], axis=1).values
x_test = df_test.drop(['id'], axis=1).values




Train shape: (595212, 59)
Test shape: (892816, 58)
Columns: Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'p

In [2]:
df_train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [3]:
y_train[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [4]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

print("def gini")

def gini


In [8]:
# all train data
d_all_train = xgb.DMatrix(x_train)

def cv_test(k,n_rounds):
    kf = KFold(n_splits = k, random_state = 3228, shuffle = True)
    res_list = []
    score_list = []
    for train_index, test_index in kf.split(x_train):
        train_X, valid_X = x_train[train_index], x_train[test_index]
        train_y, valid_y = y_train[train_index], y_train[test_index]

        xgb_params = {
            'eta': 0.02,
            'max_depth': 5,
            'subsample': 0.9,
            'silent': 1,
            'objective':'binary:logistic',
            'colsample_bytree': 0.9
        }

        d_train = xgb.DMatrix(train_X, train_y)
        d_valid = xgb.DMatrix(valid_X, valid_y)
        d_test = xgb.DMatrix(x_test)

        # This is the data xgboost will test on after eachboosting round
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        # Train the model! We pass in a max of 10,000 rounds (with early stopping after 100)
        # and the custom metric (maximize=True tells xgb that higher metric is better)
        mdl = xgb.train(xgb_params, d_train, n_rounds, watchlist, early_stopping_rounds=100, 
                        feval=gini_xgb, maximize=True, verbose_eval=100)
        
        # Predict on our test data
        p_test = mdl.predict(d_test)
        res_list.append(p_test)
        
        # predict on train
        p_train = mdl.predict(d_all_train)
        tmp_score = gini_normalized(y_train, p_train)
        score_list.append(tmp_score)
    res_list = np.array(res_list)
    avg_score = np.mean(score_list)
    return res_list, avg_score
print("def cv done")

def cv done


In [9]:
def gen_final_res(k,n_rounds):
    res, score = cv_test(k,n_rounds)
    avg_res = np.mean(np.array(res),0)
    res_file = '../results/cv_{}_{}_res.csv'.format(k,score)
    # Create a submission file
    sub = pd.DataFrame()
    sub['id'] = id_test
    sub['target'] = list(avg_res)
    sub.to_csv(res_file, index=False)
    print("------------------------")
    print(sub.head())
    print("------------------------")
    print(res_file)
print("def gen res done")

def gen res done


In [10]:
gen_final_res(5,2000)

[0]	train-gini:0.21194	valid-gini:0.203215
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.265344	valid-gini:0.241019
[200]	train-gini:0.298907	valid-gini:0.255815
[300]	train-gini:0.325051	valid-gini:0.264922
[400]	train-gini:0.348118	valid-gini:0.269815
[500]	train-gini:0.366965	valid-gini:0.271361
[600]	train-gini:0.383364	valid-gini:0.271998
[700]	train-gini:0.397833	valid-gini:0.271936
Stopping. Best iteration:
[615]	train-gini:0.385822	valid-gini:0.272448

[0]	train-gini:0.205911	valid-gini:0.214514
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.264807	valid-gini:0.256178
[200]	train-gini:0.297844	valid-gini:0.274997
[300]	train-gini:0.324114	valid-gini:0.283906
[400]	train-gini:0.347091	valid-gini:0.286004
[500]	train-gini:0.366331	valid-gini:0.286