In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from sklearn.model_selection import KFold

# Read in our input data
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# This prints out (rows, columns) in each dataframe
print('Train shape:', df_train.shape)
print('Test shape:', df_test.shape)
print('Columns:', df_train.columns)

y_train = df_train['target'].values
id_train = df_train['id'].values
id_test = df_test['id'].values



Train shape: (595212, 59)
Test shape: (892816, 58)
Columns: Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin',
       'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04',
       'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09',
       'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'p

In [2]:
# https://www.kaggle.com/armamut/ps-calc-15-bin-ps-calc-20-bin
df_train['feat_ps_calc']  = df_train['ps_calc_15_bin'] * 32 + df_train['ps_calc_16_bin'] * 16 + df_train['ps_calc_17_bin'] * 8 + df_train['ps_calc_18_bin'] * 4 + df_train['ps_calc_19_bin'] * 2 + df_train['ps_calc_20_bin'] * 1

df_test['feat_ps_calc']  = df_test['ps_calc_15_bin'] * 32 + df_test['ps_calc_16_bin'] * 16 + df_test['ps_calc_17_bin'] * 8 + df_test['ps_calc_18_bin'] * 4 + df_test['ps_calc_19_bin'] * 2 + df_test['ps_calc_20_bin'] * 1

In [3]:
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
    ('ps_car_13','ps_reg_03')
]

for a,b in combs:
    tmp_name = '{}_{}'.format(a,b)
    df_train[tmp_name] = df_train[a] * df_train[b]
    df_test[tmp_name] = df_test[a] * df_test[b]

col_to_drop = df_train.columns[df_train.columns.str.startswith('ps_calc_')]
df_train = df_train.drop(col_to_drop, axis=1)  
df_test = df_test.drop(col_to_drop, axis=1)  
# https://www.kaggle.com/kueipo/base-on-froza-pascal-single-xgb-lb-0-284

cat_features = [a for a in df_train.columns if a.endswith('cat')]

for column in cat_features:
    temp = pd.get_dummies(pd.Series(df_train[column]))
    df_train = pd.concat([df_train,temp],axis=1)
    df_train = df_train.drop([column],axis=1)
    
for column in cat_features:
    temp = pd.get_dummies(pd.Series(df_test[column]))
    df_test = pd.concat([df_test,temp],axis=1)
    df_test = df_test.drop([column],axis=1)

# many feature engineering
def recon(reg):
    integer = int(np.round((40*reg)**2)) 
    for a in range(32):
        if (integer - a) % 31 == 0:
            A = a
    M = (integer - A)//31
    return A, M
df_train['ps_reg_A'] = df_train['ps_reg_03'].apply(lambda x: recon(x)[0])
df_train['ps_reg_M'] = df_train['ps_reg_03'].apply(lambda x: recon(x)[1])
df_train['ps_reg_A'].replace(19,-1, inplace=True)
df_train['ps_reg_M'].replace(51,-1, inplace=True)
df_test['ps_reg_A'] = df_test['ps_reg_03'].apply(lambda x: recon(x)[0])
df_test['ps_reg_M'] = df_test['ps_reg_03'].apply(lambda x: recon(x)[1])
df_test['ps_reg_A'].replace(19,-1, inplace=True)
df_test['ps_reg_M'].replace(51,-1, inplace=True)



In [4]:
x_train = df_train.drop(['target', 'id'], axis=1).values
x_test = df_test.drop(['id'], axis=1).values
print(x_train.shape,x_test.shape)

(595212, 278) (892816, 278)


In [5]:
df_train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,...,5.3999999999999995,5.4,5.6,5.6000000000000005,6.3,6.4,7.2,8.1,ps_reg_A,ps_reg_M
0,7,0,2,5,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,26
1,9,0,1,7,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,30
2,13,0,5,9,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,51
3,16,0,0,2,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,17
4,17,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,36


In [6]:
y_train[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [7]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    allres = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    allres = allres[ np.lexsort((allres[:,2], -1*allres[:,1])) ]
    totalLosses = allres[:,0].sum()
    giniSum = allres[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

print("def gini")

def gini


In [12]:
def cv_test(k,n_rounds):
    kf = KFold(n_splits = k, random_state = 3228, shuffle = True)
    res_list = []
    score_list = []
    for train_index, test_index in kf.split(x_train):
        train_X, valid_X = x_train[train_index], x_train[test_index]
        train_y, valid_y = y_train[train_index], y_train[test_index]

        xgb_params = {
            'eta': 0.01,
            'max_depth': 5,
            'subsample': 0.9,
            'gamma':0.02,
            'silent': 1,
            'objective':'binary:logistic',
            'colsample_bytree': 0.9,
            'scale_pos_weight':5
        }

        d_train = xgb.DMatrix(train_X, train_y)
        d_valid = xgb.DMatrix(valid_X, valid_y)
        d_test = xgb.DMatrix(x_test)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        mdl = xgb.train(xgb_params, d_train, n_rounds, watchlist, early_stopping_rounds=50, 
                        feval=gini_xgb, maximize=True, verbose_eval=200)
        
        # Predict on our test data
        p_test = mdl.predict(d_test)
        res_list.append(p_test)
        
        # predict on valid
        p_train = mdl.predict(d_valid)
        tmp_score = gini_normalized(valid_y, p_train)
        print('tmp_score',tmp_score)
        score_list.append(tmp_score)
    res_list = np.array(res_list)   
    return res_list, score_list
print("def cv done")

def cv done


In [13]:
def gen_final_res(k,n_rounds):
    res_list, score_list = cv_test(k,n_rounds)
    print(np.mean(score_list))
    
    
    # avg res
    # avg_res = np.mean(np.array(res_list),0)
    
    
    # try weight res
    f_res = 0
    for i in range(k):
        f_res += res_list[i] * score_list[i]
    f_res = f_res / sum(score_list)
    
    
    res_file = '../results/cv_{}_res_csv.gz'.format(k)
    sub = pd.DataFrame()
    sub['id'] = id_test
    sub['target'] = list(f_res)
    sub.to_csv(res_file, index=False, compression='gzip')
    print("------------------------")
    print(sub.head())
    print("------------------------")
    print(res_file)
print("def gen res done")

def gen res done


In [14]:
gen_final_res(5,2000)

[0]	train-error:0.037012	valid-error:0.036516	train-gini:0.213126	valid-gini:0.199471
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 50 rounds.
[200]	train-error:0.036556	valid-error:0.035928	train-gini:0.289901	valid-gini:0.254197
[400]	train-error:0.036577	valid-error:0.035979	train-gini:0.321491	valid-gini:0.266094
[600]	train-error:0.03664	valid-error:0.036113	train-gini:0.344874	valid-gini:0.270855
[800]	train-error:0.036657	valid-error:0.036239	train-gini:0.364188	valid-gini:0.271941
[1000]	train-error:0.036678	valid-error:0.036323	train-gini:0.380201	valid-gini:0.272945
Stopping. Best iteration:
[960]	train-error:0.03667	valid-error:0.036298	train-gini:0.377092	valid-gini:0.272986

tmp_score 0.272980468309
[0]	train-error:0.036399	valid-error:0.036978	train-gini:0.213995	valid-gini:0.212093
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until 