In [2]:
import numpy as np
import pandas as pd
from sklearn import *
from catboost import CatBoostClassifier
from multiprocessing import *

base_path = "input/"
train = pd.read_csv(base_path + 'train.csv')
test = pd.read_csv(base_path + 'test.csv')
col = [c for c in train.columns if c not in ['id','target']]
print(len(col))
col = [c for c in col if not c.startswith('ps_calc_')]
print(len(col))

train = train.replace(-1, np.NaN)
d_median = train.median(axis=0)
d_mean = train.mean(axis=0)
train = train.fillna(-1)

def transform_df(df):
    df = pd.DataFrame(df)
    dcol = [c for c in df.columns if c not in ['id','target']]
    df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c: #standard arithmetic
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)
            #df[c+str('_sq')] = np.power(df[c].values,2).astype(np.float32)
            #df[c+str('_sqr')] = np.square(df[c].values).astype(np.float32)
            #df[c+str('_log')] = np.log(np.abs(df[c].values) + 1)
            #df[c+str('_exp')] = np.exp(df[c].values) - 1
    
    return df

def multi_transform(df):
    print('Init Shape: ', df.shape)
    #p = Pool(cpu_count())
    df = p.map(transform_df, np.array_split(df, cpu_count()))
    df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
    #p.close(); p.join()
    print('After Shape: ', df.shape)
    return df

def gini(y, pred):
    fpr, tpr, thr = metrics.roc_curve(y, pred, pos_label=1)
    g = 2 * metrics.auc(fpr, tpr) -1
    return g

def gini_catboost(pred, y):
    return gini(y, pred)

x1, x2, y1, y2 = model_selection.train_test_split(train, train['target'], test_size=0.25, random_state=99)

x1 = transform_df(x1)
x2 = transform_df(x2)
test = transform_df(test)
train = transform_df(train)

col = [c for c in x1.columns if c not in ['id','target']]
col = [c for c in col if not c.startswith('ps_calc_')]
print(x1.values.shape, x2.values.shape)

#remove duplicates just in case
#tdups = transform_df(train)
#dups = tdups[tdups.duplicated(subset=col, keep=False)]

#x1 = x1[~(x1['id'].isin(dups['id'].values))]
#x2 = x2[~(x2['id'].isin(dups['id'].values))]
#print(x1.values.shape, x2.values.shape)

y1 = x1['target']
y2 = x2['target']
x1 = x1[col]
x2 = x2[col]

model3 = CatBoostClassifier(iterations=1200, learning_rate=0.02, depth=7, loss_function='Logloss', eval_metric='AUC', random_seed=99, od_type='Iter', od_wait=100) 
model3.fit(x1[col], y1, eval_set=(x2[col], y2), use_best_model=True, verbose=True)
print(gini_catboost(model3.predict_proba(x2[col])[:,1], y2))
test['target'] = model3.predict_proba(test[col])[:,1]
test['target'] = (np.exp(test['target'].values) - 1.0).clip(0,1)
train['target'] = model3.predict_proba(train[col])[:,1]
train['target'] = (np.exp(train['target'].values) - 1.0).clip(0,1)
test[['id','target']].to_csv(base_path + 'test_catboost_submission.csv', index=False, float_format='%.5f')
train[['id','target']].to_csv(base_path + 'train_catboost_submission.csv', index=False, float_format='%.5f')

#Extras
import matplotlib.pyplot as plt

df = pd.DataFrame({'imp': model3.feature_importances_, 'col':col})
df = df.sort_values(['imp','col'], ascending=[True, False])
_ = df.plot(kind='barh', x='col', y='imp', figsize=(7,12))
plt.savefig('catboost_feature_importance.png')

57
37
((446409L, 141L), (148803L, 141L))
Borders for float features generated
0:	learn 0.5855176686	test 0.5852459886	bestTest 0.5852459886		total: 1.27s	remaining: 25m 23s
1:	learn 0.5925086625	test 0.5956808016	bestTest 0.5956808016		total: 2.18s	remaining: 21m 44s
2:	learn 0.6034741053	test 0.6049093302	bestTest 0.6049093302		total: 3.19s	remaining: 21m 11s
3:	learn 0.6042592307	test 0.6033532867	bestTest 0.6049093302		total: 4.26s	remaining: 21m 13s
4:	learn 0.605622349	test 0.6054043831	bestTest 0.6054043831		total: 4.99s	remaining: 19m 53s
5:	learn 0.6036202501	test 0.604062613	bestTest 0.6054043831		total: 6.07s	remaining: 20m 7s
6:	learn 0.6149936764	test 0.6135789402	bestTest 0.6135789402		total: 7.37s	remaining: 20m 56s
7:	learn 0.6158551722	test 0.6145667878	bestTest 0.6145667878		total: 8.57s	remaining: 21m 17s
8:	learn 0.6170107949	test 0.6161280818	bestTest 0.6161280818		total: 9.87s	remaining: 21m 45s
9:	learn 0.6190228617	test 0.6175570879	bestTest 0.6175570879		total: 