In [14]:
import catboost
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from src.data import prepare_folds
from src.train import train_folds

In [15]:
data = pd.read_pickle('data/v3/train.pkl')

In [16]:
folds = prepare_folds(data)

In [18]:
config = {
    'iterations': 500,
    'eval_metric': 'AUC',
    'random_seed': 42,
    'use_best_model': True,
    'early_stopping_rounds': 100,
    'verbose': 50,
    'depth': 4,
    'one_hot_max_size': 12,
    'class_weights': [1, 9]
#     'depth': 6
}

In [19]:
models, results = train_folds(folds, config, model='catboost')

Training on (55999, 2713)
0:	test: 0.7546516	best: 0.7546516 (0)	total: 2.04s	remaining: 16m 58s
50:	test: 0.8129950	best: 0.8129950 (50)	total: 1m 40s	remaining: 14m 47s
100:	test: 0.8192071	best: 0.8192071 (100)	total: 2m 56s	remaining: 11m 38s
150:	test: 0.8217305	best: 0.8217492 (149)	total: 4m 16s	remaining: 9m 53s
200:	test: 0.8235583	best: 0.8235583 (200)	total: 5m 38s	remaining: 8m 23s
250:	test: 0.8241640	best: 0.8243005 (232)	total: 6m 54s	remaining: 6m 50s
300:	test: 0.8257619	best: 0.8257619 (300)	total: 8m 15s	remaining: 5m 27s
350:	test: 0.8264076	best: 0.8267676 (347)	total: 9m 34s	remaining: 4m 3s
400:	test: 0.8259435	best: 0.8267676 (347)	total: 10m 45s	remaining: 2m 39s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8267676378
bestIteration = 347

Shrink model to first 348 iterations.
Confusion matrix:
[[11049  1891]
 [  421   640]]
Report:
              precision    recall  f1-score   support

         0.0       0.96      0.85      0.91     1294

In [23]:
results['importances'].groupby(['feature']) \
    .agg({'importance': 'mean'}) \
    .sort_values(by="importance", ascending=False)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
id_diff_count_sms_source_4,3.572846
content_count_m1_diff_count_sms_source_4,2.849347
id_div_lt,1.422649
short_out_calls_part_m1_diff_lt,1.126114
id_diff_paym_last_days,1.085302
is_my_vf_os_category_block_flag,1.046451
income_brnd_cont_m1,1.043979
data_type_2_m1_sum_data_type_3_m1,1.031602
content_count_m1_div_balance_sum,0.977345
id_sum_paym_last_days,0.966838


In [14]:
def train_cat(train, val):
    train_x = train.drop(columns=['target', 'id'])
    train_y = train.target
    
    test_x = val.drop(columns=['target', 'id'])
    test_y = val.target
    
    cols = list(train_x.columns)
    categorical_features = [cols.index(col_name)for col_name in \
                            list(train_x.dtypes[(train_x.dtypes == object)].index)]
    
    model = catboost.CatBoostClassifier(**config, loss_function='Logloss')
    model.fit(train_x, train_y, eval_set=(test_x, test_y), cat_features=categorical_features, use_best_model=True)
    prediction = model.predict_proba(test_x)
    score = roc_auc_score(test_y.values, prediction[:, 1])
    return {'score': score, 'model': model}

In [187]:
def eval_test(models):
#     test_lb = pd.read_csv('data/test_music.csv')
    test_lb = pd.read_csv('data/test.pkl')
    predictions = []
    for model in models:
        predict = model.predict_proba(test_lb)
        predictions.append(predict[:, 1])
    return np.mean(predictions, axis=0)

In [188]:
def prepare_submission(predictions, score="Unknown"):
    sub = pd.read_csv('data/sample_submission_music.csv')
    sub.prediction = predictions
    sub.to_csv('submission-%s.csv' % score, index=False)

In [189]:
folds = prepare_folds(data)

In [190]:
models = []
scores = []

In [191]:
for n, fold in enumerate(folds):
    train_f, val_f = fold
    result = train_cat(train_f, val_f)
    models.append(result['model'])
    scores.append(result['score'])
    print("Fold %s: %.4f" % (n, result['score']))

0:	learn: 0.5227499	test: 0.5229379	best: 0.5229379 (0)	total: 171ms	remaining: 42.5s
1:	learn: 0.4298926	test: 0.4302763	best: 0.4302763 (1)	total: 311ms	remaining: 38.5s
2:	learn: 0.3777540	test: 0.3785001	best: 0.3785001 (2)	total: 549ms	remaining: 45.2s
3:	learn: 0.3420062	test: 0.3430525	best: 0.3430525 (3)	total: 693ms	remaining: 42.6s
4:	learn: 0.3225721	test: 0.3235724	best: 0.3235724 (4)	total: 954ms	remaining: 46.7s
5:	learn: 0.3113971	test: 0.3124682	best: 0.3124682 (5)	total: 1.15s	remaining: 46.7s
6:	learn: 0.3049706	test: 0.3063844	best: 0.3063844 (6)	total: 1.44s	remaining: 50.2s
7:	learn: 0.2996830	test: 0.3013239	best: 0.3013239 (7)	total: 1.59s	remaining: 48.1s
8:	learn: 0.2965771	test: 0.2983366	best: 0.2983366 (8)	total: 1.93s	remaining: 51.7s
9:	learn: 0.2929628	test: 0.2953144	best: 0.2953144 (9)	total: 2.24s	remaining: 53.8s
10:	learn: 0.2902371	test: 0.2929490	best: 0.2929490 (10)	total: 2.54s	remaining: 55.2s
11:	learn: 0.2883804	test: 0.2915046	best: 0.2915046

94:	learn: 0.2650599	test: 0.2809241	best: 0.2808781 (91)	total: 19.6s	remaining: 32s
95:	learn: 0.2649683	test: 0.2809010	best: 0.2808781 (91)	total: 19.7s	remaining: 31.6s
96:	learn: 0.2646933	test: 0.2807837	best: 0.2807837 (96)	total: 19.9s	remaining: 31.5s
97:	learn: 0.2646721	test: 0.2807683	best: 0.2807683 (97)	total: 20.3s	remaining: 31.5s
98:	learn: 0.2644190	test: 0.2807226	best: 0.2807226 (98)	total: 20.5s	remaining: 31.2s
99:	learn: 0.2641575	test: 0.2806034	best: 0.2806034 (99)	total: 20.6s	remaining: 30.9s
100:	learn: 0.2638416	test: 0.2805384	best: 0.2805384 (100)	total: 20.8s	remaining: 30.7s
101:	learn: 0.2637704	test: 0.2805465	best: 0.2805384 (100)	total: 20.9s	remaining: 30.4s
102:	learn: 0.2635683	test: 0.2805244	best: 0.2805244 (102)	total: 21.1s	remaining: 30.2s
103:	learn: 0.2635007	test: 0.2805126	best: 0.2805126 (103)	total: 21.3s	remaining: 29.9s
104:	learn: 0.2634494	test: 0.2805238	best: 0.2805126 (103)	total: 21.5s	remaining: 29.7s
105:	learn: 0.2633251	te

186:	learn: 0.2566805	test: 0.2809180	best: 0.2803695 (126)	total: 38.6s	remaining: 13s
187:	learn: 0.2566606	test: 0.2809262	best: 0.2803695 (126)	total: 38.8s	remaining: 12.8s
188:	learn: 0.2565919	test: 0.2809227	best: 0.2803695 (126)	total: 39s	remaining: 12.6s
189:	learn: 0.2565522	test: 0.2809340	best: 0.2803695 (126)	total: 39.1s	remaining: 12.3s
190:	learn: 0.2564668	test: 0.2809200	best: 0.2803695 (126)	total: 39.4s	remaining: 12.2s
191:	learn: 0.2564566	test: 0.2809306	best: 0.2803695 (126)	total: 39.5s	remaining: 11.9s
192:	learn: 0.2562109	test: 0.2809033	best: 0.2803695 (126)	total: 39.7s	remaining: 11.7s
193:	learn: 0.2562092	test: 0.2809024	best: 0.2803695 (126)	total: 39.8s	remaining: 11.5s
194:	learn: 0.2562083	test: 0.2809025	best: 0.2803695 (126)	total: 40s	remaining: 11.3s
195:	learn: 0.2562001	test: 0.2809059	best: 0.2803695 (126)	total: 40.1s	remaining: 11s
196:	learn: 0.2560799	test: 0.2808227	best: 0.2803695 (126)	total: 40.2s	remaining: 10.8s
197:	learn: 0.2559

28:	learn: 0.2787593	test: 0.2810287	best: 0.2810287 (28)	total: 5.68s	remaining: 43.3s
29:	learn: 0.2786907	test: 0.2810008	best: 0.2810008 (29)	total: 6.02s	remaining: 44.1s
30:	learn: 0.2784397	test: 0.2808372	best: 0.2808372 (30)	total: 6.4s	remaining: 45.2s
31:	learn: 0.2778950	test: 0.2804640	best: 0.2804640 (31)	total: 7s	remaining: 47.7s
32:	learn: 0.2777361	test: 0.2803827	best: 0.2803827 (32)	total: 7.4s	remaining: 48.7s
33:	learn: 0.2776344	test: 0.2803634	best: 0.2803634 (33)	total: 7.52s	remaining: 47.8s
34:	learn: 0.2773504	test: 0.2802471	best: 0.2802471 (34)	total: 7.92s	remaining: 48.6s
35:	learn: 0.2771499	test: 0.2802582	best: 0.2802471 (34)	total: 8.18s	remaining: 48.6s
36:	learn: 0.2768688	test: 0.2801739	best: 0.2801739 (36)	total: 8.34s	remaining: 48s
37:	learn: 0.2766748	test: 0.2800130	best: 0.2800130 (37)	total: 8.5s	remaining: 47.4s
38:	learn: 0.2763398	test: 0.2798712	best: 0.2798712 (38)	total: 8.68s	remaining: 46.9s
39:	learn: 0.2761849	test: 0.2797691	bes

KeyboardInterrupt: 

In [132]:
test_target = eval_test(models)

In [None]:
prepare_submission(test_target, np.mean(scores))

In [116]:
def feature_importance()

In [192]:
sorted(list(zip(models[0].feature_importances_, folds[0][0].drop(columns=['target']).columns)))

[(0.0, 'abon_cost_m2'),
 (0.0, 'content_cost_m2'),
 (0.0, 'is_obl_center'),
 (0.0, 'pay_in_P2P_cost_m3'),
 (0.0011581220672693233, 'service_1_count'),
 (0.0023272929604219684, 'sms_cost_m3'),
 (0.005399718921974757, 'content_cost_m1'),
 (0.006450366100659957, 'sms_cost_m1'),
 (0.037960800627595506, 'pay_out_P2P_cost_m3'),
 (0.039748322664764926, 'lt'),
 (0.05338051041931503, 'all_home_clc_m1'),
 (0.05367823731743929, 'sms_cost_m2'),
 (0.08014775903059629, 'conn_com_cost_m3'),
 (0.08038628073604527, 'abon_cost_m1'),
 (0.08632969135533232, 'paym_last_days'),
 (0.0900557461378153, 'abon_cost_m3'),
 (0.14364747404676967, 'block_all_dur_m2'),
 (0.15689937396181133, 'pay_out_P2P_cost_m1'),
 (0.16512831870766614, 'all_home_clc_m3'),
 (0.1678570788888345, 'conn_com_cost_m2'),
 (0.1760914516511828, 'os_category'),
 (0.214812387355699, 'pay_in_P2P_cost_m2'),
 (0.2152683976695754, 'all_home_clc_m2'),
 (0.2301634559837553, 'pay_out_P2P_cost_m2'),
 (0.23430957214003972, 'com_num_cost_m1'),
 (0.3220

In [71]:
a = sorted(list(zip(models[0].feature_importances_, folds[0][0].drop(columns=['target']).columns)))