In [1]:
import os
import numpy as np
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split
import utils
import scoring
import catboost
import lightgbm as lgb

In [2]:
DATA_PATH = "./data"

In [3]:
%%time
train = utils.load_train_hdf(DATA_PATH)

CPU times: user 2min 33s, sys: 34.4 s, total: 3min 7s
Wall time: 3min 20s


In [4]:
%%time
test = pd.read_hdf(DATA_PATH + '/test_public.hdf', axis=0, ignore_index=True)

CPU times: user 13.5 s, sys: 1.95 s, total: 15.5 s
Wall time: 31.2 s


In [5]:
%%time
test2 = pd.read_hdf(DATA_PATH + '/test_public.hdf', axis=0, ignore_index=True)

CPU times: user 13.2 s, sys: 1.84 s, total: 15 s
Wall time: 15 s


In [29]:
%%time
test_private = pd.read_hdf(DATA_PATH + '/test_private_v2_track_1.hdf', axis=0, ignore_index=True)

CPU times: user 26.5 s, sys: 6.26 s, total: 32.7 s
Wall time: 32.7 s


In [30]:
%%time
test_private2 = pd.read_hdf(DATA_PATH + '/test_private_v2_track_1.hdf', axis=0, ignore_index=True)

CPU times: user 25.4 s, sys: 5.26 s, total: 30.6 s
Wall time: 30.6 s


### После добавления информативных признаков, мы их сохраним и будем быстро добавлять из файла

In [8]:
%%time
closest_hits_features = pd.read_hdf('data_itog.hdf', axis=0, ignore_index=True)

CPU times: user 457 ms, sys: 531 ms, total: 987 ms
Wall time: 2min 25s


In [13]:
%%time
closest_hits_features2 = pd.read_hdf(DATA_PATH + '/data_itog2.hdf', axis=0, ignore_index=True)

CPU times: user 67.4 ms, sys: 731 ms, total: 798 ms
Wall time: 17.6 s


In [14]:
%%time
closest_hits_features_test = pd.read_hdf(DATA_PATH + '/data_itog_test.hdf', axis=0, ignore_index=True)

CPU times: user 7.4 ms, sys: 104 ms, total: 111 ms
Wall time: 2.45 s


In [15]:
%%time
closest_hits_features_test2 = pd.read_hdf(DATA_PATH + '/data_itog_test2.hdf', axis=0, ignore_index=True)

CPU times: user 0 ns, sys: 117 ms, total: 117 ms
Wall time: 2.24 s


In [None]:
%%time
closest_hits_features_test_private = pd.read_hdf(DATA_PATH + '/data_itog_test_private.hdf', axis=0, ignore_index=True)

In [None]:
%%time
closest_hits_features_test_private2 = pd.read_hdf(DATA_PATH + '/data_itog_test_private2.hdf', axis=0, ignore_index=True)

In [38]:
avg_dist_hits = pd.read_hdf('avg_dist_hits.hdf', axis=0, ignore_index=True)

In [39]:
avg_dist_hits_test = pd.read_hdf('avg_dist_hits_test.hdf', axis=0, ignore_index=True)

In [16]:
KILL_COLUMNS = ['sWeight',
 'particle_type',
 'label',
 'kinWeight',
 'weight']

In [11]:
%%time
train = pd.concat([train, closest_hits_features], axis=1)

CPU times: user 9.19 s, sys: 2.13 s, total: 11.3 s
Wall time: 11.3 s


In [17]:
%%time
train = pd.concat([train, closest_hits_features2], axis=1)

CPU times: user 41.9 s, sys: 7.62 s, total: 49.5 s
Wall time: 15.5 s


In [42]:
%%time
train = pd.concat([train, avg_dist_hits], axis=1)

CPU times: user 4.2 s, sys: 2.88 s, total: 7.09 s
Wall time: 7.08 s


In [18]:
%%time
test = pd.concat([test, closest_hits_features_test], axis=1)

CPU times: user 14.8 s, sys: 1.49 s, total: 16.2 s
Wall time: 1.71 s


In [19]:
%%time
test2 = pd.concat([test2, closest_hits_features_test2], axis=1)

CPU times: user 15.4 s, sys: 1.52 s, total: 16.9 s
Wall time: 1.65 s


In [37]:
%%time
test_private = pd.concat([test_private, closest_hits_features_test_private], axis=1)

CPU times: user 2.52 s, sys: 408 ms, total: 2.93 s
Wall time: 2.93 s


In [38]:
%%time
test_private2 = pd.concat([test_private2, closest_hits_features_test_private2], axis=1)

CPU times: user 2.23 s, sys: 476 ms, total: 2.7 s
Wall time: 2.7 s


In [32]:
%%time
test = pd.concat([test, avg_dist_hits_test], axis=1)

CPU times: user 540 ms, sys: 369 ms, total: 909 ms
Wall time: 908 ms


In [44]:
%%time
test2 = pd.concat([test2, avg_dist_hits_test], axis=1)

CPU times: user 574 ms, sys: 293 ms, total: 867 ms
Wall time: 865 ms


In [9]:
%%time
train_part, validation = train_test_split(train, test_size=0.2, stratify=train.label, shuffle=True)

CPU times: user 21.9 s, sys: 3.31 s, total: 25.2 s
Wall time: 25.1 s


# Добавление информативных признаков, сохранение итогового датасета

In [31]:
def find_closest_hit_per_station(row):
    result = [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
    mind = [0, 0, 0, 0]
    hits = 0
    for station in row["FOI_hits_S"]:
        x_distances_2 = (row["Lextra_X[" + str(station) + "]"] - row["FOI_hits_X"][hits])**2
        y_distances_2 = (row["Lextra_Y[" + str(station) + "]"] - row["FOI_hits_Y"][hits])**2
        distances_2 = x_distances_2 + y_distances_2
        if mind[station] == 0:
            mind[station] = distances_2
            result[station*6 + 0] = x_distances_2
            result[station*6 + 1] = y_distances_2
            result[station*6 + 2] = row["FOI_hits_T"][hits]
            result[station*6 + 3] = row["FOI_hits_Z"][hits]
            result[station*6 + 4] = row["FOI_hits_DX"][hits]
            result[station*6 + 5] = row["FOI_hits_DY"][hits]
        else:
            if mind[station] > distances_2:
                mind[station] = distances_2
                result[station*6 + 0] = x_distances_2
                result[station*6 + 1] = y_distances_2
                result[station*6 + 2] = row["FOI_hits_T"][hits]
                result[station*6 + 3] = row["FOI_hits_Z"][hits]
                result[station*6 + 4] = row["FOI_hits_DX"][hits]
                result[station*6 + 5] = row["FOI_hits_DY"][hits]
        hits = hits + 1
    return result

In [32]:
def find_closest_hit_per_station_track2(row):
    result = [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
    mind = [0, 0, 0, 0]
    hits = 0
    for station in row["FOI_hits_S"]:
        x_distances_2 = (row["Lextra_X[" + str(station) + "]"] - row["FOI_hits_X"][hits])**2
        y_distances_2 = (row["Lextra_Y[" + str(station) + "]"] - row["FOI_hits_Y"][hits])**2
        distances_2 = x_distances_2 + y_distances_2
        if mind[station] == 0:
            mind[station] = distances_2
            result[station] = x_distances_2
            result[4 + station] = y_distances_2
            result[8 + station] = row["FOI_hits_T"][hits]
            result[12 + station] = row["FOI_hits_Z"][hits]
            result[16 + station] = row["FOI_hits_DX"][hits]
            result[20 + station] = row["FOI_hits_DY"][hits]
        else:
            if mind[station] > distances_2:
                mind[station] = distances_2
                result[station] = x_distances_2
                result[4 + station] = y_distances_2
                result[8 + station] = row["FOI_hits_T"][hits]
                result[12 + station] = row["FOI_hits_Z"][hits]
                result[16 + station] = row["FOI_hits_DX"][hits]
                result[20 + station] = row["FOI_hits_DY"][hits]
        hits = hits + 1
    return result

In [16]:
PADsizeX = [31.6667, 63.0, 126.0, 252.0]
PADsizeY = [38.8585, 77.9582, 156.158, 312.557]

In [17]:
def find_avg_dist(row):
    result = [1000,1000]
    hits = 0
    distances_1 = 0
    distances_2 = 0
    for station in row["FOI_hits_S"]:
        x_distances_1 = (row["Lextra_X[" + str(station) + "]"] - row["FOI_hits_X"][hits])**2
        y_distances_1 = (row["Lextra_Y[" + str(station) + "]"] - row["FOI_hits_Y"][hits])**2
        
        x_distances_2 = x_distances_1/(PADsizeX[station]**2)
        y_distances_2 = y_distances_1/(PADsizeY[station]**2)
        
        distances_1 = distances_1 + x_distances_1 + y_distances_1
        distances_2 = distances_2 + x_distances_2 + y_distances_2
        hits = hits + 1
     
    result[0] = 1.0*distances_1/hits
    result[1] = 1.0*distances_2/hits
        
    return result

In [None]:
%%time
closest_hits_features2 = train.apply(find_closest_hit_per_station_track2, axis=1, result_type="expand")

CPU times: user 1h 47min 42s, sys: 15.2 s, total: 1h 47min 58s
Wall time: 1h 47min 57s


In [24]:
closest_hits_features.to_hdf('data_itog.hdf', mode='w', key='s')

In [17]:
closest_hits_features2.to_hdf('data_itog2.hdf', mode='w', key='s')

In [None]:
%%time
closest_hits_features_test = test.apply(find_closest_hit_per_station, axis=1, result_type="expand")

In [None]:
%%time
closest_hits_features_test2 = test.apply(find_closest_hit_per_station_track2, axis=1, result_type="expand")

In [33]:
%%time
closest_hits_features_test_private = test_private.apply(find_closest_hit_per_station, axis=1, result_type="expand")

CPU times: user 24min 33s, sys: 10.6 s, total: 24min 44s
Wall time: 24min 44s


In [34]:
%%time
closest_hits_features_test_private2 = test_private2.apply(find_closest_hit_per_station_track2, axis=1, result_type="expand")

CPU times: user 26min 5s, sys: 16.9 s, total: 26min 22s
Wall time: 26min 22s


In [None]:
closest_hits_features_test.to_hdf('data_itog_test.hdf', mode='w', key='s')

In [None]:
closest_hits_features_test2.to_hdf('data_itog_test2.hdf', mode='w', key='s')

In [35]:
closest_hits_features_test_private.to_hdf('data_itog_test_private.hdf', mode='w', key='s')

In [36]:
closest_hits_features_test_private2.to_hdf('data_itog_test_private2.hdf', mode='w', key='s')

In [None]:
avg_dist_hits = train.apply(find_avg_dist, axis=1, result_type="expand")

In [26]:
avg_dist_hits.to_hdf('avg_dist_hits.hdf', mode='w', key='s')

In [18]:
%%time
avg_dist_hits_test = test2.apply(find_avg_dist, axis=1, result_type="expand")

CPU times: user 8min 58s, sys: 419 ms, total: 8min 58s
Wall time: 8min 58s


In [19]:
avg_dist_hits_test.to_hdf('avg_dist_hits_test.hdf', mode='w', key='s')

# Base XGBoost-model

In [43]:
model = xgboost.XGBClassifier(n_jobs=-1)

In [None]:
%%time
model.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=train_part.weight.values)

[08:31:07] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
CPU times: user 2h 45min 38s, sys: 8.03 s, total: 2h 45min 46s
Wall time: 11min 46s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [None]:
%%time
validation_predictions = model.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]

CPU times: user 6.41 s, sys: 945 ms, total: 7.35 s
Wall time: 2.22 s


In [47]:
scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values)
#0.7524629661308765 (80%)

0.7556692594512752

# Catboost

In [14]:
%%time
for v_depth in range(3, 12, 1):
    model2 = catboost.CatBoostClassifier(iterations=140, depth=v_depth, l2_leaf_reg=5, learning_rate=0.1, 
                                      verbose=False, random_seed=27)
    model2.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=np.abs(train_part.weight.values), plot=False)
    validation_predictions2 = model2.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
    print(scoring.rejection90(validation.label.values, validation_predictions2, sample_weight=validation.weight.values))

#0.7210961960996494
#0.7301291892159026
#0.7422736150548254
#0.7489491364059263
#0.7513587393570458
#0.7519879050217635
#0.7513397901983102
#0.760425103597202
#0.7573003221726251
#0.7500328908000631

0.6991737755624489
0.7192388677010243
0.7222040875723421
0.7275903681694574
0.7338900780596765
0.7319827913154457
0.7377879676850043
0.7454101485322719
0.743777864020875
CPU times: user 4h 10min 51s, sys: 4min 41s, total: 4h 15min 32s
Wall time: 27min 44s


In [12]:
%%time
for v_l2_leaf_reg in range(3, 10, 1):
    model2 = catboost.CatBoostClassifier(iterations=140, depth=10, l2_leaf_reg=v_l2_leaf_reg, learning_rate=0.1, 
                                      verbose=False, random_seed=27)
    model2.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=np.abs(train_part.weight.values), plot=False)
    validation_predictions2 = model2.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
    print(scoring.rejection90(validation.label.values, validation_predictions2, sample_weight=validation.weight.values))

0.7433842562282784
0.7435192247856279
0.7454101485322719
0.742733703813112
0.740925366507245
0.7393674632659628
0.7392564553109914
CPU times: user 6h 37min 13s, sys: 2min 59s, total: 6h 40min 13s
Wall time: 35min 7s


In [15]:
%%time
#model2 = catboost.CatBoostClassifier(iterations=700, depth=10, l2_leaf_reg=6, learning_rate=0.06, rsm = 0.7,
#                                      verbose=False, random_seed=27)
for v_rsm in range(1, 11, 1):
    model2 = catboost.CatBoostClassifier(iterations=140, depth=10, l2_leaf_reg=5, learning_rate=0.1, rsm = v_rsm/10,
                                      verbose=False, random_seed=27)
    model2.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=np.abs(train_part.weight.values), plot=False)
    validation_predictions2 = model2.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
    print(scoring.rejection90(validation.label.values, validation_predictions2, sample_weight=validation.weight.values))

0.7337988326126255
0.7354050139745928
0.745507760697995
0.7412447904759015
0.7458740305064446
0.7443704048935286
0.7424584967062693
0.7417753832165102
0.7399204737898198
0.7454101485322719
CPU times: user 5h 52min 11s, sys: 4min 9s, total: 5h 56min 21s
Wall time: 37min 3s


In [16]:
%%time
for v_learning_rate in [0.025, 0.05, 0.1, 0.2, 0.3]:
    model2 = catboost.CatBoostClassifier(iterations=140, depth=10, l2_leaf_reg=5, learning_rate=v_learning_rate, 
                                      verbose=False, random_seed=27)
    model2.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=np.abs(train_part.weight.values), plot=False)
    validation_predictions2 = model2.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
    print(scoring.rejection90(validation.label.values, validation_predictions2, sample_weight=validation.weight.values))

0.7152579608624362
0.7275038516973219
0.7454101485322719
0.7411041030407106
0.7377629156163057
CPU times: user 4h 46min 27s, sys: 2min 5s, total: 4h 48min 33s
Wall time: 25min 16s


In [17]:
%%time
for v_learning_rate in [0.075, 0.125]:
    model2 = catboost.CatBoostClassifier(iterations=140, depth=10, l2_leaf_reg=5, learning_rate=v_learning_rate, 
                                      verbose=False, random_seed=27)
    model2.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=np.abs(train_part.weight.values), plot=False)
    validation_predictions2 = model2.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
    print(scoring.rejection90(validation.label.values, validation_predictions2, sample_weight=validation.weight.values))

0.7384598740396509
0.7440981186384867
CPU times: user 1h 53min 39s, sys: 51.1 s, total: 1h 54min 30s
Wall time: 10min 5s


In [None]:
%%time
for v_learning_rate in [0.09, 0.11]:
    model2 = catboost.CatBoostClassifier(iterations=140, depth=10, l2_leaf_reg=5, learning_rate=v_learning_rate, 
                                      verbose=False, random_seed=27)
    model2.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=np.abs(train_part.weight.values), plot=False)
    validation_predictions2 = model2.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
    print(scoring.rejection90(validation.label.values, validation_predictions2, sample_weight=validation.weight.values))

0.7414363990428148
0.7397084104669127
CPU times: user 1h 53min 53s, sys: 50.7 s, total: 1h 54min 44s
Wall time: 10min 4s


### Пробничек

In [21]:
model2 = catboost.CatBoostClassifier(iterations=140, depth=10, l2_leaf_reg=6, learning_rate=0.1, rsm = 0.75,
                                      verbose=False, random_seed=27)
model2.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=np.abs(train_part.weight.values), plot=False)
validation_predictions2 = model2.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
print(scoring.rejection90(validation.label.values, validation_predictions2, sample_weight=validation.weight.values))
#0.7449939644431877

0.7676367989852553


### создаем модель Catboost и делаем прогноз для Track1

In [None]:
%%time
model2 = catboost.CatBoostClassifier(iterations=5000, depth=10, l2_leaf_reg=6, learning_rate=0.05, rsm = 0.7,
                                      verbose=False, random_seed=27)
model2.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train.label.values,
          sample_weight=np.abs(train.weight.values), plot=False)

model2.save_model('model_catboost_ver3')

CPU times: user 1d 8h 25min 54s, sys: 20min 42s, total: 1d 8h 46min 37s
Wall time: 2h 24min 47s


In [36]:
%%time
predictions = model2.predict_proba(test.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]        
pd.DataFrame(data={"prediction": predictions}, index=test.index).to_csv("sample_submission_c1.csv", index_label="id")

CPU times: user 38.5 s, sys: 893 ms, total: 39.4 s
Wall time: 12.7 s


In [None]:
%%time
model2 = catboost.CatBoostClassifier(iterations=8000, depth=10, l2_leaf_reg=6, learning_rate=0.04, rsm = 0.7,
                                      verbose=False, random_seed=27)
model2.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train.label.values,
          sample_weight=np.abs(train.weight.values), plot=False)

model2.save_model('model_catboost_ver2')

CPU times: user 2d 2h 8min 11s, sys: 28min 6s, total: 2d 2h 36min 18s
Wall time: 3h 39min 48s


In [None]:
%%time
predictions = model2.predict_proba(test.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]        
pd.DataFrame(data={"prediction": predictions}, index=test.index).to_csv("sample_submission_c2.csv", index_label="id")

CPU times: user 1min 7s, sys: 920 ms, total: 1min 7s
Wall time: 14.3 s


### тестируем Catboost для Track2

In [12]:
%%time
#7508.48
model2 = catboost.CatBoostClassifier(iterations=772, depth=10, l2_leaf_reg=5, learning_rate=0.062, 
                                      verbose=False, random_seed=27)
model2.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train.label.values,
          sample_weight=np.abs(train.weight.values), plot=False)

model2.save_model('model_catboost_track2_')

CPU times: user 6h 29min 27s, sys: 2min 46s, total: 6h 32min 14s
Wall time: 28min 16s


In [23]:
%%time
#7504.55
model2 = catboost.CatBoostClassifier(iterations=775, depth=10, l2_leaf_reg=5, learning_rate=0.062, 
                                      verbose=False, random_seed=27)
model2.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train.label.values,
          sample_weight=np.abs(train.weight.values), plot=False)

model2.save_model('model_catboost_track2')

CPU times: user 6h 31min, sys: 2min 44s, total: 6h 33min 45s
Wall time: 28min 16s


In [24]:
%%time
#7515.35
model2 = catboost.CatBoostClassifier(iterations=771, depth=10, l2_leaf_reg=5, learning_rate=0.062, 
                                      verbose=False, random_seed=27)
model2.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train.label.values,
          sample_weight=np.abs(train.weight.values), plot=False)

model2.save_model('model_catboost_track2')

CPU times: user 6h 29min 10s, sys: 2min 43s, total: 6h 31min 54s
Wall time: 28min 10s


# LightGBM

In [59]:
%%time
for v_learning_rate in [0.05, 0.1, 0.15, 0.2, 0.3, 0.4]:
    lg = lgb.LGBMClassifier(boosting_type='gbdt',
                objective = 'binary',
                learning_rate=v_learning_rate, 
                n_estimators=100, 
                max_depth = 11,         
                num_leaves=40, 
                random_state=27)

    lg.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values, 
               train_part.label.values, sample_weight=train_part.weight.values)

    validation_predictions = lg.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]

    print(str(v_learning_rate) + ": " + str(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values)))

#0.05: 0.7380773999181911
#0.1: 0.7452859696543559
#0.15: 0.7438050448998444
#0.2: 0.7418872001861442
#0.3: 0.7496258688049903
#0.4: 0.7417361506547251

0.05: 0.7438885700606036
0.1: 0.7539971555720008
0.15: 0.7599570552990959
0.2: 0.7559458305416447
0.3: 0.7498752615425813
0.4: 0.7534909275289093
CPU times: user 39min 8s, sys: 26.6 s, total: 39min 35s
Wall time: 3min 22s


In [53]:
%%time
for v_learning_rate in [0.3]:
    lg = lgb.LGBMClassifier(boosting_type='gbdt',
                objective = 'binary',
                learning_rate=v_learning_rate, 
                n_estimators=100, 
                max_depth = 11,         
                num_leaves=40, 
                random_state=27)

    lg.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values, 
               train_part.label.values, sample_weight=train_part.weight.values)

    validation_predictions = lg.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]

    print(str(v_learning_rate) + ": " + str(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values)))

0.3: 0.7518625423878622
CPU times: user 5min 49s, sys: 4.29 s, total: 5min 53s
Wall time: 30.9 s


In [60]:
%%time
            for v_colsample_bytree in [0.66,0.67,0.68]:
                lg = lgb.LGBMClassifier(boosting_type='gbdt',
                objective = 'binary',
                learning_rate = 0.3, 
                colsample_bytree = v_colsample_bytree, 
                subsample = 0.7,
                reg_alpha = 1,
                v_reg_lambda = 1,
                n_estimators = 100, 
                max_depth = 11,         
                num_leaves = 40, 
                random_state = 27)

                lg.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values, 
                           train_part.label.values, sample_weight=train_part.weight.values)

                validation_predictions = lg.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]

                print(str(v_colsample_bytree) + ": " + str(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values)))

0.66: 0.7523140372164502
0.67: 0.7523140372164502
0.68: 0.749994779410802
CPU times: user 13min 46s, sys: 13.3 s, total: 13min 59s
Wall time: 1min 18s


In [153]:
%%time
            for v_n_estimators in [100,200,300,400]:
                lg = lgb.LGBMClassifier(boosting_type='gbdt',
                objective = 'binary',
                learning_rate = 0.2, 
                colsample_bytree = 0.66, 
                subsample = 0.7,
                reg_alpha = 1,
                v_reg_lambda = 1,
                n_estimators = v_n_estimators, 
                max_depth = 11,         
                num_leaves = 40, 
                random_state = 27)

                lg.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values, 
                           train_part.label.values, sample_weight=train_part.weight.values)

                validation_predictions = lg.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]

                print(str(v_n_estimators) + ": " + str(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values)))
#0.1: 0.7489462215351488
#0.15: 0.7526157139780304
#0.2: 0.7571553959266342
#0.25: 0.7529741374159277

100: 0.7560070395023757
200: 0.7546111963984424
300: 0.7553296204679462
400: 0.7515648987383036
CPU times: user 20min 19s, sys: 17.8 s, total: 20min 37s
Wall time: 20min 37s


In [13]:
lg = lgb.LGBMClassifier(boosting_type='gbdt',
                objective = 'binary',
                learning_rate = 0.3, 
                colsample_bytree = 0.66, 
                subsample = 0.7,
                reg_alpha = 1,
                v_reg_lambda = 1,
                n_estimators = 100, 
                max_depth = 11,         
                num_leaves = 40, 
                random_state = 27)

lg.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values, 
                           train.label.values, sample_weight=train.weight.values)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.66,
        importance_type='split', learning_rate=0.3, max_depth=11,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=27, reg_alpha=1, reg_lambda=0.0, silent=True,
        subsample=0.7, subsample_for_bin=200000, subsample_freq=0,
        v_reg_lambda=1)

In [24]:
lg2 = lgb.LGBMClassifier(boosting_type='gbdt',
                objective = 'binary',
                learning_rate = 0.3, 
                colsample_bytree = 0.66, 
                subsample = 0.7,
                reg_alpha = 1,
                v_reg_lambda = 1,
                n_estimators = 100, 
                max_depth = 11,         
                num_leaves = 40, 
                random_state = 27)

lg2.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values, 
                           train.label.values, sample_weight=train.weight.values)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.66,
        importance_type='split', learning_rate=0.3, max_depth=11,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=40, objective='binary',
        random_state=27, reg_alpha=1, reg_lambda=0.0, silent=True,
        subsample=0.7, subsample_for_bin=200000, subsample_freq=0,
        v_reg_lambda=1)

In [74]:
%%time
predictions = lg.predict_proba(test.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]        
pd.DataFrame(data={"prediction": predictions}, index=test.index).to_csv("sample_submission_lg.csv", index_label="id")

CPU times: user 24.5 s, sys: 488 ms, total: 25 s
Wall time: 4.36 s


In [75]:
lg.booster_.save_model('lg_model3')

<lightgbm.basic.Booster at 0x7f93c81f3438>

# XGBoost: Tune max_depth and min_child_weight

In [None]:
%%time
for v_max_depth in range(3, 10, 2):
    for v_min_child_weight in range(1, 6, 2):
        model = xgboost.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=v_max_depth,
         min_child_weight=v_min_child_weight, gamma=0, subsample=0.8, colsample_bytree=0.8,
         objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
        
        model.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train_part.label.values,
          sample_weight=train_part.weight.values)
        
        validation_predictions = model.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
        
        print(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values))

[06:20:26] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7592807395815973
[06:26:05] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7592807395815973
[06:31:45] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7592807395815973
[06:37:25] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7792739743372552
[06:46:37] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7826360544662789
[06:55:49] Tree method is automatically selec

# XGBoost: Tune gamma

In [None]:
%%time
for i in range(0,5):
    model = xgboost.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
             min_child_weight=5, gamma=i/10.0, subsample=0.8, colsample_bytree=0.8,
             objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
        
    model.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
                  train_part.label.values,
                  sample_weight=train_part.weight.values)
        
    validation_predictions = model.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
        
    print(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values))

[08:36:56] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7902122356725646
[08:54:27] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7941769217644825
[09:11:54] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7951455626935715
[09:29:28] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7932563238216181
[09:46:55] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7917126045790891
CPU times: user 20h 14min 27s, sys: 40.1 s, t

# XGBoost: Tune subsample and colsample_bytree

In [None]:
%%time
for v_subsample in range(7,10):
    for v_colsample_bytree in range(7,10):
        model = xgboost.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
             min_child_weight=5, gamma=0.2, subsample=v_subsample/10.0, colsample_bytree=v_colsample_bytree/10.0,
             objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
        
        model.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
                  train_part.label.values,
                  sample_weight=train_part.weight.values)
        
        validation_predictions = model.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
        
        print(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values))

[10:04:49] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7938725119068626
[10:20:20] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7926633311619761
[10:37:46] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7964085040293832
[10:56:58] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7942975234712555
[11:12:50] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7951455626935715
[11:30:20] Tree method is automatically selec

# XGBoost: Tuning Regularization Parameters

In [None]:
%%time
for v_reg_alpha in [1e-5, 1e-2, 0.1, 1, 100]:
    model = xgboost.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
             min_child_weight=5, gamma=0.2, subsample=0.7, colsample_bytree=0.9, reg_alpha=v_reg_alpha,
             objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
        
    model.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
                  train_part.label.values,
                  sample_weight=train_part.weight.values)
        
    validation_predictions = model.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
        
    print(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values))

[13:29:28] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7963100280108867
[13:48:50] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7943658715598989
[14:08:08] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7917684628076389
[14:27:26] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7976379054256889
[14:46:41] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.8044142882183166
CPU times: user 22h 33min 42s, sys: 39.8 s, t

In [42]:
%%time
for v_reg_alpha in [50]:
    model = xgboost.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
             min_child_weight=5, gamma=0.2, subsample=0.7, colsample_bytree=0.9, reg_alpha=v_reg_alpha,
             objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
        
    model.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
                  train_part.label.values,
                  sample_weight=train_part.weight.values)
        
    validation_predictions = model.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
        
    print(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values))

[15:05:45] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.8028579103869556
CPU times: user 4h 29min 21s, sys: 7.75 s, total: 4h 29min 29s
Wall time: 19min 10s


In [43]:
%%time
for v_reg_alpha in [25,75]:
    model = xgboost.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
             min_child_weight=5, gamma=0.2, subsample=0.7, colsample_bytree=0.9, reg_alpha=v_reg_alpha,
             objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
        
    model.fit(train_part.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
                  train_part.label.values,
                  sample_weight=train_part.weight.values)
        
    validation_predictions = model.predict_proba(validation.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values)[:, 1]
        
    print(scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values))

[15:25:27] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.7921211354696326
[15:44:41] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
0.8030983338716395
CPU times: user 8h 59min 36s, sys: 15.8 s, total: 8h 59min 52s
Wall time: 38min 23s


# Final XGBoost-model

In [None]:
%%time
model = xgboost.XGBClassifier(learning_rate=0.01, n_estimators=5000, max_depth=9,
             min_child_weight=5, gamma=0.2, subsample=0.7, colsample_bytree=0.9, reg_alpha=100,
             objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
    
model.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train.label.values,
          sample_weight=train.weight.values)

model.save_model('model_a2')

[14:59:54] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.


# Stacking models

In [20]:
%%time
#model = xgboost.Booster({'nthread': 1})
model = xgboost.XGBClassifier(n_jobs=-1)
model.load_model('model_a1')

CPU times: user 31.5 ms, sys: 70.4 ms, total: 102 ms
Wall time: 6.74 s


In [21]:
%%time
model2 = catboost.CatBoostClassifier()
model2.load_model('model_catboost')

CPU times: user 23.2 ms, sys: 149 ms, total: 172 ms
Wall time: 1.82 s


In [20]:
%%time
model3 = catboost.CatBoostClassifier()
model3.load_model('model_catboost_ver2')

CPU times: user 115 ms, sys: 221 ms, total: 335 ms
Wall time: 334 ms


In [135]:
#lg = lgb.Booster(model_file='lg_model')

In [22]:
%%time
predictions1 = model.predict_proba(test.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1] 

CPU times: user 11min 12s, sys: 6.24 s, total: 11min 19s
Wall time: 10min 52s


In [40]:
%%time
predictions1 = model.predict_proba(test_private.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1] 

CPU times: user 22min 58s, sys: 11.7 s, total: 23min 10s
Wall time: 23min 3s


In [23]:
%%time
predictions2 = model2.predict_proba(test.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]        

CPU times: user 49.1 s, sys: 9.5 s, total: 58.6 s
Wall time: 12 s


In [41]:
%%time
predictions2 = model2.predict_proba(test_private.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]        

CPU times: user 1min 37s, sys: 2.41 s, total: 1min 40s
Wall time: 23.3 s


In [18]:
%%time
predictions3 = lg.predict_proba(test.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]  

CPU times: user 5.05 s, sys: 493 ms, total: 5.55 s
Wall time: 5.55 s


In [25]:
%%time
predictions31 = lg2.predict_proba(test2.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]  

CPU times: user 5.14 s, sys: 925 ms, total: 6.06 s
Wall time: 6 s


In [42]:
%%time
predictions31 = lg2.predict_proba(test_private2.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]  

CPU times: user 9.78 s, sys: 1.85 s, total: 11.6 s
Wall time: 11.1 s


In [98]:
%%time
predictions4 = model3.predict_proba(test.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]     

CPU times: user 1min 4s, sys: 219 ms, total: 1min 4s
Wall time: 10.3 s


In [44]:
predictions = (predictions1 + predictions2 + predictions31)/3.0

In [46]:
%%time
pd.DataFrame(data={"prediction": predictions}, index=test_private.index).to_csv("sample_submission_s2.csv", index_label="id")

CPU times: user 4.34 s, sys: 92 ms, total: 4.43 s
Wall time: 4.42 s
