### This is the final notebook. All experimental code and tuning-code can be watched in draft.ipynb.

In [1]:
import os
import numpy as np
import pandas as pd
import utils
import xgboost
import scoring
import catboost
import lightgbm as lgb

In [2]:
DATA_PATH = "./data"

In [3]:
%%time
train = utils.load_train_hdf(DATA_PATH)

CPU times: user 2min 33s, sys: 34.4 s, total: 3min 7s
Wall time: 3min 20s


In [29]:
%%time
test_private = pd.read_hdf(DATA_PATH + '/test_private_v2_track_1.hdf', axis=0, ignore_index=True)

CPU times: user 26.5 s, sys: 6.26 s, total: 32.7 s
Wall time: 32.7 s


In [16]:
KILL_COLUMNS = ['sWeight',
 'particle_type',
 'label',
 'kinWeight',
 'weight']

# Adding new features

### Function for counting of closest hit per station

In [31]:
def find_closest_hit_per_station(row):
    result = [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
    mind = [0, 0, 0, 0]
    hits = 0
    for station in row["FOI_hits_S"]:
        x_distances_2 = (row["Lextra_X[" + str(station) + "]"] - row["FOI_hits_X"][hits])**2
        y_distances_2 = (row["Lextra_Y[" + str(station) + "]"] - row["FOI_hits_Y"][hits])**2
        distances_2 = x_distances_2 + y_distances_2
        if mind[station] == 0:
            mind[station] = distances_2
            result[station*6 + 0] = x_distances_2
            result[station*6 + 1] = y_distances_2
            result[station*6 + 2] = row["FOI_hits_T"][hits]
            result[station*6 + 3] = row["FOI_hits_Z"][hits]
            result[station*6 + 4] = row["FOI_hits_DX"][hits]
            result[station*6 + 5] = row["FOI_hits_DY"][hits]
        else:
            if mind[station] > distances_2:
                mind[station] = distances_2
                result[station*6 + 0] = x_distances_2
                result[station*6 + 1] = y_distances_2
                result[station*6 + 2] = row["FOI_hits_T"][hits]
                result[station*6 + 3] = row["FOI_hits_Z"][hits]
                result[station*6 + 4] = row["FOI_hits_DX"][hits]
                result[station*6 + 5] = row["FOI_hits_DY"][hits]
        hits = hits + 1
    return result

### Same function for counting of closest hit per station. In that function I use the same order of features as in "advanced_baseline". I decided to use two functions with different order of features, because I saw different results using LightGBM by using different orders of features (it's strange, but usefull).

In [32]:
def find_closest_hit_per_station2(row):
    result = [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
    mind = [0, 0, 0, 0]
    hits = 0
    for station in row["FOI_hits_S"]:
        x_distances_2 = (row["Lextra_X[" + str(station) + "]"] - row["FOI_hits_X"][hits])**2
        y_distances_2 = (row["Lextra_Y[" + str(station) + "]"] - row["FOI_hits_Y"][hits])**2
        distances_2 = x_distances_2 + y_distances_2
        if mind[station] == 0:
            mind[station] = distances_2
            result[station] = x_distances_2
            result[4 + station] = y_distances_2
            result[8 + station] = row["FOI_hits_T"][hits]
            result[12 + station] = row["FOI_hits_Z"][hits]
            result[16 + station] = row["FOI_hits_DX"][hits]
            result[20 + station] = row["FOI_hits_DY"][hits]
        else:
            if mind[station] > distances_2:
                mind[station] = distances_2
                result[station] = x_distances_2
                result[4 + station] = y_distances_2
                result[8 + station] = row["FOI_hits_T"][hits]
                result[12 + station] = row["FOI_hits_Z"][hits]
                result[16 + station] = row["FOI_hits_DX"][hits]
                result[20 + station] = row["FOI_hits_DY"][hits]
        hits = hits + 1
    return result

In [None]:
%%time
closest_hits_features = train.apply(find_closest_hit_per_station, axis=1, result_type="expand")

In [None]:
%%time
closest_hits_features2 = train.apply(find_closest_hit_per_station2, axis=1, result_type="expand")

CPU times: user 1h 47min 42s, sys: 15.2 s, total: 1h 47min 58s
Wall time: 1h 47min 57s


### I will use two train-dataset with different order of new features

In [None]:
%%time
train2 = pd.concat([train, closest_hits_features2], axis=1)

In [None]:
%%time
train = pd.concat([train, closest_hits_features], axis=1)

### The same operations with test-dataset

In [None]:
%%time
closest_hits_features_test = test_private.apply(find_closest_hit_per_station, axis=1, result_type="expand")

In [None]:
%%time
closest_hits_features_test2 = test_private.apply(find_closest_hit_per_station2, axis=1, result_type="expand")

In [None]:
%%time
test_private2 = pd.concat([test_private, closest_hits_features2], axis=1)

In [None]:
%%time
test_private = pd.concat([test_private, closest_hits_features], axis=1)

# Catboost (with first variant of train and test-dataset) for track 1

In [None]:
%%time
model_cb = catboost.CatBoostClassifier(iterations=5000, depth=10, l2_leaf_reg=6, learning_rate=0.05, rsm = 0.7,
                                      verbose=False, random_seed=27)
model_cb.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
          train.label.values,
          sample_weight=np.abs(train.weight.values), plot=False)

CPU times: user 1d 8h 25min 54s, sys: 20min 42s, total: 1d 8h 46min 37s
Wall time: 2h 24min 47s


In [36]:
%%time
predictions_cb = model_cb.predict_proba(test_private.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]        

CPU times: user 38.5 s, sys: 893 ms, total: 39.4 s
Wall time: 12.7 s


# LightGBM for track 1

### As I wrote earlier, different order of new features gave me different results using LightGBM-results in ensembling of models-result later. The best result was using second variant of train and test-dataset for LightGBM

In [None]:
%%time
lg = lgb.LGBMClassifier(boosting_type='gbdt',
                objective = 'binary',
                learning_rate = 0.3, 
                colsample_bytree = 0.66, 
                subsample = 0.7,
                reg_alpha = 1,
                v_reg_lambda = 1,
                n_estimators = 100, 
                max_depth = 11,         
                num_leaves = 40, 
                random_state = 27)

lg.fit(train2.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values, 
                  train2.label.values, sample_weight=train2.weight.values)

In [74]:
%%time
predictions_lgb = lg.predict_proba(test_private2.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1]        

CPU times: user 24.5 s, sys: 488 ms, total: 25 s
Wall time: 4.36 s


# XGBoost (with first variant of train and test-dataset) for track 1

In [None]:
%%time
model_xgb = xgboost.XGBClassifier(learning_rate=0.01, n_estimators=5000, max_depth=9,
             min_child_weight=5, gamma=0.2, subsample=0.7, colsample_bytree=0.9, reg_alpha=100,
             objective= 'binary:logistic', nthread=16, scale_pos_weight=1, seed=27)
    
model_xgb.fit(train.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
              train.label.values,
              sample_weight=train.weight.values)

[14:59:54] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.


In [None]:
predictions_xgb = model_xgb.predict_proba(test_private.drop(utils.FOI_COLUMNS, axis=1).values)[:, 1] 

# Ensembling results for track1 and submitting

In [44]:
predictions = (predictions_cb + predictions_lgb + predictions_xgb)/3.0

In [46]:
%%time
pd.DataFrame(data={"prediction": predictions}, index=test_private.index).to_csv("submission_private.csv", index_label="id")

CPU times: user 4.34 s, sys: 92 ms, total: 4.43 s
Wall time: 4.42 s


# Catboost for track2

In [None]:
%%time
#7515.35
model_cb2 = catboost.CatBoostClassifier(iterations=771, depth=10, l2_leaf_reg=5, learning_rate=0.062, 
                                        verbose=False, random_seed=27)
model_cb2.fit(train2.drop(utils.FOI_COLUMNS + utils.TRAIN_COLUMNS + KILL_COLUMNS, axis=1).values,
              train2.label.values,
              sample_weight=np.abs(train2.weight.values), plot=False)

model_cb2.save_model('model_catboost_track2')