In [4]:
import os
import pandas as pd
import numpy as np
import xgboost
import scoring
from sklearn.model_selection import train_test_split

%matplotlib inline
from matplotlib import pyplot as plt

from itertools import repeat

DATA_PATH = "../data"

SIMPLE_FEATURE_COLUMNS = [
    'ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 'avg_cs[0]',
    'avg_cs[1]', 'avg_cs[2]', 'avg_cs[3]', 'ndof', 'MatchedHit_TYPE[0]',
    'MatchedHit_TYPE[1]', 'MatchedHit_TYPE[2]', 'MatchedHit_TYPE[3]',
    'MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]',
    'MatchedHit_X[3]', 'MatchedHit_Y[0]', 'MatchedHit_Y[1]',
    'MatchedHit_Y[2]', 'MatchedHit_Y[3]', 'MatchedHit_Z[0]',
    'MatchedHit_Z[1]', 'MatchedHit_Z[2]', 'MatchedHit_Z[3]',
    'MatchedHit_DX[0]', 'MatchedHit_DX[1]', 'MatchedHit_DX[2]',
    'MatchedHit_DX[3]', 'MatchedHit_DY[0]', 'MatchedHit_DY[1]',
    'MatchedHit_DY[2]', 'MatchedHit_DY[3]', 'MatchedHit_DZ[0]',
    'MatchedHit_DZ[1]', 'MatchedHit_DZ[2]', 'MatchedHit_DZ[3]',
    'MatchedHit_T[0]', 'MatchedHit_T[1]', 'MatchedHit_T[2]',
    'MatchedHit_T[3]', 'MatchedHit_DT[0]', 'MatchedHit_DT[1]',
    'MatchedHit_DT[2]', 'MatchedHit_DT[3]', 'Lextra_X[0]', 'Lextra_X[1]',
    'Lextra_X[2]', 'Lextra_X[3]', 'Lextra_Y[0]', 'Lextra_Y[1]',
    'Lextra_Y[2]', 'Lextra_Y[3]', 'NShared', 'Mextra_DX2[0]',
    'Mextra_DX2[1]', 'Mextra_DX2[2]', 'Mextra_DX2[3]', 'Mextra_DY2[0]',
    'Mextra_DY2[1]', 'Mextra_DY2[2]', 'Mextra_DY2[3]', 'FOI_hits_N', 'PT', 'P'
]

## Track1

In [None]:
features = [
    'PT', 'NShared', 'ncl[0]', 'Delta_D_norm[3]', 'Delta_D_norm[0]',
    'Delta_X_norm[1]', 'Delta_D_norm[1]', 'Delta_D_norm[2]', 'ncl[2]',
    'Delta_X_norm[0]', 'ncl[3]', 'Mextra_DX[3]', 'MatchedHit_R[1]', 'P', 'ncl[1]',
    'D_Delta_R[1]', 'FOI_hits_N', 'Lextra_Y[0]', 'Delta_R[3]', 'avg_cs[0]',
    'avg_cs[1]', 'MatchedHit_R[3]', 'MatchedHit_T[2]', 'Delta_X_norm[3]',
    'D_Delta_R[2]', 'avg_cs[3]', 'Delta_L_norm[1]', 'MatchedHit_Y[1]',
    'MatchedHit_Y[3]', 'MatchedHit_DT[0]', 'MatchedHit_R[2]', 'Delta_Y_norm[0]',
    'MatchedHit_TYPE[1]', 'MatchedHit_Z[3]', 'MatchedHit_T[3]', 'Delta_L_norm[0]',
    'MatchedHit_T[1]', 'Delta_L_norm[2]', 'MatchedHit_X[3]', 'MatchedHit_TYPE[0]',
    'D_Delta_X[3]', 'Delta_X_norm[2]', 'MatchedHit_DZ[3]', 'MatchedHit_Z[2]',
    'MatchedHit_Z[1]', 'MatchedHit_Y[0]', 'MatchedHit_Z[0]', 'Delta_R_norm[3]',
    'avg_cs[2]', 'MatchedHit_DZ[2]', 'D_Delta_X[1]', 'Delta_Y[2]'
]

### Feature engineering

In [2]:
%%time

def build_features(sample):
    feature_sets = dict()

    # ------ Радиусы и неопределенности ------
    
    # радиусы столкновений и неопределенности координат
    for stage in [0,1,2,3]:
        sample[f'MatchedHit_R[{stage}]'] = np.sqrt(sample[f'MatchedHit_X[{stage}]']**2 + sample[f'MatchedHit_Y[{stage}]']**2)
        sample[f'Lextra_R[{stage}]'] = np.sqrt(sample[f'Lextra_X[{stage}]']**2 + sample[f'Lextra_Y[{stage}]']**2)
        sample[f'MatchedHit_DR[{stage}]'] = np.sqrt(sample[f'MatchedHit_DX[{stage}]']**2 + sample[f'MatchedHit_DY[{stage}]']**2)
        sample[f'Mextra_DR2[{stage}]'] = sample[f'Mextra_DX2[{stage}]'] + sample[f'Mextra_DY2[{stage}]']
    
    # расчетная неопределенность координат и радиуса (извлекаем корни)
    for stage in [0,1,2,3]:
        for ax in ['X', 'Y', 'R']:
            sample[f'Mextra_D{ax}[{stage}]'] = np.sqrt(sample[f'Mextra_D{ax}2[{stage}]'])
    
    # ------ Отклонения расчетных и измеренных показателей ------

    # неопределенность разности измеренных и расчетных показателей (нормировочные коэффициенты)
    for stage in [0,1,2,3]:
        for ax in ['X', 'Y', 'R']:
            sample[f'D_Delta_{ax}[{stage}]'] = np.sqrt(sample[f'MatchedHit_D{ax}[{stage}]']**2 + sample[f'Mextra_D{ax}2[{stage}]'])
    
    # отклонение координат и радиуса
    for stage in [0,1,2,3]:
        for ax in ['X', 'Y', 'R']:
            sample[f'Delta_{ax}[{stage}]'] = sample[f'MatchedHit_{ax}[{stage}]'] - sample[f'Lextra_{ax}[{stage}]']
            sample[f'Delta_{ax}_norm[{stage}]'] = sample[f'Delta_{ax}[{stage}]'].abs()/sample[f'D_Delta_{ax}[{stage}]']

    # расстояние
    for stage in [0,1,2,3]:
        sample[f'Delta_D[{stage}]'] = np.sqrt(sample[f'Delta_X[{stage}]']**2 + sample[f'Delta_Y[{stage}]']**2)
        sample[f'Delta_D_norm[{stage}]'] = sample[f'Delta_D[{stage}]']/sample[f'D_Delta_R[{stage}]']

    # продольное отклонение
    for stage in [0,1,2,3]:
        sample[f'Delta_L[{stage}]'] = np.sqrt((sample[f'Delta_D[{stage}]']**2 - sample[f'Delta_R[{stage}]']**2).clip(0))
        sample[f'Delta_L_norm[{stage}]'] = sample[f'Delta_L[{stage}]']/sample[f'D_Delta_R[{stage}]']

types = dict(zip(SIMPLE_FEATURE_COLUMNS+['weight'], repeat(np.float32)))
types['id'] = np.int64
types['label'] = np.int64
        
train = pd.concat([
    pd.read_csv(os.path.join(DATA_PATH, "train_part_%i.csv.gz" % i),
                usecols = SIMPLE_FEATURE_COLUMNS + ['id', 'label', 'weight'],
                index_col='id', dtype=types)
    for i in (1,2)], axis=0, ignore_index=True)

test_public = pd.read_csv(os.path.join(DATA_PATH, "test_public.csv.gz"),
                          usecols = SIMPLE_FEATURE_COLUMNS + ['id'],  
                          index_col='id', dtype=types)

test_private = pd.read_csv(os.path.join(DATA_PATH, 'test_private_v3_track_1.csv.gz'),
                           usecols = SIMPLE_FEATURE_COLUMNS + ['id'],
                           index_col='id', dtype=types)

build_features(train)
build_features(test_public)
build_features(test_private)

train.to_csv('track1_train.csv', columns=features+['label', 'weight'], index_label='id')
test_public.to_csv('track1_test_public.csv', columns=features, index_label='id')
test_private.to_csv('track1_test_private.csv', columns=features, index_label='id')

  mask |= (ar1 == a)


CPU times: user 8min 34s, sys: 18.4 s, total: 8min 52s
Wall time: 8min 45s


### Model training

In [6]:
%%time

types = dict(zip(features+['id', 'label', 'weight'], repeat(np.float32)))
types['id'] = np.int64
types['label'] = np.int64

train = pd.read_csv('track1_train.csv',
                    usecols = features + ['id', 'label', 'weight'], index_col='id', dtype=types)

test_public = pd.read_csv('track1_test_public.csv',
                          usecols = features + ['id'], index_col='id', dtype=types)

test_private = pd.read_csv('track1_test_private.csv',
                           usecols = features + ['id'], index_col='id', dtype=types)

xgb_params = {
    'n_estimators': 500,
}

model = xgboost.XGBClassifier(n_jobs=3, random_state=0, **xgb_params)    
model.fit(train[features].values, train.label.values, sample_weight=train.weight.values)
model.save_model(f"track1.xgb")

  mask |= (ar1 == a)


CPU times: user 11h 9min 31s, sys: 1min 33s, total: 11h 11min 4s
Wall time: 3h 48min 58s


In [7]:
predictions_public = model.predict_proba(test_public[features].values)[:, 1]
pd.DataFrame(data={"prediction": predictions_public}, index=test_public.index).to_csv(
    f"track1_public.csv", index_label='id', compression='zip')

In [8]:
predictions_private = model.predict_proba(test_private[features].values)[:, 1]
pd.DataFrame(data={"prediction": predictions_private}, index=test_private.index).to_csv(
    f"track1_private.csv", index_label='id', compression='zip')

## Track2

In [None]:
features = [
    'NShared', 'PT', 'Delta_D_norm[3]', 'Delta_X_norm[1]', 'Delta_D_norm[0]',
    'FOI_hits_N', 'Delta_X_norm[3]', 'Delta_D_norm[2]', 'MatchedHit_TYPE[1]',
    'Delta_D_norm[1]', 'ncl[2]', 'P', 'Delta_X_norm[0]'
]

### Feature engineering

In [7]:
%%time

def build_features_fast(sample):
    # отклонение координат
    for stage in [0,1,2,3]:
        sample[f'Delta_X[{stage}]'] = sample[f'MatchedHit_X[{stage}]'] - sample[f'Lextra_X[{stage}]']

    # нормированное отклонение координат            
    for stage in [0,1,3]:
        sample[f'Delta_X_norm[{stage}]'] = (sample[f'Delta_X[{stage}]'].abs()/
                                            np.sqrt(sample[f'MatchedHit_DX[{stage}]']**2 + sample[f'Mextra_DX2[{stage}]']))

    # расстояние
    for stage in [0,1,2,3]:
        sample[f'Delta_D_norm[{stage}]'] = (np.sqrt(sample[f'Delta_X[{stage}]']**2 + (sample[f'MatchedHit_Y[{stage}]'] - sample[f'Lextra_Y[{stage}]'])**2)/
                                            np.sqrt(sample[f'MatchedHit_DX[{stage}]']**2 + sample[f'MatchedHit_DY[{stage}]']**2 + sample[f'Mextra_DX2[{stage}]'] + sample[f'Mextra_DY2[{stage}]']))

        
types = dict(zip(SIMPLE_FEATURE_COLUMNS+['weight'], repeat(np.float32)))
types['id'] = np.int64
types['label'] = np.int64
        
train = pd.concat([
    pd.read_csv(os.path.join(DATA_PATH, "train_part_%i.csv.gz" % i),
                usecols = SIMPLE_FEATURE_COLUMNS + ['id', 'label', 'weight'],
                index_col='id', dtype=types)
    for i in (1,2)], axis=0, ignore_index=True)

test_public = pd.read_csv(os.path.join(DATA_PATH, "test_public.csv.gz"),
                          usecols = SIMPLE_FEATURE_COLUMNS + ['id'],  
                          index_col='id', dtype=types)

# test_private = pd.read_csv(os.path.join(DATA_PATH, 'test_private_v3_track_2.csv.gz'),
#                            usecols = SIMPLE_FEATURE_COLUMNS + ['id'],
#                            index_col='id', dtype=types)

build_features_fast(train)
build_features_fast(test_public)
# build_features_fast(test_private)

train.to_csv('track2_train.csv', columns=features+['label', 'weight'], index_label='id')
test_public.to_csv('track2_test_public.csv', columns=features, index_label='id')
#test_private.to_csv('track2_test_private.csv', columns=features, index_label='id')

CPU times: user 1min 22s, sys: 2.78 s, total: 1min 25s
Wall time: 1min 24s


### Model training

In [11]:
%%time

types = dict(zip(features+['id', 'label', 'weight'], repeat(np.float32)))
types['id'] = np.int64
types['label'] = np.int64

train = pd.read_csv('track2_train.csv',
                    usecols = features + ['id', 'label', 'weight'], index_col='id', dtype=types)

test_public = pd.read_csv('track2_test_public.csv',
                          usecols = features + ['id'], index_col='id', dtype=types)

# test_private = pd.read_csv('track2_test_private.csv',
#                            usecols = features + ['id'], index_col='id', dtype=types)

xgb_params = {
    'n_estimators': 20,
}

train_part, validation = train_test_split(train, test_size=0.25, shuffle=True, random_state=0)

model = xgboost.XGBClassifier(n_jobs=-1, random_state=0, **xgb_params)    
model.fit(train_part[features].values, train_part.label.values, sample_weight=train_part.weight.values)
validation_predictions = model.predict_proba(validation[features].values)[:, 1]
score = scoring.rejection90(validation.label.values, validation_predictions, sample_weight=validation.weight.values)
print(f'score={score}')

predictions_public = model.predict_proba(test_public[features].values)[:, 1]
pd.DataFrame(data={"prediction": predictions_public}, index=test_public.index).to_csv(
    f"track2_public.csv", index_label='id')

model.save_model(f"track2.xgb")

  mask |= (ar1 == a)


score=0.731123685836792
CPU times: user 3min 36s, sys: 25.7 s, total: 4min 1s
Wall time: 1min 30s
