In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KDTree

import hdbscan
from sklearn.neighbors import NearestNeighbors
import pickle as pkl
import gc

from itertools import combinations
from tqdm import tqdm_notebook as tqdm

from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns

import mlcrate as mlc


from trackml.dataset import load_event
import pickle as pkl

pd.options.display.max_columns = 200

In [2]:
def score_event(truth, submission):
    truth = truth[['hit_id', 'particle_id', 'weight']].merge(submission, how='left', on='hit_id')
    df = truth.groupby(['track_id', 'particle_id']).hit_id.count().to_frame('count_both').reset_index()
    
    df1 = df.groupby(['particle_id']).count_both.sum().to_frame('count_particle').reset_index()
    df = df.merge(df1, how='left', on='particle_id')
    df1 = df.groupby(['track_id']).count_both.sum().to_frame('count_track').reset_index()
    df = df.merge(df1, how='left', on='track_id')
    df['valid'] = (df.count_both > 0.5*np.maximum(df.count_particle, df.count_track))
    truth = truth.merge(df[['track_id', 'particle_id', 'valid']], how='left', on=['track_id', 'particle_id'])

    score = truth[truth.valid].weight.sum()
    return score

In [3]:
def score_data(data):
    truth = data
    truth['count_both'] = truth.groupby(['track_id', 'particle_id']).hit_id.transform('count')    
    truth['count_particle'] = truth.groupby(['particle_id']).hit_id.transform('count')
    truth['count_track'] = truth.groupby(['track_id']).hit_id.transform('count')
    truth['valid'] = (truth.count_both > 0.5*truth.count_particle) & (truth.count_both > 0.5*truth.count_track)
    score = truth[truth.valid].weight.sum()
    truth.loc[truth.track_id == 0, 'count_track'] = 0
    return score

In [4]:
data_l = []
for i in range(10,100):
    event = '../input/train_1/event0000010%d' % i
    print('event:', event)
    hits, cells, particles, truth = load_event(event)
    data = hits
    data = data.merge(truth, how='left', on='hit_id')
    data = data.merge(particles, how='left', on='particle_id')
    data['rv'] = np.sqrt(data.vx**2 + data.vy**2)
    data = data[(data.rv <= 1) & (data.vz <= 50) & (data.vz >= -50)].copy()
    data = data[data.weight > 0]
    data['event_id'] = i
    
    data_l.append(data)

data = pd.concat(data_l, axis=0)
data = data.sample(frac=1, random_state=0)

event: ../input/train_1/event000001010
event: ../input/train_1/event000001011
event: ../input/train_1/event000001012
event: ../input/train_1/event000001013
event: ../input/train_1/event000001014
event: ../input/train_1/event000001015
event: ../input/train_1/event000001016
event: ../input/train_1/event000001017
event: ../input/train_1/event000001018
event: ../input/train_1/event000001019
event: ../input/train_1/event000001020
event: ../input/train_1/event000001021
event: ../input/train_1/event000001022
event: ../input/train_1/event000001023
event: ../input/train_1/event000001024
event: ../input/train_1/event000001025
event: ../input/train_1/event000001026
event: ../input/train_1/event000001027
event: ../input/train_1/event000001028
event: ../input/train_1/event000001029
event: ../input/train_1/event000001030
event: ../input/train_1/event000001031
event: ../input/train_1/event000001032
event: ../input/train_1/event000001033
event: ../input/train_1/event000001034
event: ../input/train_1/e

In [5]:
data['layer'] = 100 * data.volume_id + data.layer_id
data = data.sort_values(by=['particle_id', 'z']).reset_index(drop=True)
df = data.groupby(['event_id', 'particle_id']).layer.apply(lambda s: ' '.join([str(i) for i in s]))

df = df.to_frame('layers')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,layers
event_id,particle_id,Unnamed: 2_level_1
10,4503805785800704,1704 1702 1308 1306 1304 1302 1302 808 806 804...
10,4503943224754176,802 802 804 806 1302 1402 1404 1406 1408 1810 ...
10,4504011944230912,802 804 806 902 1302 1402 1402 1404 1406 1408 ...
10,4504080663707648,802 804 806 1302 1302 1302 1304 1402 1404 1406...
10,4504149383184384,802 804 806 806 808 1302 1304 1304 1402 1404 1...
10,4504286822137856,802 802 802 804 804 902 904 906 908 910 1406
10,4504355541614592,802 802 804 902 904 904 906 906 908 908 910 14...
10,4504699138998272,802 802 804 902 902 904 904 906 906
10,4504974016905216,802 902 904 906 908 910 1406 1408 1410 1412
10,4505248894812160,802 804 806 1302 1402 1404 1406 1408 1810 1812


In [6]:
from collections import Counter

cnt = Counter()

for x in tqdm(df.itertuples(name=None, index=False)):
    layers = x[0].split()
    for i in range(len(layers) - 3):
        s = ' '.join(layers[i:i+4])
        cnt[s] += 1
    




In [7]:
with open('../data/layers_4_center_fix.pkl', 'wb') as file:
    pkl.dump(cnt, file)

In [8]:
cnt.most_common(10)

[('802 804 806 808', 86119),
 ('808 806 804 802', 86049),
 ('804 806 808 1302', 79225),
 ('1302 808 806 804', 78704),
 ('806 808 1302 1304', 78216),
 ('1304 1302 808 806', 77929),
 ('1306 1304 1302 808', 65604),
 ('808 1302 1304 1306', 65527),
 ('902 902 904 904', 52282),
 ('712 712 714 714', 52111)]