In [None]:
!nvidia-smi

In [1]:
################################################################################
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import importlib
import numpy as np
import pandas as pd
import json
import sys
from IPython.display import display
from typing import Iterable, List, Sequence

In [2]:
################################################################################
def flatten(
        iterable: Iterable
        ) -> Iterable:
    """ Return a flattened iterable from a nested iterable.
        [[3, [4, 5]], 6, [[[7]]]] -> [3, 4, 5, 6, 7]
    """
    for item in iterable:
        if  isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
            yield from flatten(item)
        else:
            yield item


def parse_file(
        filename         : str,
        initial_event_id : int      = 0,
        ignored_columns  : Sequence = (),
        ) -> Iterable[Iterable]:
    """ Parses the lines in the file from 'filename' to a format
        appropriate for passing into a pandas DataFrame constructor.
    """
    event_id = initial_event_id
    with open(filename) as file:
        lines = filter(None, (line.strip() for line in file))
        for line in lines:
            if line.startswith("#"):
                event_id += 1
            else:
                j_list = json.loads("[{0}]".format(line))
                for column in ignored_columns:
                    del j_list[column]
                j_list.append(event_id)
                yield flatten(j_list)

In [3]:
################################################################################
clusters_columns = [
    "hit_nr", "barcode", "volume_id", "layer_id",
    "lx",     "ly",      "elx",       "ely",    
    "gx",     "gy",      "gz",        "phi",    
    "theta",  "ephi",    "etheta",    "event_id",
]
particles_columns = [
    "barcode",  "vertex_x", "vertex_y",
    "vertex_z", "momentum", "theta",
    "phi",      "charge",   "event_id",
]

# Extraction from a single file.

In [17]:
################################################################################
number = 1
base_directory = "/inputdata/ACTS/prod_mu200_pt500_2017_07_25"
clusters_filename  = base_directory + "/clusters_{0}.csv".format(number)
particles_filename = base_directory + "/particles_{0}.csv".format(number)

In [18]:
################################################################################
clusters_lines = parse_file(clusters_filename, ignored_columns=[7])
clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
clusters_frame.head(3)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id
0,1,297238949795987456,7,2,3.025,19.4344,0.15,0.15,-157.488,-25.0229,-1498.0,-2.98402,3.03554,0.1,0.01,1
1,2,67563271539916800,7,2,3.575,10.0366,0.15,0.15,-141.123,-51.07,-1498.0,-2.81819,1.5708,0.1,0.01,1
2,3,562962116768694272,7,2,6.925,8.07187,0.15,0.15,-132.658,-66.1443,-1502.0,-2.67906,3.04322,0.1,0.01,1


In [19]:
################################################################################
particles_lines = parse_file(particles_filename)
particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
particles_frame.head(3)

Unnamed: 0,barcode,vertex_x,vertex_y,vertex_z,momentum,theta,phi,charge,event_id
0,206158430208,-0.003092,-0.004118,-56.6067,3.99061,0.158342,1.93647,-1,1
1,274877906944,-0.003092,-0.004118,-56.6067,7.70008,2.99673,-2.08866,1,1
2,343597383680,-0.003092,-0.004118,-56.6067,1.52538,2.67101,2.78452,-1,1


In [20]:
################################################################################
left_frame     = clusters_frame
right_frame    = particles_frame[["event_id", "barcode", "momentum", "charge"]]
combined_frame = left_frame.merge(right_frame, on=["event_id", "barcode"])
combined_frame.head(3)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id,momentum,charge
0,1,297238949795987456,7,2,3.025,19.4344,0.15,0.15,-157.488,-25.0229,-1498.0,-2.98402,3.03554,0.1,0.01,1,11.5048,1
1,62,297238949795987456,7,4,3.375,-24.5174,0.15,0.15,-113.909,-19.2973,-1098.0,-3.00299,1.5708,0.1,0.01,1,11.5048,1
2,65,297238949795987456,7,4,-7.325,-24.2719,0.15,0.15,-114.331,-19.3656,-1102.0,-2.9738,3.03675,0.1,0.01,1,11.5048,1


In [25]:
################################################################################
gx    = combined_frame["gx"]
gy    = combined_frame["gy"]
gz    = combined_frame["gz"]
phi   = np.arctan2(gy, gx)
r     = np.sqrt(gx**2 + gy**2)
frame = combined_frame.assign(phi=phi, r=r, z=gz)
frame.head(3)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id,momentum,charge,r,z
0,1,297238949795987456,7,2,3.025,19.4344,0.15,0.15,-157.488,-25.0229,-1498.0,-2.984022,3.03554,0.1,0.01,1,11.5048,1,159.463525,-1498.0
1,62,297238949795987456,7,4,3.375,-24.5174,0.15,0.15,-113.909,-19.2973,-1098.0,-2.973776,1.5708,0.1,0.01,1,11.5048,1,115.532013,-1098.0
2,65,297238949795987456,7,4,-7.325,-24.2719,0.15,0.15,-114.331,-19.3656,-1102.0,-2.973803,3.03675,0.1,0.01,1,11.5048,1,115.959493,-1102.0


In [26]:
################################################################################
# Eliminate duplicate hits that were caused by imperfections in the detector.
frame = frame.sort_values("r")
frame = frame.drop_duplicates(["event_id", "barcode", "layer_id"])
frame.head(3)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id,momentum,charge,r,z
1312251,4764,882706832634675200,8,2,4.43409,-23.8884,0.15,0.15,-20.9463,23.4453,151.112,2.299959,0.098447,0.1,0.01,42,1.94662,-1,31.4393,151.112
775929,5888,220687514296385536,8,2,4.41968,22.1344,0.15,0.15,10.3661,-29.6812,197.134,-1.234792,0.019321,0.1,0.01,25,8.99135,-1,31.439301,197.134
614775,4854,36030652444835840,8,2,4.45167,-25.1719,0.15,0.15,20.9594,-23.4336,9.82812,-0.841075,0.063407,0.1,0.01,20,2.51346,-1,31.439308,9.82812


In [None]:
# Checking which volumes to use.
for volume_id in np.sort(frame["volume_id"].unique()):
    volume = frame[frame["volume_id"] == volume_id]
    for layer_id in np.sort(frame["layer_id"].unique()):
        rs = volume[volume["layer_id"] == layer_id]["r"]
        print("Volume ID: {0}, Layer ID: {1}, Min: {2}, Max: {3}".format(volume_id, layer_id, rs.min(), rs.max()))
    print()

In [None]:
################################################################################
# Specify the volume to use. Each volume is a different detector configuration.
frame = frame[frame["volume_id"] == 8]

In [None]:
################################################################################
# Set radiuses to be the same for each layer.
for layer_id in frame["layer_id"].unique():
    ind = frame["layer_id"] == layer_id
    rs  = frame[ind]["r"]
    med = rs.median()
    frame.loc[ind, "r"] = med

In [None]:
################################################################################
# Put limits on the number of tracks per event. 
max_tracks = 50
min_tracks = 2
frames = [f for (_, f) in frame.groupby("event_id", sort=False)]
for i, f in enumerate(frames):
    barcodes = f["barcode"].unique()
    if len(barcodes) < min_tracks:
        frames[i] = pd.DataFrame()
    if len(barcodes) > max_tracks:
        length = np.random.randint(min_tracks, max_tracks + 1)
        barcodes = np.random.choice(barcodes, length, replace=False)
        f = f[f["barcode"].isin(barcodes)]
        frames[i] = f
frame = pd.concat(frames)

In [None]:
################################################################################
# Clean up the frame a bit.
frame = frame[["event_id", "barcode", "phi", "r", "z", "momentum", "charge"]]
frame = frame.sort_values(["event_id", "barcode", "r"])
print("Hits:", len(frame))
print("Events:", len(frame["event_id"].unique()))
frame.head(15)

# Extraction from multiple files.

In [4]:
################################################################################
def extract(
        clusters_filename  : str, 
        particles_filename : str,
        initial_event_id   : int = 0,
        ) -> pd.DataFrame:
    """ Everything in one function.
        Depending on the size of the file, this function could take a long
        time. Most of the time is spent parsing the csv files within the
        first 4 lines.
    """
    clusters_lines = parse_file(
        clusters_filename,
        ignored_columns=[7],
        initial_event_id=initial_event_id)
    clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
    
    particles_lines = parse_file(
        particles_filename, 
        initial_event_id=initial_event_id)
    particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
    
    left  = clusters_frame
    right = particles_frame[["event_id", "barcode", "momentum", "charge"]]
    combined_frame = left.merge(right, on=["event_id", "barcode"])
    
    gx    = combined_frame["gx"]
    gy    = combined_frame["gy"]
    gz    = combined_frame["gz"]
    phi   = np.arctan2(gy, gx)
    r     = np.sqrt(gx**2 + gy**2)
    frame = combined_frame.assign(phi=phi, r=r, z=gz)
    
    frame = frame.sort_values("r")
    frame = frame.drop_duplicates(["event_id", "barcode", "layer_id"])
    
    frame = frame[frame["volume_id"] == 8]
    
    for layer_id in frame["layer_id"].unique():
        ind = frame["layer_id"] == layer_id
        rs  = frame[ind]["r"]
        med = rs.median()
        frame.loc[ind, "r"] = med
    
    cols = ["event_id", "barcode", "phi", "r", "z", "momentum", "charge"]
    frame = frame[cols]
    frame = frame.sort_values(["event_id", "barcode", "r"])
    
    return frame

In [6]:
%%time
################################################################################
frames = []
initial_event_id = 0
base_directory = "/inputdata/ACTS/prod_mu25_pt1000_2017_07_27"
for i in range(1, 1 + 100):
    print("Extracting from file {0}. Initial Event ID is {1}".format(i, initial_event_id))
    try:
        clusters_filename  = base_directory + "/clusters_{0}.csv".format(i)
        particles_filename = base_directory + "/particles_{0}.csv".format(i)
        frame = extract(
            clusters_filename=clusters_filename, 
            particles_filename=particles_filename, 
            initial_event_id=initial_event_id,)
        initial_event_id = frame["event_id"].max() + 1
        frames.append(frame)
    except FileNotFoundError as error:
        print(error)
frame = pd.concat(frames)
print("All Done!")

Extracting from file 1. Initial Event ID is 0
Extracting from file 2. Initial Event ID is 1001
Extracting from file 3. Initial Event ID is 2002
Extracting from file 4. Initial Event ID is 3003
Extracting from file 5. Initial Event ID is 4004
Extracting from file 6. Initial Event ID is 5005
Extracting from file 7. Initial Event ID is 6006
Extracting from file 8. Initial Event ID is 7007
Extracting from file 9. Initial Event ID is 8008
Extracting from file 10. Initial Event ID is 9009
Extracting from file 11. Initial Event ID is 10010
Extracting from file 12. Initial Event ID is 11011
Extracting from file 13. Initial Event ID is 12012
Extracting from file 14. Initial Event ID is 13013
Extracting from file 15. Initial Event ID is 14014
Extracting from file 16. Initial Event ID is 15015
Extracting from file 17. Initial Event ID is 16016
Extracting from file 18. Initial Event ID is 17017
Extracting from file 19. Initial Event ID is 18018
Extracting from file 20. Initial Event ID is 19019
Ex

In [7]:
print("Number of Hits: {}".format(len(frame)))
print("Number of Events: {}".format(len(frame["event_id"].unique())))
tracks  = [value for (_, value) in frame.groupby(["event_id"])]
lengths = [len(value["barcode"].unique()) for value in tracks]
print("Min Number of Tracks: {}".format(min(lengths)))
print("Max Number of Tracks: {}".format(max(lengths)))

Number of Hits: 13929190
Number of Events: 99615
Min Number of Tracks: 1
Max Number of Tracks: 246


In [8]:
filepath = "data/sets/ACTS-MU10-EV99615.gz"
frame.to_csv(filepath, compression="gzip")
print("{0} bytes".format(os.path.getsize(filepath)))

324461472 bytes


In [21]:
from tracker import extractor
import random

In [30]:
frame = pd.read_csv(filepath) 820 rows, 246
events = [event for (_, event) in frame.groupby("event_id")]
e = []
max_tracks = 50
for event in events:
    if len(event["barcode"].unique()) <= max_tracks:
        e.append(event)
    else:
        tracks = [track for (_, track) in event.groupby("barcode")]
        rng    = np.random.randint(1, max_tracks + 1)
        sample = random.sample(tracks, rng)
        e.append(pd.concat(sample))
events = pd.concat(e)

In [39]:
events = events.rename(columns = {'barcode':'cluster_id'})
events

Unnamed: 0.1,Unnamed: 0,event_id,cluster_id,phi,r,z,momentum,charge
0,57,1,9007611571601408,2.950621,31.957990,-47.81660,2.25170,-1
1,58,1,9007611571601408,2.956995,71.837640,-78.49790,2.25170,-1
2,60,1,9007611571601408,2.964698,115.817280,-113.72800,2.25170,-1
3,61,1,9007611571601408,2.974130,171.806223,-157.85200,2.25170,-1
4,12,1,9008298766368768,0.367411,31.957990,-121.95900,5.85017,-1
5,14,1,9008298766368768,0.374021,71.837640,-247.68100,5.85017,-1
6,15,1,9008298766368768,0.381647,115.817280,-391.18300,5.85017,-1
7,118,1,9009879314333696,1.383301,31.957990,113.52200,4.45530,-1
8,119,1,9009879314333696,1.394936,71.837640,284.62200,4.45530,-1
9,120,1,9009879314333696,1.407345,115.817280,464.87200,4.45530,-1


In [46]:
prepared = extractor.prepare_frame(events, n_rows=200, n_tracks=50, n_noise=0)

In [47]:
prepared = prepared.sort_values(["event_id", "cluster_id", "r"])
prepared.head(10)

Unnamed: 0,cluster_id,event_id,momentum,noise,padding,phi,r,z
29,0.0,0.0,3.52941,0.0,0.0,-2.455632,31.95799,48.4719
30,0.0,0.0,3.52941,0.0,0.0,-2.463615,71.83764,139.45
31,0.0,0.0,3.52941,0.0,0.0,-2.472854,115.81728,239.335
32,0.0,0.0,3.52941,0.0,0.0,-2.484726,171.806223,367.422
25,1.0,0.0,2.92286,0.0,0.0,-1.58155,31.95799,30.9612
26,1.0,0.0,2.92286,0.0,0.0,-1.589796,71.83764,99.15
27,1.0,0.0,2.92286,0.0,0.0,-1.598428,115.81728,173.105
28,1.0,0.0,2.92286,0.0,0.0,-1.609701,171.806223,268.239
21,2.0,0.0,2.41,0.0,0.0,-0.552337,31.95799,-60.2337
22,2.0,0.0,2.41,0.0,0.0,-0.559781,71.83764,-108.009


In [48]:
print(len(prepared["event_id"].unique()))

99615


In [49]:
prepared.to_csv("data/sets/ACTS-T50-EV99615.gz", compression="gzip")