In [None]:
!nvidia-smi

In [2]:
################################################################################
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import importlib
import numpy as np
import pandas as pd
import json
import sys
from IPython.display import display
from typing import Iterable, List, Sequence

In [3]:
################################################################################
def flatten(
        iterable: Iterable
        ) -> Iterable:
    """
    Return a flattened iterable from a nested iterable.
    [[3, [4, 5]], 6, [[[7]]]] -> [3, 4, 5, 6, 7]
    
    Arguments:
        iterable
            Some Iterable object that may or may not contain more Iterable
            objects.
            
    Yields the elements from each Iterable or single element from iterable.
    """
    for item in iterable:
        if  isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
            yield from flatten(item)
        else:
            yield item


def parse_file(
        filename         : str,
        initial_event_id : int      = 0,
        ignored_columns  : Sequence = (),
        ) -> Iterable[Iterable]:
    """ 
    Parses the lines in the file from 'filename' to a format
    appropriate for passing into a pandas DataFrame constructor.
        
    Arguments:
        filename
            The name of the file to parse.
        initial_event_id
            The event ID that the first event extracted has.
            Event ID is incremented by 1 after finishing the
            parsing of an event.
        ignored_columns
            The list of indices of the columns to delete from each line.
    
    For each line, yields a generator that yield the elements from the line.
    """
    event_id = initial_event_id
    with open(filename) as file:
        lines = filter(None, (line.strip() for line in file))
        for line in lines:
            if line.startswith("#"):
                event_id += 1
            else:
                j_list = json.loads("[{0}]".format(line))
                for column in ignored_columns:
                    del j_list[column]
                j_list.append(event_id)
                yield flatten(j_list)

In [4]:
################################################################################
clusters_columns = [
    "hit_nr", "barcode", "volume_id", "layer_id",
    "lx",     "ly",      "elx",       "ely",    
    "gx",     "gy",      "gz",        "phi",    
    "theta",  "ephi",    "etheta",    "event_id",
]
particles_columns = [
    "barcode",  "vertex_x", "vertex_y",
    "vertex_z", "momentum", "theta",
    "phi",      "charge",   "event_id",
]

# Extraction from a single file.

In [5]:
################################################################################
number = 1
base_directory     = "/inputdata/ACTS/prod_mu200_pt500_2017_07_25"
clusters_filename  = base_directory + "/clusters_{0}.csv".format(number)
particles_filename = base_directory + "/particles_{0}.csv".format(number)

In [28]:
################################################################################
# Example of how the file looks.
with open(clusters_filename, "r") as file:
    for _ in range(10):
        print(file.readline())



### Format hit_nr, barcode, volume_id, layer_id , [lx, ly], [elx, ely], [ gx, gy, gz ],[[fch0,fch1,fchdata]], [phi,theta], [ephi,etheta]

1, 297238949795987456, 7, 2, [ 3.025, 19.4344], [ 0.15, 0.15],[ -157.488, -25.0229,-1498], [[ 228, 985, 0.268438]], [ -2.98402, 3.03554], [ 0.1, 0.01]

2, 67563271539916800, 7, 2, [ 3.575, 10.0366], [ 0.15, 0.15],[ -141.123, -51.07,-1498], [[ 239, 817, 0.0209375], [ 239, 818, 0.268438]], [ -2.81819, 1.5708], [ 0.1, 0.01]

3, 562962116768694272, 7, 2, [ 6.925, 8.07187], [ 0.15, 0.15],[ -132.658, -66.1443,-1502], [[ 306, 783, 0.268438]], [ -2.67906, 3.04322], [ 0.1, 0.01]

4, 4672924418048, 7, 2, [ -0.125, 8.63437], [ 0.15, 0.15],[ -129.911, -72.2169,-1498], [[ 165, 793, 0.268438]], [ -2.63423, 3.04269], [ 0.1, 0.01]

5, 562962116768694272, 7, 2, [ -6.725, 7.67812], [ 0.15, 0.15],[ -132.287, -65.9857,-1498], [[ 33, 776, 0.268438]], [ -2.6789, 3.04323], [ 0.1, 0.01]

6, 252213192724316160, 7, 2, [ 2.675, 16.1104], [ 0.15, 0.15],[ -127.279, -90.4306,-1

In [6]:
################################################################################
# Quick note: I am ignoring the 7th column within each line.
# This column contains [[fch0, fch1, fchdata]]. 
clusters_lines = parse_file(clusters_filename, ignored_columns=[7])
clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
clusters_frame.head(3)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id
0,1,297238949795987456,7,2,3.025,19.4344,0.15,0.15,-157.488,-25.0229,-1498.0,-2.98402,3.03554,0.1,0.01,1
1,2,67563271539916800,7,2,3.575,10.0366,0.15,0.15,-141.123,-51.07,-1498.0,-2.81819,1.5708,0.1,0.01,1
2,3,562962116768694272,7,2,6.925,8.07187,0.15,0.15,-132.658,-66.1443,-1502.0,-2.67906,3.04322,0.1,0.01,1


In [15]:
events = clusters_frame.groupby(["event_id"])["barcode"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(clusters_frame))
print("Events:", len(clusters_frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

Hits: 3048491
Events: 100
Min Tracks: 1828
Max Tracks: 3463


In [7]:
################################################################################
particles_lines = parse_file(particles_filename)
particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
particles_frame.head(3)

Unnamed: 0,barcode,vertex_x,vertex_y,vertex_z,momentum,theta,phi,charge,event_id
0,206158430208,-0.003092,-0.004118,-56.6067,3.99061,0.158342,1.93647,-1,1
1,274877906944,-0.003092,-0.004118,-56.6067,7.70008,2.99673,-2.08866,1,1
2,343597383680,-0.003092,-0.004118,-56.6067,1.52538,2.67101,2.78452,-1,1


In [17]:
################################################################################
events = particles_frame.groupby(["event_id"])["barcode"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(particles_frame))
print("Events:", len(particles_frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

Hits: 249957
Events: 100
Min Tracks: 1828
Max Tracks: 3463


In [18]:
################################################################################
left_frame     = clusters_frame
right_frame    = particles_frame[["event_id", "barcode", "momentum", "charge"]]
combined_frame = left_frame.merge(right_frame, on=["event_id", "barcode"])
combined_frame.head(3)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id,momentum,charge
0,1,297238949795987456,7,2,3.025,19.4344,0.15,0.15,-157.488,-25.0229,-1498.0,-2.98402,3.03554,0.1,0.01,1,11.5048,1
1,62,297238949795987456,7,4,3.375,-24.5174,0.15,0.15,-113.909,-19.2973,-1098.0,-3.00299,1.5708,0.1,0.01,1,11.5048,1
2,65,297238949795987456,7,4,-7.325,-24.2719,0.15,0.15,-114.331,-19.3656,-1102.0,-2.9738,3.03675,0.1,0.01,1,11.5048,1


In [21]:
################################################################################
gx    = combined_frame["gx"]
gy    = combined_frame["gy"]
gz    = combined_frame["gz"]
phi   = np.arctan2(gy, gx)
r     = np.sqrt(gx**2 + gy**2)
frame = combined_frame.assign(phi=phi, r=r, z=gz)
frame.head(3)

Unnamed: 0,hit_nr,barcode,volume_id,layer_id,lx,ly,elx,ely,gx,gy,gz,phi,theta,ephi,etheta,event_id,momentum,charge,r,z
0,1,297238949795987456,7,2,3.025,19.4344,0.15,0.15,-157.488,-25.0229,-1498.0,-2.984022,3.03554,0.1,0.01,1,11.5048,1,159.463525,-1498.0
1,62,297238949795987456,7,4,3.375,-24.5174,0.15,0.15,-113.909,-19.2973,-1098.0,-2.973776,1.5708,0.1,0.01,1,11.5048,1,115.532013,-1098.0
2,65,297238949795987456,7,4,-7.325,-24.2719,0.15,0.15,-114.331,-19.3656,-1102.0,-2.973803,3.03675,0.1,0.01,1,11.5048,1,115.959493,-1102.0


In [22]:
################################################################################
# Eliminate duplicate hits that were caused by imperfections in the detector.
frame = frame.sort_values("r")
frame = frame.drop_duplicates(["event_id", "barcode", "layer_id"])

In [23]:
################################################################################
# Specify the volume to use. Each volume is a different detector configuration.
frame = frame[frame["volume_id"] == 8]

In [24]:
################################################################################
# Set radiuses to be the same for each layer.
for layer_id in frame["layer_id"].unique():
    ind = frame["layer_id"] == layer_id
    rs  = frame[ind]["r"]
    med = rs.median()
    frame.loc[ind, "r"] = med

In [None]:
################################################################################
# # Put limits on the number of tracks per event. 
# max_tracks = 50
# min_tracks = 2
# frames = [f for (_, f) in frame.groupby("event_id", sort=False)]
# for i, f in enumerate(frames):
#     barcodes = f["barcode"].unique()
#     if len(barcodes) < min_tracks:
#         frames[i] = pd.DataFrame()
#     if len(barcodes) > max_tracks:
#         length = np.random.randint(min_tracks, max_tracks + 1)
#         barcodes = np.random.choice(barcodes, length, replace=False)
#         f = f[f["barcode"].isin(barcodes)]
#         frames[i] = f
# frame = pd.concat(frames)

In [25]:
################################################################################
# Clean up the frame a bit.
frame = frame[["event_id", "barcode", "phi", "r", "z", "momentum", "charge"]]
frame = frame.sort_values(["event_id", "barcode", "r"])
frame.head(15)

Unnamed: 0,event_id,barcode,phi,r,z,momentum,charge
23892,1,206158430208,1.951472,31.959915,140.519,3.99061,-1
23894,1,206158430208,1.970991,71.842854,391.806,3.99061,-1
831,1,274877906944,-2.097345,31.959915,-272.719,7.70008,1
6708,1,343597383680,2.798524,31.959915,-118.753,1.52538,-1
6710,1,343597383680,2.816034,71.842854,-199.328,1.52538,-1
6711,1,343597383680,2.835049,115.820868,-285.384,1.52538,-1
6712,1,343597383680,2.859102,171.805097,-394.595,1.52538,-1
4711,1,1236950581248,2.740044,31.959915,-138.3,2.65858,-1
4712,1,1236950581248,2.752914,71.842854,-241.175,2.65858,-1
4714,1,1236950581248,2.766501,115.820868,-355.448,2.65858,-1


In [26]:
events = frame.groupby(["event_id"])["barcode"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(frame))
print("Events:", len(frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

Hits: 800073
Events: 100
Min Tracks: 1828
Max Tracks: 3460


# Extraction from multiple files.

In [None]:
################################################################################
def extract(
        clusters_filename  : str, 
        particles_filename : str,
        initial_event_id   : int = 0,
        ) -> pd.DataFrame:
    """ Everything in one function.
        Depending on the size of the file, this function could take a long
        time. Most of the time is spent parsing the csv files within the
        first 4 lines.
    """
    clusters_lines = parse_file(
        clusters_filename,
        ignored_columns=[7],
        initial_event_id=initial_event_id)
    clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
    
    particles_lines = parse_file(
        particles_filename, 
        initial_event_id=initial_event_id)
    particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
    
    left  = clusters_frame
    right = particles_frame[["event_id", "barcode", "momentum", "charge"]]
    combined_frame = left.merge(right, on=["event_id", "barcode"])
    
    gx    = combined_frame["gx"]
    gy    = combined_frame["gy"]
    gz    = combined_frame["gz"]
    phi   = np.arctan2(gy, gx)
    r     = np.sqrt(gx**2 + gy**2)
    frame = combined_frame.assign(phi=phi, r=r, z=gz)
    
    frame = frame.sort_values("r")
    frame = frame.drop_duplicates(["event_id", "barcode", "layer_id"])
    
    frame = frame[frame["volume_id"] == 8]
    
    for layer_id in frame["layer_id"].unique():
        ind = frame["layer_id"] == layer_id
        rs  = frame[ind]["r"]
        med = rs.median()
        frame.loc[ind, "r"] = med
    
    cols = ["event_id", "barcode", "phi", "r", "z", "momentum", "charge"]
    frame = frame[cols]
    frame = frame.sort_values(["event_id", "barcode", "r"])
    
    return frame

In [None]:
%%time
################################################################################
frames = []
initial_event_id = 0
base_directory = "/inputdata/ACTS/prod_mu200_pt500_2017_07_26"
for i in range(1, 1 + 100):
    print("Extracting from file {0}. Initial Event ID is {1}".format(i, initial_event_id))
    try:
        clusters_filename  = base_directory + "/clusters_{0}.csv".format(i)
        particles_filename = base_directory + "/particles_{0}.csv".format(i)
        frame = extract(
            clusters_filename=clusters_filename, 
            particles_filename=particles_filename,
            initial_event_id=initial_event_id,)
        initial_event_id = frame["event_id"].max() + 1
        frames.append(frame)
    except FileNotFoundError as error:
        print(error)
frame = pd.concat(frames)

In [None]:
print("Number of Hits: {}".format(len(frame)))
print("Number of Events: {}".format(len(frame["event_id"].unique())))
tracks  = [value for (_, value) in frame.groupby(["event_id"])]
lengths = [len(value["barcode"].unique()) for value in tracks]
print("Min Number of Tracks: {}".format(min(lengths)))
print("Max Number of Tracks: {}".format(max(lengths)))