In [None]:
!nvidia-smi

In [1]:
################################################################################
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import importlib
import numpy as np
import pandas as pd
import json
import sys
from IPython.display import display
from typing import Iterable, List, Sequence

In [2]:
################################################################################
def flatten(
        iterable: Iterable
        ) -> Iterable:
    """ Return a flattened iterable from a nested iterable.
        [[3, [4, 5]], 6, [[[7]]]] -> [3, 4, 5, 6, 7]
    """
    for item in iterable:
        if  isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
            yield from flatten(item)
        else:
            yield item


def parse_file(
        filename         : str,
        initial_event_id : int      = 0,
        ignored_columns  : Sequence = (),
        ) -> Iterable[Iterable]:
    """ Parses the lines in the file from 'filename' to a format
        appropriate for passing into a pandas DataFrame constructor.
    """
    event_id = initial_event_id
    with open(filename) as file:
        lines = filter(None, (line.strip() for line in file))
        for line in lines:
            if line.startswith("#"):
                event_id += 1
            else:
                j_list = json.loads("[{0}]".format(line))
                for column in ignored_columns:
                    del j_list[column]
                j_list.append(event_id)
                yield flatten(j_list)

In [3]:
################################################################################
clusters_columns = [
    "hit_nr", "barcode", "volume_id", "layer_id",
    "lx",     "ly",      "elx",       "ely",    
    "gx",     "gy",      "gz",        "phi",    
    "theta",  "ephi",    "etheta",    "event_id",
]
particles_columns = [
    "barcode",  "vertex_x", "vertex_y",
    "vertex_z", "momentum", "theta",
    "phi",      "charge",   "event_id",
]

# Extraction from a single file.

In [None]:
################################################################################
number = 1
base_directory     = "/inputdata/ACTS/prod_mu10_pt1000_2017_07_29"
clusters_filename  = base_directory + "/clusters_{0}.csv".format(number)
particles_filename = base_directory + "/particles_{0}.csv".format(number)

In [None]:
################################################################################
clusters_lines = parse_file(clusters_filename, ignored_columns=[7])
clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
clusters_frame.head(3)

In [None]:
################################################################################
particles_lines = parse_file(particles_filename)
particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
particles_frame.head(3)

In [None]:
################################################################################
left_frame     = clusters_frame
right_frame    = particles_frame[["event_id", "barcode", "momentum", "charge"]]
combined_frame = left_frame.merge(right_frame, on=["event_id", "barcode"])
combined_frame.head(3)

In [None]:
################################################################################
gx    = combined_frame["gx"]
gy    = combined_frame["gy"]
gz    = combined_frame["gz"]
phi   = np.arctan2(gy, gx)
r     = np.sqrt(gx**2 + gy**2)
frame = combined_frame.assign(phi=phi, r=r, z=gz)

In [None]:
################################################################################
# Eliminate duplicate hits that were caused by imperfections in the detector.
frame = frame.sort_values("r")
frame = frame.drop_duplicates(["event_id", "barcode", "layer_id"])

In [None]:
################################################################################
# Specify the volume to use. Each volume is a different detector configuration.
frame = frame[frame["volume_id"] == 8]

In [None]:
################################################################################
# Set radiuses to be the same for each layer.
for layer_id in frame["layer_id"].unique():
    ind = frame["layer_id"] == layer_id
    rs  = frame[ind]["r"]
    med = rs.median()
    frame.loc[ind, "r"] = med

In [None]:
################################################################################
# Put limits on the number of tracks per event. 
max_tracks = 50
min_tracks = 2
frames = [f for (_, f) in frame.groupby("event_id", sort=False)]
for i, f in enumerate(frames):
    barcodes = f["barcode"].unique()
    if len(barcodes) < min_tracks:
        frames[i] = pd.DataFrame()
    if len(barcodes) > max_tracks:
        length = np.random.randint(min_tracks, max_tracks + 1)
        barcodes = np.random.choice(barcodes, length, replace=False)
        f = f[f["barcode"].isin(barcodes)]
        frames[i] = f
frame = pd.concat(frames)

In [None]:
################################################################################
# Clean up the frame a bit.
frame = frame[["event_id", "barcode", "phi", "r", "z", "momentum", "charge"]]
frame = frame.sort_values(["event_id", "barcode", "r"])
frame.head(15)

# Extraction from multiple files.

In [4]:
################################################################################
def extract(
        clusters_filename  : str, 
        particles_filename : str,
        initial_event_id   : int = 0,
        ) -> pd.DataFrame:
    """ Everything in one function.
        Depending on the size of the file, this function could take a long
        time. Most of the time is spent parsing the csv files within the
        first 4 lines.
    """
    clusters_lines = parse_file(
        clusters_filename,
        ignored_columns=[7],
        initial_event_id=initial_event_id)
    clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
    
    particles_lines = parse_file(
        particles_filename, 
        initial_event_id=initial_event_id)
    particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
    
    left  = clusters_frame
    right = particles_frame[["event_id", "barcode", "momentum", "charge"]]
    combined_frame = left.merge(right, on=["event_id", "barcode"])
    
    gx    = combined_frame["gx"]
    gy    = combined_frame["gy"]
    gz    = combined_frame["gz"]
    phi   = np.arctan2(gy, gx)
    r     = np.sqrt(gx**2 + gy**2)
    frame = combined_frame.assign(phi=phi, r=r, z=gz)
    
    frame = frame.sort_values("r")
    frame = frame.drop_duplicates(["event_id", "barcode", "layer_id"])
    
    frame = frame[frame["volume_id"] == 8]
    
    for layer_id in frame["layer_id"].unique():
        ind = frame["layer_id"] == layer_id
        rs  = frame[ind]["r"]
        med = rs.median()
        frame.loc[ind, "r"] = med
    
    cols = ["event_id", "barcode", "phi", "r", "z", "momentum", "charge"]
    frame = frame[cols]
    frame = frame.sort_values(["event_id", "barcode", "r"])
    
    return frame

In [6]:
%%time
################################################################################
frames = []
initial_event_id = 0
base_directory = "/inputdata/ACTS/prod_mu200_pt500_2017_07_26"
for i in range(1, 1 + 100):
    print("Extracting from file {0}. Initial Event ID is {1}".format(i, initial_event_id))
    try:
        clusters_filename  = base_directory + "/clusters_{0}.csv".format(i)
        particles_filename = base_directory + "/particles_{0}.csv".format(i)
        frame = extract(
            clusters_filename=clusters_filename, 
            particles_filename=particles_filename,
            initial_event_id=initial_event_id,)
        initial_event_id = frame["event_id"].max() + 1
        frames.append(frame)
    except FileNotFoundError as error:
        print(error)
frame = pd.concat(frames)

Extracting from file 1. Initial Event ID is 0
Extracting from file 2. Initial Event ID is 101
Extracting from file 3. Initial Event ID is 202
Extracting from file 4. Initial Event ID is 303
Extracting from file 5. Initial Event ID is 404
Extracting from file 6. Initial Event ID is 505
Extracting from file 7. Initial Event ID is 606
Extracting from file 8. Initial Event ID is 707
Extracting from file 9. Initial Event ID is 808
Extracting from file 10. Initial Event ID is 909
Extracting from file 11. Initial Event ID is 1010
Extracting from file 12. Initial Event ID is 1111
Extracting from file 13. Initial Event ID is 1212
Extracting from file 14. Initial Event ID is 1313
Extracting from file 15. Initial Event ID is 1414
Extracting from file 16. Initial Event ID is 1515
Extracting from file 17. Initial Event ID is 1616
Extracting from file 18. Initial Event ID is 1717
Extracting from file 19. Initial Event ID is 1818
Extracting from file 20. Initial Event ID is 1919
Extracting from file 

KeyboardInterrupt: 

In [8]:
print("Number of Hits: {}".format(len(frame)))
print("Number of Events: {}".format(len(frame["event_id"].unique())))
tracks  = [value for (_, value) in frame.groupby(["event_id"])]
lengths = [len(value["barcode"].unique()) for value in tracks]
print("Min Number of Tracks: {}".format(min(lengths)))
print("Max Number of Tracks: {}".format(max(lengths)))

Number of Hits: 30341559
Number of Events: 3800
Min Number of Tracks: 1630
Max Number of Tracks: 3553


In [9]:
filepath = "data/sets/ACTS-MU200-EV3800.gz"
frame.to_csv(filepath, compression="gzip")
print("{0} bytes".format(os.path.getsize(filepath)))

719643198 bytes
