In [1]:
!nvidia-smi

Fri Oct  6 18:12:15 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 375.66                 Driver Version: 375.66                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1080    Off  | 0000:04:00.0     Off |                  N/A |
| 27%   31C    P8     9W / 180W |      0MiB /  8114MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 1080    Off  | 0000:05:00.0     Off |                  N/A |
| 27%   32C    P8     9W / 180W |      0MiB /  8114MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce GTX 1080    Off  | 0000:06:00.0     Off |                  N/A |
| 27%   

In [2]:
################################################################################
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import importlib
import numpy as np
import pandas as pd
import json
import sys
from IPython.display import display
from typing import Iterable, List, Sequence

pd.options.display.max_columns = 0

In [3]:
################################################################################
def flatten(
        iterable: Iterable
        ) -> Iterable:
    """
    Return a flattened iterable from a nested iterable.
    [[3, [4, 5]], 6, [[[7]]]] -> [3, 4, 5, 6, 7]
    
    Arguments:
        iterable
            Some Iterable object that may or may not contain more Iterable
            objects.
            
    Yields the elements from each Iterable or single element from iterable.
    """
    for item in iterable:
        if  isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
            yield from flatten(item)
        else:
            yield item


def parse_file(
        filename         : str,
        initial_event_id : int      = 0,
        ignored_columns  : Sequence = (),
        ) -> Iterable[Iterable]:
    """ 
    Parses the lines in the file from 'filename' to a format
    appropriate for passing into a pandas DataFrame constructor.
        
    Arguments:
        filename
            The name of the file to parse.
        initial_event_id
            The event ID that the first event extracted has.
            Event ID is incremented by 1 after finishing the
            parsing of an event.
        ignored_columns
            The list of indices of the columns to delete from each line.
    
    For each line, yields a generator that yield the elements from the line.
    """
    event_id = initial_event_id
    with open(filename) as file:
        lines = filter(None, (line.strip() for line in file))
        for line in lines:
            if line.startswith("#"):
                event_id += 1
            else:
                j_list = json.loads("[{0}]".format(line))
                for column in ignored_columns:
                    del j_list[column]
                j_list.append(event_id)
                yield flatten(j_list)

In [4]:
################################################################################
clusters_columns = [
    "hit_nr", "barcode", "volume_id", "layer_id",
    "lx",     "ly",      "elx",       "ely",    
    "gx",     "gy",      "gz",        "phi",    
    "theta",  "ephi",    "etheta",    "event_id",
]
particles_columns = [
    "barcode",  "vertex_x", "vertex_y",
    "vertex_z", "momentum", "theta",
    "phi",      "charge",   "event_id",
]

# Extraction from a single file.

In [None]:
################################################################################
number = 1
base_directory     = "/inputdata/ACTS/prod_mu10_pt1000_2017_07_29"
clusters_filename  = base_directory + "/clusters_{0}.csv".format(number)
particles_filename = base_directory + "/particles_{0}.csv".format(number)

In [None]:
################################################################################
# Example of how the file looks.
with open(clusters_filename, "r") as file:
    for _ in range(5):
        print(file.readline())

In [None]:
################################################################################
# Quick note: I am ignoring the 7th column within each line.
# This column contains [[fch0, fch1, fchdata]]. 
clusters_lines = parse_file(clusters_filename, ignored_columns=[7])
clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
clusters_frame.head(6)

In [None]:
################################################################################
clusters_frame = clusters_frame.rename(columns={
    "hit_nr": "hit_number", "barcode": "cluster_id", "lx": "local_x",
    "ly": "local_y", "gx": "x", "gy": "y", "gz": "z", "elx": "local_x_error",
    "ely": "local_y_error", "ephi": "phi_error", "etheta": "theta_error"
})
clusters_frame.head(6)

In [None]:
################################################################################
events = clusters_frame.groupby(["event_id"])["cluster_id"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(clusters_frame))
print("Events:", len(clusters_frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

In [None]:
################################################################################
particles_lines = parse_file(particles_filename)
particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
particles_frame.head(6)

In [None]:
################################################################################
particles_frame = particles_frame.rename(columns={
    "barcode": "cluster_id", "theta": "momentum_theta", "phi": "momentum_phi"
})
particles_frame.head(6)

In [None]:
################################################################################
events = particles_frame.groupby(["event_id"])["cluster_id"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(particles_frame))
print("Events:", len(particles_frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

In [None]:
################################################################################
combined_frame = clusters_frame.merge(
    particles_frame,
    on=["event_id", "cluster_id"])
combined_frame.head(6)

In [None]:
################################################################################
x    = combined_frame["x"]
y    = combined_frame["y"]
frame = combined_frame.assign(r=np.sqrt(x**2 + y**2))
frame.head(6)

In [None]:
################################################################################
# Eliminate duplicate hits that were caused by imperfections in the detector.
frame = frame.sort_values("r")
frame = frame.drop_duplicates(["event_id", "cluster_id", "layer_id"])

In [None]:
################################################################################
# Specify the volume to use. Each volume is a different detector configuration.
frame = frame[frame["volume_id"] == 8]

In [None]:
################################################################################
# Set radiuses to be the same for each layer.
for layer_id in frame["layer_id"].unique():
    ind = frame["layer_id"] == layer_id
    rs  = frame[ind]["r"]
    med = rs.median()
    frame.loc[ind, "r"] = med

In [None]:
################################################################################
# Clean up the frame a bit.
frame = frame.sort_values(["event_id", "cluster_id", "r"])
frame.head(6)

In [None]:
################################################################################
events = frame.groupby(["event_id"])["cluster_id"]
tracks_per_event = [len(event.unique()) for (_, event) in events]

print("Hits:", len(frame))
print("Events:", len(frame["event_id"].unique()))
print("Min Tracks:", min(tracks_per_event))
print("Max Tracks:", max(tracks_per_event))

# Extraction from multiple files.

In [5]:
################################################################################
def get_clusters_frame(
        clusters_filename : str,
        initial_event_id  : int,
        ) -> pd.DataFrame:
    clusters_lines = parse_file(
        clusters_filename,
        ignored_columns=[7],
        initial_event_id=initial_event_id)
    clusters_frame = pd.DataFrame(clusters_lines, columns=clusters_columns)
    clusters_frame = clusters_frame.rename(columns={
        "hit_nr": "hit_number", "barcode": "cluster_id", "lx": "local_x",
        "ly": "local_y", "gx": "x", "gy": "y", "gz": "z", "elx":
        "local_x_error", "ely": "local_y_error", "ephi": "phi_error",
        "etheta": "theta_error"})
    return clusters_frame

def get_particles_frame(
        particles_filename : str,
        initial_event_id   : int,
        ) -> pd.DataFrame:
    particles_lines = parse_file(
        particles_filename, 
        initial_event_id=initial_event_id)
    particles_frame = pd.DataFrame(particles_lines, columns=particles_columns)
    particles_frame = particles_frame.rename(columns={
        "barcode": "cluster_id", "theta": "momentum_theta",
        "phi": "momentum_phi"})
    return particles_frame

def extract(
        clusters_filename  : str, 
        particles_filename : str,
        initial_event_id   : int = 0,
        ) -> pd.DataFrame:
    """ Everything in one function.
        Depending on the size of the file, this function could take a long
        time. Most of the time is spent parsing the csv files within the
        first 4 lines.
    """
    clusters  = get_clusters_frame(clusters_filename, initial_event_id)
    particles = get_particles_frame(particles_filename, initial_event_id)
    combined  = clusters.merge(particles, on=["event_id", "cluster_id"])
    volume    = combined[combined["volume_id"] == 8]
    
    return (
        volume.assign(r=np.sqrt(combined["x"]**2 + combined["y"]**2))
              .sort_values(["event_id", "cluster_id", "r"])
              .drop_duplicates(["event_id", "cluster_id", "layer_id"])
    )

In [None]:
%%time
################################################################################
frames = []
initial_event_id = 0
base_directory = "/inputdata/ACTS/prod_mu10_pt1000_2017_07_29"
for i in range(1, 1 + 100):
    print("Extracting from file {0}. Initial Event ID is {1}".format(i, initial_event_id))
    try:
        clusters_filename  = base_directory + "/clusters_{0}.csv".format(i)
        particles_filename = base_directory + "/particles_{0}.csv".format(i)
        frame = extract(
            clusters_filename=clusters_filename, 
            particles_filename=particles_filename,
            initial_event_id=initial_event_id,)
        initial_event_id = frame["event_id"].max() + 1
        frames.append(frame)
    except FileNotFoundError as error:
        print(error)
print("All done. Concatenating frames.")
frame = pd.concat(frames)
for layer_id in frame["layer_id"].unique():
    ind = frame["layer_id"] == layer_id
    rs  = frame[ind]["r"]
    med = rs.median()
    frame.loc[ind, "r"] = med

In [None]:
print("Number of Hits: {}".format(len(frame)))
print("Number of Events: {}".format(len(frame["event_id"].unique())))
tracks  = [value for (_, value) in frame.groupby(["event_id"])]
lengths = [len(value["cluster_id"].unique()) for value in tracks]
print("Min Number of Tracks: {}".format(min(lengths)))
print("Max Number of Tracks: {}".format(max(lengths)))

In [None]:
frame.to_csv("data/sets/ACTS-MU10-PT1000-COMPLETE.gz", compression="gzip")

In [6]:
frame = pd.read_csv("data/sets/ACTS-MU10-PT1000-COMPLETE.gz")

In [52]:
from tracker import extractor
importlib.reload(extractor)

<module 'tracker.extractor' from '/home/jovyan/work/Fermi-Internship-2017/Track/tracker/extractor.py'>

In [53]:
test = frame[frame["event_id"] < 20]

In [62]:
prepared = extractor.prepare_frame(frame)
prepared = prepared.sort_values(["event_id", "cluster_id", "r"])

In [65]:
prepared.to_csv("data/sets/ACTS-MU10-PT1000-PREPARED.gz", compression="gzip")