In [48]:
import plotly
import numpy as np 
import pandas as pd
import plotly.express as px
from pathlib import Path
import re
from dataclasses import dataclass, fields
from io import StringIO

In [2]:
events_path = Path('truthparticle.txt')

In [3]:
events_txt = events_path.open().read()

In [49]:
event_pattern = r"^TruthEvent(?:.*?\n){4}^id.*?\n(?P<ip>.*?)\n.*?\n(?P<truth_event>(?:[-0-9].*?\n)+)(?P<pileup>(?s).+?)(?:(?:run)|\Z)"
event_prog = re.compile(event_pattern, re.MULTILINE)

In [50]:
pu_pattern = r"^TruthPileupEventContainer i(?:.*?\n){2}(?P<data>(?s).+?)(?:N|\Z)"
pu_prog = re.compile(pu_pattern, re.MULTILINE)

In [51]:
def findall(prog, txt):
    results = []
    i = 0
    while True:
        result = prog.search(txt[i:])
        if not result:
            break
        results.append(result)
        i += result.span(0)[1]
    return results

In [52]:
event_results = findall(event_prog, events_txt)
event_pu_results = [findall(pu_prog, r.group('pileup')) for r in event_results]

In [31]:
'id	barcode	x	y	z	t	numIncomingParticles	numOutgoingParticles'.split('\t')

['id',
 'barcode',
 'x',
 'y',
 'z',
 't',
 'numIncomingParticles',
 'numOutgoingParticles']

In [40]:
@dataclass
class Vertex():
    id: int
    barcode: int
    x: float
    y: float
    z: float
    t: float
    numIncomingParticles: int
    numOutgoingParticles: int

    def __post_init__(self):
        for field in fields(self):
            value = getattr(self, field.name)
            if not isinstance(value, field.type):
                setattr(self, field.name, field.type(value))

In [41]:
v = Vertex('1', '1', '1', '1', '1', '1', '1', '1')

In [88]:
class Event():
    particle_fields = ['iPart', 'barcode', 'status', 'pdgId', 'px', 'py', 'pz', 'isCharged', 'm', 'pt', 'eta', 'phi', 'rapidity', 'hasProdVtx', 'prodVtx_id', 'prodVtx_x', 'prodVtx_y', 'prodVtx_z', 'hasDecayVtx', 'decayVtx_id']

    def __init__(self, event_result, event_pu_result):
        self.ip = Vertex(*event_result.group('ip').split())
        self.primary = pd.read_csv(StringIO(event_result.group('truth_event')), sep='\t', names=self.particle_fields, index_col=False)
        self.pileup = self.read_pileup(event_pu_result)

    def read_pileup(self, event_pu_result):
        pu_dfs = []
        for pu_id, pu_result in enumerate(event_pu_result):
            pu_df = pd.read_csv(StringIO(pu_result.group('data')), sep='\t', names=self.particle_fields, index_col=False)
            pu_df['pileup_id'] = pu_id
            pu_dfs.append(pu_df)
        pileup = pd.concat(pu_dfs, ignore_index=True)
        return pileup

In [89]:
ev = Event(event_results[0], event_pu_results[0])

In [90]:
ev.pileup

Unnamed: 0,iPart,barcode,status,pdgId,px,py,pz,isCharged,m,pt,...,phi,rapidity,hasProdVtx,prodVtx_id,prodVtx_x,prodVtx_y,prodVtx_z,hasDecayVtx,decayVtx_id,pileup_id
0,913,1047,1,-321,-1431.49000,-1144.7700,91539.100,1,91558.80,493.68,...,4.604100,-2.467030,4.569090,1,0,0.017039,0.010330,-18.4265,0,0
1,914,1048,1,211,-1065.15000,-1303.8700,62717.400,1,62740.10,139.57,...,4.311010,-2.255770,4.307580,1,0,0.017039,0.010330,-18.4265,0,0
2,920,1054,1,-211,-756.05200,757.6860,-1114.170,1,1551.31,139.57,...,-0.910014,2.355110,-0.903947,1,0,0.017039,0.010330,-18.4265,0,0
3,921,1055,1,211,178.44400,-529.1880,659957.000,1,659957.00,139.57,...,7.767890,-1.245570,7.739690,1,0,0.017039,0.010330,-18.4265,0,0
4,922,1056,1,-211,34.72340,-50.0730,2795.770,1,2799.91,139.57,...,4.519330,-0.964477,3.603950,1,0,0.017039,0.010330,-18.4265,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12794,183,204,1,211,-177.91100,-113.5870,-163993.000,1,163993.00,139.57,...,-7.348500,-2.573380,-7.187350,1,0,0.002122,-0.005505,-29.5707,0,226
12795,184,205,1,-211,-6.41372,25.5844,-25406.500,1,25406.90,139.57,...,-7.563450,1.816420,-5.880520,1,0,0.002122,-0.005505,-29.5707,0,226
12796,189,210,1,211,143.71800,121.4070,2447.160,1,2458.35,139.57,...,3.260150,0.701441,3.041710,1,0,0.002122,-0.005505,-29.5707,0,226
12797,190,211,1,-211,112.36400,62.7669,8349.110,1,8351.27,139.57,...,4.865580,0.509424,4.476960,1,0,0.002122,-0.005505,-29.5707,0,226


In [None]:
event_pu_results[0][0]

In [84]:
df = pd.DataFrame(columns=['iPart', 'barcode', 'status', 'pdgId', 'px', 'py', 'pz', 'isCharged', 'm', 'pt', 'eta', 'phi', 'rapidity', 'hasProdVtx', 'prodVtx_id', 'prodVtx_x', 'prodVtx_y', 'prodVtx_z', 'hasDecayVtx', 'decayVtx_id'])

In [8]:
data_lists = [[] for _ in columns]
for line in lines[8:-2]:
    for i, element in enumerate(line.split()):
        data_lists[i].append(element)

In [13]:
data = {c: data_lists[i] for i, c in enumerate(columns)}

In [16]:
[len(d) for d in data_lists]

[13899,
 13899,
 13899,
 13671,
 13671,
 13671,
 13671,
 13671,
 13217,
 13217,
 13217,
 13217,
 13217,
 13217,
 13217,
 13217,
 13217,
 13217,
 13217,
 13217]

In [14]:
df = pd.DataFrame(data)

ValueError: arrays must all be same length

In [83]:
lines[-2]

'305\t200066\t1\t211\t-258.708\t-158.251\t504.431\t1\t604.9\t139.57\t303.271\t1.28206\t-2.59261\t1.20083\t1\t1201\t-42.6606\t19.7729\t2893.37\t0\t'