In [20]:
import numpy as np

from tqdm import tqdm
import pandas as pd

import gc

In [21]:
filenames = {
    "herwig": "GAN-data\events_anomalydetection_DelphesHerwig_qcd_features.h5",
    "pythiabg": "GAN-data\events_anomalydetection_DelphesPythia8_v2_qcd_features.h5",
    "pythiasig": "GAN-data\events_anomalydetection_DelphesPythia8_v2_Wprime_features.h5"
}

datatypes = ["herwig", "pythiabg", "pythiasig"]

train_features = ["mjj", "mj1", "eta1", "phi1", "ej1", "tau21j1", "mj2", "eta2", "phi2", "ej2", "tau21j2"]

In [22]:
def load_data(datatype, stop = None):
    input_frame = pd.read_hdf(filenames[datatype], stop = stop)
    output_frame = input_frame.copy()
    features = ["px", "py", "pz", "m", "tau1", "tau2", "tau3"]
    for feature in features:
        output_frame[feature + "j1"] = (input_frame["mj1"] >= input_frame["mj2"])*input_frame[feature + "j1"] + (input_frame["mj1"] < input_frame["mj2"])*input_frame[feature + "j2"]
        output_frame[feature + "j2"] = (input_frame["mj1"] >= input_frame["mj2"])*input_frame[feature + "j2"] + (input_frame["mj1"] < input_frame["mj2"])*input_frame[feature + "j1"]
    del input_frame
    gc.collect()
    output_frame["mjdelta"] = output_frame["mj1"] - output_frame["mj2"]
    output_frame["pj1"] = np.sqrt(output_frame["pxj1"]**2 + output_frame["pyj1"]**2 + output_frame["pzj1"]**2)
    output_frame["pj2"] = np.sqrt(output_frame["pxj2"]**2 + output_frame["pyj2"]**2 + output_frame["pzj2"]**2)
    output_frame["ej1"] = np.sqrt(output_frame["mj1"]**2 + output_frame["pj1"]**2)
    output_frame["ej2"] = np.sqrt(output_frame["mj2"]**2 + output_frame["pj2"]**2)
    output_frame["ejj"] = output_frame["ej1"] + output_frame["ej2"]
    output_frame["pjj"] = np.sqrt((output_frame["pxj1"] + output_frame["pxj2"])**2 + (output_frame["pyj1"] + output_frame["pyj2"])**2 + (output_frame["pzj1"] + output_frame["pzj2"])**2)
    output_frame["mjj"] = np.sqrt(output_frame["ejj"]**2 - output_frame["pjj"]**2)
    output_frame["tau21j1"] = output_frame["tau2j1"] / output_frame["tau1j1"]
    output_frame["tau32j1"] = output_frame["tau3j1"] / output_frame["tau2j1"]
    output_frame["tau21j2"] = output_frame["tau2j2"] / output_frame["tau1j2"]
    output_frame["tau32j2"] = output_frame["tau3j2"] / output_frame["tau2j2"]
    output_frame["ptj1"] = np.sqrt(output_frame["pxj1"]**2 + output_frame["pyj1"]**2)
    output_frame["phi1"] = np.arctan2(output_frame["pyj1"], output_frame["pxj1"])
    output_frame["eta1"] = np.arcsinh(output_frame["pzj1"] / output_frame["ptj1"])
    output_frame["ptj2"] = np.sqrt(output_frame["pxj2"]**2 + output_frame["pyj2"]**2)
    output_frame["phi2"] = np.arctan2(output_frame["pyj2"], output_frame["pxj2"])
    output_frame["eta2"] = np.arcsinh(output_frame["pzj2"] / output_frame["ptj2"])
    return output_frame

In [23]:
df = load_data("pythiasig")
df.dropna(inplace = True)
df.reset_index(drop = True, inplace = True)
df = df.astype('float32')

In [24]:
output_df = df[train_features]
del df
gc.collect()

0

In [25]:
output_df.to_csv('pythiasig.csv')