This Python code defines and utilizes the `Data_generator` class to generate synthetic data for a physics experiment. The program then cleans up the data by removing outliers and subsequently stores the clean data in a pickle file. 

## Data Generation

`Data_generator` is the main part of this script and is initialized with two parameters: the number of events (`numevents`) and a boolean indicating whether the generated data should be normalized or not. The synthetic data is a representation of the result of physics events involving multiple particles, where each particle is characterized by different properties such as `eta`, `mass`, `phi`, `pt`, `charge`, and `genPartFlav`.

Upon initialization, `Data_generator` configures a series of functions and input variables, which are then utilized in the `generate_fake_data` method to generate synthetic data. 

The `generate_fake_data` method starts by creating a dictionary, `data`, with keys for each variable and empty lists as values. It then generates random data for these variables using relevant statistical distributions which are based on the physical properties being simulated. 

After the data has been generated, `Data_generator` includes an optional step of renaming and reordering keys in the dictionary in a more standard format, before returning the data.

## Data Cleaning

After generating the data, outliers are removed using the `remove_outliers` function. This function uses the limits defined in a YAML file to identify and remove the outliers in the data. 

## Data Storage

Once the outliers have been removed, the data is then stored as a pickle file at a specified location for later use.



In [13]:
import pickle
import numpy as np
from copy import deepcopy
import sys
sys.path.append('../utils/')
from DD_data_extractor_git import Data_generator
import matplotlib.pyplot as plt
import pandas as pd
import yaml
import os

In [14]:
num_events = 2000000
Data_generator1 = Data_generator(num_events, normalize=True)
data_dict=Data_generator1.getData()

Using 19 workers


Collecting results: 100%|██████████| 19/19 [00:00<00:00, 21.17it/s]
Applying functions: 50it [01:46,  2.14s/it]


In [15]:
print(len(data_dict['1_phi']))
print(data_dict['1_phi'][:10])

2000000
[ 2.86729962  1.69881736  2.71736027  1.30041138 -3.0487028  -2.94071761
 -1.4018133   3.08674221 -1.90163112  2.47294006]


In [16]:
reference_length = len(next(iter(data_dict.values())))
for feature_name, values in data_dict.items():
    if len(values) != reference_length:
       print(f"Feature '{feature_name}' has {len(values)} values, expected {reference_length}.")


In [17]:
def compute_percentiles_from_pickle(filename):
    with open(filename, 'rb') as f:
        raw_data_dict = pickle.load(f)
    
    numeric_data_dict = {k: v for k, v in raw_data_dict.items() if (k not in dontremove_outliers) and np.issubdtype(type(v[0]), np.number)}

    # Compute the required percentiles for each numeric feature
    lower_percentiles = {k: np.percentile(v, 0.03) for k, v in numeric_data_dict.items()}
    upper_percentiles = {k: np.percentile(v, 99.7) for k, v in numeric_data_dict.items()}

    return lower_percentiles, upper_percentiles

def remove_outliers(data, lower_percentiles, upper_percentiles):
    outlier_mask = np.zeros(len(next(iter(data.values()))), dtype=bool)
    for feature_name, values in data.items():
        if (feature_name not in dontremove_outliers) and (feature_name in lower_percentiles):
            lower_value = lower_percentiles[feature_name]
            upper_value = upper_percentiles[feature_name]
            feature_outlier_mask = (np.array(values) < lower_value) | (np.array(values) > upper_value)
            outlier_mask |= feature_outlier_mask  # update the outlier mask
    
    # Remove rows with outliers from all features in the data dictionary
    cleaned_data = {k: np.array(v)[~outlier_mask].tolist() for k, v in data.items()}
    return cleaned_data

base_path = os.path.dirname(os.getcwd())
raw_data_pickle_file = os.path.join(base_path, 'saved_files', 'extracted_data', 'TEST10_data_Aug3')
dontremove_outliers=['event', 'genWeight', 'MET_phi', '1_phi', '1_genPartFlav', '2_phi', '2_genPartFlav', '3_phi', '3_genPartFlav', 'charge_1', 'charge_2', 'charge_3', 'pt_1', 'pt_2', 'pt_3', 'pt_MET', 'eta_1', 'eta_2', 'eta_3', 'mass_1', 'mass_2', 'mass_3']
lower_percentiles, upper_percentiles = compute_percentiles_from_pickle(raw_data_pickle_file)

data_dictcopy = deepcopy(data_dict)
data_dict_removed_outliers2 = remove_outliers(data_dictcopy, lower_percentiles, upper_percentiles)

print(data_dict_removed_outliers2.keys())
print(len(data_dict_removed_outliers2['mass_12']))

dict_keys(['event', 'genWeight', 'MET_phi', '1_phi', '1_genPartFlav', '2_phi', '2_genPartFlav', '3_phi', '3_genPartFlav', 'charge_1', 'charge_2', 'charge_3', 'pt_1', 'pt_2', 'pt_3', 'pt_MET', 'eta_1', 'eta_2', 'eta_3', 'mass_1', 'mass_2', 'mass_3', 'deltaphi_12', 'deltaphi_13', 'deltaphi_23', 'deltaphi_1MET', 'deltaphi_2MET', 'deltaphi_3MET', 'deltaphi_1(23)', 'deltaphi_2(13)', 'deltaphi_3(12)', 'deltaphi_MET(12)', 'deltaphi_MET(13)', 'deltaphi_MET(23)', 'deltaphi_1(2MET)', 'deltaphi_1(3MET)', 'deltaphi_2(1MET)', 'deltaphi_2(3MET)', 'deltaphi_3(1MET)', 'deltaphi_3(2MET)', 'deltaeta_12', 'deltaeta_13', 'deltaeta_23', 'deltaeta_1(23)', 'deltaeta_2(13)', 'deltaeta_3(12)', 'deltaR_12', 'deltaR_13', 'deltaR_23', 'deltaR_1(23)', 'deltaR_2(13)', 'deltaR_3(12)', 'pt_123', 'mt_12', 'mt_13', 'mt_23', 'mt_1MET', 'mt_2MET', 'mt_3MET', 'mt_1(23)', 'mt_2(13)', 'mt_3(12)', 'mt_MET(12)', 'mt_MET(13)', 'mt_MET(23)', 'mt_1(2MET)', 'mt_1(3MET)', 'mt_2(1MET)', 'mt_2(3MET)', 'mt_3(1MET)', 'mt_3(2MET)', 'ma

In [18]:
print(data_dict_removed_outliers2['mass_12'][:30])

[83.66990839135985, 186.38373934681758, 149.23723598465253, 104.73620019765548, 79.78720041567763, 115.19047244685707, 103.55794033964735, 187.1008249688647, 268.14500854362166, 98.01004739849058, 190.66351109704678, 90.32116793823182, 143.83902090183196, 224.52883185117742, 112.5204179161392, 190.32785788163682, 34.752393649485875, 141.03692309729942, 77.38751946051595, 95.7951877022242, 87.70633372026211, 126.31470007635605, 47.817641081091445, 56.45543058611864, 132.18374784966355, 71.92480693368006, 109.88823039285298, 179.88182028553385, 232.41017498298265, 64.9818492372619]


In [19]:
base_path = os.path.dirname(os.getcwd())
folder = "fake_data"

full_folder_path = os.path.join(base_path,"saved_files", folder)

os.makedirs(full_folder_path, exist_ok=True)

filename = "Aug10_2mil.pkl"

full_file_path = os.path.join(full_folder_path, filename)

with open(full_file_path, 'wb') as f:
    pickle.dump(data_dict_removed_outliers2, f)


print(data_dict_removed_outliers2.keys())
print(len(data_dict_removed_outliers2['mass_12']))

dict_keys(['event', 'genWeight', 'MET_phi', '1_phi', '1_genPartFlav', '2_phi', '2_genPartFlav', '3_phi', '3_genPartFlav', 'charge_1', 'charge_2', 'charge_3', 'pt_1', 'pt_2', 'pt_3', 'pt_MET', 'eta_1', 'eta_2', 'eta_3', 'mass_1', 'mass_2', 'mass_3', 'deltaphi_12', 'deltaphi_13', 'deltaphi_23', 'deltaphi_1MET', 'deltaphi_2MET', 'deltaphi_3MET', 'deltaphi_1(23)', 'deltaphi_2(13)', 'deltaphi_3(12)', 'deltaphi_MET(12)', 'deltaphi_MET(13)', 'deltaphi_MET(23)', 'deltaphi_1(2MET)', 'deltaphi_1(3MET)', 'deltaphi_2(1MET)', 'deltaphi_2(3MET)', 'deltaphi_3(1MET)', 'deltaphi_3(2MET)', 'deltaeta_12', 'deltaeta_13', 'deltaeta_23', 'deltaeta_1(23)', 'deltaeta_2(13)', 'deltaeta_3(12)', 'deltaR_12', 'deltaR_13', 'deltaR_23', 'deltaR_1(23)', 'deltaR_2(13)', 'deltaR_3(12)', 'pt_123', 'mt_12', 'mt_13', 'mt_23', 'mt_1MET', 'mt_2MET', 'mt_3MET', 'mt_1(23)', 'mt_2(13)', 'mt_3(12)', 'mt_MET(12)', 'mt_MET(13)', 'mt_MET(23)', 'mt_1(2MET)', 'mt_1(3MET)', 'mt_2(1MET)', 'mt_2(3MET)', 'mt_3(1MET)', 'mt_3(2MET)', 'ma