In [1]:
import pandas as pd
import numpy as np
import uproot3
import re
from scipy.stats import skew, kurtosis
from coordinates import get_coords
import os
from tqdm.notebook import tqdm
import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
def name_proc(name):
    if 'sc' in name:
        return '0.'+name[16:20]
    else:
        return name[11:15]

In [3]:
test = 0

In [4]:
if test:
    path = "root://eos.jinr.ru//eos/juno/dirac/juno/user/d/dubna/sysu_data/samples/test/"
    energy_dirs = [f'eplus_ekin_sc01_{i}MeV/23/root_data/' for i in [1, 3, 6]] + \
              [f'eplus_ekin_{i}MeV/23/root_data/' for i in range(1, 10)]
    samples = [[f'sample_{i}.root' for i in range(j*100, (j+1)*100)] for j in [1, 3, 6]] + \
              [[f'sample_{i}.root' for i in range(j*100, (j+1)*100)] for j in range(1, 10)]
else:
    path = "root://eos.jinr.ru//eos/juno/dirac/juno/user/d/dubna/sysu_data/samples/train/"
    energy_dirs = ["eplus_ekin_0_10MeV/23/root_data/"]
    samples = [[f'sample_{i}.root' for i in range(0, 2500)]]

In [5]:
eps = 50
thr_array = np.linspace(0, 95, 20)
thr_array[0] = 2

In [None]:
for k in tqdm(range(len(energy_dirs)), "Energies..."):
    energy_dir = energy_dirs[k]
    df = pd.DataFrame()
    for sample in tqdm(samples[k], "Samples...", leave=False):
        os.system(f"xrdcp {path}{energy_dir}{sample} {energy_dir}{sample}")
        try:
            tree = uproot3.open(f'{energy_dir}{sample}')['data']

            data = tree.pandas.df("*")
            data = data.reset_index()
            data = data[data.edepX**2 + data.edepY**2 + data.edepZ**2 < 17200**2]
            evtIDs = data.entry.unique()
            NEvents = len(evtIDs)

            edep = np.array([data[data.entry==i].edep.iloc[0] for i in evtIDs])
            edepX = np.array([data[data.entry==i].edepX.iloc[0] / 1000. for i in evtIDs])
            edepY = np.array([data[data.entry==i].edepY.iloc[0] / 1000. for i in evtIDs])
            edepZ = np.array([data[data.entry==i].edepZ.iloc[0] / 1000. for i in evtIDs])
            edepR = (edepX ** 2 + edepY ** 2 + edepZ ** 2) ** 0.5

            lpmt_charge = np.array([data[data.entry==i].npe for i in evtIDs])
            pmtIDs = np.array([data[data.entry==i].pmtID for i in evtIDs])
            lpmt_fht = np.array([data[data.entry==i].hittime for i in evtIDs])
            lpmt_fht = np.array([lpmt_fht[i] - lpmt_fht[i].min() for i in range(NEvents)])

            ht_mean = [lpmt_fht[i].mean() for i in range(NEvents)]
            ht_std = [lpmt_fht[i].std() for i in range(NEvents)]
            ht_skew = [skew(lpmt_fht[i]) for i in range(NEvents)]
            ht_kurtosis = [kurtosis(lpmt_fht[i]) for i in range(NEvents)]

            pe_mean = [lpmt_charge[i].mean() for i in range(NEvents)]
            pe_std = [lpmt_charge[i].std() for i in range(NEvents)]
            pe_skew = [skew(lpmt_charge[i]) for i in range(NEvents)]
            pe_kurtosis = [kurtosis(lpmt_charge[i]) for i in range(NEvents)]

            x_cc = np.zeros(NEvents)
            y_cc = np.zeros(NEvents)
            z_cc = np.zeros(NEvents)

            x_cht = np.zeros(NEvents)
            y_cht = np.zeros(NEvents)
            z_cht = np.zeros(NEvents)

            accum_charge = np.zeros(NEvents)
            nPMTs = np.zeros(NEvents)

            for i in range(NEvents):
                lpmt_x, lpmt_y, lpmt_z = get_coords.get_lpmt_coords(pmtIDs[i])

                x_cc[i] = np.sum(lpmt_x * lpmt_charge[i]) / np.sum(lpmt_charge[i])
                y_cc[i] = np.sum(lpmt_y * lpmt_charge[i]) / np.sum(lpmt_charge[i])
                z_cc[i] = np.sum(lpmt_z * lpmt_charge[i]) / np.sum(lpmt_charge[i])

                x_cht[i] = np.sum(lpmt_x / (lpmt_fht[i] + eps)) / np.sum(1 / (lpmt_fht[i] + eps))
                y_cht[i] = np.sum(lpmt_y / (lpmt_fht[i] + eps)) / np.sum(1 / (lpmt_fht[i] + eps))
                z_cht[i] = np.sum(lpmt_z / (lpmt_fht[i] + eps)) / np.sum(1 / (lpmt_fht[i] + eps))

                accum_charge[i] = np.sum(lpmt_charge[i])
                nPMTs[i] = int(lpmt_charge[i].shape[0])

            R_cc = (x_cc**2 + y_cc**2 + z_cc**2)**0.5
            pho_cc = (x_cc**2 + y_cc**2)**0.5
            theta_cc = np.arctan2((x_cc**2 + y_cc**2)**0.5, z_cc)
            phi_cc = np.arctan2(y_cc, x_cc)
            gamma_z_cc = z_cc / (x_cc**2 + y_cc**2)**0.5
            gamma_y_cc = y_cc / (z_cc**2 + x_cc**2)**0.5
            gamma_x_cc = x_cc / (z_cc**2 + y_cc**2)**0.5
            jacob_cc = R_cc**2 * np.sin(theta_cc)

            R_cht = (x_cht**2 + y_cht**2 + z_cht**2)**0.5
            pho_cht = (x_cht**2 + y_cht**2)**0.5
            theta_cht = np.arctan2((x_cht**2 + y_cht**2)**0.5, z_cht)
            phi_cht = np.arctan2(y_cht, x_cht)
            gamma_z_cht = z_cht / (x_cht**2 + y_cht**2)**0.5
            gamma_y_cht = y_cht / (z_cht**2 + x_cht**2)**0.5
            gamma_x_cht = x_cht / (z_cht**2 + y_cht**2)**0.5
            jacob_cht = R_cht**2 * np.sin(theta_cht)

            ht_ps = []
            pe_ps = []
            for thr in thr_array:
                ht_ps.append([np.percentile(lpmt_fht[i], thr) for i in range(NEvents)])
                pe_ps.append([np.percentile(lpmt_charge[i], thr) for i in range(NEvents)])
            ht_ps = np.array(ht_ps)

            diff_ht_ps = []
            for i in range(1, len(ht_ps)):
                diff_ht_ps.append(ht_ps[i] - ht_ps[i-1])

            features_df = pd.DataFrame()
            features_df['AccumCharge'] = accum_charge
            features_df['nPMTs'] = nPMTs

            features_df['R_cc'] = R_cc
            features_df['pho_cc'] = pho_cc
            features_df['x_cc'] = x_cc
            features_df['y_cc'] = y_cc
            features_df['z_cc'] = z_cc
            features_df['gamma_z_cc'] = gamma_z_cc
            features_df['gamma_y_cc'] = gamma_y_cc
            features_df['gamma_x_cc'] = gamma_x_cc
            features_df['theta_cc'] = theta_cc
            features_df['phi_cc'] = phi_cc
            features_df['jacob_cc'] = jacob_cc

            features_df['R_cht'] = R_cht
            features_df['pho_cht'] = pho_cht
            features_df['x_cht'] = x_cht
            features_df['y_cht'] = y_cht
            features_df['z_cht'] = z_cht
            features_df['gamma_z_cht'] = gamma_z_cht
            features_df['gamma_y_cht'] = gamma_y_cht
            features_df['gamma_x_cht'] = gamma_x_cht
            features_df['theta_cht'] = theta_cht
            features_df['phi_cht'] = phi_cht
            features_df['jacob_cht'] = jacob_cht

            features_df['ht_std'] = ht_std
            features_df['ht_mean'] = ht_mean
            features_df['ht_skew'] = ht_skew
            features_df['ht_kurtosis'] = ht_kurtosis

            features_df['pe_std'] = pe_std
            features_df['pe_mean'] = pe_mean
            features_df['pe_skew'] = pe_skew
            features_df['pe_kurtosis'] = pe_kurtosis

            features_df['ht_2p'] = ht_ps[0]
            features_df['ht_5p'] = ht_ps[1]
            features_df['ht_10p'] = ht_ps[2]
            features_df['ht_15p'] = ht_ps[3]
            features_df['ht_20p'] = ht_ps[4]
            features_df['ht_25p'] = ht_ps[5]
            features_df['ht_30p'] = ht_ps[6]
            features_df['ht_35p'] = ht_ps[7]
            features_df['ht_40p'] = ht_ps[8]
            features_df['ht_45p'] = ht_ps[9]
            features_df['ht_50p'] = ht_ps[10]
            features_df['ht_55p'] = ht_ps[11]
            features_df['ht_60p'] = ht_ps[12]
            features_df['ht_65p'] = ht_ps[13]
            features_df['ht_70p'] = ht_ps[14]
            features_df['ht_75p'] = ht_ps[15]
            features_df['ht_80p'] = ht_ps[16]
            features_df['ht_85p'] = ht_ps[17]
            features_df['ht_90p'] = ht_ps[18]
            features_df['ht_95p'] = ht_ps[19]

            features_df['ht_5-2p'] = diff_ht_ps[0]
            features_df['ht_10-5p'] = diff_ht_ps[1]
            features_df['ht_15-10p'] = diff_ht_ps[2]
            features_df['ht_20-15p'] = diff_ht_ps[3]
            features_df['ht_25-20p'] = diff_ht_ps[4]
            features_df['ht_30-25p'] = diff_ht_ps[5]
            features_df['ht_35-30p'] = diff_ht_ps[6]
            features_df['ht_40-35p'] = diff_ht_ps[7]
            features_df['ht_45-40p'] = diff_ht_ps[8]
            features_df['ht_50-45p'] = diff_ht_ps[9]
            features_df['ht_55-50p'] = diff_ht_ps[10]
            features_df['ht_60-55p'] = diff_ht_ps[11]
            features_df['ht_65-60p'] = diff_ht_ps[12]
            features_df['ht_70-65p'] = diff_ht_ps[13]
            features_df['ht_75-70p'] = diff_ht_ps[14]
            features_df['ht_80-75p'] = diff_ht_ps[15]
            features_df['ht_85-80p'] = diff_ht_ps[16]
            features_df['ht_90-85p'] = diff_ht_ps[17]
            features_df['ht_95-90p'] = diff_ht_ps[18]

            features_df['pe_2p'] = pe_ps[0]
            features_df['pe_5p'] = pe_ps[1]
            features_df['pe_10p'] = pe_ps[2]
            features_df['pe_15p'] = pe_ps[3]
            features_df['pe_20p'] = pe_ps[4]
            features_df['pe_25p'] = pe_ps[5]
            features_df['pe_30p'] = pe_ps[6]
            features_df['pe_35p'] = pe_ps[7]
            features_df['pe_40p'] = pe_ps[8]
            features_df['pe_45p'] = pe_ps[9]
            features_df['pe_50p'] = pe_ps[10]
            features_df['pe_55p'] = pe_ps[11]
            features_df['pe_60p'] = pe_ps[12]
            features_df['pe_65p'] = pe_ps[13]
            features_df['pe_70p'] = pe_ps[14]
            features_df['pe_75p'] = pe_ps[15]
            features_df['pe_80p'] = pe_ps[16]
            features_df['pe_85p'] = pe_ps[17]
            features_df['pe_90p'] = pe_ps[18]
            features_df['pe_95p'] = pe_ps[19]

            features_df['edep'] = edep
            features_df['edepX'] = edepX
            features_df['edepY'] = edepY
            features_df['edepZ'] = edepZ
            features_df['edepR'] = edepR

            df = df.append(features_df)
            os.system(f"rm -r {energy_dir}{sample}")
        except:
            print(f'{energy_dir}{sample} is not processed')

    if test:
        df.to_csv(
            f'/mnt/cephfs/ml_data/mc_2021/processed_data_J19/ProcessedTest/{name_proc(energy_dir)}.csv.gz',
            index=False, compression='gzip'
        )
    else:
        df.to_csv(
            f'/mnt/cephfs/ml_data/mc_2021/processed_data_J19/ProcessedTrain/ProcessedTrain1M.csv.gz',
            index=False, compression='gzip'
        )

Energies...:   0%|          | 0/1 [00:00<?, ?it/s]

Samples...:   0%|          | 0/2500 [00:00<?, ?it/s]

eplus_ekin_0_10MeV/23/root_data/sample_221.root is not processed
eplus_ekin_0_10MeV/23/root_data/sample_245.root is not processed
eplus_ekin_0_10MeV/23/root_data/sample_509.root is not processed
