In [2]:
import uproot
import pandas as pd
import numpy as np
import json
import dill
from numpy import  log, sqrt, exp, pi, e
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
from source import *
from numba import njit
import matplotlib.pyplot as plt
plt.rcParams['text.usetex'] = True
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pyarrow.parquet as pq
import pyarrow as pa


In [3]:
chunk_size = 100000
sample_size = 1_000_000
dataset2 = ds.dataset("Gen_MC/merged.parquet", format="parquet")
dataset1 = ds.dataset("MC_sig_root/Bs_2tau_sig_MC.parquet", format="parquet")



In [4]:


scanner = dataset1.scanner(batch_size=1_000_000)

for batch in scanner.to_batches():
    table1 = pa.Table.from_batches([batch]).to_pandas()

    break
table1.shape

scanner = dataset2.scanner(batch_size=1_000_000)

table2 = pd.DataFrame()

for batch in scanner.to_batches():
    table2 = pd.concat([pa.Table.from_batches([batch]).to_pandas(), table2])

    if table2.shape[0] > sample_size:
        break
table2 = table2.sample(sample_size)

In [5]:
print(table1.shape, table2.shape)


(1000000, 29) (1000000, 20)


In [6]:
int8_cols = [
    "idec0", "idec1", "N_KL", "is0"
]

to_drop = [
    "lost_gamma_0", "lost_pi_0", "lost_K_0",
    "lost_gamma_1", "lost_pi_1", "lost_K_1", "p_td",
    "Miss_id_0", "Miss_id_1", "N_tracks_in_ROE", "lost_nu_0", "lost_nu_1", "__candidate__", "__ncandidates__", "__weight__"
]

bool_cols = ["Miss_id_0", "Miss_id_1"]

int32_cols = ["__experiment__", "__run__", "__event__"]

float32_cols = [
    "missedE", "M0", "p0", "recM2", "totalEnergyMC", "E_gamma_in_ROE", "Bs_lik"
]

decay_mode_to_nu = {
    0: 2,  # e+ nu_e nu_tau
    1: 2,  # mu+ nu_mu nu_tau
    2: 1,  # pi+ nu_tau
    3: 1,  # rho+ (pi+ pi0) nu_tau
    4: 1,  # pi+ pi+ pi- nu_tau
    5: 1
}
decay_mode_to_gamma = {
    0: 0,  # e+ nu_e nu_tau
    1: 0,  # mu+ nu_mu nu_tau
    2: 0,  # pi+ nu_tau
    3: 0,  # rho+ (pi+ pi0) nu_tau
    4: 0,  # pi+ pi+ pi- nu_tau
    5: 1   # rho+ (pi+ gamma) nu_tau
}

def safe_downcast(col, target_type):
    try:
        return pd.to_numeric(col, downcast=target_type)
    except Exception as e:
        print(f"Ошибка при преобразовании {col.name} → {target_type}: {e}")
        return col

for idx in [0, 1]:
    table1["correct_nu_0"] = (table1['lost_nu_0'] == table1['idec0'].map(decay_mode_to_nu))
    table1["correct_nu_1"] = (table1['lost_nu_1'] == table1['idec1'].map(decay_mode_to_nu))
    table1["correct_gamma_0"] = (table1['lost_gamma_0'] == table1['idec0'].map(decay_mode_to_gamma))
    table1["correct_gamma_1"] = (table1['lost_gamma_1'] == table1['idec1'].map(decay_mode_to_gamma))
    table1["lost_0"] = ((table1['lost_K_0'] == 0) & (table1['lost_pi_0'] == 0))
    table1["lost_1"] = ((table1['lost_K_1'] == 0) & (table1['lost_pi_1'] == 0))

for col in table1.columns:
    if col in bool_cols:
        table1[col] = table1[col].astype("boolean")
    if col in int8_cols:
        table1[col] = safe_downcast(table1[col], "unsigned")
    elif col in int32_cols:
        table1[col] = safe_downcast(table1[col], "unsigned")
    elif col in float32_cols:
        table1[col] = safe_downcast(table1[col], "float")
    elif col in to_drop:
        table1.drop(columns=col, inplace=True)



In [7]:
print(table1.shape, table2.shape)


(1000000, 20) (1000000, 20)


In [8]:
frames = [table1, table2]
a = pd.concat(frames)


In [10]:
a.drop(columns="source_file", inplace=True)

In [11]:
a.to_parquet("data.parquet")