Lanqing, Apr 18 2023

This notebook shows a minimal example of using matching method to do data selection. In this example, we did:
- Mahalanobis Distance
- Nearest Neighbor Matching
- Minimum Matching Rate Selection

We applied this settings to roughly select Rn220 decay data, based on matching them to a verified full-chain based WFSim Rn220 dataset.

**NB: to run this notebook, you probably need at least 8 GB RAM! Please also use tag 2022.06.3**

In [None]:
import matching
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cutax
from matplotlib.colors import LogNorm

In [None]:
# Load data: Rn220 calibration runs with minimal cuts only
st = cutax.xenonnt_v8(output_folder='/project2/lgrandi/xenonnt/processed')

rn_available = st.select_runs(run_mode='tpc_radon*', available='event_info', include_tags='_sr0')
rn_available = rn_available.name
available = []
for run in rn_available:
    available.append(st.is_stored(run, 'event_info'))
rn_available = rn_available[available]

st.register_all(cutax.cuts.fiducial_volume)
st.register_all(cutax.cuts.junk)
st.register_all(cutax.cuts.s2_area)
st.register_all(cutax.cut_lists.basic)
st.register_all(cutax.cut_lists.rn220)
st.register_all(cutax.cuts.s2_width)
st.register_all(cutax.cuts.s1_max_pmt)
st.register_all(cutax.cuts.s1_aft)
st.register_all(cutax.cuts.s1_single_scatter)
st.register_all(cutax.cuts.s1_width)
st.register_all(cutax.cuts.naive_bayes_classifier)
st.register_all(cutax.cuts.s2_aft)
st.register_all(cutax.cuts.s2_recon_pos_diff)
st.register_all(cutax.cuts.s2_pattern)
st.register_all(cutax.cuts.s2_single_scatter)
st.register_all(cutax.cuts.s1_pattern)
st.register_all(cutax.cuts.shadow)
st.register_all(cutax.cuts.ambience)

rn = st.get_df(rn_available[:50], ['event_info', 'cuts_rn220']+basic_cuts+rn_new_cuts)
rn = rn[rn['cuts_minimal']]
rn = rn[rn['cs1']<600]
rn = rn[rn['cs2']<20000]

plt.figure(dpi=150)
plt.hist2d(rn[rn['cuts_minimal']]['cs1'], 
           rn[rn['cuts_minimal']]['cs2'], 
           bins=(np.linspace(0,600,100), np.linspace(0, 20000, 100)), norm=LogNorm())
plt.xlabel('CS1 [PE]')
plt.ylabel('CS2 [PE]')
plt.title('Data With Minimal Cuts')

In [None]:
# Load simulation kindly provided by Pavel: 
gc.collect()

pavel0 = pd.read_csv('/project2/lgrandi/pkavrigin/2023-04-04_Pb/pb212_nest_nocl_20230402/ei_ms/pb212_nest_nocl_20230402_ei_ms_Merged1177Files_Part0.csv')
pavel0 = pavel0[(pavel0['cs2']<20000) & (pavel0['cs1']<600)]
pavel = pavel0
del pavel0
gc.collect()

pavel1 = pd.read_csv('/project2/lgrandi/pkavrigin/2023-04-04_Pb/pb212_nest_nocl_20230402/ei_ms/pb212_nest_nocl_20230402_ei_ms_Merged1177Files_Part1.csv')
pavel1 = pavel1[(pavel1['cs2']<20000) & (pavel1['cs1']<600)]
pavel = pd.concat([pavel, pavel1])
del pavel1
gc.collect()

pavel2 = pd.read_csv('/project2/lgrandi/pkavrigin/2023-04-04_Pb/pb212_nest_nocl_20230402/ei_ms/pb212_nest_nocl_20230402_ei_ms_Merged146Files_Part2.csv')
pavel2 = pavel2[(pavel2['cs2']<20000) & (pavel2['cs1']<600)]   
pavel = pd.concat([pavel, pavel2])
del pavel2
gc.collect()

pavel = pavel[pavel['z']>-134.23]
pavel = pavel[pavel['z']<-13.6]

plt.figure(dpi=150)
plt.hist2d(pavel['cs1'], 
           pavel['cs2'], 
           bins=(np.linspace(0,600,100), np.linspace(0, 20000, 100)), norm=LogNorm())
plt.xlabel('CS1 [PE]')
plt.ylabel('CS2 [PE]')
plt.title('Simulation Inside Fiducial Volume')

In [None]:
# Selection based on matching.
# The matched covariates are 'z', 's2_range_50p_area', 's1_area_fraction_top', 's2_area'.
# Note that min_match_rate is a parameter you can play around. 
# The higher it goes the better match you will get, but at the cost of more loss in statistics.

sel = matching.MinimumMatchingRate(data=rn, 
                                   simu=pavel, 
                                   covariates=['z', 's2_range_50p_area', 's1_area_fraction_top', 's2_area'])

mask = sel.select(min_match_rate = 1)
rn_selected = rn[mask]

plt.figure(dpi=150)
plt.hist2d(rn_selected['cs1'], 
           rn_selected['cs2'], 
           bins=(np.linspace(0,600,100), np.linspace(0, 20000, 100)), norm=LogNorm())
plt.xlabel('CS1 [PE]')
plt.ylabel('CS2 [PE]')
plt.title('Matching Based Selection')
plt.show()

In [None]:
# As a comparison to traditional cut based selection:
plt.figure(dpi=150)
plt.hist2d(rn[rn['cuts_minimal'] & rn['cut_s2_width'] & rn['cut_s1_area_fraction_top'] & rn['cut_rn220_s2_area']]['cs1'], 
           rn[rn['cuts_minimal'] & rn['cut_s2_width'] & rn['cut_s1_area_fraction_top'] & rn['cut_rn220_s2_area']]['cs2'], 
           bins=(np.linspace(0,600,100), np.linspace(0, 20000, 100)), norm=LogNorm())
plt.xlabel('CS1 [PE]')
plt.ylabel('CS2 [PE]')
plt.title('Data With Minimal Cuts + S2 Width + S2 Area + S1 AFT')

In [None]:
# You can also check the matched 1D distribution in covariates.
plt.figure(dpi=100)
plt.hist(rn['z'], bins=np.linspace(-135, -13, 40), density=True, label='data',histtype='step')
plt.hist(pavel['z'], bins=np.linspace(-135, -13, 40), density=True, label='simu',histtype='step')
plt.hist(rn_selected['z'], bins=np.linspace(-135, -13, 40), density=True, label='selected data',histtype='step')
plt.xlabel('Z [cm]')
plt.ylabel('Frequency [AU]')
plt.legend()
plt.show()

plt.figure(dpi=100)
plt.hist(rn['s2_range_50p_area'], bins=np.linspace(0, 2E4, 40), density=True, label='data',histtype='step')
plt.hist(pavel['s2_range_50p_area'], bins=np.linspace(0, 2E4, 40), density=True, label='simu',histtype='step')
plt.hist(rn_selected['s2_range_50p_area'], bins=np.linspace(0, 2E4, 40), density=True, label='selected data',histtype='step')
plt.xlabel('S2 Width [ns]')
plt.ylabel('Frequency [AU]')
#plt.yscale('log')
plt.legend()
plt.show()

plt.figure(dpi=100)
plt.hist(rn['s1_area_fraction_top'], bins=np.linspace(0, 1, 40), density=True, label='data',histtype='step')
plt.hist(pavel['s1_area_fraction_top'], bins=np.linspace(0, 1, 40), density=True, label='simu',histtype='step')
plt.hist(rn_selected['s1_area_fraction_top'], bins=np.linspace(0, 1, 40), density=True, label='selected data',histtype='step')
plt.xlabel('S1 Area Fraction Top')
plt.ylabel('Frequency [AU]')
#plt.yscale('log')
plt.legend()
plt.show()

plt.figure(dpi=100)
plt.hist(rn['s2_area'], bins=np.linspace(0, 20000, 40), density=True, label='data',histtype='step')
plt.hist(pavel['s2_area'], bins=np.linspace(0, 20000, 40), density=True, label='simu',histtype='step')
plt.hist(rn_selected['s2_area'], bins=np.linspace(0, 20000, 40), density=True, label='selected data',histtype='step')
plt.xlabel('S2 Area [PE]')
plt.ylabel('Frequency [AU]')
#plt.yscale('log')
plt.legend()
plt.show()