# Cleaning and preprocessing
Merge the two steps from the thesis to not store too many arrays from intermediate steps

In [2]:
import numpy as np
import awkward1 as ak

import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from scipy.stats import binned_statistic_2d

import gc

In [3]:
import sys
sys.path.append("../helpers/")
from variables import *
from tools import *

Explanation of steps is here, to not blow up the code later

## Cleaning

- flavor does not have to be inferred from different columns, but is already given
- datastructure is either awkward array (but rectangular, high-level) or numpy array (also rectangular, low-level), no need to flatten again
- number of tracks is already implemented in low-level using placeholders for padding

Or, to summarize, one just has to identify the defaults and set them to the designated default value. These can be the ones calculated in `defaults.ipynb`, and later on, during scaling, these can be set to 0 if one wants to do so (zero-padding, otherwise it's minimum-padding).

## Preprocessing

- First, one might want to merge the ak- and np-arrays to reuse the logic when applying the scalers
- To calculate them first, it is not necessary to merge arrays, just use columns individually, can be ak or np - also here; when calculating scalers, not the entire dataset needs to be in memory, only the feature(s) one is looking at currently
- To apply scalers, one would rather want to use an entire sample with all its columns, but split in chunks of e.g. 50k samples, because one also wants to store more global info, like bins and sample weights - this is nothing that one wants to calculate / do more than once per jet
- Choose between uniformly-spaced and custom-binning

## Definition of variables
Select first six tracks (in d0Sig-ordering). This defines which indices are relevant.

However, I'll preprocess the entire dataset with all its columns, to have them at hand if one really needs them all - selecting columns later is not problematic.

<hr>

Note: the following has been externalized into `helpers/variables.py`.

In [19]:
n_tracks = 6
low_level_indices = (np.array([[(i * 33 + k) for k in range(n_tracks)] for i in range(28)])).flatten()
low_level_indices

array([  0,   1,   2,   3,   4,   5,  33,  34,  35,  36,  37,  38,  66,
        67,  68,  69,  70,  71,  99, 100, 101, 102, 103, 104, 132, 133,
       134, 135, 136, 137, 165, 166, 167, 168, 169, 170, 198, 199, 200,
       201, 202, 203, 231, 232, 233, 234, 235, 236, 264, 265, 266, 267,
       268, 269, 297, 298, 299, 300, 301, 302, 330, 331, 332, 333, 334,
       335, 363, 364, 365, 366, 367, 368, 396, 397, 398, 399, 400, 401,
       429, 430, 431, 432, 433, 434, 462, 463, 464, 465, 466, 467, 495,
       496, 497, 498, 499, 500, 528, 529, 530, 531, 532, 533, 561, 562,
       563, 564, 565, 566, 594, 595, 596, 597, 598, 599, 627, 628, 629,
       630, 631, 632, 660, 661, 662, 663, 664, 665, 693, 694, 695, 696,
       697, 698, 726, 727, 728, 729, 730, 731, 759, 760, 761, 762, 763,
       764, 792, 793, 794, 795, 796, 797, 825, 826, 827, 828, 829, 830,
       858, 859, 860, 861, 862, 863, 891, 892, 893, 894, 895, 896])

In [17]:
relevant_columns_overall = np.concatenate([np.arange(n_high_level), low_level_indices+n_high_level])
relevant_columns_overall

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  50,  51,  52,
        53,  54,  55,  83,  84,  85,  86,  87,  88, 116, 117, 118, 119,
       120, 121, 149, 150, 151, 152, 153, 154, 182, 183, 184, 185, 186,
       187, 215, 216, 217, 218, 219, 220, 248, 249, 250, 251, 252, 253,
       281, 282, 283, 284, 285, 286, 314, 315, 316, 317, 318, 319, 347,
       348, 349, 350, 351, 352, 380, 381, 382, 383, 384, 385, 413, 414,
       415, 416, 417, 418, 446, 447, 448, 449, 450, 451, 479, 480, 481,
       482, 483, 484, 512, 513, 514, 515, 516, 517, 545, 546, 547, 548,
       549, 550, 578, 579, 580, 581, 582, 583, 611, 612, 613, 614, 615,
       616, 644, 645, 646, 647, 648, 649, 677, 678, 679, 680, 681, 682,
       710, 711, 712, 713, 714, 715, 743, 744, 745, 746, 747, 748, 776,
       777, 778, 779, 780, 781, 809, 810, 811, 812, 813, 814, 842, 843,
       844, 845, 846, 847, 875, 876, 877, 878, 879, 880, 908, 90

Index 2 corresponds to flavor = target

In [20]:
len(relevant_columns_overall)

185

In [4]:
user = 'um106329'
akArrays_path = '/hpcwork/' + user + '/jet_flavor_MLPhysics/dataset/akArrays/'
np_arrays_path = '/hpcwork/' + user + '/jet_flavor_MLPhysics/dataset/npArrays/'

In [5]:
defaults_path = '/hpcwork/' + user + '/jet_flavor_MLPhysics/dataset/defaults/'

In [6]:
weights_path = '/hpcwork/' + user + '/jet_flavor_MLPhysics/dataset/weights/'

In [7]:
preprocessed_path = '/hpcwork/' + user + '/jet_flavor_MLPhysics/dataset/preprocessed/'

In [8]:
# externalize this!
a4_indices_list = [k for k in range(3,11)]
a5_indices_list = [k for k in range(11,17)]

In [9]:
def clean_any_numpy_array(npArray, defaults=get_all_defaults()):
    
    # check if single column or more than one column
    if npArray.ndim > 1:
        for i in range(len(defaults)):
            npArray[:,i] = np.where(~np.isfinite(npArray[:,i]), defaults[i], npArray[:,i])
            npArray[:,i] = np.where(npArray[:,i] < defaults[i], defaults[i], npArray[:,i])
    else:
        npArray = np.where(~np.isfinite(npArray), defaults, npArray)
        npArray = np.where(npArray < defaults, defaults, npArray)

        
    return npArray

In [10]:
def get_default_from_full_index(index): # externalize this!
    return np.load(defaults_path+'all_defaults.npy')[index]

In [11]:
def get_all_defaults(scaled=False, old=False):
    if scaled:
        if not old: 
            return np.load(defaults_path+'all_scaled_defaults.npy')
        else: 
            return np.load(defaults_path+'all_scaled_defaults_OLD.npy')
    else:
        if not old:
            return np.load(defaults_path+'all_defaults.npy') # this is the "default default" case
        else:
            return np.load(defaults_path+'all_defaults_OLD.npy')

In [12]:
def get_track_vertex_index_from_full_index(index):
    return (index - n_high_level)

In [13]:
def get_var_and_track_from_track_vertex_index(index):
    var, track = divmod(index, 33)
    return var, track

In [None]:
def column_builder(index)

In [14]:
def flat_samples_builder(split_a,split_b):
    akArray = ak.from_parquet(akArrays_path+'split_{0}_{1}.parquet'.format(split_a,split_b))
    a1 = ak.to_numpy(akArray.a1).data
    a2 = ak.to_numpy(akArray.a2).data
    a3 = ak.to_numpy(akArray.a3).data
    a4 = ak.to_numpy(akArray.a4).data
    a5 = ak.to_numpy(akArray.a5).data
    
    a1_2_3Array = np.swapaxes(np.concatenate([a1,a2,a3],axis=0).reshape(3,-1).data,0,1)
    highlevelArray = np.concatenate((a1_2_3Array,a4,a5),axis=1)
    
    
    del akArray
    del a1_2_3Array
    del a1
    del a2
    del a3
    del a4
    del a5
    gc.collect()
    
    lowlevelArray = np.load(np_arrays_path+'sorted_track_vertex_info_split_{0}_{1}.npy'.format(split_a,split_b))

    
    return np.concatenate((highlevelArray, lowlevelArray), axis=1)

In [181]:
%%time
some_samples = flat_samples_builder(0,49999)
some_samples

17
924
CPU times: user 299 ms, sys: 287 ms, total: 585 ms
Wall time: 585 ms


array([[ 4.78712e+01,  1.89325e+00,  5.00000e+00, ..., -1.00000e+03,
        -1.00000e+03, -1.00000e+03],
       [ 3.49703e+01,  6.09755e-01,  5.00000e+00, ..., -1.00000e+03,
        -1.00000e+03, -1.00000e+03],
       [ 2.65706e+01, -5.30268e-01,  5.00000e+00, ..., -1.00000e+03,
        -1.00000e+03, -1.00000e+03],
       ...,
       [ 2.67027e+01,  3.07903e-01,  5.00000e+00, ..., -1.00000e+03,
        -1.00000e+03, -1.00000e+03],
       [ 3.94304e+01, -1.74927e+00,  5.00000e+00, ..., -1.00000e+03,
        -1.00000e+03, -1.00000e+03],
       [ 2.84135e+01, -2.41514e+00,  5.00000e+00, ..., -1.00000e+03,
        -1.00000e+03, -1.00000e+03]])

In [182]:
some_samples[0].size

941

In [191]:
some_clean_samples = clean_any_numpy_array(some_samples, get_all_defaults())
some_clean_samples

array([[47.8712  ,  1.89325 ,  5.      , ...,  0.      ,  0.      ,
         0.      ],
       [34.9703  ,  0.609755,  5.      , ...,  0.      ,  0.      ,
         0.      ],
       [26.5706  , -0.530268,  5.      , ...,  0.      ,  0.      ,
         0.      ],
       ...,
       [26.7027  ,  0.307903,  5.      , ...,  0.      ,  0.      ,
         0.      ],
       [39.4304  , -1.74927 ,  5.      , ...,  0.      ,  0.      ,
         0.      ],
       [28.4135  , -2.41514 ,  5.      , ...,  0.      ,  0.      ,
         0.      ]])

In [15]:
def calc_scalers(columns, defaults): # should not store directly, but return a list of scalers (store individual scaler later)
    scalers = []
    
    def get_trainingsamples(columns):
        # to calculate the scalers, one really only needs to use the training samples, which are created by using the train_test_split twice
        # the first function call splits test from train/val, and the second call further splits the train and val set
        train_and_val,_ = train_test_split(columns, test_size=0.2, random_state=1)
        trainset, _ = train_test_split(train_and_val, test_size=0.1, random_state=1)
        return trainset

    trainingsamples = get_trainingsamples(columns)
    
    # do not compute scalers with default values, which were set to minima
    if trainingsamples.ndim > 1:
        for i in range(np.size(defaults)):
            #print(i)
            scaler = StandardScaler().fit(trainingsamples[:,i][trainingsamples[:,i] != defaults[i]].reshape(-1,1))
            scalers.append(scaler)
    else:
        scaler = StandardScaler().fit(trainingsamples[trainingsamples != defaults].reshape(-1,1))
        scalers.append(scaler)
    
    return scalers

In [16]:
def preprocess_apply_scalers(split_a,split_b,chunk_number,defaults_to_zero=False):
    
    defaults = get_all_defaults()
     
    scalers = torch.load(preprocessed_path+'all_scalers.pt')
    
    # build flat array for this chunk with all variables
    dataset = flat_samples_builder(split_a, split_b)
    # clean all variables
    dataset = clean_any_numpy_array(dataset, defaults)
    
    
    # split tr va te
    trainingset, testset = train_test_split(dataset, test_size=0.2, random_state=1)
    trainset, valset = train_test_split(trainingset,test_size=0.1, random_state=1)
    del trainingset
    gc.collect()
    
    
    
    # uniformly spaced binning
    b_weights = np.load(weights_path+'b_weights_pt_eta_flav_balanced.npy')
    c_weights = np.load(weights_path+'c_weights_pt_eta_flav_balanced.npy')
    l_weights = np.load(weights_path+'l_weights_pt_eta_flav_balanced.npy')

    flavour_lookuptables = np.array([b_weights,c_weights,l_weights])

    pt_edges = np.load(weights_path+'pt_edges.npy')
    eta_edges = np.load(weights_path+'eta_edges.npy')
    
    
    # custom binning
    b_weights_alt = np.load(weights_path+'b_weights_pt_eta_flav_balanced_alt.npy')
    c_weights_alt = np.load(weights_path+'c_weights_pt_eta_flav_balanced_alt.npy')
    l_weights_alt = np.load(weights_path+'l_weights_pt_eta_flav_balanced_alt.npy')

    flavour_lookuptables_alt = np.array([b_weights_alt,c_weights_alt,l_weights_alt])

    pt_edges_alt = np.load(weights_path+'pt_edges_alt.npy')
    eta_edges_alt = np.load(weights_path+'eta_edges_alt.npy')
    
    
    def extract_store_sample_info(sample, name):
        
        _,_,_,these_eta_pt_bins = binned_statistic_2d(sample[:,1],sample[:,0],None,'count',bins=(eta_edges,pt_edges),expand_binnumbers=True)
        these_eta_bins = these_eta_pt_bins[0]-1
        these_pt_bins = these_eta_pt_bins[1]-1
        
        _,_,_,these_eta_pt_bins_alt = binned_statistic_2d(sample[:,1],sample[:,0],None,'count',bins=(eta_edges_alt,pt_edges_alt),expand_binnumbers=True)
        these_eta_bins_alt = these_eta_pt_bins_alt[0]-1
        these_pt_bins_alt = these_eta_pt_bins_alt[1]-1
        
        
        # go from flavour to index in lookup table, store 5,4,0 --> 0,1,2
        def flavour_to_lookup_index(flavour):
            flavour = np.where(flavour == 0, 2, flavour)
            flavour = np.where(flavour == 4, 1, flavour)
            flavour = np.where(flavour == 5, 0, flavour)
            return flavour
        
        these_targets = (torch.Tensor(flavour_to_lookup_index(sample[:,2]))).long()
        torch.save(these_targets, preprocessed_path+'{0}_targets_{1}.pt'.format(name, chunk_number))
        
        these_weights = flavour_lookuptables[these_targets,these_eta_bins,these_pt_bins]
        
        these_weights_alt = flavour_lookuptables_alt[these_targets,these_eta_bins_alt,these_pt_bins_alt]
        
        
        np.save(preprocessed_path+'{0}_eta_pt_bins_{1}.npy'.format(name, chunk_number), these_eta_pt_bins.astype(np.ubyte))
        np.save(preprocessed_path+'{0}_eta_pt_bins_alt_{1}.npy'.format(name, chunk_number), these_eta_pt_bins_alt.astype(np.ubyte))
        np.save(preprocessed_path+'{0}_sample_weights_{1}.npy'.format(name, chunk_number), these_weights)
        np.save(preprocessed_path+'{0}_sample_weights_alt_{1}.npy'.format(name, chunk_number), these_weights_alt)
        
        del these_weights
        del these_eta_pt_bins
        del these_eta_bins
        del these_pt_bins
        
        del these_weights_alt
        del these_eta_pt_bins_alt
        del these_eta_bins_alt
        del these_pt_bins_alt
        
        gc.collect()

        
        
    extract_store_sample_info(trainset, 'train')
    extract_store_sample_info(valset, 'val')
    extract_store_sample_info(testset, 'test')
        
    
    non_target_indices = [0,1]
    b = [k for k in range(3,941)]
    non_target_indices.extend(b)
    
    
    # get inputs independently
    train_inputs = torch.Tensor(trainset[:,non_target_indices])
    del trainset
    gc.collect()
    val_inputs = torch.Tensor(valset[:,non_target_indices])
    del valset
    gc.collect()
    test_inputs = torch.Tensor(testset[:,non_target_indices])
    del testset
    gc.collect()
    
    norm_train_inputs,norm_val_inputs,norm_test_inputs = train_inputs.clone().detach(),val_inputs.clone().detach(),test_inputs.clone().detach()
    
    # apply scaler to all input features
    if defaults_to_zero:
        for (i,full_index) in enumerate(non_target_indices):
            scaler = scalers[full_index]
            
            
            norm_train_inputs[:,i] = torch.Tensor(np.where(
                                                           train_inputs[:,i] != defaults[full_index],
                                                           scaler.transform(train_inputs[:,i].reshape(-1,1)).reshape(1,-1),
                                                           0. * norm_train_inputs[:,i]
                                                          )
                                                 )
            norm_val_inputs[:,i]   = torch.Tensor(np.where(
                                                           val_inputs[:,i] != defaults[full_index],
                                                           scaler.transform(val_inputs[:,i].reshape(-1,1)).reshape(1,-1),
                                                           0. * norm_val_inputs[:,i]
                                                          )
                                                 )
            norm_test_inputs[:,i]  = torch.Tensor(np.where(
                                                           test_inputs[:,i] != defaults[full_index],
                                                           scaler.transform(test_inputs[:,i].reshape(-1,1)).reshape(1,-1),
                                                           0. * norm_test_inputs[:,i]
                                                          )
                                                 )
    else:
        for (i,full_index) in enumerate(non_target_indices):
            scaler = scalers[full_index]
            norm_train_inputs[:,i]   = torch.Tensor(scaler.transform(train_inputs[:,i].reshape(-1,1)).reshape(1,-1))
            norm_val_inputs[:,i]       = torch.Tensor(scaler.transform(val_inputs[:,i].reshape(-1,1)).reshape(1,-1))
            norm_test_inputs[:,i]     = torch.Tensor(scaler.transform(test_inputs[:,i].reshape(-1,1)).reshape(1,-1))
    
    train_inputs = norm_train_inputs.clone().detach()
    val_inputs = norm_val_inputs.clone().detach()
    test_inputs = norm_test_inputs.clone().detach()
    
    if defaults_to_zero:
        text = 'default_to_zero'
    else:
        text = ''
        
    torch.save(train_inputs, preprocessed_path+'train_inputs_{0}{1}.pt'.format(chunk_number,text))
    torch.save(val_inputs, preprocessed_path+'val_inputs_{0}{1}.pt'.format(chunk_number,text))
    torch.save(test_inputs, preprocessed_path+'test_inputs_{0}{1}.pt'.format(chunk_number,text))

In [17]:
splits = []
total = 11491971
for k in range(0,total,50000):
    splits.append(k)
splits.append(total)
len(splits)

231

Getting the high-level columns is not problematic at all, just load the full array and select the first five subarrays.

In [29]:
array = ak.concatenate([ak.from_parquet(akArrays_path+'split_{0}_{1}.parquet'.format(splits[i],splits[i+1]-1)) for i in range(len(splits)-1)])

In [30]:
a1 = array.a1

In [31]:
a2 = array.a2

In [32]:
a3 = array.a3

a3 doesn't need a scaler --> it's the target

In [33]:
a4 = array.a4

In [34]:
a5 = array.a5

In [35]:
import gc
del array
gc.collect()

2784

Initial tests of functions:

In [52]:
get_default_from_full_index_DEPRECATED(0)

array(22.9141)

True minima are now handled like other true values in the distribution, and the defaults differ slightly. When cutting, even when there is only one true entry = min, the result will now contain something, and if it's just one sample, then that's better than an empty array with which scalers can't be computed. First version used custom minima, latest version uses custom defaults.

In [197]:
get_default_from_full_index(0)

0.0

In [53]:
get_default_from_full_index_DEPRECATED(2)

array(0)

In [198]:
get_default_from_full_index(2)

-1.0

In [54]:
np.load(defaults_path+'custom_minimum_{}.npy'.format('a3'),)

array(0)

In [199]:
np.load(defaults_path+'custom_default_{}.npy'.format('a3'),)

array(-1)

The first three columns (pt/eta/flavor) are always meaningful and do not have to be cleaned.

In [202]:
a1_sc = calc_scalers(ak.to_numpy(a1).data, get_default_from_full_index(0))

In [203]:
a2_sc = calc_scalers(ak.to_numpy(a2).data, get_default_from_full_index(1))

In [204]:
# not really relevant
a3_sc = calc_scalers(ak.to_numpy(a3).data, get_default_from_full_index(2))

Starting with the high-level variables stored in a4 and a5, the cleaning has to be applied, as there are sometimes unphysical values.

In [208]:
indices = a4_indices_list
defaults = get_all_defaults()[indices]
a4_sc = calc_scalers(clean_any_numpy_array(ak.to_numpy(a4).data, defaults), 
                     defaults)

In [209]:
indices = a5_indices_list
defaults = get_all_defaults()[indices]
a5_sc = calc_scalers(clean_any_numpy_array(ak.to_numpy(a5).data, defaults), 
                     defaults)

In [210]:
var_group = 'a1'
torch.save(a1_sc, preprocessed_path+'scalers_{}.pt'.format(var_group))

In [211]:
var_group = 'a2'
torch.save(a2_sc, preprocessed_path+'scalers_{}.pt'.format(var_group))

In [212]:
var_group = 'a3'
torch.save(a3_sc, preprocessed_path+'scalers_{}.pt'.format(var_group))

In [213]:
var_group = 'a4'
torch.save(a4_sc, preprocessed_path+'scalers_{}.pt'.format(var_group))

In [214]:
var_group = 'a5'
torch.save(a5_sc, preprocessed_path+'scalers_{}.pt'.format(var_group))

Low-level calculation starts here:

In [18]:
def calc_scalers_low_level(var_index):
    vars_ = get_track_vertex_indices_all_tracks_one_variable(var_index)
    columns = np.concatenate([np.load(np_arrays_path+'sorted_track_vertex_info_split_{0}_{1}.npy'.format(splits[i],splits[i+1]-1))[:,vars_] for i in range(len(splits)-1)])
#    columns = np.concatenate([np.load(np_arrays_path+'sorted_track_vertex_info_split_{0}_{1}.npy'.format(splits[i],splits[i+1]-1))[:,vars_] for i in range(15)])

    indices = np.array(vars_) + n_high_level
    defaults = get_all_defaults()[indices]
#    print(defaults)
    lowlevel_sc = calc_scalers(clean_any_numpy_array(columns, defaults), 
                         defaults)
    
    return lowlevel_sc

Pass the index of columns that have only one sample after cutting on defaults (= all samples placed at the same position).

In our case, this holds true for track no. 33 (index: 32), all others have at least two samples due to construction (which leaves >= 1 samples to derive scalers then). For the problematic track, the scalers won't be meaningful, but at least we store something.

As for previous computations, we pass a variable index that runs from 0 to 27 to account for the 28 different track-vertex features (all low-level), which each are stored for (at most) 33 tracks.

In [218]:
var_ind = 0
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [221]:
var_ind = 1
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [222]:
var_ind = 2
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [223]:
var_ind = 3
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [224]:
var_ind = 4
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [344]:
var_ind = 5
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [345]:
var_ind = 6
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [346]:
var_ind = 7
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [347]:
var_ind = 8
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [348]:
var_ind = 9
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [349]:
var_ind = 10
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [350]:
var_ind = 11
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [351]:
var_ind = 12
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [352]:
var_ind = 13
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [353]:
var_ind = 14
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [354]:
var_ind = 15
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [355]:
var_ind = 16
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [356]:
var_ind = 17
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [357]:
var_ind = 18
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [239]:
var_ind = 19
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [240]:
var_ind = 20
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [241]:
var_ind = 21
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [242]:
var_ind = 22
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [243]:
var_ind = 23
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [244]:
var_ind = 24
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [245]:
var_ind = 25
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [246]:
var_ind = 26
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [247]:
var_ind = 27
a6_k_sc = calc_scalers_low_level(var_ind)

torch.save(a6_k_sc, preprocessed_path+'scalers_{}.pt'.format('a6_'+str(var_ind)))

In [20]:
sorted_scalers_files = ['scalers_a{}.pt'.format(j) for j in range(1,6)]
b = ['scalers_a6_{}.pt'.format(l) for l in range(0,28)]
sorted_scalers_files.extend(b)
sorted_scalers_files

['scalers_a1.pt',
 'scalers_a2.pt',
 'scalers_a3.pt',
 'scalers_a4.pt',
 'scalers_a5.pt',
 'scalers_a6_0.pt',
 'scalers_a6_1.pt',
 'scalers_a6_2.pt',
 'scalers_a6_3.pt',
 'scalers_a6_4.pt',
 'scalers_a6_5.pt',
 'scalers_a6_6.pt',
 'scalers_a6_7.pt',
 'scalers_a6_8.pt',
 'scalers_a6_9.pt',
 'scalers_a6_10.pt',
 'scalers_a6_11.pt',
 'scalers_a6_12.pt',
 'scalers_a6_13.pt',
 'scalers_a6_14.pt',
 'scalers_a6_15.pt',
 'scalers_a6_16.pt',
 'scalers_a6_17.pt',
 'scalers_a6_18.pt',
 'scalers_a6_19.pt',
 'scalers_a6_20.pt',
 'scalers_a6_21.pt',
 'scalers_a6_22.pt',
 'scalers_a6_23.pt',
 'scalers_a6_24.pt',
 'scalers_a6_25.pt',
 'scalers_a6_26.pt',
 'scalers_a6_27.pt']

In [254]:
all_the_scalers = torch.load(preprocessed_path+sorted_scalers_files[0])
for k in range(1,len(sorted_scalers_files)):
    all_the_scalers = all_the_scalers + torch.load(preprocessed_path+sorted_scalers_files[k])

In [256]:
len(all_the_scalers)

941

In [259]:
type(all_the_scalers)

list

In [260]:
type(all_the_scalers[0])

sklearn.preprocessing._data.StandardScaler

In [261]:
sum([type(all_the_scalers[0]) == type(all_the_scalers[k]) for k in range(len(all_the_scalers))])

941

Now all scalers are part of a single list, which can be stored via torch as .pt again for easy look-up later.

In [262]:
torch.save(all_the_scalers, preprocessed_path+'all_scalers.pt')

In order to decide also at the level of scaled quantities whether some value is a default, it could help to scale the defaults themselves and store the resulting "scaled default" as well.

In [368]:
all_scaled_defaults = get_all_defaults(scaled=False, old=False)

for i in range(941):
    all_scaled_defaults[i] = all_the_scalers[i].transform(all_scaled_defaults[i].reshape(1, -1))

np.save(defaults_path+'all_scaled_defaults.npy', all_scaled_defaults)
all_scaled_defaults

array([-2.68771825e+00, -2.20531961e+00, -1.43435124e+00, -5.52270751e-01,
       -5.20628201e-01, -2.02641782e-01, -2.47709478e-01, -8.06466477e-01,
       -6.11756335e-01, -1.89240835e+00, -1.96493416e+00, -7.95656559e-01,
       -2.05953082e+00, -1.33896640e+00, -4.65264374e-01, -1.21666085e+00,
       -1.77710507e+00, -2.06165409e+01, -4.23211524e+01, -4.18137157e+01,
       -3.28382430e+01, -2.63348779e+01, -2.24139809e+01, -1.96496869e+01,
       -1.76053132e+01, -1.60689738e+01, -1.50964120e+01, -1.45371519e+01,
       -1.35719891e+01, -1.31965659e+01, -1.32496793e+01, -1.34172481e+01,
       -1.35280753e+01, -1.30844969e+01, -1.26810217e+01, -1.04408429e+01,
       -8.80461743e+00, -9.10968117e+00, -8.79438548e+00, -6.84805061e+00,
       -4.80606126e+00, -4.28630713e+00, -2.69319157e+00, -3.46397803e+00,
       -2.17454450e+00, -1.47284020e+00, -1.24215797e+00, -1.24716454e+00,
       -1.37494567e+00, -1.00000000e-03, -2.96728695e+02, -2.30162743e+02,
       -3.07855080e+02, -

In [369]:
all_scaled_defaults_OLD = get_all_defaults(scaled=False, old=True)

for i in range(941):
    all_scaled_defaults_OLD[i] = all_the_scalers[i].transform(all_scaled_defaults_OLD[i].reshape(1, -1))

np.save(defaults_path+'all_scaled_defaults_OLD.npy', all_scaled_defaults_OLD)
all_scaled_defaults_OLD

array([-2.68771825e+00, -2.20531961e+00, -1.43435124e+00, -5.52270751e-01,
       -5.20628201e-01, -2.02641782e-01, -2.47709478e-01, -8.06466477e-01,
       -6.11756335e-01, -1.89240835e+00, -1.96493416e+00, -7.95656559e-01,
       -2.05953082e+00, -1.33896640e+00, -4.65264374e-01, -1.21666085e+00,
       -1.77710507e+00, -2.06165409e+01, -4.23211524e+01, -4.18137157e+01,
       -3.28382430e+01, -2.63348779e+01, -2.24139809e+01, -1.96496869e+01,
       -1.76053132e+01, -1.60689738e+01, -1.50964120e+01, -1.45371519e+01,
       -1.35719891e+01, -1.31965659e+01, -1.32496793e+01, -1.34172481e+01,
       -1.35280753e+01, -1.30844969e+01, -1.26810217e+01, -1.04408429e+01,
       -8.80461743e+00, -9.10968117e+00, -8.79438548e+00, -6.84805061e+00,
       -4.80606126e+00, -4.28630713e+00, -2.69319157e+00, -3.46397803e+00,
       -2.17454450e+00, -1.47284020e+00, -1.24215797e+00, -1.24716454e+00,
       -1.37494567e+00, -1.00000000e-03, -2.96728695e+02, -2.30162743e+02,
       -3.07855080e+02, -

In [388]:
# https://stackoverflow.com/questions/34734572/tabs-in-print-are-not-consistent-python
a = get_all_defaults(scaled=False, old=False)
b = get_all_defaults(scaled=True, old=False)
c = get_all_defaults(scaled=False, old=True)
d = get_all_defaults(scaled=True, old=True)
print('{:<42s} {:<20s} {:<20s} {:<20s} {:<20s} {:<20s} {:<20s}'.format('feature', 'mean', 'variance', 'default (NEW)', 'scaled default (NEW)', 'default (OLD)', 'scaled default (OLD)'))
print('-'*44*4)
for i in range(941):
    # https://stackoverflow.com/questions/35944783/how-to-store-scaling-parameters-for-later-use
    print('{:<42} {:<20s} {:<20s} {:<20s} {:<20s} {:<20s} {:<20s}'.format(str(full_index_to_name(i)), str(all_the_scalers[i].mean_), str(all_the_scalers[i].var_), str(a[i]), str(b[i]), str(c[i]), str(d[i])))

feature                                    mean                 variance             default (NEW)        scaled default (NEW) default (OLD)        scaled default (OLD)
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
jet_pt                                     [33.82514418]        [158.38419179]       0.0                  -2.6877182490335687  0.0                  -2.6877182490335687 
jet_eta                                    [0.00104774]         [1.85183688]         -3.0                 -2.2053196101228676  -3.0                 -2.2053196101228676 
flavor                                     [2.44464758]         [5.76737997]         -1.0                 -1.4343512404828216  -1.0                 -1.4343512404828216 
track_2_d0_significance                    [3.97768268]         [51.87476977]        0.0                  -0.552270750526215   0.0                 

In [21]:
all_the_scalers_NEW = torch.load(preprocessed_path+sorted_scalers_files[0])
for k in range(1,len(sorted_scalers_files)):
    all_the_scalers_NEW = all_the_scalers_NEW + torch.load(preprocessed_path+sorted_scalers_files[k])

In [374]:
len(all_the_scalers_NEW)

941

In [375]:
type(all_the_scalers_NEW)

list

In [376]:
type(all_the_scalers_NEW[0])

sklearn.preprocessing._data.StandardScaler

In [377]:
sum([type(all_the_scalers_NEW[0]) == type(all_the_scalers_NEW[k]) for k in range(len(all_the_scalers_NEW))])

941

In [378]:
torch.save(all_the_scalers, defaults_path+'all_scalers_OLD.pt')
torch.save(all_the_scalers_NEW, defaults_path+'all_scalers.pt')

<hr>

## Finally, apply scalers during preprocessing

When doing the preprocessing, one has to decide how to proceed with defaults (close to minima or rather = 0?) - when one saves only the first option, it should be possible to go to the second option at runtime, e.g. by scaling the defaults itself and checking to which value they are mapped --> this could be used to slice, if one really needs to use zeros instead of "minima".

In [379]:
preprocess_apply_scalers(splits[0],splits[1]-1,0,defaults_to_zero=False)

In [383]:
%%time
for i in range(10):
    preprocess_apply_scalers(splits[i],splits[i+1]-1,i,defaults_to_zero=False)

CPU times: user 44.7 s, sys: 17.5 s, total: 1min 2s
Wall time: 1min 4s


In [391]:
%%time
#for i in range(len(splits)-1):
#for i in range(10):
for i in range(10,len(splits)-1):
    preprocess_apply_scalers(splits[i],splits[i+1]-1,i,defaults_to_zero=False)

CPU times: user 15min 18s, sys: 5min 42s, total: 21min
Wall time: 22min 2s


All files should be there now, let's go training! 🥳