# Data prep and reduction

This notebook loads the full data set and selects the wanted features.

It also makes a reduced version of the dataset, with a combined and seperate signal files.

The default size of the reduced dataset is 10% of the full sized set.

### Note:

We did not do a lot of "data exploration" as the dataset is simulated. So the data should already be "cleaned" and distributed. 

Although we do some exploration of the data in other notebooks    

In [None]:
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.utils import shuffle

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Change to appropriate path
path = 'drive/My Drive/DAT255 Zprime data/'
seed = 42

# Reduction variables 
dst_path = 'drive/MyDrive/Dat255 reduced files/'
percentage = 0.1
signal_percentage = 0.1


#dst_path = 'drive/MyDrive/Dat255 reduced files50/'
#percentage = 0.5
#signal_percentage = 0.5

In [None]:
# These are the final features that were selected during the feature selection process made by Dovydas
features_from_feature_importance = [
                                    "met_et",
                                    "lep_1_E",
                                    "lep_2_E",
                                    "lep_3_E",
                                    "lep_1_eta",
                                    "lep_2_eta",
                                    "jet_n",
                                    "lep_1_pt",
                                    "lep_2_pt",
                                    "lep_3_pt",
                                    "lep_4_pt",
                                    "lep_5_pt",
                                    "lep_1_phi",
                                    "lep_2_phi",
                                    "jet_2_trueflav",
                                    "jet_1_E",
                                    "jet_3_E",
                                    "jet_1_pt",
                                    "jet_2_pt",
                                    "jet_3_pt",
                                    "jet_4_pt",
                                    "jet_5_pt",
                                    "jet_6_pt",
                                    "jet_7_pt",
                                    "jet_8_pt",
                                    "jet_9_pt",
                                    "alljet_n",
                                    "lep_1_etcone20",
                                    "jet_2_MV1",
                                    "jet_1_MV1",
                                    "jet_1_phi",
                                    "jet_1_m",
                                    "jet_2_E",
                                    "jet_2_jvf",
                                    "jet_1_SV0",
                                    ]
eventWeights = [

                ]

# Features should be the same in all files.
feature_df     = pd.read_hdf(path + 'mc_110899.ZPrime400.hdf5', 'mini')
feature_list = list(feature_df)

# Distinguish features from the weights (both are columns in the original file.)
for feature in feature_list:
  if "SCALE" in feature.upper() or "WEIGHT" in feature.upper():
    eventWeights.append(feature)

In [None]:
del feature_df

In [None]:
## Information needed for calculation the mass of the signal
invariant_features = [ 'lep_1_pt',
                       'lep_1_eta',
                       'lep_1_phi',
                       'lep_1_type',
                       'lep_1_charge',
                       'lep_1_E',
                       'lep_2_pt',
                       'lep_2_eta',
                       'lep_2_phi',
                       'lep_2_type',
                       'lep_2_charge',
                       'lep_2_E',
                       'jet_1_pt',
                       'jet_1_eta',
                       'jet_1_phi',
                       'jet_2_pt',
                       'jet_2_eta',
                       'jet_2_phi',
          
] 

In [None]:
features_and_weights = list(set(features_from_feature_importance + invariant_features))
features_and_weights += eventWeights

In [None]:
# load the background files into dataframes
diboson     = pd.read_hdf(path+'diboson.hdf5','mini')[features_and_weights]
DYee     = pd.read_hdf(path+'DYee.hdf5','mini')[features_and_weights]
DYmumu     = pd.read_hdf( path + 'DYmumu.hdf5', 'mini')[features_and_weights]
DYtautau    = pd.read_hdf( path + 'DYtautau.hdf5', 'mini')[features_and_weights]
ttbar_lep     = pd.read_hdf( path + 'ttbar_lep.hdf5', 'mini')[features_and_weights]
Wenu     = pd.read_hdf( path + 'Wenu.hdf5.hdf5', 'mini')[features_and_weights]
Wmunu     = pd.read_hdf( path + 'Wmunu.hdf5', 'mini')[features_and_weights]
Wtaunu     = pd.read_hdf( path + 'Wtaunu.hdf5', 'mini')[features_and_weights]


## total rows of Zee: 5625000, read in two passes as all the ram was used. 
Zee1     = pd.read_hdf( path + 'Zee.hdf5', 'mini', stop=2812500)[features_and_weights]
Zee2     = pd.read_hdf( path + 'Zee.hdf5', 'mini', start=2812500)[features_and_weights]

ttbar_had     = pd.read_hdf( path + 'ttbar_had.hdf5', 'mini')[features_and_weights]
Ztautau     = pd.read_hdf( path + 'Ztautau.hdf5', 'mini')[features_and_weights]

In [None]:
Zee = pd.concat([Zee1,Zee2])
Zee.shape

(5625000, 50)

In [None]:
# free up ram
del Zee1
del Zee2

In [None]:
diboson = shuffle(diboson, random_state = seed)
DYee = shuffle(DYee, random_state = seed)
DYmumu = shuffle(DYmumu, random_state = seed)
DYtautau = shuffle(DYtautau, random_state = seed)
ttbar_lep = shuffle(ttbar_lep, random_state = seed)
Wenu = shuffle(Wenu, random_state = seed)
Wmunu = shuffle(Wmunu, random_state = seed)
Wtaunu = shuffle(Wtaunu, random_state = seed)
Zee = shuffle(Zee, random_state = seed)
ttbar_had = shuffle(ttbar_had, random_state = seed)
Ztautau = shuffle(Ztautau, random_state = seed)


In [None]:
def reduce_file(src_df, name, dst, percentage=0.1):
  """
    src_file: variable of the file you want to reduce
    name: name of the file when saved
    dst: destination folder
    percentage: % of the file you want

    data should be shuffeled before reduction
  """

  #new_signal = shuffle(src_file, random_state=seed)
  ZprimeX_sample = src_df.sample(frac = percentage, random_state=seed, replace = False) 
  ZprimeX_sample.to_pickle(dst + name + '.pkl')

In [None]:
background_files = [diboson ,
                    DYee ,
                    DYmumu ,
                    DYtautau ,
                    ttbar_lep ,
                    Wenu ,
                    Wmunu ,
                    Wtaunu ,
                    Zee ,
                    ttbar_had ,
                    Ztautau ]

background_files_names = ["diboson" ,
                    "DYee" ,
                    "DYmumu" ,
                    "DYtautau" ,
                    "ttbar_lep" ,
                    "Wenu" ,
                    "Wmunu" ,
                    "Wtaunu" ,
                    "Zee" ,
                    "ttbar_had" ,
                    "Ztautau" ]



In [None]:
for i in range(len(background_files_names)):
  reduce_file(background_files[i], background_files_names[i], dst_path, percentage=percentage)

In [None]:
# Free up ram
del diboson 
del DYee 
del DYmumu 
del DYtautau 
del ttbar_lep 
del Wenu 
del Wmunu 
del Wtaunu 
del Zee 
del ttbar_had 
del Ztautau 

In [None]:
signal_name_list = [
               'mc_110899.ZPrime400.hdf5',
               'mc_110901.ZPrime500.hdf5',
               'mc_110902.ZPrime750.hdf5',
               'mc_110903.ZPrime1000.hdf5',
               'mc_110905.ZPrime1500.hdf5',
               'mc_110906.ZPrime1750.hdf5',
               'mc_110907.ZPrime2000.hdf5',
               'mc_110908.ZPrime2250.hdf5',
               'mc_110909.ZPrime2500.hdf5',
               'mc_110910.ZPrime3000.hdf5'
]

In [None]:
# signal files are a bit smaller so we made this for convenience (uses more ram)
def reduce_file_from_path(src, dst, name, percentage, f_and_w):
    """
    src: source folder
    dst: destination folder
    name: name of the file when saved
    percentage: % of the file you want
    f_and_w: features and weights you want from the src file.

    data is shuffeled before reduction
  """
    new_signal = pd.read_hdf(src + name , 'mini')[f_and_w]
    new_signal = shuffle(new_signal, random_state=42)
    ZprimeX_sample = new_signal.sample(frac = percentage, random_state=42, replace = False)
  
    ZprimeX_sample.to_pickle(dst + name + '.pkl')



In [None]:
for signal_name in signal_name_list:
  reduce_file_from_path(path, dst_path, signal_name, percentage, features_and_weights)

In [None]:
# Combines all signal files into a single file.
signal_df = pd.DataFrame()
for signal_name in signal_name_list:
  new_signal = pd.read_hdf(path + signal_name, 'mini')[features_and_weights]
  signal_df = signal_df.append(new_signal, ignore_index=True)

In [None]:
signal_df.shape

(225998, 50)

In [None]:
signal_df = shuffle(signal_df, random_state=seed)

In [None]:
reduce_file(signal_df, "signal", dst_path, percentage=signal_percentage)

In [None]:
# end