In [1]:
import uproot
import pandas as pd
import time
from array import array
from numpy import load, loadtxt, savetxt, savez_compressed
import numpy as np
import sys, re
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
def Trandform(signalArray, bkgArray, rangConst="_n1_1"):
    signalConstrain = []
    bkgConstrain = []

    # Find a common minimum and maximum between signal and background
    mini1 = np.min(signalArray, axis=0)
    mini2 = np.min(bkgArray, axis=0)
    maxi1 = np.max(signalArray, axis=0)
    maxi2 = np.max(bkgArray, axis=0)
    mini = np.minimum(mini1, mini2)
    maxi = np.maximum(maxi1, maxi2)

    if rangConst == "_0_1":  # [0, 1]

        signalConstrain = (signalArray - mini) / (maxi - mini)
        bkgConstrain = (bkgArray - mini) / (maxi - mini)

    elif rangConst == "_n1_1":  # [-1,1]

         signalConstrain = (2.*(signalArray - mini)/(maxi-mini))-1
         bkgConstrain    = (2.*(bkgArray - mini)/(maxi-mini))-1

    elif rangConst == "_0_2pi":  # [0, 2pi]

        signalConstrain = (2 * np.pi * (signalArray - mini) / (maxi-mini) )
        bkgConstrain = (2 * np.pi * (bkgArray - mini) / (maxi-mini) )

    elif rangConst == "_npi_pi":  # [-pi, pi]

        signalConstrain = (np.pi * (signalArray - mini) / (maxi - mini)) - np.pi
        bkgConstrain = (np.pi * (bkgArray - mini) / (maxi - mini)) - np.pi

    del signalArray, bkgArray

    return signalConstrain, bkgConstrain

def Root_File_to_DataFrame(tree):
    tree_arrays = tree.arrays()
    columns = tree.keys()

    dict_data = dict.fromkeys(columns, 0)
    for feature in columns:
        dict_data[feature] = tree_arrays[feature]

    pandas_file = pd.DataFrame(dict_data)
    return pandas_file

In [3]:
import matplotlib.pyplot as plt
import numpy as np

def preparingData(prossEvent=10, fraction=0.5, dictionary={}, nqubits=5, plot_variable=False, dataType="Classical"):

    rng = 45 #np.random.RandomState(1)
    all_variables = [*dictionary.keys()]
    variables = all_variables[:nqubits]

    print("Using the following variables:\n",variables)

    sigFile = uproot.open('samples/Results_qqsig_passcut.root')
    bkgFile = uproot.open('samples/Results_qqbkg_passcut.root')

    signal_dataset = Root_File_to_DataFrame(sigFile['BDTtree1'])
    bkg_dataset = Root_File_to_DataFrame(bkgFile['BDTtree1'])

    del sigFile, bkgFile
    # plotting the variables
    if plot_variable is True:
       #print("Plotting the variables...")
       signal_dataset_to_plot = signal_dataset[all_variables].to_numpy()
       bkg_dataset_to_plot = bkg_dataset[all_variables].to_numpy()

       signal_dataset_to_plot1 = signal_dataset_to_plot
       bkg_dataset_to_plot1 = bkg_dataset_to_plot

       signal_dataset_to_plot, bkg_dataset_to_plot = Trandform(signal_dataset_to_plot, bkg_dataset_to_plot, "_n1_1")
       #print(bkg_dataset_to_plot)
       #Plotter.plotVars(dictionary, signal_dataset_to_plot, bkg_dataset_to_plot)
       #plt.hist(signal_dataset_to_plot,bins=100, histtype='step', label=['signal'])
       #plt.show()
        
    signal_dataset = signal_dataset[variables].head(prossEvent).to_numpy()
    bkg_dataset = bkg_dataset[variables].head(prossEvent).to_numpy()
    #print(bkg_dataset[variables].head(prossEvent).to_numpy())
    #plt.hist(bkg_dataset[variables].head(prossEvent).to_numpy(),bins=100, histtype='step')
    
    # Transformation applies for both Classical and Quantum
    signal_dataset, bkg_dataset = Trandform(signal_dataset, bkg_dataset, "_n1_1")
    #variableStyle = [*dictionary.values()]
    #plt.hist([np.divide(signal_dataset[:,0], np.sum(signal_dataset[:,0])), np.divide(bkg_dataset[:,0], np.sum(bkg_dataset[:,0]))], bins=100, histtype='step', color=['red', 'blue'], label=['Signal','Background'])
    #plt.legend(loc='upper left')
    #plt.xlabel('$'+variableStyle[0]+'$');
    #plt.ylabel('Events')#note the "r" infornt of string this prevent backslash escape (ex \n);

    print("number of signal:",len(signal_dataset),"number of background:",len(bkg_dataset))

    train_size = int((len(signal_dataset)+len(bkg_dataset)) * fraction)
    test_size = int((len(signal_dataset)+len(bkg_dataset)) * fraction)

    #logging.info("Total number of signal : %s", str(len(signal_dataset)))
    #logging.info("Total number of backgrouns : %s", str(len(bkg_dataset)))
    #logging.info("Train size : %s", str(train_size))
    #logging.info("Testing size : %s", str(test_size))

    X_signal = signal_dataset
    X_background = bkg_dataset

    y_signal = np.ones(X_signal.shape[0])
    y_background = np.ones(X_background.shape[0])
    y_background = -1 * y_background

    X = np.concatenate([X_signal, X_background], axis=0)
    y = np.concatenate([y_signal, y_background])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, test_size=test_size, random_state=rng
    )

    # TODO: Use standarised only in the classical case
    if dataType=="Classical":
       #print("*=================================================================================*")
       #print("*= The data will be prepared for the classical case so transformation is needed. =*")
       #print("*=================================================================================*")

       scaler = StandardScaler()
       X_train = scaler.fit_transform(X_train)
       X_test = scaler.transform(X_test)
    elif dataType=="Quantum":
       # Do nothing here
       #print("*==================================================================================*")
       #print("*= The data will be prepared for the quantum case so no transformation is needed. =*")
       print("*==================================================================================*")


    np.savez_compressed('tmpData/data', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    loaded = np.load('tmpData/data.npz')

    del(signal_dataset, bkg_dataset, X_signal, X_background, y_background, y_signal, X, y)

    return (
           loaded
    )