# ML test
### Testing of reading in data and trying an auto encoder
Remember to pip3 install keras-tuner to tune for the given session

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
import keras_tuner as kt
seed = tf.random.set_seed(1)

In [2]:
df = pd.read_hdf("/storage/shared/data/2lep_df_forML.hdf5")
df = pd.concat([df,pd.read_hdf("/storage/shared/data/2lep_df_forML_signal.hdf5")])


In [3]:
df.pop("category")
y = df["isSignal"]
df.pop("isSignal")
X = df
X = X.to_numpy()
y = y.to_numpy()


In [4]:
print(np.shape(X))
print(np.shape(y))

(109683372, 19)
(109683372,)


### Data handling and preperations
Before we train on the data, we need to scale it and split it into a validation and 

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [6]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [7]:
X_train, X_val, y_train, y_val= train_test_split(
                X, y, test_size=0.2, random_state=seed)

Now we need to separate the signal from the background in the training data.

In [8]:
X_back = X_train[np.where(y_train == 0)]

Then we perform a new split, such that we get a validation and training set for the autoencoder

In [9]:
X_b_train, X_b_val= train_test_split(
                X_back, test_size=0.2, random_state=seed)

In [90]:
data_shape = np.shape(X_b_train)[1]
number_of_rows = np.shape(X_b_train)[0]
n_vali = np.shape(X_b_val)[0]
random_indices = np.random.choice(number_of_rows, size=int(1e6), replace=False)

test_indices = np.random.choice(n_vali, size=int(200000), replace=False)

smaller_data = X_b_train[random_indices, :]
small_vali = X_b_val[test_indices, :]


test_indices_1 = np.random.choice(np.shape(X_val)[0], size=int(200000), replace=False)
X_val = X_val[test_indices_1, :]

### Training
Now we can train on the data

In [24]:
from keras import backend as K
from keras.utils.generic_utils import get_custom_objects
from gridsearch import *



In [66]:

def gridautoencoder(X_b, X_back_test):
    tuner = kt.Hyperband(
        AE_model_builder,
        objective=kt.Objective("val_mse", direction="min"),
        max_epochs=50,
        factor=3,
        directory="GridSearches",
        project_name="AE",
        overwrite=True,
    )

    tuner.search(X_b, X_b, epochs=50, batch_size=4000,
                 validation_data=(X_back_test, X_back_test))
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    print(
        f"""
    For Encoder: \n 
    First layer has {best_hps.get('num_of_neurons0')} with activation {best_hps.get('0_act')} \n
    Second layer has {best_hps.get('num_of_neurons1')} with activation {best_hps.get('1_act')} \n
    
    Latent layer has {best_hps.get("lat_num")} with activation {best_hps.get('2_act')} \n
    \n
    For Decoder: \n 
    First layer has {best_hps.get('num_of_neurons5')} with activation {best_hps.get('5_act')}\n
    Second layer has {best_hps.get('num_of_neurons6')} with activation {best_hps.get('6_act')}\n
    Third layer has activation {best_hps.get('7_act')}\n
    \n
    with learning rate = {best_hps.get('learning_rate')} and alpha = {best_hps.get('alpha')}
    """
    )

    state = True
    while state == True:
        answ = input("Do you want to save model? (y/n) ")
        if answ == "y":
            name = input("name: ")
            tuner.hypermodel.build(best_hps).save(
                f"../tf_models/model_{name}.h5")
            state = False
            print("Model saved")
        elif answ == "n":
            state = False
            print("Model not saved")


def AE_model_builder(hp):
    

    alpha_choice = hp.Choice("alpha", values=[1., 0.5, 0.1, 0.05, 0.01])
    #get_custom_objects().update({"leakyrelu": tf.keras.layers.LeakyReLU(alpha=alpha_choice)})
    activations = {
        "relu": tf.nn.relu,
        "tanh": tf.nn.tanh,
        "leakyrelu": lambda x: tf.nn.leaky_relu(x, alpha=alpha_choice),
        "linear": tf.nn.linear
    }
    inputs = tf.keras.layers.Input(shape=data_shape, name="encoder_input")
    x = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons0", min_value=13, max_value=17, step=1),
        activation=activations.get(hp.Choice(
            "0_act", ["relu", "tanh", "leakyrelu"])))(inputs)
    x1 = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons1", min_value=7, max_value=12, step=1),
        activation=activations.get(hp.Choice(
            "1_act", ["relu", "tanh", "leakyrelu","linear"]))
    )(x)
    val = hp.Int("lat_num", min_value=1, max_value=6, step=1)
    x2 = tf.keras.layers.Dense(
        units=val, activation=activations.get(hp.Choice(
            "2_act", ["relu", "tanh", "leakyrelu","linear"]))
    )(x1)
    encoder = tf.keras.Model(inputs, x2, name="encoder")

    latent_input = tf.keras.layers.Input(shape=val, name="decoder_input")
    x = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons5", min_value=7, max_value=12, step=1),
        activation=activations.get(hp.Choice(
            "5_act", ["relu", "tanh", "leakyrelu","linear"]))
    )(latent_input)
    x1 = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons6", min_value=13, max_value=17, step=1),
        activation=activations.get(hp.Choice(
            "6_act", ["relu", "tanh", "leakyrelu","linear"]))
    )(x)
    output = tf.keras.layers.Dense(
        data_shape, activation=activations.get(hp.Choice(
            "7_act", ["relu", "tanh", "leakyrelu","linear"]))
    )(x1)
    decoder = tf.keras.Model(latent_input, output, name="decoder")

    outputs = decoder(encoder(inputs))
    AE_model = tf.keras.Model(inputs, outputs, name="AE_model")

    hp_learning_rate = hp.Choice("learning_rate", values=[
                                 9e-2, 9.5e-2, 1e-3, 1.5e-3])
    optimizer = tf.keras.optimizers.Adam(hp_learning_rate)
    AE_model.compile(loss="mse", optimizer=optimizer, metrics=["mse"])

    return AE_model





In [67]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

In [68]:
tf.config.set_visible_devices([], 'GPU')


In [70]:
with tf.device("/CPU:0"):
    gridautoencoder(smaller_data, small_vali)

       

Trial 90 Complete [00h 04m 22s]
val_mse: 0.10679914057254791

Best val_mse So Far: 0.022179190069437027
Total elapsed time: 01h 47m 12s
INFO:tensorflow:Oracle triggered exit

    For Encoder: 
 
    First layer has 16 with activation leakyrelu 

    Second layer has 10 with activation leakyrelu 

    
    Latent layer has 5 with activation tanh 

    

    For Decoder: 
 
    First layer has 11 with activation tanh

    Second layer has 14 with activation tanh

    Third layer has activation leakyrelu

    

    with learning rate = 0.0015 and alpha = 1.0
    


Do you want to save model? (y/n)  y
name:  prelim_ae_2lep_data


Model saved


In [75]:
#hypermodel = tf.keras.models.load_model("../tf_models/model_prelim_ae_2lep_data.h5")
inputs = tf.keras.layers.Input(shape=data_shape, name="encoder_input")
x = tf.keras.layers.Dense(units=16,activation=tf.keras.layers.LeakyReLU(alpha=1.))(inputs)
x1 = tf.keras.layers.Dense(units=10,activation=tf.keras.layers.LeakyReLU(alpha=1.))(x)
val = 5
x2 = tf.keras.layers.Dense(units=val, activation="tanh")(x1)
encoder = tf.keras.Model(inputs, x2, name="encoder")

latent_input = tf.keras.layers.Input(shape=val, name="decoder_input")
x = tf.keras.layers.Dense(units=11,activation="tanh")(latent_input)
x1 = tf.keras.layers.Dense(units=14,activation="tanh")(x)
output = tf.keras.layers.Dense(data_shape, activation=tf.keras.layers.LeakyReLU(alpha=1.))(x1)
decoder = tf.keras.Model(latent_input, output, name="decoder")

outputs = decoder(encoder(inputs))
AE_model = tf.keras.Model(inputs, outputs, name="AE_model")

hp_learning_rate = 0.0015
optimizer = tf.keras.optimizers.Adam(hp_learning_rate)
AE_model.compile(loss="mse", optimizer=optimizer, metrics=["mse"])

In [86]:
with tf.device("/CPU:0"):
    AE_model.fit(X_b_train, X_b_train, epochs=10, batch_size=4000, validation_data=(X_b_val, X_b_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [91]:
import plot_set


#Calculate prediction
with tf.device("/CPU:0"):
    pred_back = AE_model.predict(X_b_val)
    print("Background done")
    pred_sig = AE_model.predict(X_val)
    print("Signal done")
    
recon_err_back = tf.keras.losses.msle(pred_back, X_b_val)
recon_err_sig = tf.keras.losses.msle(pred_sig, X_val)

b = recon_err_back/np.max(recon_err_back)
b_s = recon_err_sig/np.max(recon_err_sig)

binsize = 100
plt.figure(num=0, dpi=80, facecolor='w', edgecolor='k')
n_b, bins_b, patches_b = plt.hist(b, bins=binsize, histtype="stepfilled", facecolor="b",
                                  label="Background", density=True)

n_b, bins_b, patches_b = plt.hist(b_s, bins=binsize, histtype="stepfilled", facecolor="b",
                                  label="Background and signal", density=True)

plt.xlabel("Output", fontsize=15)
plt.ylabel("#Events", fontsize=15)
plt.title("Autoencoder output distribution", fontsize=15, fontweight="bold")
plt.legend(fontsize=16, loc="lower right")

plt.savefig("AE_output.pdf", bbox_inches="tight")
plt.show()

KeyboardInterrupt: 

Now we implement testing of the data, and stacking of histograms with the reconstruction for the given background processes, a signal, and ATLAS data.

Might have to implement ROOT histograms for stacking of histograms



Here we plot the ROC curves for the model