# ML test
### Testing of reading in data and trying an auto encoder

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
import keras_tuner as kt
seed = tf.random.set_seed(1)

In [2]:
df = pd.read_hdf("/storage/shared/data/2lep_df_forML.hdf5")
df = pd.concat([df,pd.read_hdf("/storage/shared/data/2lep_df_forML_signal.hdf5")])


In [3]:
df.pop("category")
y = df["isSignal"]
df.pop("isSignal")
X = df
X = X.to_numpy()
y = y.to_numpy()


In [4]:
print(np.shape(X))
print(np.shape(y))

(109683372, 19)
(109683372,)


### Data handling and preperations
Before we train on the data, we need to scale it and split it into a validation and 

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [6]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [7]:
X_train, X_val, y_train, y_val= train_test_split(
                X, y, test_size=0.2, random_state=seed)

Now we need to separate the signal from the background in the training data.

In [8]:
X_back = X_train[np.where(y_train == 0)]

Then we perform a new split, such that we get a validation and training set for the autoencoder

In [9]:
X_b_train, X_b_val= train_test_split(
                X_back, test_size=0.2, random_state=seed)

In [10]:
data_shape = np.shape(X_b_train)[1]
number_of_rows = np.shape(X_b_train)[0]
n_vali = np.shape(X_b_val)[0]
random_indices = np.random.choice(number_of_rows, size=int(1e6), replace=False)

test_indices = np.random.choice(n_vali, size=int(200000), replace=False)

smaller_data = X_b_train[random_indices, :]
small_vali = X_b_val[test_indices, :]

### Training
Now we can train on the data

In [24]:
from keras import backend as K
from keras.utils.generic_utils import get_custom_objects
from gridsearch import *



In [66]:

def gridautoencoder(X_b, X_back_test):
    tuner = kt.Hyperband(
        AE_model_builder,
        objective=kt.Objective("val_mse", direction="min"),
        max_epochs=50,
        factor=3,
        directory="GridSearches",
        project_name="AE",
        overwrite=True,
    )

    tuner.search(X_b, X_b, epochs=50, batch_size=4000,
                 validation_data=(X_back_test, X_back_test))
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    print(
        f"""
    For Encoder: \n 
    First layer has {best_hps.get('num_of_neurons0')} with activation {best_hps.get('0_act')} \n
    Second layer has {best_hps.get('num_of_neurons1')} with activation {best_hps.get('1_act')} \n
    
    Latent layer has {best_hps.get("lat_num")} with activation {best_hps.get('2_act')} \n
    \n
    For Decoder: \n 
    First layer has {best_hps.get('num_of_neurons5')} with activation {best_hps.get('5_act')}\n
    Second layer has {best_hps.get('num_of_neurons6')} with activation {best_hps.get('6_act')}\n
    Third layer has activation {best_hps.get('7_act')}\n
    \n
    with learning rate = {best_hps.get('learning_rate')} and alpha = {best_hps.get('alpha')}
    """
    )

    state = True
    while state == True:
        answ = input("Do you want to save model? (y/n) ")
        if answ == "y":
            name = input("name: ")
            tuner.hypermodel.build(best_hps).save(
                f"../tf_models/model_{name}.h5")
            state = False
            print("Model saved")
        elif answ == "n":
            state = False
            print("Model not saved")


def AE_model_builder(hp):
    

    alpha_choice = hp.Choice("alpha", values=[1., 0.5, 0.1, 0.05, 0.01])
    #get_custom_objects().update({"leakyrelu": tf.keras.layers.LeakyReLU(alpha=alpha_choice)})
    activations = {
        "relu": tf.nn.relu,
        "tanh": tf.nn.tanh,
        "leakyrelu": lambda x: tf.nn.leaky_relu(x, alpha=alpha_choice),
    }
    inputs = tf.keras.layers.Input(shape=data_shape, name="encoder_input")
    x = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons0", min_value=13, max_value=17, step=1),
        activation=activations.get(hp.Choice(
            "0_act", ["relu", "tanh", "leakyrelu"])))(inputs)
    x1 = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons1", min_value=7, max_value=12, step=1),
        activation=activations.get(hp.Choice(
            "1_act", ["relu", "tanh", "leakyrelu"]))
    )(x)
    val = hp.Int("lat_num", min_value=1, max_value=6, step=1)
    x2 = tf.keras.layers.Dense(
        units=val, activation=activations.get(hp.Choice(
            "2_act", ["relu", "tanh", "leakyrelu"]))
    )(x1)
    encoder = tf.keras.Model(inputs, x2, name="encoder")

    latent_input = tf.keras.layers.Input(shape=val, name="decoder_input")
    x = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons5", min_value=7, max_value=12, step=1),
        activation=activations.get(hp.Choice(
            "5_act", ["relu", "tanh", "leakyrelu"]))
    )(latent_input)
    x1 = tf.keras.layers.Dense(
        units=hp.Int("num_of_neurons6", min_value=13, max_value=17, step=1),
        activation=activations.get(hp.Choice(
            "6_act", ["relu", "tanh", "leakyrelu"]))
    )(x)
    output = tf.keras.layers.Dense(
        data_shape, activation=activations.get(hp.Choice(
            "7_act", ["relu", "tanh", "leakyrelu"]))
    )(x1)
    decoder = tf.keras.Model(latent_input, output, name="decoder")

    outputs = decoder(encoder(inputs))
    AE_model = tf.keras.Model(inputs, outputs, name="AE_model")

    hp_learning_rate = hp.Choice("learning_rate", values=[
                                 9e-2, 9.5e-2, 1e-3, 1.5e-3])
    optimizer = tf.keras.optimizers.Adam(hp_learning_rate)
    AE_model.compile(loss="mse", optimizer=optimizer, metrics=["mse"])

    return AE_model





In [67]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

In [68]:
tf.config.set_visible_devices([], 'GPU')


In [None]:
with tf.device("/CPU:0"):
    gridautoencoder(smaller_data, small_vali)

       

Trial 58 Complete [00h 00m 51s]
val_mse: 0.34281668066978455

Best val_mse So Far: 0.04355898126959801
Total elapsed time: 00h 34m 09s

Search: Running Trial #59

Value             |Best Value So Far |Hyperparameter
0.01              |0.1               |alpha
14                |15                |num_of_neurons0
relu              |relu              |0_act
10                |7                 |num_of_neurons1
tanh              |leakyrelu         |1_act
6                 |4                 |lat_num
tanh              |tanh              |2_act
10                |11                |num_of_neurons5
tanh              |tanh              |5_act
17                |17                |num_of_neurons6
tanh              |tanh              |6_act
tanh              |leakyrelu         |7_act
0.001             |0.095             |learning_rate
6                 |50                |tuner/epochs
0                 |17                |tuner/initial_epoch
2                 |3                 |tuner/bracket
0

In [38]:

%tb

NameError: name 'timer' is not defined

In [None]:
import plot_set
from plot_set import plot_histo

b = (err_val)#-np.min(err_val))/(np.max(err_val)-np.min(err_val))
print(b)

print(len(b))

plot_histo(b, "AE_histo.pdf")

In [None]:
AE_model.save("ex_model.h5")