In [1]:
import numpy as np
import pandas as pd

In [2]:
num_data = pd.read_csv("data.csv")

In [3]:
target = "Class_b\'2\'"
pos = num_data[target].mean()
neg = 1 - pos

# Train-test split

In [4]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

from keras import models
from keras import layers
from keras import optimizers 
from keras import metrics

In [5]:
def sample(data, cut, target):
    assert(cut > 0)
    return data.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=1/cut))    

In [6]:
def get_train_test_split(data, target, cut, random_state = None):
    num_data_new = sample(data, cut, target)
    
    train, test = train_test_split(num_data_new, test_size=0.2, random_state=random_state) 

    y_train = np.array(train[target])
    y_test = np.array(test[target])

    x_train = np.array(train.drop(columns=[target]))
    x_test = np.array(test.drop(columns=[target]))
    
    return x_train, x_test, y_train, y_test 

# Creating models

In [7]:
params = [15, 32, 32, 1]

In [8]:
input_size = len(num_data.columns) - 1

In [9]:
input_size

14

In [10]:
def make_model(input_size, parameters):
    model = models.Sequential()

    model.add(layers.Dense(parameters[0], activation='relu', input_shape=(input_size,))) # , kernel_regularizer="l2"
    for i in range(1, len(parameters)-1):
        model.add(layers.Dense(parameters[i], activation="relu")) #, kernel_regularizer="l1")
    model.add(layers.Dense(parameters[-1], activation='sigmoid')) # , kernel_regularizer="l1"
    
    model.compile(optimizer=optimizers.Adam(), 
              loss='binary_crossentropy', 
              metrics=["accuracy"]
              )
    
    return model

In [11]:
volume = len(num_data)

In [12]:
# cuts = [1, 2, 4, 8, 16, 32, 64, 128, 254, 1024, 2048]
cuts = [1, 2, 4, 8, 16]

In [13]:
print("Cuts are {}".format(cuts))
print("Volume of data is {}".format(list(map(lambda x: int(volume / x), cuts))))

Cuts are [1, 2, 4, 8, 16]
Volume of data is [14980, 7490, 3745, 1872, 936]


In [14]:
get_model_directory = lambda series, cut: "models/{}_series/{}_cut/".format(series, cut)
get_model_name = lambda number:  "m_{}.h5".format(number)

# Logging

In [15]:
import datetime 
def log_preamble(log_file_name):
    global tries, epochs, series, file_name
    time_stamp = datetime.datetime.now()

    with open(log_file_name, "a") as log_file:
        log_file.write(f"\n\nExecuted on time is {datetime.datetime.now()}\n")
        log_file.write(f"Tries: {tries}, epochs: {epochs}, series = {series}\n")
        log_file.write(f"Network configuration is {params}\n")
    return time_stamp

def log_final(log_file_name, time_stamp):
    time_stamp_new = datetime.datetime.now()
    with open(log_file_name, "a") as log_file:
        log_file.write(f"Finished successfully at {time_stamp_new}\n")
        log_file.write(f"Total time = {(time_stamp_new - time_stamp).total_seconds()}\n")

In [16]:
model_name = lambda series_num, cut_val, model_num: get_model_directory(series_num, cut_val) + get_model_name(model_num)

# Training models

In [17]:
from keras.callbacks import ModelCheckpoint

def model_train(data_split, batch_size, epochs, 
                random_target=False, discard = True, model = None, init_epoch = None):
    if not discard:
        model = make_model(input_size, params)
    elif model is None:
        print("Must supply a model if discard is set to False")
        return None # 
     
    if not discard: 
        early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        verbose=1,
        patience=15,
        mode='auto',
        restore_best_weights=True)

        callbacks = [early_stopping]
    else:
        callbacks = []
        
    x_train, x_test, y_train, y_test = data_split

    if random_target:
        y_train = np.random.randint(2, size = len(y_train))
        y_test = np.random.randint(2, size = len(y_test))  

    if model is not None: 
        batch_size = len(x_train)

    history = model.fit(x_train, y_train,
                    epochs=epochs,
                    initial_epoch= init_epoch,
                    batch_size=min(batch_size, len(x_train)),  
                    validation_data=(x_test, y_test), 
                    callbacks = callbacks) 
    return history, model

In [19]:
series = [10503]
tries = 60 
epochs = None
batch_size = None
histories = [[[] for _ in cuts] for _ in series] 

log_file = "model_log.txt"
t = log_preamble(log_file)

model = make_model(input_size, params)

for i, s in enumerate(series):
    for cut_number, cut in enumerate(cuts):
        split = get_train_test_split(num_data, target, cut)
        for number in range(tries):
            model_name_current = model_name(s, cut, number)
            history, model_to_save = model_train(split, None, number+1, discard = False, model = model,
                                                 init_epoch = number) 
            
            model_to_save.save(model_name_current)
            histories[i][cut_number].append(history)

log_final(log_file, t)

Epoch 2/2
Epoch 3/3
Epoch 4/4
Epoch 5/5
Epoch 6/6
Epoch 7/7
Epoch 8/8
Epoch 9/9
Epoch 10/10
Epoch 11/11
Epoch 12/12
Epoch 13/13
Epoch 14/14
Epoch 15/15
Epoch 16/16
Epoch 17/17
Epoch 18/18
Epoch 19/19
Epoch 20/20
Epoch 21/21
Epoch 22/22
Epoch 23/23
Epoch 24/24
Epoch 25/25
Epoch 26/26
Epoch 27/27
Epoch 28/28
Epoch 29/29
Epoch 30/30
Epoch 31/31
Epoch 32/32
Epoch 33/33
Epoch 34/34
Epoch 35/35
Epoch 36/36
Epoch 37/37
Epoch 38/38
Epoch 39/39
Epoch 40/40
Epoch 41/41
Epoch 42/42
Epoch 43/43
Epoch 44/44
Epoch 45/45
Epoch 46/46
Epoch 47/47
Epoch 48/48
Epoch 49/49
Epoch 50/50
Epoch 51/51
Epoch 52/52
Epoch 53/53
Epoch 54/54
Epoch 55/55
Epoch 56/56
Epoch 57/57
Epoch 58/58
Epoch 59/59
Epoch 60/60
Epoch 2/2
Epoch 3/3
Epoch 4/4
Epoch 5/5
Epoch 6/6
Epoch 7/7
Epoch 8/8
Epoch 9/9
Epoch 10/10
Epoch 11/11
Epoch 12/12
Epoch 13/13
Epoch 14/14
Epoch 15/15
Epoch 16/16
Epoch 17/17
Epoch 18/18
Epoch 19/19
Epoch 20/20
Epoch 21/21
Epoch 22/22
Epoch 23/23
Epoch 24/24
Epoch 25/25
Epoch 26/26
Epoch 27/27
Epoch 28/28
