In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
file_name = "data_new_circles.csv"
num_data = pd.read_csv(file_name)

In [8]:
target = "target"

In [9]:
pos = num_data[target].mean()
neg = 1 - pos

In [10]:
weight_for_0 = (1 / neg) / 2
weight_for_1 = (1 / pos) / 2

class_weight = {0 : weight_for_0, 1 : weight_for_1}

print(weight_for_0, weight_for_1)

1.0 1.0


In [11]:
def stratify_sample(data, cut, target):
    assert(cut) >= 0
    return data.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=1/cut))

In [12]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

from keras import models
from keras import layers
from keras import optimizers 
from keras import metrics

In [13]:
def sample(data, cut, target):
    assert(cut > 0)
    return data.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=1/cut))   

In [14]:
def get_train_test_split(data, target, cut, random_state = None):
    num_data_new = sample(data, cut, target)
    
    train, test = train_test_split(num_data_new, test_size=0.2, random_state=random_state) 

    y_train = np.array(train[target])
    y_test = np.array(test[target])

    x_train = np.array(train.drop(columns=[target]))
    x_test = np.array(test.drop(columns=[target]))
    
    return x_train, x_test, y_train, y_test 

In [15]:
input_size = len(num_data.columns) - 1
print(input_size)

2


In [16]:
params = [10, 10, 1]

In [35]:
volume = len(num_data)
data_vol = [volume, volume/2, volume/3, volume / 5, volume/10, 30, 15, 10, 6, 2] 
log_cuts = list(map(lambda x: int(volume / x), data_vol))

In [36]:
print("Volume = {}, \n Cuts are {}".format(volume, log_cuts))
print("Volume of data is {}".format(list(map(lambda x: int(volume / x), log_cuts))))

Volume = 600, 
 Cuts are [1, 2, 3, 5, 10, 20, 40, 60, 100, 300]
Volume of data is [600, 300, 200, 120, 60, 30, 15, 10, 6, 2]


In [19]:
get_model_directory = lambda series, cut: "models/{}_series/{}_cut/".format(series, cut)
get_model_name = lambda number:  "m_{}.h5".format(number)

In [20]:
def make_model(input_size, parameters):
    model = models.Sequential()

    model.add(layers.Dense(parameters[0], activation='relu', input_shape=(input_size,)))
    model.add(layers.Dense(parameters[1], activation='relu'))
    model.add(layers.Dense(parameters[2], activation='sigmoid'))
    
    model.compile(optimizer=optimizers.Adam(), 
              loss='binary_crossentropy', 
              metrics=["accuracy"]
              )
    
    return model

In [40]:
from keras.callbacks import ModelCheckpoint
## this function is task-specific
def model_train(model_name, data_split, epochs, random_target=False):
    model = make_model(input_size, params)

    checkpoint = ModelCheckpoint(model_name, 
        verbose=0, 
        monitor='val_accuracy',
        save_best_only=True, 
        mode='auto'
    )  

    early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', 
    verbose=1,
    patience=10,
    mode='auto',
    restore_best_weights=True)

    x_train, x_test, y_train, y_test = data_split

    if random_target:
        y_train = np.random.randint(2, size = len(y_train))
        y_test = np.random.randint(2, size = len(y_test))  

    # batch_size made bigger to ensure some important cases are included    
    history = model.fit(x_train, y_train,
                    epochs=epochs,
                    batch_size=min(32, len(x_train)),  
                    validation_data=(x_test, y_test), 
                    class_weight=class_weight, 
                    callbacks = [checkpoint, early_stopping])
    return history, model

In [41]:
import datetime 

def log_preamble(log_file_name):
    global tries, epochs, series, file_name
    time_stamp = datetime.datetime.now()

    with open(log_file_name, "a") as log_file:
        log_file.write(f"\n\nExecuted on time is {datetime.datetime.now()}\n")
        log_file.write(f"Tries: {tries}, epochs: {epochs}, series = {series}\n")
        log_file.write(f"Data file is {file_name}\n")
        log_file.write(f"Network configuration is {params}\n")
    return time_stamp

def log_final(log_file_name, time_stamp):
    time_stamp_new = datetime.datetime.now()
    with open(log_file_name, "a") as log_file:
        log_file.write(f"Finished successfully at {time_stamp_new}\n")
        log_file.write(f"Total time = {(time_stamp_new - time_stamp).total_seconds()}\n")

## Train

In [42]:
tries = 15
epochs = 30
series = [12]
histories = [[[] for _ in log_cuts] for _ in series] 
# before compiling
log_file = "model_log.txt"
t = log_preamble(log_file)
for i, s in enumerate(series):
    for cut_number, cut in enumerate(log_cuts):
        model_dir = get_model_directory(s, cut)
        split = get_train_test_split(num_data, target, cut)
        for number in range(tries):
            model_name = model_dir + get_model_name(number)
            history, model = model_train(model_name, split, epochs) 
            
            # models.save_model(model, filepath = model_name, include_optimizer = False)
            histories[i][cut_number].append(history)

log_final(log_file, t)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
 1/15 [=>............................] - ETA: 0s - loss: 0.4728 - accuracy: 0.7500Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
 1/15 [=>............................] - ETA: 0s - loss: 0.5422 - accuracy: 0.6875Restoring model weights from the end of the best epoch: 6.
Epoch 16: early stopping
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30


In [43]:
def display_loss_graph(history):
    history_dict = history.history
    loss_values = history_dict['loss'] 
    val_loss_values = history_dict['val_loss']
    epochs = range(1, len(loss_values)+1)
    plt.plot(epochs, loss_values, 'bo', label='Training loss')
    plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

def display_accuracy_graph(history):
    history_dict = history.history
    acc = history_dict["accuracy"]
    val_acc = history_dict['val_accuracy']
    epochs = range(1, len(acc)+1)
    plt.plot(epochs, acc, 'bo', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [44]:
import pickle
import os

for s in series:
    for i, cut in enumerate(log_cuts):
        for x in range(tries):
            filename = './histories/{}_series/{}_cut/m_{}'.format(s, i, x)

            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename, "wb") as file:
                pickle.dump(histories[s][i][x], file)

IndexError: list index out of range

In [None]:
accuracy_score = [[[-1 for _ in range(tries)] for _ in log_cuts] for _ in series]

In [None]:
for s in series:
    for i, cut in enumerate(log_cuts):
        for x in range(tries):
            history_dict = histories[s][i][x].history["val_accuracy"]
            accuracy_score[s][i][x] = np.max(history_dict)

In [None]:
plt.boxplot(accuracy_score[0])

In [None]:
plt.boxplot(accuracy_score[1])

In [None]:
acc_data = {}

In [None]:
for s in series:
    for i in range(len(log_cuts)):
        for x in range(tries):
            acc_data[(s, i, x)] = accuracy_score[s][i][x]

In [None]:
with open("acc_dict", "wb") as file:
    pickle.dump(acc_data, file)