## Setup

In [0]:
import os, sys
import random
import numpy as np
import tensorflow as tf

import pandas as pd
import matplotlib.pyplot as plt
import time

import tqdm
import h5py

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
%load_ext autoreload
%autoreload 2

### Helper functions

In [2]:
import time
from keras.callbacks import Callback
import numpy as np
import pandas as pd
import os

def progress_bar(current_value, max_value):
    progress = ((current_value+1)/max_value)*100
    if progress>98: progress=100
    print('\r[{0}{1}] {2:.1f}%'.format('#'*int(progress/2), ' '*(50-int(progress/2)), progress), end='')


def play_bell():
    import winsound
    duration = 200  # millisecond
    freq = 440  # Hz
    for i in range(5):
        winsound.Beep(int(freq*(i/2+1)), duration)
    

class LossHistory(Callback):
    def __init__(self, number_of_epochs, logging_parameter='acc'):
        self.number_of_epochs = number_of_epochs
        self.current_epoch = 0
    def on_train_begin(self, logs={}):
        self.initial_time = time.time()
    def on_batch_end(self, batch, logs={}):
        if logs['batch']==0:
            NUMBER_OF_DIESIS = 20
            self.current_epoch += 1
            progress = self.current_epoch/self.number_of_epochs
            diesis = np.round(progress*NUMBER_OF_DIESIS).astype('int')
            eta = (time.time()-self.initial_time) * (self.number_of_epochs/self.current_epoch -1)
            remaining_time = time.strftime("%H hours, %M min, %S sec", time.gmtime(eta))
            print('\r[{}{}]  eta: {}'.format(
                '#'*diesis, '-'*(NUMBER_OF_DIESIS-diesis), remaining_time), end='')
#             print('\r[{}{}] {}: {:.3f} eta: {}'.format(
#                 '#'*diesis, '-'*(NUMBER_OF_DIESIS-diesis), training_metric, logs[training_metric], remaining_time), end='')

def add_grid_and_save(grid):
    # save data in temporary dataframe if cross validation is not over
    if len(grid['fit_outs'][1]) < grid['skf_n_splits'][1]:
        temp_grid_file_path = '{}/grids/grid_tmp.pkl'.format(grid['root_path'][1])
        grid_df = pd.DataFrame()
        grid_df = grid_df.append({key:grid[key][1] for key in grid.keys()}, ignore_index=True)
        grid_df.to_pickle(temp_grid_file_path)

    else:
        grid_file_path = '{}/grids/grid_{}.pkl'.format(grid['root_path'][1], grid['version'][1])
        if os.path.isfile(grid_file_path):
            grid_df = pd.read_pickle(grid_file_path)
        else:
            grid_df = pd.DataFrame()

        grid_df = grid_df.append({key:grid[key][1] for key in grid.keys()}, ignore_index=True)
        grid_df.to_pickle(grid_file_path)

Using TensorFlow backend.


### Load and preprocess data

In [0]:
from sklearn.model_selection import train_test_split

def load_data(grid):
    train_data_filename = f'{ROOT_PATH}/data/train_data_sorted.h5'

    with h5py.File(train_data_filename, 'r') as f:
        X_train = f['X_train'][()]
        y_train = f['y_train'][()]

    dev_data_filename = f'{ROOT_PATH}/data/dev_data_sorted.h5'
    with h5py.File(dev_data_filename, 'r') as f:
        X_dev = f['X_dev'][()]
        y_dev = f['y_dev'][()]


    # from sklearn.preprocessing import Normalizer
    # transformer = Normalizer().fit(X_train)

    # X_train = transformer.transform(X_train)
    # X_dev = transformer.transform(X_dev)

    # y_max = np.max(y_train)
    # y_train = y_train/y_max
    # y_dev = y_dev/y_max

    X_train = X_train.reshape(X_train.shape[0], -1, 1)
    X_dev = X_dev.reshape(X_dev.shape[0], -1, 1)
    return X_train, X_dev, y_train, y_dev


def pre_process_data(X, y, grid):
    ### TODO
    return X_processed, y_processed

### Create model

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten
from keras.layers.convolutional import Conv1D, SeparableConv1D
from keras.layers.convolutional import MaxPooling1D, AveragePooling1D
from keras.layers import Dropout
from keras.layers import CuDNNGRU, LSTM
from keras.initializers import glorot_uniform
from keras import regularizers
from keras import optimizers


def create_model(grid):
    model = Sequential()
    
    if grid['avg_pool'][1]:
        model.add(AveragePooling1D(pool_size=grid['avg_pool_size'][1]))
    
    for conv_layer_index in range(grid['conv_layers_count'][1]):
        model.add(SeparableConv1D(filters=grid[f'conv_{conv_layer_index}'][1], kernel_size=3, padding='same', activation='relu', ))
        model.add(MaxPooling1D(pool_size=grid[f'pool_{conv_layer_index}'][1]))
        
    if grid['gru'][1]:
        model.add(CuDNNGRU(grid['gru_size'][1])) 
    else:
        model.add(Flatten())
    model.add(Dense(grid['dense_size'][1], activation='relu', kernel_initializer='normal'))
    model.add(Dense(1))

    return model


### Training parameters

In [0]:
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from scipy.stats import kde
from keras import optimizers


def train_model(X, y, model, grid):
    skf = KFold(n_splits=grid['skf_n_splits'][1], shuffle=True)
    
    train_index, dev_index = list(skf.split(X, y))[len(grid['fit_outs'][1])]
    
    X_train, X_dev = X[train_index], X[dev_index]
    y_train, y_dev = y[train_index], y[dev_index]
    
    



    progress_bar = LossHistory(grid['epochs'][1], logging_parameter=grid['training_metric'][1])
    chk = ModelCheckpoint(grid['best_model_paths'][1][-1], monitor=grid['best_model_metric'][1],
                          save_best_only=True, mode='max', verbose=0)
    
    optimizer = optimizers.Adam(lr=grid['learning_rate'][1])
    model.compile(loss=grid['loss'][1], optimizer=optimizer, metrics=[grid['training_metric'][1]])
    fit_out = model.fit(X_train, y_train, epochs=grid['epochs'][1], batch_size=grid['batch_size'][1],
                        callbacks=[ progress_bar], validation_data=(X_dev, y_dev), verbose=0) # removed chk

    fit_out.model = None
    fit_out.epoch = None
    fit_out.validation_data = None
    grid['fit_outs'][1].append(fit_out)
    return grid

### Run training

In [0]:
import time
import numpy as np

def run_training(X, y, grid, verbose=True):
    initial_time = time.time()
    
    for index_split in range(grid['skf_n_splits'][1]): # looping on uncompleted CV trainings
        print(f"\nCV validation {index_split+1} of {grid['skf_n_splits'][1]}")

        model = create_model(grid)

        best_model_path = '{}/models/{}/best_model_{}_{}.pkl'.format(grid['root_path'][1], grid['version'][1], grid['test_index'][1], index_split)
        grid['best_model_paths'][1].append(best_model_path)
              
        grid = train_model(X, y, model, grid)
        grid['best_model_results'][1].append(np.max(grid['fit_outs'][1][-1].history[grid['best_model_metric'][1]]))
        add_grid_and_save(grid)

    total_time = time.strftime("%H hours, %M min, %S sec", time.gmtime((time.time() - initial_time)))
    print('  --  Model trained in {}'.format(total_time))
    return grid

## Optimization

### Init

In [7]:
VERSION = '04'

try:
    import google.colab
    from google.colab import drive

    drive.mount('/content/gdrive', force_remount=True)
    ROOT_PATH = 'gdrive/My Drive/Colab Notebooks/LANL' # project path in drive
    print('Working on google colab')
except:
    ROOT_PATH = '..'
    print('Working locally')

# where models and grids are saved
directories = [f'{ROOT_PATH}/models/{VERSION}', f'{ROOT_PATH}/grids/']
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
Working on google colab


### Grid

In [0]:
def create_grid(params={}):
    
    grid_file_path = '{}/grids/grid_{}.pkl'.format(ROOT_PATH, VERSION)
    
    if os.path.isfile(grid_file_path):
        grid_df = pd.read_pickle(grid_file_path)
        test_index = grid_df['test_index'].max() + 1
    else:
        test_index = 0

    current_grid = {
        'version'                : ['str'     , VERSION],
        'params'                 : ['O'       , params],
        'test_index'             : [np.int    , test_index],
        'root_path'              : ['str'     , ROOT_PATH],
        
        # Model
        'avg_pool'               : ['bool'     , True],
        'avg_pool_size'          : [np.int     , 2],
        'conv_layers_count'      : [np.int     , 3],
        'filter_0'               : [np.int     , 8],
        
        'conv_0'                 : [np.int     , 8],
        'pool_0'                 : [np.int     , 16],
        'conv_1'                 : [np.int     , 16],
        'pool_1'                 : [np.int     , 8],
        'conv_2'                 : [np.int     , 32],
        'pool_2'                 : [np.int     , 2],

        
        'dense_size'             : [np.int     , 50],
        'gru'                    : ['bool'     , True],
        'gru_size'               : [np.int     , 50],
        
        # Metrics
        'best_model_metric'      : ['str'     , 'val_loss'],
        'training_metric'        : ['str'     , 'mean_squared_error'],
        'loss'                   : ['str'     , 'mean_squared_error'],
        
        'learning_rate'          : [np.float  , 0.001],
        
        # Training parameters
        'epochs'                 : [np.int    , 100],
        'batch_size'             : [np.int    , 32],
        'skf_n_splits'           : [np.int    , 5],
        
        # Outputs
        'best_model_paths'       : ['O'       , []],
        'best_model_results'     : ['O'       , []],
        'fit_outs'               : ['O'       , []]}

    for key, value in params.items():
        current_grid[key][1] = value
    return current_grid

### Run

In [0]:
X_train, X_test, y_train, y_test = load_data({})

In [0]:
def test_params(next_point_to_probe, X_train, X_test, y_train, y_test):
    with open (f'{ROOT_PATH}/log.txt', 'a') as f:
        f.write(f'{time.ctime()}  -  starting {next_point_to_probe}\n')
    print(f'Testing {next_point_to_probe}')

    
    grid = create_grid(next_point_to_probe)

    grid = run_training(X_train, y_train, grid, verbose=True)
    
    val_loss_history = [fit_out.history['val_loss'] for fit_out in grid['fit_outs'][1]]
    val_loss_estimator = np.mean(val_loss_history, 0)
    val_loss_estimator.sort()
    metric_result = val_loss_estimator[:10].mean()
    
    
    with open (f'{ROOT_PATH}/log.txt', 'a') as f:
        f.write(f'{time.ctime()}  -  finished {next_point_to_probe}\n')
    
    return metric_result

In [0]:
def get_next_point():
    point = {}
    
    point['loss'] = np.random.choice(['mean_squared_error', 'mean_absolute_error'])
    point['learning_rate'] = np.power(10, np.random.uniform(-5, -3))
    point['avg_pool'] = np.random.choice([True, False])
    if point['avg_pool']:
        point['avg_pool_size'] = np.random.randint(2, 8)
        
    point['conv_layers_count'] = np.random.randint(2, 4)
    point['gru'] = np.random.choice([True, False])
    
    point['gru_size'] = np.random.randint(20, 100)
    point['dense_size'] = np.random.randint(20, 100)
    return point

    
    
    

In [12]:
for random_search_index in range(10):
    print(f'Starting test {random_search_index}')
    next_point_to_probe = get_next_point()

    target = test_params(next_point_to_probe, X_train, X_test, y_train, y_test)

Starting test 0
Testing {'loss': 'mean_squared_error', 'learning_rate': 4.9624694077469894e-05, 'avg_pool': False, 'conv_layers_count': 3, 'gru': True, 'gru_size': 95, 'dense_size': 29}


W0715 06:40:33.785174 140408954263424 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.




CV validation 1 of 5


W0715 06:40:34.625317 140408954263424 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0715 06:40:34.626812 140408954263424 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0715 06:40:34.631552 140408954263424 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0715 06:40:34.677778 140408954263424 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0715 06:40:36.585258 140408954263424 deprecation_wrapper.py:119] From /us

[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 2 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 3 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 4 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 5 of 5
[####################]  eta: 00 hours, 00 min, 00 sec  --  Model trained in 00 hours, 26 min, 30 sec
Starting test 1
Testing {'loss': 'mean_squared_error', 'learning_rate': 4.802498102756505e-05, 'avg_pool': False, 'conv_layers_count': 2, 'gru': False, 'gru_size': 42, 'dense_size': 82}

CV validation 1 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 2 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 3 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 4 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 5 of 5
[####################]  eta: 00 hours, 00 min, 00 sec  --  Model trained in 00 hours, 20 mi

W0715 08:28:55.235006 140408954263424 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3980: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.



[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 2 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 3 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 4 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 5 of 5
[####################]  eta: 00 hours, 00 min, 00 sec  --  Model trained in 00 hours, 12 min, 25 sec
Starting test 5
Testing {'loss': 'mean_absolute_error', 'learning_rate': 0.00030478683507330866, 'avg_pool': False, 'conv_layers_count': 2, 'gru': True, 'gru_size': 43, 'dense_size': 61}

CV validation 1 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 2 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 3 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 4 of 5
[####################]  eta: 00 hours, 00 min, 00 sec
CV validation 5 of 5
[####################]  eta: 00 hours, 00 min, 00 sec  --  Model trained in 00 hours, 30 m

KeyboardInterrupt: ignored

In [0]:
 +