# Model Training
> To facilitate a more automated training procedure, the model training is moved to a standalone file.  
This keeps Keras much happier in terms of required restarts and memory usage.

In [1]:
%load_ext autoreload
%autoreload 2
from importlib import reload

import numpy as np
import time # !
import json
from matplotlib import pyplot as plt

from keras.utils import to_categorical

import glove_helper
from loadutils import conll2003Data, saveProcessedData
from common import vocabulary, utils


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#print("Tensorflow version:", tf.__version__)
#print("Keras version:", K.__version__)

In [3]:
TRAIN_FILE = "../data/conll2003/eng.train"
DEV_FILE = "../data/conll2003/eng.testa"
TEST_FILE = "../data/conll2003/eng.testb"

# out files for IPC
HYPER_PARAM_FILE = "hyper_params.json"

VOCAB_SIZE = 20000

## Local helper utils

In [4]:
# local untils

# timeit decorator
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print ('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [14]:
def construct_embedding_matrix(embed_dim, vocab_size):
    """
    construct embedding matrix from GloVe 6Bn word data
    
    reuse glove_helper code from w266 
    
    Returns: an embedding matrix directly plugged into keras.layers.Embedding(weights=[embedding_matrix])
    """
    reload(glove_helper)
    hands = glove_helper.Hands(ndim=embed_dim)
    embedding_matrix = np.zeros((vocab_size, embed_dim))
    
    for i in range(vocabData.vocab.size):
        word = vocabData.vocab.ids_to_words([i])[0]
        try:
            embedding_vector = hands.get_vector(word)
        except:
            embedding_vector = hands.get_vector("<unk>")
        embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [6]:
def plot_history( history):
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

## Load the Data

In [7]:
# UPDATES!

windowLength = 9
#testNumSents = 20000

# Use training set to build vocab here
vocabData = conll2003Data(TRAIN_FILE)
vocabData.buildVocab( vocabSize=VOCAB_SIZE)

# Format training data
trainX, trainX_pos, trainX_capitals, trainY  = vocabData.formatWindowedData( 
                                                  vocabData.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

# read in dev data
devSents = vocabData.readFile( DEV_FILE)
devX, devX_pos, devX_capitals, devY = vocabData.formatWindowedData( 
                                              devSents, 
                                              windowLength=windowLength,
                                              verbose=False)

# read in the test data
testSents = vocabData.readFile( TEST_FILE)
testX, testX_pos, testX_capitals, testY = vocabData.formatWindowedData( 
                                                testSents, 
                                                windowLength=windowLength,
                                                verbose=False)

----------------------------------------------------
reading file from path ../data/conll2003/eng.train
'readFile'  1161.07 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  1066.03 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  1892.75 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testa
'readFile'  255.94 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  427.06 ms
----------------------------------------------------
reading file from path ../data/conll2003/eng.testb
'readFile'  235.91 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  528.62 ms


In [8]:
# Get Y

# encoding 1-hot for ner targets
trainY_cat = to_categorical(trainY.astype('float32'))
devY_cat = to_categorical(devY.astype('float32'), num_classes=trainY_cat.shape[1])
testY_cat = to_categorical(testY.astype('float32'), num_classes=trainY_cat.shape[1])

trainY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), trainY_cat)), dtype=np.float)
devY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), devY_cat)), dtype=np.float)
testY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), testY_cat)), dtype=np.float)

In [9]:
# Get X pos tags

# encoding 1-hot for pos tags
trainX_pos_cat = to_categorical(trainX_pos.astype('float32'))
devX_pos_cat = to_categorical(devX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2]) 
testX_pos_cat = to_categorical(testX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2])

trainX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat)), dtype=np.float)
devX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat)), dtype=np.float)
testX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat)), dtype=np.float)

In [10]:
# Get X capitlization 

# encoding 1-hot for capitalization info  ("allCaps", "upperInitial", "lowercase", "mixedCaps", "noinfo")
trainX_capitals_cat = to_categorical(trainX_capitals.astype('float32'))
devX_capitals_cat = to_categorical(devX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2]) 
testX_capitals_cat = to_categorical(testX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2])

trainX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat)), dtype=np.float)
devX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat)), dtype=np.float)
testX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat)), dtype=np.float)

## Set up model parameters

In [11]:
# define hyper parameters for model
# CAPSNET
hyper_param_caps = {
    
    'max_features' : vocabData.vocab.size,  # 20000
    'maxlen' : trainX.shape[1],  # window size (9)
    'poslen' : trainX_pos_cat.shape[2],  # pos classes (45)
    'capitallen' : trainX_capitals_cat.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat.shape[1],  # 8 
    'embed_dim' : 50,  # word embedding size
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'use_2D_primarycaps' : False,
    'primarycaps_dim_capsule' : 8,
    'primarycaps_n_channels' : 32,
    'primarycaps_kernel_size' : 3,
    'primarycaps_strides' : 1,
    'primarycaps_padding' : 'valid',

    'ner_capsule_dim' : 16,
    
    'num_dynamic_routing_passes' : 3,
    
    # decoder is still work in progress
    'use_decoder' : False,
    'decoder_feed_forward_1' : 100,
    'decoder_feed_forward_2' : 100, 
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 5, # default to same as epochs, ie don't use
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
}

In [12]:
# define hyper parameters for model
# CNN
hyper_param_cnn = {
    
    'max_features' : vocabData.vocab.size,  # 20000
    'maxlen' : trainX.shape[1],  # window size (9)
    'poslen' : trainX_pos_cat.shape[2],  # pos classes (45)
    'capitallen' : trainX_capitals_cat.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat.shape[1],  # 8 
    'embed_dim' : 50,  # word embedding size
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'conv2_filters' : 256,
    'conv2_kernel_size' : 3,
    'conv2_strides' : 1,
    'conv2_padding' : 'valid',
    
    'conv3_filters' : 128,
    'conv3_kernel_size' : 3,
    'conv3_strides' : 1,
    'conv3_padding' : 'valid',
    
    'max_pooling_size' : 3,
    'max_pooling_strides' : 1,
    'max_pooling_padding' : 'valid',
    'maxpool_dropout' : 0.3,
    
    'feed_forward_1' : 328,
    'ff1_dropout' : 0.3,
    'feed_forward_2' : 192,
    'ff2_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 5, # default to same as epochs, ie don't use
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,  # set to 0 to disable dropout
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
    #'loss_function' : margin_loss, # constructed loss function see margin_loss() in this notebook
}

## Build Glove Embeddings Matrix and Save All Data to Disk

In [13]:
# Load GloVe embedding matrix
# embedding_matrix = construct_embedding_matrix(hyper_param_caps['embed_dim'])
embedding_matrix = construct_embedding_matrix( hyper_param_caps['embed_dim'], 
                                               hyper_param_caps['max_features'])

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.50d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 50))
17748


In [15]:
# save all loaded data for use by training process
saveProcessedData( trainX, trainX_capitals_cat, trainX_pos_cat, devX, devX_capitals_cat,
                   devX_pos_cat, trainY_cat, devY_cat, embedding_matrix)

## Model Training Functions

In [16]:
@timeit 
def trainModelSP( testFunc, modelName, hyper_params, embed_matrix=None, verbose=False):
    """
    testFunc - the name of the python file to run
    modelName - the internal name (ID) of the model to train
    hyper_params - a dict of hyper parameters
    """
    # save the hyperparams
    with open(HYPER_PARAM_FILE, mode='w') as fp:
        json.dump( hyper_params, fp)
    
    # call the train function
    # consider replacing with a call to subprocess!!
    !python {testFunc} {modelName} {HYPER_PARAM_FILE}


In [17]:
@timeit 
def testFeatures( testFunc, modelName, hyper_params):
    """
    builds and trains 4 models for the configuration in hyper_params,
    1 for each input feature configuration: base, pos, caps, pos + caps
    
    testFunc - the name of the python file to run
    modelName - the model name to use for labeling
    """
    hypers = hyper_params.copy()
    
    # try the embeddings with different features
    
    # base
    curModel = modelName + "_base"
    trainModelSP( testFunc, curModel, hypers )
    
    # pos tags
    curModel = modelName + "_pos"
    hypers['use_pos_tags'] = True
    hypers['use_capitalization_info'] = False
    trainModelSP( testFunc, curModel, hypers )
    
    # capitalization info
    curModel = modelName + "_caps"
    hypers['use_pos_tags'] = False
    hypers['use_capitalization_info'] = True
    trainModelSP( testFunc, curModel, hypers )
    
    # both
    curModel = modelName + "_pos_caps"
    hypers['use_pos_tags'] = True
    hypers['use_capitalization_info'] = True
    trainModelSP( testFunc, curModel, hypers )
    

## Still need:
> a function to read in each historylog.csv file, optionally plot, then collect the best scoring epoch (lowest loss? - discuss, add F1?) to determine the best model and how long it trained.

### 1D Primary Caps Layer Training
> I know the output isn't pretty, but we don't really need it since everything is stored in the history log... It is really just to show a sign of life.

In [18]:
# test changing vocab size?...will have to rerun all tests for each size. same for word embedding size. is ok.
# change embedding sizes
# primary caps conv1D kernel size - play with it!!!
# - changing dropout rate will be perhaps in hyperparam tuning

In [None]:
# CONSIDER WRAPPING IN A FUNCTION... PROS AND CONS...

# capsnet training function
testFunc = "trainCapsModel.py"

hypers = hyper_param_caps.copy()
hypers['epochs'] = 50
hypers['stopping_patience'] = 5
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# try different embeddings
# learn embeddings
print("\n\nLearn Embeddings")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "learn", hypers)

# learn embeddings + Dropout
print("\n\nLearn Embeddings and Dropout")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "learn_dropout", hypers)

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn", hypers)

# use glove, no learn + Dropout
print("\n\nGlove Embeddings and Dropout")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "glove_nolearn_dropout", hypers)

# use glove, learn
print("\n\nGlove Embeddings with Learning")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_learn", hypers)

# use glove, learn + Dropout
print("\n\nGlove Embeddings with Learning and Dropout")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "glove_learn_dropout", hypers)





Learn Embeddings
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
x (?, 9)
embed (?, 9, 50)
embed (?, 9, 50)
conv1 (?, 7, 256)
primarycaps (?, ?, 8)
ner_caps (?, 8, 16)
out_pred (?, 8)

Training Model: learn_base
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
x (InputLayer)               (None, 9)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 9, 50)             887400    
_________________________________________________________________
conv1 (Conv1D)               (None, 7, 256)            38656     
_________________________________________________________________
primarycap_conv1d (Conv1D)   (None, 5, 256)            196864    
_________________________________________________________________
primarycap_reshape (Reshape) (None, 160, 8)            0         
_________________

Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.01995, saving model to ./result/weights-01.h5
Epoch 2/50

Epoch 00002: val_loss did not improve
Epoch 3/50

Epoch 00003: val_loss did not improve
Epoch 4/50

Epoch 00004: val_loss did not improve
Epoch 5/50

Epoch 00005: val_loss did not improve
Epoch 6/50

Epoch 00006: val_loss did not improve
Epoch 00006: early stopping
'trainModelSP'  775470.15 ms
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
x (?, 9)
x_capital (?, 9, 5)
embed (?, 9, 55)
embed (?, 9, 55)
conv1 (?, 7, 256)
primarycaps (?, ?, 8)
ner_caps (?, 8, 16)
out_pred (?, 8)

Training Model: learn_caps
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x (InputLayer)                  (None, 9)            0                                            
___________________________________

Train on 203621 samples, validate on 51362 samples
2018-04-12 00:26:19.632773: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2018-04-12 00:26:19.705135: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2018-04-12 00:26:19.705541: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 0000:00:04.0
totalMemory: 11.17GiB freeMemory: 11.09GiB
2018-04-12 00:26:19.705585: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7)
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.01691, saving model to

out_pred (Length)               (None, 8)            0           nercaps[0][0]                    
Total params: 1,321,320
Trainable params: 1,321,320
Non-trainable params: 0
__________________________________________________________________________________________________
Train on 203621 samples, validate on 51362 samples
2018-04-12 00:58:28.996657: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2018-04-12 00:58:29.068441: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2018-04-12 00:58:29.068843: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 0000:00:04.0
totalMemory: 11.17GiB freeMemory: 11.09GiB
2018-04-12 00:58


Epoch 00005: val_loss did not improve
Epoch 6/50

Epoch 00006: val_loss did not improve
Epoch 7/50

Epoch 00007: val_loss did not improve
Epoch 00007: early stopping
'trainModelSP'  927465.92 ms
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
x (?, 9)
x_pos (?, 9, 45)
x_capital (?, 9, 5)
embed (?, 9, 100)
embed (?, 9, 100)
conv1 (?, 7, 256)
primarycaps (?, ?, 8)
ner_caps (?, 8, 16)
out_pred (?, 8)

Training Model: learn_dropout_pos_caps
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x (InputLayer)                  (None, 9)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 9, 50)        887400      x[0][0]                          
____________________

Train on 203621 samples, validate on 51362 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.03897, saving model to ./result/weights-01.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.03897 to 0.03421, saving model to ./result/weights-02.h5
Epoch 3/50

Epoch 00003: val_loss did not improve
Epoch 4/50

Epoch 00004: val_loss improved from 0.03421 to 0.03348, saving model to ./result/weights-04.h5
Epoch 5/50

Epoch 00005: val_loss did not improve
Epoch 6/50

Epoch 00006: val_loss improved from 0.03348 to 0.03279, saving model to ./result/weights-06.h5
Epoch 7/50

Epoch 00007: val_loss did not improve
Epoch 8/50

Epoch 00008: val_loss did not improve
Epoch 9/50

Epoch 00009: val_loss did not improve
Epoch 10/50

Epoch 00010: val_loss did not improve
Epoch 11/50

Epoch 00011: val_loss did not improve
Epoch 00011: early stopping
'trainModelSP'  1395999.14 ms
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-04-12 02:07:59.1297


Epoch 00010: val_loss did not improve
Epoch 11/50

Epoch 00011: val_loss did not improve
Epoch 12/50

Epoch 00012: val_loss did not improve
Epoch 00012: early stopping
'trainModelSP'  1407255.18 ms
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
2018-04-12 02:31:26.386825: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2018-04-12 02:31:26.463062: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2018-04-12 02:31:26.463410: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 0000:00:04.0
totalMemory: 11.17GiB freeMemory: 11.09GiB
2018-04-12 02:31:26.463441: I tensorflow/core/commo


Training Model: glove_nolearn_pos_caps
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x (InputLayer)                  (None, 9)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 9, 50)        887400      x[0][0]                          
__________________________________________________________________________________________________
x_pos (InputLayer)              (None, 9, 45)        0                                            
__________________________________________________________________________________________________
x_capital (InputLayer)          (None, 9, 5)         0                                            
_____________________________________________________________________


Epoch 00003: val_loss improved from 0.03854 to 0.03720, saving model to ./result/weights-03.h5
Epoch 4/50

Epoch 00004: val_loss improved from 0.03720 to 0.03318, saving model to ./result/weights-04.h5
Epoch 5/50

Epoch 00005: val_loss improved from 0.03318 to 0.03211, saving model to ./result/weights-05.h5
Epoch 6/50

Epoch 00006: val_loss improved from 0.03211 to 0.03204, saving model to ./result/weights-06.h5
Epoch 7/50

Epoch 00007: val_loss improved from 0.03204 to 0.03197, saving model to ./result/weights-07.h5
Epoch 8/50

Epoch 00008: val_loss did not improve
Epoch 9/50

Epoch 00009: val_loss did not improve
Epoch 10/50

Epoch 00010: val_loss improved from 0.03197 to 0.02995, saving model to ./result/weights-10.h5
Epoch 11/50

Epoch 00011: val_loss did not improve
Epoch 12/50

Epoch 00012: val_loss did not improve
Epoch 13/50

Epoch 00013: val_loss did not improve
Epoch 14/50

Epoch 00014: val_loss improved from 0.02995 to 0.02989, saving model to ./result/weights-14.h5
Epoch 1

Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.02483, saving model to ./result/weights-01.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.02483 to 0.02129, saving model to ./result/weights-02.h5
Epoch 3/50

Epoch 00003: val_loss improved from 0.02129 to 0.02084, saving model to ./result/weights-03.h5
Epoch 4/50

Epoch 00004: val_loss improved from 0.02084 to 0.01861, saving model to ./result/weights-04.h5
Epoch 5/50

Epoch 00005: val_loss did not improve
Epoch 6/50

Epoch 00006: val_loss improved from 0.01861 to 0.01835, saving model to ./result/weights-06.h5
Epoch 7/50

Epoch 00007: val_loss did not improve
Epoch 8/50

Epoch 00008: val_loss improved from 0.01835 to 0.01766, saving model to ./result/weights-08.h5
Epoch 9/50

Epoch 00009: val_loss did not improve
Epoch 10/50

Epoch 00010: val_loss did not improve
Epoch 11/50

Epoch 00011: val_loss did not improve
Epoch 12/50

Epoch 00012: val_loss improved from 0.01766 to 0.01694, saving model to ./result/weights-12.h5
Ep

Epoch 3/50

Epoch 00003: val_loss improved from 0.01854 to 0.01802, saving model to ./result/weights-03.h5
Epoch 4/50

Epoch 00004: val_loss improved from 0.01802 to 0.01684, saving model to ./result/weights-04.h5
Epoch 5/50

Epoch 00005: val_loss improved from 0.01684 to 0.01616, saving model to ./result/weights-05.h5
Epoch 6/50

Epoch 00006: val_loss improved from 0.01616 to 0.01559, saving model to ./result/weights-06.h5
Epoch 7/50

Epoch 00007: val_loss did not improve
Epoch 8/50

Epoch 00008: val_loss improved from 0.01559 to 0.01537, saving model to ./result/weights-08.h5
Epoch 9/50

Epoch 00009: val_loss improved from 0.01537 to 0.01465, saving model to ./result/weights-09.h5
Epoch 10/50

Epoch 00010: val_loss did not improve
Epoch 11/50

Epoch 00011: val_loss did not improve
Epoch 12/50

Epoch 00012: val_loss did not improve
Epoch 13/50

Epoch 00013: val_loss did not improve
Epoch 14/50

Epoch 00014: val_loss improved from 0.01465 to 0.01394, saving model to ./result/weights-14


Epoch 00001: val_loss improved from inf to 0.02030, saving model to ./result/weights-01.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.02030 to 0.01659, saving model to ./result/weights-02.h5
Epoch 3/50

Epoch 00003: val_loss improved from 0.01659 to 0.01645, saving model to ./result/weights-03.h5
Epoch 4/50

Epoch 00004: val_loss improved from 0.01645 to 0.01533, saving model to ./result/weights-04.h5
Epoch 5/50

Epoch 00005: val_loss did not improve
Epoch 6/50

Epoch 00006: val_loss did not improve
Epoch 7/50

Epoch 00007: val_loss improved from 0.01533 to 0.01392, saving model to ./result/weights-07.h5
Epoch 8/50

Epoch 00008: val_loss did not improve
Epoch 9/50

Epoch 00009: val_loss improved from 0.01392 to 0.01389, saving model to ./result/weights-09.h5
Epoch 10/50

Epoch 00010: val_loss did not improve
Epoch 11/50

Epoch 00011: val_loss did not improve
Epoch 12/50

Epoch 00012: val_loss improved from 0.01389 to 0.01368, saving model to ./result/weights-12.h5
Epoch 13/50



I wonder if pos tags could be more useful with another representation (besides 1-hot encodings). I expected them to rock. Are they too sparse? do they make the input too large for the network? Check the paper for clues...

In [None]:
# try it all again with SGD instead of Adam. Maybe Adam is too agressive?
# train a caps model with 2D Primary caps and repeat tests

# capsnet training function
testFunc = "trainCapsModel.py"

hypers = hyper_param_caps.copy()
hypers['optimizer'] = "SGD"
print("Training with SGD - Nesterov Momentum Optimizer")

hypers['epochs'] = 50
hypers['stopping_patience'] = 5
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# try different embeddings
# learn embeddings
print("\n\nLearn Embeddings")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "SGD_primcaps_learn", hypers)

# learn embeddings + Dropout
print("\n\nLearn Embeddings and Dropout")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "SGD_primcaps_learn_dropout", hypers)

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "SGD_primcaps_glove_nolearn", hypers)

# use glove, no learn + Dropout
print("\n\nGlove Embeddings and Dropout")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "SGD_primcaps_glove_nolearn_dropout", hypers)

# use glove, learn
print("\n\nGlove Embeddings with Learning")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "SGD_primcaps_glove_learn", hypers)

# use glove, learn + Dropout
print("\n\nGlove Embeddings with Learning and Dropout")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "SGD_primcaps_glove_learn_dropout", hypers)

In [None]:
hypers = hyper_param_caps.copy()
hypers['optimizer'] = "SGD"
print("Training with SGD - Nesterov Momentum Optimizer")

hypers['epochs'] = 60
hypers['stopping_patience'] = 10 # more dropout... let it go longer
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# More SGD
# use glove, no learn + Dropout
print("\n\nGlove Embeddings and Dropout at 50%")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.5
testFeatures( testFunc, "SGD_primcaps_glove_nolearn_dropout_05", hypers)

# use glove, learn + Dropout
print("\n\nGlove Embeddings with Learning and Dropout at 50%")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.5
testFeatures( testFunc, "SGD_primcaps_glove_learn_dropout_05", hypers)

# learn embeddings + Dropout
print("\n\nLearn Embeddings and Dropout at 50%")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.5
testFeatures( testFunc, "SGD_primcaps_learn_dropout_05", hypers)

# Back to Adam
hypers['optimizer'] = "Adam"
print("Training with Adam")

# learn embeddings + Dropout
print("\n\nLearn Embeddings and Dropout at 50%")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.5
testFeatures( testFunc, "learn_dropout_05", hypers)

# use glove, no learn + Dropout
print("\n\nGlove Embeddings and Dropout at 50%")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.5
testFeatures( testFunc, "glove_nolearn_dropout_05", hypers)

# use glove, learn + Dropout
print("\n\nGlove Embeddings with Learning and Dropout at 50%")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.5
testFeatures( testFunc, "glove_learn_dropout_05", hypers)

In [123]:
#testModel( draw_capsnet_model, hyper_param_caps)
#testModel( capsmodel, hyper_param_caps)

### 2D Primary Caps Layer
>**NOT YET ATTEMPTED!**  
* try one first, see if it even trains...  
* may need a new set of hypers to get it working/training

In [188]:
# train a caps model with 2D Primary caps and repeat tests

# capsnet training function
testFunc = "trainCapsModel.py"

hypers = hyper_param_caps.copy()
hypers['use_2D_primarycaps'] = True

hypers['epochs'] = 1
hypers['stopping_patience'] = 1
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# try different embeddings
# learn embeddings
print("\n\nLearn Embeddings")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "2D_primcaps_learn", hypers)

# learn embeddings + Dropout
print("\n\nLearn Embeddings and Dropout")
hypers['use_glove'] = False
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "2D_primcaps_learn_dropout", hypers)

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "2D_primcaps_glove_nolearn", hypers)

# use glove, no learn + Dropout
print("\n\nGlove Embeddings and Dropout")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "2D_primcaps_glove_nolearn_dropout", hypers)

# use glove, learn
print("\n\nGlove Embeddings with Learning")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "2D_primcaps_glove_learn", hypers)

# use glove, learn + Dropout
print("\n\nGlove Embeddings with Learning and Dropout")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = True
hypers['embed_dropout'] = 0.25
testFeatures( testFunc, "2D_primcaps_glove_learn_dropout", hypers)



x (?, 9)
x_pos (?, 9, 45)
x_capital (?, 9, 5)
embed (?, 9, 100)
embed (?, 9, 100)
conv1 (?, 7, 256, 1)
primarycaps (?, ?, 8)
ner_caps (?, 8, 16)
out_pred (?, 8)


In [28]:
# baseline thoughts... we want the cnn to be the BEST. we want to compare our results to state of the art.