In [1]:
from utils import *
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import itertools


Using TensorFlow backend.


In [2]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'
glove_file = 'wordvectors/glove.6B.300d.txt'
unk_char = '<UNK>'
pad_char = '<PAD>'


In [3]:
def get_train_data(train_file, word_to_index, max_len=None):
    sentence_pair, is_duplicated, sequence_length = read_train_data(train_file)
    sent_pair = filter_unknown(sentence_pair, word_to_index, unk_char)
    if max_len:
        sent_pair = pad(sent_pair, pad_char, max_len)
    
    return word2index(sent_pair, word_to_index), np.array(is_duplicated), np.array(sequence_length)

def get_test_data(test_file, word_to_index):
    sentence_pair, sequence_length = read_test_data(test_file)
    sent_pair = filter_unknown(sentence_pair, word_to_index, unk_char)
    
    return word2index(sent_pair, word_to_index), np.array(sequence_length)


def get_embedding_matrix(glove_file, unk_char, pad_char):
    word_to_index, index_to_word, emb_matrix = read_glove_vecs(glove_file)

    word_to_index[unk_char] = len(word_to_index)
    word_to_index[pad_char] = len(word_to_index)

    index_to_word[len(word_to_index) - 2] = unk_char
    index_to_word[len(word_to_index) - 1] = pad_char

    emb_dim = emb_matrix.shape[1]
    emb_matrix = np.append(emb_matrix, [[0] * emb_dim] * 2, axis=0)

    return emb_matrix, word_to_index, index_to_word

In [4]:
emb_matrix, word_to_index, index_to_word = get_embedding_matrix(glove_file, unk_char, pad_char)
print('emb_matrix shape: {}'.format(emb_matrix.shape))

emb_matrix shape: (400002, 300)


In [5]:
# max_len = 20
# x_train, y_train, train_sequlence_length = get_train_data(train_file, word_to_index, max_len=max_len)
# np.save('data/x_train_pad.npy', x_train)
# np.save('data/y_train.npy', y_train)
# np.save('data/train_seq_len.npy', train_sequlence_length)
# print(len(x_train))

In [6]:
# x_test, test_sequence_length = get_test_data(test_file, word_to_index)
# np.save('data/test.npy', x_test)
# np.save('data/test_seq_len.npy', test_sequence_length)
# print(len(x_test))

In [7]:
import tensorflow as tf
from tensorflow.keras.backend import binary_crossentropy
from tensorflow.contrib.layers import xavier_initializer
from tensorflow.nn import bidirectional_dynamic_rnn, embedding_lookup, dropout
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell
from tqdm import tqdm_notebook

class QuestionPairDuplicated:
    def __init__(self, emb_matrix, learning_rate, emb_trainable=False):
        tf.reset_default_graph()
        self.learning_rate = learning_rate
                
        # input
        self.input_sentA = tf.placeholder(tf.int32, shape=[None, None])  # (batch_size, time_step)
        self.input_sentB = tf.placeholder(tf.int32, shape=[None, None])  # (batch_size, time_step) 
        self.input_seq_lenA = tf.placeholder(tf.int32, shape=[None, ])  # (batch_size, )
        self.input_seq_lenB = tf.placeholder(tf.int32, shape=[None, ])  # (batch_size, )
        
        # output
        self.target = tf.placeholder(dtype=tf.float32, shape=[None, ])  # (batch_size, )

        # embedding matrix
        self.embedding_matrix = tf.get_variable(shape=emb_matrix.shape, 
                                    initializer=tf.constant_initializer(emb_matrix, dtype=tf.float32),
                                    dtype=tf.float32,
                                    trainable=emb_trainable,
                                    name='embeddings_matrix')
        
        
    def embedding_layer(self, sequence):
        return embedding_lookup(self.embedding_matrix, sequence)  # (batch_size, time_step, emb_dim)
    
    
    def bilstm(self, sequence, sequence_length, lstm_unit, reuse=None):
        with tf.variable_scope('BiLSTM', reuse=reuse, dtype=tf.float32):
            cell_fw = LSTMCell(num_units=lstm_unit, reuse=tf.get_variable_scope().reuse)
            cell_bw = LSTMCell(num_units=lstm_unit, reuse=tf.get_variable_scope().reuse)
            
        ((output_fw, output_bw), _) = bidirectional_dynamic_rnn(cell_fw, cell_bw, sequence, dtype=tf.float32, sequence_length=sequence_length)
        
        return tf.concat([output_fw, output_bw], axis=2)  # (batch_size, num_step, lstm_unit * 2)
    
    
    def lstm(self, sequence, sequence_length, lstm_unit, n_layers=1, reuse=None):
        with tf.variable_scope('LSTM', reuse=reuse, dtype=tf.float32):
            cell = tf.contrib.rnn.LSTMCell(num_units=lstm_unit, activation='tanh', reuse=tf.get_variable_scope().reuse)
#             cell = MultiRNNCell([cell] * n_layers)

        _, state = tf.nn.dynamic_rnn(cell, sequence, dtype=tf.float32, sequence_length=sequence_length)
        return state[1]  # (batch_size, lstm_unit)
    
    
    def manhattan_distance(self, vecA, vecB):
        # exp(-||h1 - h2||)
        diff = tf.reduce_sum(tf.abs(tf.subtract(vecA, vecB)), axis=1)  # (batch_size, )
        return tf.exp(-diff)
    
    
    def loss_function(self, output):
        ## MSE
        diff = tf.subtract(self.target, output) # (batch_size, )
        return tf.reduce_mean(tf.square(diff)) # (1, )
    
    
    def build(self, lstm_unit=256, hidden_unit=16, output_unit=1, encoder='lstm'):
        word_embA = self.embedding_layer(self.input_sentA)  # (batch_size, num_step, emb_dim)
        word_embB = self.embedding_layer(self.input_sentB)  # (batch_size, num_step, emb_dim)
        
        if encoder == 'lstm':
            repA = self.lstm(word_embA, self.input_seq_lenA, lstm_unit, reuse=None)  # (batch_size, lstm_unit)
            repB = self.lstm(word_embB, self.input_seq_lenB, lstm_unit, reuse=True)  # (batch_size, lstm_unit)
            input_dim = lstm_unit * 2
        elif encoder == 'bilstm':
            repA = self.bilstm(word_embA, self.input_seq_lenA, lstm_unit, None)  # (batch_size, num_step, lstm_unit * 2)
            repB = self.bilstm(word_embA, self.input_seq_lenB, lstm_unit, True)  # (batch_size, num_step, lstm_unit * 2)
            repA = tf.reduce_sum(repA, axis=1)  # (batch_size, lstm_unit * 2)
            repB = tf.reduce_sum(repB, axis=1)  # (batch_size, lstm_unit * 2)
            input_dim = lstm_unit * 4
        
        self.output = self.manhattan_distance(repA, repB)  # (batch_size, )
        self.loss = self.loss_function(self.output)  # (1, )
        
#         rep = tf.concat([repA, repB], axis=1)  # lstm: (batch_size, lstm_unit * 2), bilstm: (batch_size, lstm_unit * 4)
#         rep = dropout(rep, keep_prob=0.8)
        
#         hidden = self.dense(rep, hidden_unit, 'hidden')  # (batch_size, hidden_unit)
#         hidden = dropout(hidden, keep_prob=0.8)
        
#         self.output = self.dense(hidden, output_unit, 'output')  # (batch_size, output_unit)
#         self.output = tf.reshape(self.output, (-1, ))

#         self.loss = self.loss_function(self.output)  # ()
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

        
    def fit(self, train_data, val_data=None, epoch_size=1, batch_size=128, word_to_index=None, model_name='model'):
        def learn(X, Y, sequence_length, epoch, mode):
            tn = tqdm_notebook(total=len(X))
            for sentA, sentB, seq_lenA, seq_lenB, target in next_batch_with_pad(X, Y, sequence_length, word_to_index, batch_size):
#             for sentA, sentB, seq_lenA, seq_lenB, target in next_batch(X, Y, sequence_length, batch_size):

                feed_dict = {
                    self.input_sentA: sentA,
                    self.input_sentB: sentB, 
                    self.input_seq_lenA: seq_lenA,
                    self.input_seq_lenB: seq_lenB,
                    self.target: target
                }
                if mode == 'train':
                    fetches = [self.loss, self.output, self.optimizer]
                    loss, output, _ = self.sess.run(fetches, feed_dict)
                    tn.set_description('Epoch: {}/{}'.format(epoch, epoch_size))
                elif mode == 'validate':
                    fetches = [self.loss, self.output]
                    loss, output = self.sess.run(fetches, feed_dict)
                
                tn.set_postfix(loss=loss, accuracy=accuracy(output, target), mode=mode)
                tn.update(n=batch_size)
                
        saver = tf.train.Saver()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
                
        x_train, y_train, train_sequence_length = train_data[0], train_data[1], train_data[2]
        if val_data:
            x_val, y_val, val_sequence_length = val_data[0], val_data[1], val_data[2]
        
        print('Train on {} samples, validate on {} samples'.format(len(x_train), len(x_val) if val_data else 0))
        for epoch in range(1, epoch_size + 1):       
            x_train, y_train, train_sequence_length = shuffle_data(x_train, y_train, train_sequence_length)
            # train
            learn(x_train, y_train, train_sequence_length, epoch, 'train')

            # validate
            if val_data:
                learn(x_val, y_val, val_sequence_length, epoch, 'validate')    


        save_path = saver.save(self.sess, 'models/{}.ckpt'.format(model_name))
        print('Model was saved in {}'.format(save_path))
            
    
    def restore(self, model_path):
        saver = tf.train.Saver()
        self.sess = tf.Session()
        saver.restore(self.sess, model_path)
            
    
    def predict(self, X, sequence_length, word_to_index):
        
        y_empty = np.empty(0)
        batch_size, i = 100, 0
        tn = tqdm_notebook(total=len(X))
        prediction = np.empty((len(X), ))
        for sentA, sentB, seq_lenA, seq_lenB, _ in next_batch_with_pad(X, y_empty, sequence_length, word_to_index, batch_size):
            fetches = [self.output]
            feed_dict = {
                self.input_sentA: sentA,
                self.input_sentB: sentB, 
                self.input_seq_lenA: seq_lenA,
                self.input_seq_lenB: seq_lenB,
            }
            output = self.sess.run(fetches, feed_dict)[0]
            prediction[i * batch_size: i * batch_size + len(output)] = output
            
            tn.set_postfix(mode='predict')
            tn.update(n=batch_size)
            
            i += 1
        
        
        return prediction
        
        


In [8]:
X, Y, sequence_length = np.load('data/x_train.npy'), np.load('data/y_train.npy'), np.load('data/train_seq_len.npy')
x_train, y_train, train_sequlence_length, x_val, y_val, val_sequlence_length = split_train_val_data(X, Y, sequence_length, 0.0)

In [9]:
learning_rate = 0.001
model = QuestionPairDuplicated(emb_matrix, learning_rate, emb_trainable=True)
model.build(lstm_unit=256, encoder='lstm')

In [11]:
epoch_size = 15
batch_size = 128
train_data = [x_train, y_train, train_sequlence_length]
val_data = [x_val, y_val, val_sequlence_length]
model.fit(train_data, None, epoch_size, batch_size, word_to_index, 'model-manhattan')

ResourceExhaustedError: OOM when allocating tensor of shape [400002,300] and type float
	 [[{{node embeddings_matrix/Initializer/Const}} = Const[_class=["loc:@embeddings_matrix/Assign"], dtype=DT_FLOAT, value=Tensor<type: float shape: [400002,300] values: [0.04656 0.21318 -0.0074364...]...>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

Caused by op 'embeddings_matrix/Initializer/Const', defined at:
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\asyncio\base_events.py", line 1434, in _run_once
    handle._run()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 1233, in inner
    self.run()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel\kernelbase.py", line 370, in dispatch_queue
    yield self.process_one()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 346, in wrapper
    runner = Runner(result, future, yielded)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 1080, in __init__
    self.run()
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\IPython\core\interactiveshell.py", line 2817, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\IPython\core\interactiveshell.py", line 2843, in _run_cell
    return runner(coro)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\IPython\core\interactiveshell.py", line 3018, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\IPython\core\interactiveshell.py", line 3183, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\IPython\core\interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-b821a1a792f3>", line 2, in <module>
    model = QuestionPairDuplicated(emb_matrix, learning_rate, emb_trainable=True)
  File "<ipython-input-7-30f195317781>", line 27, in __init__
    name='embeddings_matrix')
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1484, in get_variable
    aggregation=aggregation)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1234, in get_variable
    aggregation=aggregation)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 538, in get_variable
    aggregation=aggregation)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 492, in _true_getter
    aggregation=aggregation)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 920, in _get_single_variable
    aggregation=aggregation)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variables.py", line 145, in __call__
    return cls._variable_call(*args, **kwargs)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variables.py", line 141, in _variable_call
    aggregation=aggregation)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variables.py", line 120, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2441, in default_variable_creator
    expected_shape=expected_shape, import_scope=import_scope)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variables.py", line 147, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variables.py", line 1104, in __init__
    constraint=constraint)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variables.py", line 1212, in _init_from_args
    initial_value(), name="initial_value", dtype=dtype)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 894, in <lambda>
    shape.as_list(), dtype=dtype, partition_info=partition_info)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\ops\init_ops.py", line 219, in __call__
    self.value, dtype=dtype, shape=shape, verify_shape=verify_shape)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\framework\constant_op.py", line 213, in constant
    name=name).outputs[0]
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\util\deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\framework\ops.py", line 3272, in create_op
    op_def=op_def)
  File "C:\Users\chiamin\Anaconda3\envs\iisr\lib\site-packages\tensorflow\python\framework\ops.py", line 1768, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor of shape [400002,300] and type float
	 [[{{node embeddings_matrix/Initializer/Const}} = Const[_class=["loc:@embeddings_matrix/Assign"], dtype=DT_FLOAT, value=Tensor<type: float shape: [400002,300] values: [0.04656 0.21318 -0.0074364...]...>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]


In [None]:
model_path = 'models/model-manhattan.ckpt'
model.restore(model_path)

In [None]:
x_test, test_sequence_length = np.load('data/x_test.npy'), np.load('data/test_seq_len.npy')

In [None]:
prediction = model.predict(x_test, test_sequence_length, word_to_index)

In [None]:
prediction2csv(prediction, 'data/submit.csv')

In [15]:
def prediction2csv(prediction, filepath):
    with open(filepath, 'w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['test_id', 'is_duplicate'])
        for i, pred in enumerate(prediction):
            writer.writerow([i, pred])
    

In [10]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, multiply
from keras.layers.merge import concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

In [16]:
vocab_size = len(emb_matrix)
emb_dim = len(emb_matrix[0])
batch_size = 100
max_len = 250
embedding_layer = Embedding(vocab_size, emb_dim, weights=[emb_matrix], trainable=False)


In [21]:
print(len(x_train))

283003


In [23]:
inputA = []
inputB = []
labels = []
for sentA, sentB, _, _, duplicated in next_batch(x_train, y_train, train_sequlence_length, 100):
    inputA.append(sentA)
    inputB.append(sentB)
    labels.append(duplicated)

print(np.array(inputA).shape)
inputA = np.array(inputA).reshape((-1, max_len))
inputB = np.array(inputB).reshape((-1, max_len))
labels = np.array(labels).reshape((-1, 1))
print(inputA.shape)
print(inputB.shape)
print(labels.shape)

(2830, 100)
(1132, 250)
(1132, 250)
(283000, 1)


In [18]:
 
sentence_indicesA = Input(shape=[max_len], dtype='int32')
sentence_indicesB = Input(shape=[max_len], dtype='int32')

embeddingsA = embedding_layer(sentence_indicesA)
embeddingsB = embedding_layer(sentence_indicesB) 
print(embeddingsA)

siamese_lstm = LSTM(256, activation='tanh')


X = concatenate([siamese_lstm(embeddingsA), siamese_lstm(embeddingsB)])

X = Dense(16, activation='sigmoid')(X)
X = Dense(1, activation='sigmoid')(X)

model = Model([sentence_indicesA, sentence_indicesB], X)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

Tensor("embedding_1/embedding_lookup/Identity:0", shape=(?, 10, 300), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 300)      120000600   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)           

In [19]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.fit([inputA, inputB], labels, epochs=25, batch_size = 32, shuffle=True)

Epoch 1/25
Epoch 2/25
 15168/404224 [>.............................] - ETA: 6:30 - loss: 0.4349 - acc: 0.7960

KeyboardInterrupt: 

In [63]:
# [[0]] [[0.3755557]] [[0.47089315]]
# [[1]] [[1.52865097e-26]] [[16.11809565]]
# [[0]] [[4.60674538e-26]] [[1.00000005e-07]]
# [[1]] [[2.43860639e-28]] [[16.11809565]]

p = 0
q = 0.49983746

In [109]:
p * -np.log(q) + (1 - p) * -np.log(1 - q)

-0.692822153387

In [19]:
indice = np.arange(len(X))
indice.shape

(404290,)

In [43]:

print(len(flatten))

9098031


In [44]:
from collections import Counter
counter = Counter(flatten)
    

In [47]:
counter.items()

dict_items([(102, 324643), (14, 269925), (0, 377688), (1065, 751), (21, 17316), (3372, 304), (4, 205819), (4280, 1687), (6, 197277), (593, 1255), (211, 2455), (474, 29564), (523, 1754), (3, 159882), (180152, 20), (8849, 22), (41, 223286), (16359, 9), (5188, 139), (54, 23648), (1927, 5024), (83, 43822), (792, 10798), (78, 4025), (8396, 31), (137, 4567), (197, 220894), (86, 118053), (686, 4254), (1512, 2098), (192, 70965), (925, 2619), (2540, 459), (110, 4580), (622, 6160), (7, 212205), (88832, 278), (30, 43853), (1041, 216), (20463, 673), (131, 4237), (36758, 34), (738, 84085), (913, 12747), (7278, 278), (191, 3004), (10678, 281), (4441, 1488), (20, 71121), (596, 10572), (6345, 157), (61, 26219), (7460, 3997), (1021, 368), (795, 680), (2204, 325), (42, 43975), (48, 20052), (13151, 68), (430, 3992), (400000, 63400), (3191, 443), (2982, 405), (16904, 35), (5, 133984), (4136, 433), (4861, 33), (16049, 80), (2120, 430), (3981, 737), (29593, 340), (51980, 39), (1662, 1036), (3539, 142), (337