In [42]:
from gensim.models import Word2Vec
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout, Input, Concatenate

import json
import keras
import oov_prep as oov
import data_cleaning as dc

### Pre-processing data

In [7]:
# Movie conversation cleaned and POS tagged data
with open('clean_tagged_data.json', 'r') as outfile:
    data = json.load(outfile)
    
sent, tags = zip(*data['tags'])

print('Sentences/Tags data length:', len(sent), len(tags))
print(sent[:3])
print(tags[:3])

Sentences/Tags data length: 289401 289401
('<BOS> they do not <EOS>', '<BOS> they do to <EOS>', '<BOS> i hope so <EOS>')
('<start> PPSS DO * <end>', '<start> PPSS DO TO <end>', '<start> NN NN RB <end>')


In [8]:
sent = sent[:20000]
tags = tags[:20000]

In [9]:
# Set the number of n-grams to build sequences
n = 5

# maximum length of sequence is n-1 since the last word will be the target (y)
max_length = n-1

In [41]:
# Tokenizing and getting n-gram sequences

tk_text = Tokenizer(filters=[])
tk_text.fit_on_texts(sent)
dec_sentences = tk_text.texts_to_sequences(sent)

tk_tags = Tokenizer(filters=[])
tk_tags.fit_on_texts(tags)
dec_tagged = tk_tags.texts_to_sequences(tags)

tagged_sent = zip(dec_sentences, dec_tagged)

text_grams, tag_grams = oov.tagged_n_grams(tagged_sent, n)

X, y, X_rev, y_rev = text_grams
X_tag, y_tag, X_tag_rev, y_tag_rev = tag_grams

vocab_size = len(tk_text.word_index)+1
tags_size = len(tk_tags.word_index)+1
print('Vocabulary Size: %d' % vocab_size)

Total Sequences: 779764
Vocabulary Size: 12133


In [65]:
# splitting sentences for word2vec
splt_sent = [s.lower().split() for s in sent]
splt_sent[:2]

[['<bos>', 'they', 'do', 'not', '<eos>'],
 ['<bos>', 'they', 'do', 'to', '<eos>']]

In [39]:
# embedding words with word2vec (skip-gram)
embedded = Word2Vec(splt_sent, sg=1, min_count=0)

In [40]:
# creating embedding_matrix to associate embedding with encoded sequences
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tk_text.word_index.items():
    embedding_matrix[i] = embedded.wv[word]

### Modeling and Training for OOV Prediction

In [66]:
# using n-grams of sentences, tags, reversed sentences and reversed tags
# using word2vec embedding for sentences and standard keras embedding for tags
# concatenating each model to get a more reliable prediction 

text_input = Input((max_length,))
model_text = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(text_input)
model_text = Bidirectional(LSTM(16, dropout=0.2, recurrent_dropout=0.1))(model_text)

tags_input = Input((max_length,))
model_tags = Embedding(vocab_size, 100)(tags_input)
model_tags = Bidirectional(LSTM(16, dropout=0.2, recurrent_dropout=0.1))(model_tags)

concatenate_text_tags = Concatenate()([model_text, model_tags])

rev_text_input = Input((max_length,))
rev_model_text = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(rev_text_input)
rev_model_text = Bidirectional(LSTM(16, dropout=0.2, recurrent_dropout=0.1))(rev_model_text)

rev_tags_input = Input((max_length,))
rev_model_tags = Embedding(vocab_size, 100)(rev_tags_input)
rev_model_tags = Bidirectional(LSTM(16, dropout=0.2, recurrent_dropout=0.1))(rev_model_tags)

rev_concatenate_text_tags = Concatenate()([rev_model_text, rev_model_tags])

concatenate = Concatenate()([concatenate_text_tags, rev_concatenate_text_tags])

result = Dense(vocab_size+1, activation='softmax')(concatenate)

model = Model(inputs=[text_input, tags_input, rev_text_input, rev_tags_input], outputs=result)

model.summary()

Model: "model_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_32 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
input_33 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
input_34 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
input_35 (InputLayer)           (None, 4)            0                                            
___________________________________________________________________________________________

In [64]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

model.fit([X, X_tag, X_rev, X_tag_rev], y, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model.save('model_full_concat.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 623811 samples, validate on 155953 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
