In [2]:
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout

import json
import keras
import oov_prep as oov

Using TensorFlow backend.


In [3]:
# Movie conversation cleaned and POS tagged data
with open('clean_tagged_data.json', 'r') as outfile:
    data = json.load(outfile)

sent, tags = zip(*data['tags'])

print('Sentences/Tags data length:', len(sent), len(tags))
print(sent[:3])
print(tags[:3])

In [None]:
# Tokenizing and getting n-gram sequences for sentences

tk = Tokenizer(oov_token='<UNK>')
tk.fit_on_texts(sent)
enc_sentences = tk.texts_to_sequences(sent)

X_enc_sent, y_enc_sent, X_rev_enc_sent, y_rev_enc_sent = oov.n_grams(enc_sentences)

vocab_size = len(tk.word_index)
print('Vocabulary Size: %d' % vocab_size)

In [None]:
# Tokenizing and getting n-gram sequences for tags

tk = Tokenizer(oov_token='<UNK>')
tk.fit_on_texts(tags)
enc_tagged = tk.texts_to_sequences(tags)

X_enc_tags, y_enc_tags, X_rev_enc_tags, y_rev_enc_tags = oov.n_grams(enc_tagged)

tag_vocab_size = len(tk.word_index)
print('Vocabulary Size: %d' % tag_vocab_size)

In [None]:
# Define senteces forward sequence bidirectional model

model_sent = Sequential()
model_sent.add(Embedding(vocab_size+1, 32, mask_zero=True, input_length=max_length))
model_sent.add(Bidirectional(LSTM(64)))
model_sent.add(Dropout(0.5))
model_sent.add(Dense(vocab_size+1, activation='softmax'))
print(model_sent.summary())

In [None]:
# Define tags forward sequence bidirectional model

model_tags = Sequential()
model_tags.add(Embedding(tag_vocab_size+1, 32, mask_zero=True, input_length=max_length))
model_tags.add(Bidirectional(LSTM(64)))
model_tags.add(Dropout(0.5))
model_tags.add(Dense(tag_vocab_size+1, activation='softmax'))
print(model_tags.summary())

In [None]:
# define reverse model for sentences

rev_model_sent = Sequential()
rev_model_sent.add(Embedding(vocab_size+1, 32, mask_zero=True, input_length=max_length))
rev_model_sent.add(Bidirectional(LSTM(64)))
rev_model_sent.add(Dropout(0.5))
rev_model_sent.add(Dense(vocab_size+1, activation='softmax'))
print(rev_model_sent.summary())

In [None]:
# define reverse model for tags

rev_model_tags = Sequential()
rev_model_tags.add(Embedding(tag_vocab_size+1, 32, mask_zero=True, input_length=max_length))
rev_model_tags.add(Bidirectional(LSTM(64)))
rev_model_tags.add(Dropout(0.5))
rev_model_tags.add(Dense(tag_vocab_size+1, activation='softmax'))
print(rev_model_tags.summary())

In [None]:
# compile sentence forward sequence network
# loss is set to sparse_cat_cross because of multiple classes and no one-hot encoding

model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])

model_sent.fit(X_enc_sent, y_enc_sent, batch_size=128, epochs=20, verbose=1, shuffle=True, validation_split=0.2)

model_sent.save('model_oov_sent.h5')

In [None]:
# compile reverse sequence network

rev_model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.002),
                       metrics=['accuracy'])

rev_model_sent.fit(X_rev_enc_sent, y_rev_enc_sent, batch_size=128, epochs=200, verbose=1, shuffle=True,
                   validation_split=0.2)

rev_model_sent.save('rev_model_oov_sent.h5')

In [None]:
# compile tags forward sequence network

model_tags.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])

model_tags.fit(X_enc_tags, y_enc_tags, batch_size=128, epochs=20, verbose=1, shuffle=True, validation_split=0.2)

model_tags.save('model_oov_tags.h5')

In [None]:
# compile reverse sequence network

rev_model_tags.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.002),
                       metrics=['accuracy'])

rev_model_tags.fit(X_rev_enc_tags, y_rev_enc_tags, batch_size=128, epochs=200, verbose=1, shuffle=True,
                   validation_split=0.2)

rev_model_tags.save('rev_model_oov_tags.h5')