In [42]:
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout, Input, Concatenate

import json
import keras
import oov_prep as oov
import data_cleaning as dc

In [7]:
# Movie conversation cleaned and POS tagged data
with open('clean_tagged_data.json', 'r') as outfile:
    data = json.load(outfile)
    
sent, tags = zip(*data['tags'])

print('Sentences/Tags data length:', len(sent), len(tags))
print(sent[:3])
print(tags[:3])

Sentences/Tags data length: 289401 289401
('<BOS> they do not <EOS>', '<BOS> they do to <EOS>', '<BOS> i hope so <EOS>')
('<start> PPSS DO * <end>', '<start> PPSS DO TO <end>', '<start> NN NN RB <end>')


In [8]:
sent = sent[:20000]
tags = tags[:20000]

In [9]:
# Set the number of n-grams to build sequences
n = 5

# maximum length of sequence is n-1 since the last word will be the target prediction
max_length = n-1

In [41]:
# Tokenizing and getting n-gram sequences

tk_text = Tokenizer(filters=[])
tk_text.fit_on_texts(sent)
dec_sentences = tk_text.texts_to_sequences(sent)

tk_tags = Tokenizer(filters=[])
tk_tags.fit_on_texts(tags)
dec_tagged = tk_tags.texts_to_sequences(tags)

tagged_sent = zip(dec_sentences, dec_tagged)

text_grams, tag_grams = oov.tagged_n_grams(tagged_sent, n)

X, y, X_rev, y_rev = text_grams
X_tag, y_tag, X_tag_rev, y_tag_rev = tag_grams

vocab_size = len(tk_text.word_index)+1
tags_size = len(tk_tags.word_index)+1
print('Vocabulary Size: %d' % vocab_size)

Total Sequences: 779764
Vocabulary Size: 12133


In [None]:
# Tokenizing and getting n-gram sequences for sentences

tk = Tokenizer()
tk.fit_on_texts(sent)
enc_sentences = tk.texts_to_sequences(sent)

X_enc_sent, y_enc_sent, X_rev_enc_sent, y_rev_enc_sent = oov.n_grams(enc_sentences, n)

vocab_size = len(tk.word_index)+1
print('Vocabulary Size: %d' % vocab_size)

In [None]:
# Tokenizing and getting n-gram sequences for tags

tk = Tokenizer()
tk.fit_on_texts(tags)
enc_tagged = tk.texts_to_sequences(tags)

X_enc_tags, y_enc_tags, X_rev_enc_tags, y_rev_enc_tags = oov.n_grams(enc_tagged, n)

tag_vocab_size = len(tk.word_index)+1
print('Vocabulary Size: %d' % tag_vocab_size)

In [None]:
# Define senteces forward sequence bidirectional model

model_sent = Sequential()
model_sent.add(Embedding(vocab_size, 32, mask_zero=True, input_length=max_length))
model_sent.add(Bidirectional(LSTM(8)))
model_sent.add(Dropout(0.5))
model_sent.add(Dense(vocab_size, activation='softmax'))
print(model_sent.summary())

In [None]:
# Define tags forward sequence bidirectional model

model_tags = Sequential()
model_tags.add(Embedding(tag_vocab_size, 32, mask_zero=True, input_length=max_length))
model_tags.add(Bidirectional(LSTM(8)))
model_tags.add(Dropout(0.5))
model_tags.add(Dense(tag_vocab_size, activation='softmax'))
print(model_tags.summary())

In [None]:
# define reverse model for sentences

rev_model_sent = Sequential()
rev_model_sent.add(Embedding(vocab_size, 32, mask_zero=True, input_length=max_length))
rev_model_sent.add(Bidirectional(LSTM(8)))
rev_model_sent.add(Dropout(0.5))
rev_model_sent.add(Dense(vocab_size, activation='softmax'))
print(rev_model_sent.summary())

In [None]:
# define reverse model for tags

rev_model_tags = Sequential()
rev_model_tags.add(Embedding(tag_vocab_size, 32, mask_zero=True, input_length=max_length))
rev_model_tags.add(Bidirectional(LSTM(8)))
rev_model_tags.add(Dropout(0.5))
rev_model_tags.add(Dense(tag_vocab_size, activation='softmax'))
print(rev_model_tags.summary())

In [None]:
# compile sentence forward sequence network
# loss is set to sparse_cat_cross because of multiple classes and no one-hot encoding

model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])

model_sent.fit(X_enc_sent, y_enc_sent, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_sent.save('model_oov_sent.h5')

In [None]:
# compile reverse sequence network

rev_model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

rev_model_sent.fit(X_rev_enc_sent, y_rev_enc_sent, batch_size=128, epochs=5, verbose=1, shuffle=True,
                   validation_split=0.2)

rev_model_sent.save('rev_model_oov_sent.h5')

In [None]:
# compile tags forward sequence network

model_tags.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])

model_tags.fit(X_enc_tags, y_enc_tags, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_tags.save('model_oov_tags.h5')

In [None]:
# compile reverse sequence network

rev_model_tags.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

rev_model_tags.fit(X_rev_enc_tags, y_rev_enc_tags, batch_size=128, epochs=5, verbose=1, shuffle=True,
                   validation_split=0.2)

rev_model_tags.save('rev_model_oov_tags.h5')

In [14]:
from gensim.models import Word2Vec

In [38]:
splt_sent = [s.lower().split() for s in sent]
splt_sent[:2]

[['<bos>', 'they', 'do', 'not', '<eos>'],
 ['<bos>', 'they', 'do', 'to', '<eos>']]

In [39]:
embedded = Word2Vec(splt_sent, sg=1, min_count=0)

In [40]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tk_text.word_index.items():
    embedding_matrix[i] = embedded.wv[word]

In [None]:
concat_sentag = []
for i in range(len(splt_sent)):
    sentence = []
    for ii in range(len(splt_sent[i])):
        sentence.append(''.join([splt_sent[i][ii].lower(), splt_tags[i][ii].lower()]))
    concat_sentag.append(sentence)

In [None]:
tk = Tokenizer()
tk.fit_on_texts(concat_sentag)
enc_sentences = tk.texts_to_sequences(concat_sentag)

X_enc_sent, y_enc_sent, X_rev_enc_sent, y_rev_enc_sent = oov.n_grams(enc_sentences, n)

vocab_size = len(tk.word_index)+1
print('Vocabulary Size: %d' % vocab_size)

In [None]:
emb_sentag = []

c=0
for sent in concat_sentag:
    embedded_sent = []
    for word in sent:
        embedded_sent.append(list(embedded.wv[word]))
    emb_sentag.append(embedded_sent)

In [None]:
max_length = n-1

In [None]:
X_dim = X.shape[0]
in_dim = X.shape[2]

In [None]:
model_input = Input((3,), dtype='float32')
model_emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length,
                      trainable=False)(model_input)
model_emb = Bidirectional(LSTM(8))(model_input)
#model_emb = LSTM(8, input_shape=(100,32))(model_emb)
#model_emb = Dropout(0.5)(model_emb)
model_emb = Dense(vocab_size+1, activation='softmax')(model_emb)
model_emb = Model(inputs = model_input, outputs = model_emb)
model_emb.summary()

In [None]:
model_emb.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

model_emb.fit(X, y, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_emb.save('model_emb.h5')

In [None]:
in_dim

In [None]:
X.shape

In [None]:

model_sent = Sequential()
model_sent.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length,
                      trainable=False))
model_sent.add(Bidirectional(LSTM(8)))
model_sent.add(Dropout(0.5))
model_sent.add(Dense(vocab_size, activation='softmax'))
print(model_sent.summary())

In [None]:
model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

model_sent.fit(X_enc_sent, y_enc_sent, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_sent.save('model_emb.h5')

In [None]:
from keras.utils import to_categorical
text = np.random.randint(5000, size=(442702, 200), dtype='int32')
topic = np.random.randint(2, size=(442702, 227), dtype='int32')
sentiment1 = to_categorical(np.random.randint(5, size=442702), dtype='int32')

from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, Concatenate, Lambda
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.backend import cast
from keras.models import Model

In [None]:
text_input = Input(shape=(max_length,), dtype='int32', name='text')
text_encoded = Embedding(vocab_size, output_dim=100, weights=[embedding_matrix], trainable=False)(text_input)
text_encoded = Dropout(0.1)(text_encoded)
text_encoded = Conv1D(300, 3, padding='valid', activation='relu', strides=1)(text_encoded)
text_encoded = GlobalMaxPool1D()(text_encoded)

tags_input = Input(shape=(max_length,), dtype='int32', name='topic')

tags_float = Lambda(lambda x:cast(x, 'float32'), name='Floatconverter')(topic_input)

concatenated = Concatenate(axis=-1)([text_encoded, topic_float])
sentiment = Dense(5, activation='softmax')(concatenated)

model = Model(inputs=[text_input, topic_input], outputs=sentiment)
# summarize layers
print(model.summary())

In [51]:
model_text = Sequential()
model_text.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length,
                      trainable=False))
model_text.add(Bidirectional(LSTM(8)))
model_text.add(Dropout(0.5))
model_text.add(Dense(vocab_size, activation='softmax'))

model_tags = Sequential()
model_tags.add(Embedding(tags_size, 32, input_length=max_length))
model_tags.add(Bidirectional(LSTM(8)))
model_tags.add(Dropout(0.5))
model_tags.add(Dense(tags_size, activation='softmax'))

model = Concatenate()([model_text, model_tags])
model.add(Dense(vocab_size, activation='softmax'))

print(model.summary())

ValueError: Layer concatenate_8 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.engine.sequential.Sequential'>. Full input: [<keras.engine.sequential.Sequential object at 0x000002109E10D608>, <keras.engine.sequential.Sequential object at 0x000002109E4E8A88>]. All inputs to the layer should be tensors.

In [58]:
text_input = Input((max_length,))
model_text = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(text_input)
model_text = Bidirectional(LSTM(8, dropout=0.3, recurrent_dropout=0.2))(model_text)
#model_text = Dense(vocab_size+1, activation='softmax')(model_text)
#model_text = Model(inputs=text_input, outputs=model_text)

tags_input = Input((max_length,))
model_tags = Embedding(vocab_size, 100)(tags_input)
model_tags = Bidirectional(LSTM(8, dropout=0.3, recurrent_dropout=0.2))(model_tags)
#model_tags = Dense(tags_size+1, activation='softmax')(model_tags)
#model_tags = Model(inputs=tags_input, outputs=model_tags)

concatenate = Concatenate()([model_text, model_tags])
result = Dense(vocab_size+1, activation='softmax')(concatenate)

model = Model(inputs=[text_input, tags_input], outputs=result)

model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_18 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
input_19 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
embedding_30 (Embedding)        (None, 4, 100)       1213300     input_18[0][0]                   
__________________________________________________________________________________________________
embedding_31 (Embedding)        (None, 4, 100)       1213300     input_19[0][0]                   
___________________________________________________________________________________________

In [60]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

model.fit([X, X_tag], y, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model.save('model_concat.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 623811 samples, validate on 155953 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [63]:
text_input = Input((max_length,))
model_text = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(text_input)
model_text = Bidirectional(LSTM(8, dropout=0.2, recurrent_dropout=0.1))(model_text)

tags_input = Input((max_length,))
model_tags = Embedding(vocab_size, 100)(tags_input)
model_tags = Bidirectional(LSTM(8, dropout=0.2, recurrent_dropout=0.1))(model_tags)

concatenate_text_tags = Concatenate()([model_text, model_tags])

rev_text_input = Input((max_length,))
rev_model_text = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(rev_text_input)
rev_model_text = Bidirectional(LSTM(8, dropout=0.2, recurrent_dropout=0.1))(rev_model_text)

rev_tags_input = Input((max_length,))
rev_model_tags = Embedding(vocab_size, 100)(rev_tags_input)
rev_model_tags = Bidirectional(LSTM(8, dropout=0.2, recurrent_dropout=0.1))(rev_model_tags)

rev_concatenate_text_tags = Concatenate()([rev_model_text, rev_model_tags])

concatenate = Concatenate()([concatenate_text_tags, rev_concatenate_text_tags])

result = Dense(vocab_size+1, activation='softmax')(concatenate)

model = Model(inputs=[text_input, tags_input, rev_text_input, rev_tags_input], outputs=result)

model.summary()

Model: "model_14"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
input_29 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
input_30 (InputLayer)           (None, 4)            0                                            
__________________________________________________________________________________________________
input_31 (InputLayer)           (None, 4)            0                                            
___________________________________________________________________________________________

In [64]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

model.fit([X, X_tag, X_rev, X_tag_rev], y, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model.save('model_full_concat.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 623811 samples, validate on 155953 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
