In [326]:
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout, Input, Flatten

import json
import keras
import oov_prep as oov
import data_cleaning as dc

In [348]:
# Movie conversation cleaned and POS tagged data
with open('clean_tagged_data.json', 'r') as outfile:
    data = json.load(outfile)

sent, tags = zip(*data['tags'])

print('Sentences/Tags data length:', len(sent), len(tags))
print(sent[:3])
print(tags[:3])

Sentences/Tags data length: 289401 289401
('<BOS> they do not <EOS>', '<BOS> they do to <EOS>', '<BOS> i hope so <EOS>')
('<start> PPSS DO * <end>', '<start> PPSS DO TO <end>', '<start> NN NN RB <end>')


In [349]:
# Set the number of n-grams to build sequences
n = 5

# maximum length of sequence is n-1 since the last word will be the target prediction
max_length = n-1

In [None]:
# Tokenizing and getting n-gram sequences for sentences

tk = Tokenizer()
tk.fit_on_texts(sent)
enc_sentences = tk.texts_to_sequences(sent)

X_enc_sent, y_enc_sent, X_rev_enc_sent, y_rev_enc_sent = oov.n_grams(enc_sentences, n)

vocab_size = len(tk.word_index)+1
print('Vocabulary Size: %d' % vocab_size)

In [None]:
# Tokenizing and getting n-gram sequences for tags

tk = Tokenizer()
tk.fit_on_texts(tags)
enc_tagged = tk.texts_to_sequences(tags)

X_enc_tags, y_enc_tags, X_rev_enc_tags, y_rev_enc_tags = oov.n_grams(enc_tagged, n)

tag_vocab_size = len(tk.word_index)+1
print('Vocabulary Size: %d' % tag_vocab_size)

In [12]:
# Define senteces forward sequence bidirectional model

model_sent = Sequential()
model_sent.add(Embedding(vocab_size, 32, mask_zero=True, input_length=max_length))
model_sent.add(Bidirectional(LSTM(8)))
model_sent.add(Dropout(0.5))
model_sent.add(Dense(vocab_size, activation='softmax'))
print(model_sent.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 4, 32)             161504    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 16)                2624      
_________________________________________________________________
dropout_5 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 5047)              85799     
Total params: 249,927
Trainable params: 249,927
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
# Define tags forward sequence bidirectional model

model_tags = Sequential()
model_tags.add(Embedding(tag_vocab_size, 32, mask_zero=True, input_length=max_length))
model_tags.add(Bidirectional(LSTM(8)))
model_tags.add(Dropout(0.5))
model_tags.add(Dense(tag_vocab_size, activation='softmax'))
print(model_tags.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 4, 32)             2464      
_________________________________________________________________
bidirectional_6 (Bidirection (None, 16)                2624      
_________________________________________________________________
dropout_6 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 77)                1309      
Total params: 6,397
Trainable params: 6,397
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
# define reverse model for sentences

rev_model_sent = Sequential()
rev_model_sent.add(Embedding(vocab_size, 32, mask_zero=True, input_length=max_length))
rev_model_sent.add(Bidirectional(LSTM(8)))
rev_model_sent.add(Dropout(0.5))
rev_model_sent.add(Dense(vocab_size, activation='softmax'))
print(rev_model_sent.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 4, 32)             161504    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 16)                2624      
_________________________________________________________________
dropout_7 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 5047)              85799     
Total params: 249,927
Trainable params: 249,927
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
# define reverse model for tags

rev_model_tags = Sequential()
rev_model_tags.add(Embedding(tag_vocab_size, 32, mask_zero=True, input_length=max_length))
rev_model_tags.add(Bidirectional(LSTM(8)))
rev_model_tags.add(Dropout(0.5))
rev_model_tags.add(Dense(tag_vocab_size, activation='softmax'))
print(rev_model_tags.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 4, 32)             2464      
_________________________________________________________________
bidirectional_8 (Bidirection (None, 16)                2624      
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 77)                1309      
Total params: 6,397
Trainable params: 6,397
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
# compile sentence forward sequence network
# loss is set to sparse_cat_cross because of multiple classes and no one-hot encoding

model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])

model_sent.fit(X_enc_sent, y_enc_sent, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_sent.save('model_oov_sent.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 154923 samples, validate on 38731 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# compile reverse sequence network

rev_model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

rev_model_sent.fit(X_rev_enc_sent, y_rev_enc_sent, batch_size=128, epochs=5, verbose=1, shuffle=True,
                   validation_split=0.2)

rev_model_sent.save('rev_model_oov_sent.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 154923 samples, validate on 38731 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
# compile tags forward sequence network

model_tags.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])

model_tags.fit(X_enc_tags, y_enc_tags, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_tags.save('model_oov_tags.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 151324 samples, validate on 37831 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
# compile reverse sequence network

rev_model_tags.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

rev_model_tags.fit(X_rev_enc_tags, y_rev_enc_tags, batch_size=128, epochs=5, verbose=1, shuffle=True,
                   validation_split=0.2)

rev_model_tags.save('rev_model_oov_tags.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 151324 samples, validate on 37831 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
from gensim.models import Word2Vec

In [58]:
model_sent.wv['do']

array([ 0.5360404 ,  0.7821148 ,  0.519133  , -0.26177096, -0.5698036 ,
        0.6429731 ,  0.56799567, -0.16616847, -0.29329345,  0.36652222,
       -0.71067363,  0.6698672 , -0.08130988,  0.03239027,  0.2183451 ,
        0.38308454,  0.36483544,  0.47769916, -0.5509241 , -0.8005504 ,
       -0.12362936,  0.08561587, -0.528271  ,  0.4450717 ,  0.11412261,
        0.6839521 ,  0.6146252 ,  0.14193049,  0.9081887 ,  0.1053213 ,
        0.8493558 ,  0.09275218], dtype=float32)

In [330]:
embedded = Word2Vec(concat_sentag, sg=1, min_count=0)

In [345]:
emb_sent = Word2Vec(splt_sent, sg=1, min_count=0)

[['<bos><start>', 'theyppss', 'dodo', 'not*', '<eos><end>'],
 ['<bos><start>', 'theyppss', 'dodo', 'toto', '<eos><end>']]

In [75]:
splt_sent = [s.split() for s in sent]
splt_tags = [t.split() for t in tags]

In [328]:
concat_sentag = []
for i in range(len(splt_sent)):
    sentence = []
    for ii in range(len(splt_sent[i])):
        sentence.append(''.join([splt_sent[i][ii].lower(), splt_tags[i][ii].lower()]))
    concat_sentag.append(sentence)

In [334]:
tk = Tokenizer()
tk.fit_on_texts(concat_sentag)
enc_sentences = tk.texts_to_sequences(concat_sentag)

X_enc_sent, y_enc_sent, X_rev_enc_sent, y_rev_enc_sent = oov.n_grams(enc_sentences, n)

vocab_size = len(tk.word_index)+1
print('Vocabulary Size: %d' % vocab_size)

Total Sequences: 193658
Vocabulary Size: 6273


In [335]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tk.word_index.items():
    embedding_matrix[i] = embedded.wv[word]

In [None]:
emb_sentag = []

c=0
for sent in concat_sentag:
    embedded_sent = []
    for word in sent:
        embedded_sent.append(list(embedded.wv[word]))
    emb_sentag.append(embedded_sent)

In [339]:
max_length = n-1

In [230]:
X_dim = X.shape[0]
in_dim = X.shape[2]

In [340]:
model_input = Input((3,), dtype='float32')
model_emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length,
                      trainable=False)(model_input)
model_emb = Bidirectional(LSTM(8))(model_input)
#model_emb = LSTM(8, input_shape=(100,32))(model_emb)
#model_emb = Dropout(0.5)(model_emb)
model_emb = Dense(vocab_size+1, activation='softmax')(model_emb)
model_emb = Model(inputs = model_input, outputs = model_emb)
model_emb.summary()

ValueError: "input_length" is 4, but received input has shape (None, 3)

In [285]:
model_emb.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

model_emb.fit(X, y, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_emb.save('model_emb.h5')

ValueError: Error when checking input: expected input_49 to have 2 dimensions, but got array with shape (152443, 3, 100)

In [224]:
in_dim

100

In [226]:
X.shape

(152443, 3, 100)

In [341]:

model_sent = Sequential()
model_sent.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length,
                      trainable=False))
model_sent.add(Bidirectional(LSTM(8)))
model_sent.add(Dropout(0.5))
model_sent.add(Dense(vocab_size, activation='softmax'))
print(model_sent.summary())

Model: "sequential_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_90 (Embedding)     (None, 4, 100)            627300    
_________________________________________________________________
bidirectional_68 (Bidirectio (None, 16)                6976      
_________________________________________________________________
dropout_42 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_48 (Dense)             (None, 6273)              106641    
Total params: 740,917
Trainable params: 113,617
Non-trainable params: 627,300
_________________________________________________________________
None


In [343]:
model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                       metrics=['accuracy'])

model_sent.fit(X_enc_sent, y_enc_sent, batch_size=128, epochs=5, verbose=1, shuffle=True, validation_split=0.2)

model_sent.save('model_emb.h5')

Train on 154926 samples, validate on 38732 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: 

In [None]:
from keras.utils import to_categorical
text = np.random.randint(5000, size=(442702, 200), dtype='int32')
topic = np.random.randint(2, size=(442702, 227), dtype='int32')
sentiment1 = to_categorical(np.random.randint(5, size=442702), dtype='int32')

from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, Concatenate, Lambda
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.backend import cast
from keras.models import Model

In [None]:
text_input = Input(shape=(max_length,), dtype='int32', name='text')
text_encoded = Embedding(vocab_size, output_dim=100, weights=[embedding_matrix], trainable=False)(text_input)
text_encoded = Dropout(0.1)(text_encoded)
text_encoded = Conv1D(300, 3, padding='valid', activation='relu', strides=1)(text_encoded)
text_encoded = GlobalMaxPool1D()(text_encoded)

topic_input = Input(shape=(227,), dtype='int32', name='topic')

topic_float = Lambda(lambda x:cast(x, 'float32'), name='Floatconverter')(topic_input)

concatenated = Concatenate(axis=-1)([text_encoded, topic_float])
sentiment = Dense(5, activation='softmax')(concatenated)

model = Model(inputs=[text_input, topic_input], outputs=sentiment)
# summarize layers
print(model.summary())

In [None]:
model_sent = Sequential()
model_sent.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length,
                      trainable=False))
model_sent.add(Bidirectional(LSTM(8)))
model_sent.add(Dropout(0.5))
model_sent.add(Dense(vocab_size, activation='softmax'))
print(model_sent.summary())