In [50]:
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Embedding, RepeatVector, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from keras.models import load_model

import re
import os
import json
import pickle
import keras
import random
import pandas as pd

import oov_prep as oov

In [None]:
# Movie conversation cleaned and POS tagged data
with open('clean_tagged_data.json', 'r') as outfile:
    data = json.load(outfile)

sent, tags = zip(*data['tags'])

In [98]:
len(sent)

289401

In [99]:
data = sent[:10000]

In [100]:
tk.fit_on_texts(data)

enc_sentences = tk.texts_to_sequences(data)

max_length = 6

X, y, rev_X, rev_y = oov.n_grams(enc_sentences, max_length)

vocab_size = len(tk.word_index)
print('Vocabulary Size: %d' % vocab_size)

Total Sequences: 462168
Vocabulary Size: 8887


In [2]:
tk = Tokenizer(oov_token='<UNK>')
tk.fit_on_texts(sent)
tk.fit_on_texts(tags)

enc_sentences = tk.texts_to_sequences(sent)
enc_tagged = tk.texts_to_sequences(tags)

X_enc_sent, y_enc_sent, X_rev_enc_sent, y_rev_enc_sent = oov.n_grams(enc_sentences)
X_enc_tags, y_enc_tags, X_rev_enc_tags, y_rev_enc_tags = oov.n_grams(enc_tagged)

vocab_size = len(tk.word_index)
print('Vocabulary Size: %d' % vocab_size)

X_enc_sent,y_enc_sent

Total Sequences: 10
Vocabulary Size: 15


(array([[0, 0, 2],
        [0, 0, 4],
        [0, 4, 5],
        [4, 5, 6],
        [0, 0, 5],
        [0, 5, 6],
        [5, 6, 7],
        [0, 0, 6],
        [0, 6, 7],
        [0, 0, 7]]),
 array([[3],
        [5],
        [6],
        [7],
        [6],
        [7],
        [8],
        [7],
        [8],
        [8]]))

b'Skipping line 144001: expected 5 fields, saw 6\nSkipping line 144113: expected 5 fields, saw 6\nSkipping line 144283: expected 5 fields, saw 6\nSkipping line 144328: expected 5 fields, saw 6\nSkipping line 144337: expected 5 fields, saw 6\nSkipping line 144400: expected 5 fields, saw 6\nSkipping line 144438: expected 5 fields, saw 6\nSkipping line 225183: expected 5 fields, saw 9\nSkipping line 225288: expected 5 fields, saw 41\nSkipping line 225302: expected 5 fields, saw 6\nSkipping line 225394: expected 5 fields, saw 6\nSkipping line 225625: expected 5 fields, saw 6\n'


In [57]:
random.shuffle(data)
data_train = data[:100000]
data_test = data[100000:150000]

In [105]:
# define forward sequence model
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=max_length-1))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.5))
#model.add(RepeatVector(1))
#model.add(LSTM(64, return_sequences=True))
#model.add(Bidirectional(LSTM(64)))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_54"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_51 (Embedding)     (None, 5, 32)             284384    
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 512)               591872    
_________________________________________________________________
dropout_46 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 8887)              4559031   
Total params: 5,435,287
Trainable params: 5,435,287
Non-trainable params: 0
_________________________________________________________________
None


In [104]:
# define reverse model
rev_model = Sequential()
rev_model.add(Embedding(vocab_size, 32, input_length=max_length-1))
#rev_model.add(LSTM(64))
rev_model.add(Bidirectional(LSTM(512)))
rev_model.add(Dense(vocab_size, activation='softmax'))
print(rev_model.summary())

Model: "sequential_53"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_50 (Embedding)     (None, 5, 32)             284384    
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 1024)              2232320   
_________________________________________________________________
dense_36 (Dense)             (None, 8887)              9109175   
Total params: 11,625,879
Trainable params: 11,625,879
Non-trainable params: 0
_________________________________________________________________
None


In [81]:
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=max_length-1))
model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
#model.add(LSTM(16, return_sequences=True, activation='relu'))
#model.add(Flatten())
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_42"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_39 (Embedding)     (None, 5, 64)             341888    
_________________________________________________________________
conv1d_62 (Conv1D)           (None, 2, 64)             16448     
_________________________________________________________________
max_pooling1d_59 (MaxPooling (None, 1, 64)             0         
_________________________________________________________________
dropout_34 (Dropout)         (None, 1, 64)             0         
_________________________________________________________________
bidirectional_20 (Bidirectio (None, 64)                24832     
_________________________________________________________________
dropout_35 (Dropout)         (None, 64)                0         
_________________________________________________________________
batch_normalization_12 (Batc (None, 64)              

In [None]:
# compile forward sequence network
# loss is set to sparse_cat_cross because of multiple classes and no one-hot encoding
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])
# fit network
model.fit(X, y,batch_size=128, epochs=20, verbose=1, shuffle=True, validation_split=0.2)
# save the model to file
model.save('model_oov.h5')
# 0.02 0.05 0.11 0.16 0.17 (128 512) - basic model


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 369734 samples, validate on 92434 samples
Epoch 1/20

In [None]:
# compile reverse sequence network
rev_model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.002), metrics=['accuracy'])
# fit network
rev_model.fit(rev_X, rev_y,batch_size=128, epochs=200, verbose=2, shuffle=True, validation_split=0.2)
# save the model to file
rev_model.save('rev_model_oov.h5')

In [10]:
len(data)

934

In [28]:
data

['<BOS> what would you give him all the lines for he is unintelligible <EOS>',
 '<BOS> she really liked the kitchen <EOS>',
 '<BOS> we do not have to disconnect them we can wheel the stand with the bed <EOS>',
 '<BOS> cannot we see the unfertilized host eggs <EOS>',
 '<BOS> have you been logging on <EOS>',
 '<BOS> oh this is your file i have to send it back to the judge with my evaluation <EOS>',
 '<BOS> i will go with you she is still got some of my stuff <EOS>',
 '<BOS> pinkys records and disc in the shopping center i amma talk to my boss and see if he got a little position for you because you been unemployed for a long time now craig <EOS>',
 '<BOS> speeding <EOS>',
 '<BOS> my car <EOS>',
 '<BOS> big ones little ones teenyweeny onesjust and otherwise <EOS>',
 '<BOS> i am not very hungry thank you <EOS>',
 '<BOS> so we will help lets get cracking <EOS>',
 '<BOS> a southern gentlemen i do not recall his name i suppose it is unkind of me to say it but he had the most disconcerting glas

In [37]:
import random

In [46]:
random.shuffle(data)
data

['<BOS> personally james i am very excited by this new arrangement we have it gives us direct access to literally hundreds of american collages twentieth century history is a growth area over there do not ask me why your readership will be first and second year american collage students <EOS>',
 '<BOS> you are not touching that car <EOS>',
 '<BOS> okay we will see you later then <EOS>',
 '<BOS> no we are almost through <EOS>',
 '<BOS> well alright there is something in me jack i feel it i do not know what it is whether i should be an artist or i do not know a dancer like isadora duncan a wild pagan spirit <EOS>',
 '<BOS> dinner in ten minutes <EOS>',
 '<BOS> a faithful heart makes wishes come true <EOS>',
 '<BOS> whatta we do now <EOS>',
 '<BOS> this mission was a scam from the word go <EOS>',
 '<BOS> look zeddemore it was not my fault you were too stupid to drop that line <EOS>',
 '<BOS> my visual says sunny skies and seventy degrees <EOS>',
 '<BOS> i know you do you and your televisi

In [45]:
data

['<BOS> yeah you know piano bars plays the piano and sings that is how they met <EOS>',
 '<BOS> let me think i need time to think <EOS>',
 '<BOS> bhey guy relax put out you gotta relax shut your mouth baby i would do anything for ya now do not make me smack you in the eye like last timeb <EOS>',
 '<BOS> oh manwhat happened <EOS>',
 '<BOS> we sure picked the wrong night to find a cemetery lets turn back <EOS>',
 '<BOS> we think <EOS>',
 '<BOS> so they tell me not soon enough of course how are you sal you look fantastic it changes your life you know a baby it puts everything in perspective does not it does not it mac you cannot be the center of your own world anymore <EOS>',
 '<BOS> oh please do not let me interrupt finish your phone call <EOS>',
 '<BOS> do you think is was a good idea to launch an attack on a holy day <EOS>',
 '<BOS> how do you know my dimwitted inexperience is not merely a subtle form of manipulation used to lower peoples expectations thereby enhancing my ability to ef

In [56]:
len(data)

266842

In [None]:
from keras import Model
from keras.layers.core import Dense, Activation
from keras.layers import Conv2D, Conv1D, MaxPooling2D, Reshape, Concatenate, Dropout , MaxPooling1D, Flatten
from keras.layers import Dense, Input

In [96]:
model_sent = Sequential()
model_sent.add(Embedding(vocab_size, 64, input_length=max_length-1))
model_sent.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
model_sent.add(MaxPooling1D(pool_size=2))
model_sent.add(Dropout(0.2))
model_sent.add(Flatten())
model_sent.add(Dense(vocab_size, activation='softmax'))
print(model_sent.summary())

Model: "sequential_50"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_47 (Embedding)     (None, 5, 64)             341888    
_________________________________________________________________
conv1d_70 (Conv1D)           (None, 2, 32)             8224      
_________________________________________________________________
max_pooling1d_67 (MaxPooling (None, 1, 32)             0         
_________________________________________________________________
dropout_44 (Dropout)         (None, 1, 32)             0         
_________________________________________________________________
flatten_21 (Flatten)         (None, 32)                0         
_________________________________________________________________
dense_33 (Dense)             (None, 5342)              176286    
Total params: 526,398
Trainable params: 526,398
Non-trainable params: 0
_______________________________________________

In [88]:
model_rev_sent = Sequential()
model_rev_sent.add(Embedding(vocab_size, 64, input_length=max_length-1))
model_rev_sent.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
model_rev_sent.add(MaxPooling1D(pool_size=2))
model_rev_sent.add(Dropout(0.2))
model_rev_sent.add(Flatten())
model_rev_sent.add(Dense(vocab_size, activation='softmax'))
print(model_rev_sent.summary())

Model: "sequential_49"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_46 (Embedding)     (None, 5, 64)             341888    
_________________________________________________________________
conv1d_69 (Conv1D)           (None, 2, 32)             8224      
_________________________________________________________________
max_pooling1d_66 (MaxPooling (None, 1, 32)             0         
_________________________________________________________________
dropout_43 (Dropout)         (None, 1, 32)             0         
_________________________________________________________________
flatten_20 (Flatten)         (None, 32)                0         
_________________________________________________________________
dense_32 (Dense)             (None, 5342)              176286    
Total params: 526,398
Trainable params: 526,398
Non-trainable params: 0
_______________________________________________

In [87]:
model_tags = Sequential()
model_tags.add(Embedding(vocab_size, 64, input_length=max_length-1))
model_tags.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
model_tags.add(MaxPooling1D(pool_size=2))
model_tags.add(Dropout(0.2))
model_tags.add(Flatten())
model_tags.add(Dense(vocab_size, activation='softmax'))
print(model_tags.summary())

Model: "sequential_48"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_45 (Embedding)     (None, 5, 64)             341888    
_________________________________________________________________
conv1d_68 (Conv1D)           (None, 2, 32)             8224      
_________________________________________________________________
max_pooling1d_65 (MaxPooling (None, 1, 32)             0         
_________________________________________________________________
dropout_42 (Dropout)         (None, 1, 32)             0         
_________________________________________________________________
flatten_19 (Flatten)         (None, 32)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 5342)              176286    
Total params: 526,398
Trainable params: 526,398
Non-trainable params: 0
_______________________________________________

In [95]:
model_rev_tags = Sequential()
model_rev_tags.add(Embedding(vocab_size, 64, input_length=max_length-1))
model_rev_tags.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
model_rev_tags.add(MaxPooling1D(pool_size=2))
model_rev_tags.add(Dropout(0.2))
model_rev_tags.add(Flatten())
model_rev_tags.add(Dense(vocab_size, activation='softmax'))
print(model_rev_tags.summary())

AttributeError: 'Tensor' object has no attribute 'add'

In [94]:
merged = Concatenate()([model_sent, model_rev_sent])#, model_tags, model_rev_tags])
output = Dense(vocab_size, activation='softmax')(merged)

#model_final = Model(inputs=[in_1D, in_2D], outputs=[output])

ValueError: Layer concatenate_4 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.engine.sequential.Sequential'>. Full input: [<keras.engine.sequential.Sequential object at 0x00000175CFCC4D88>, <keras.engine.sequential.Sequential object at 0x00000175CFDE7C48>]. All inputs to the layer should be tensors.

In [None]:
# compile forward sequence network
# loss is set to sparse_cat_cross because of multiple classes and no one-hot encoding
model_sent.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])
# fit network
model_sent.fit(X, y,batch_size=128, epochs=20, verbose=1, shuffle=True, validation_split=0.2)
# save the model to file
model_sent.save('model_oov.h5')