In [2]:
from keras.models import Model, load_model

from keras.layers.recurrent import LSTM
from keras.layers import Dense, Input, Embedding

from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

from collections import Counter

import nltk
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

np.random.seed(2018)

Using TensorFlow backend.


In [15]:
# set default parameters
BATCH_SIZE = 128
NUM_EPOCHS = 100
HIDDEN_UNITS = 100#256
MAX_INPUT_SEQ_LENGTH = 20
MAX_TARGET_SEQ_LENGTH = 20
MAX_VOCAB_SIZE = 100

input_counter = Counter()
target_counter = Counter()

# read the data

df = pd.read_csv('SARC_DS.csv')


In [16]:
df.head()

Unnamed: 0,Questions,Answer_1,Answer_2,all
0,I've been searching for the answer for this fo...,Religion must have the answer,It's obviously tracks from a giant water tract...,I've been searching for the answer for this fo...
1,"Michael Phelps Apologizes For ""Regrettable"" Be...",Wow...he smoked pot...oh lord hes such a horri...,"Wow, his girlfriend is uhm... Ah fuck it, he's...","Michael Phelps Apologizes For ""Regrettable"" Be..."
2,Utah wants to create a database to track the i...,I think the government should track every morm...,Another idea from the party that wants to get ...,Utah wants to create a database to track the i...
3,The Six Million Dead Jews of World War ONE!,"Oh right, *both* wars were just jewish conspir...","i know this seems strange but, what if he was ...",The Six Million Dead Jews of World War ONE! Oh...
4,WSJ begins the Jeb Bush campaign for 2016,Good luck with that.,time to get that shack in montana.,WSJ begins the Jeb Bush campaign for 2016 Good...


In [17]:
df.columns

Index(['Questions', 'Answer_1', 'Answer_2', 'all'], dtype='object')

In [19]:
df['QandA'] = df['all']+ ' '+ df['Answer_2']

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128540 entries, 0 to 128539
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Questions  128540 non-null  object
 1   Answer_1   128540 non-null  object
 2   Answer_2   128540 non-null  object
 3   all        128540 non-null  object
 4   QandA      128540 non-null  object
dtypes: object(5)
memory usage: 4.9+ MB


In [21]:
df=df.drop('all', axis = 1)

In [22]:
line = df.QandA
input_texts = []
target_texts = []

In [23]:
len(line)

128540

In [24]:
prev_words = []

for line in line:

    next_words = [w.lower() for w in nltk.word_tokenize(line) if w.isalpha()]
    #next_words = nltk.pos_tag(next_words)

    if len(next_words) > MAX_TARGET_SEQ_LENGTH:
        next_words = next_words[0:MAX_TARGET_SEQ_LENGTH]

    if len(prev_words) > 0:
        input_texts.append(prev_words)
        for w in prev_words:
            input_counter[w] += 1
        target_words = next_words[:]
        target_words.insert(0, 'START')
        target_words.append('END')
        for w in target_words:
            target_counter[w] += 1
        target_texts.append(target_words)

    prev_words = next_words

In [25]:
# encode the data
input_word2idx = dict()
for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)):
    input_word2idx[word[0]] = idx + 2
    
target_word2idx = dict()
for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)):
    target_word2idx[word[0]] = idx + 1

input_word2idx['PAD'] = 0
input_word2idx['UNK'] = 1
target_word2idx['UNK'] = 0

input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])
target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_encoder_tokens = len(input_idx2word)
num_decoder_tokens = len(target_idx2word)


encoder_input_data = []

encoder_max_seq_length = 0
decoder_max_seq_length = 0



for input_words, target_words in zip(input_texts, target_texts):
    
    encoder_input_wids = []
    
    for w in input_words:
        w2idx = 1
        if w in input_word2idx:
            w2idx = input_word2idx[w]
        encoder_input_wids.append(w2idx)

    encoder_input_data.append(encoder_input_wids)
    
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_words), decoder_max_seq_length)

context = dict()

context['num_encoder_tokens'] = num_encoder_tokens
context['num_decoder_tokens'] = num_decoder_tokens

context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

In [26]:
# custom function to generate batches

def generate_batch(input_data, output_text_data):
    num_batches = len(input_data) // BATCH_SIZE
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * BATCH_SIZE
            end = (batchIdx + 1) * BATCH_SIZE
            encoder_input_data_batch = pad_sequences(input_data[start:end], encoder_max_seq_length)
            decoder_target_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            decoder_input_data_batch = np.zeros(shape=(BATCH_SIZE, decoder_max_seq_length, num_decoder_tokens))
            for lineIdx, target_words in enumerate(output_text_data[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                    if idx > 0:
                        decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

In [27]:
# Compiling and training

encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS,
                              input_length=encoder_max_seq_length, name='encoder_embedding')

encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                 initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.


In [28]:
X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, target_texts, test_size=0.2, random_state=42)

train_gen = generate_batch(X_train, y_train)
test_gen = generate_batch(X_test, y_test)

train_num_batches = len(X_train) // BATCH_SIZE
test_num_batches = len(X_test) // BATCH_SIZE

In [47]:
# model.fit_generator(generator=train_gen,
#                     steps_per_epoch=train_num_batches,
#                     epochs=NUM_EPOCHS,
#                     verbose=1,
#                      validation_data=test_gen,
#                      validation_steps=test_num_batches,
#                     ###callbacks = my_callbacks
#                    )

In [30]:
model = load_model('model_QnAcomb_100epochs.h5')

In [31]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)

In [32]:
input_text = input()
input_seq = []
input_wids = []
max_encoder_seq_length = 20
max_decoder_seq_length = 10

for word in nltk.word_tokenize(input_text.lower()):
    idx = 1
    if word in input_word2idx:
        idx = input_word2idx[word]
    input_wids.append(idx)
    
input_seq.append(input_wids)
input_seq = pad_sequences(input_seq, max_encoder_seq_length)
states_value = encoder_model.predict(input_seq)
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, target_word2idx['START']] = 1
target_text = ''
target_text_len = 0
terminated = False

while not terminated:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sample_token_idx = np.argmax(output_tokens[0, -1, :])
    sample_word = target_idx2word[sample_token_idx]
    target_text_len += 1

    if sample_word != 'START' and sample_word != 'END':
        target_text += ' ' + sample_word

    if sample_word == 'END' or target_text_len >= max_decoder_seq_length:
        terminated = True

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sample_token_idx] = 1

    states_value = [h, c]
    

target_text.strip().replace('UNK', '')

What are you doing?


'w elder camping camping between chairs ubi swear swear ma'

In [33]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 20, 256)      2560512     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 10001)  0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 256), (None, 525312      encoder_embedding[0][0]          
____________________________________________________________________________________________