In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer

from keras.models import Model, Sequential
from keras.layers import Input, LSTM, Dense, TimeDistributed, RepeatVector, Bidirectional, Embedding

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from nltk.tokenize import word_tokenize

import spacy

import csv

from nltk import tokenize
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

from keras.preprocessing.text import Tokenizer

from scipy.sparse import coo_matrix, hstack, csr_matrix
import sparse

from tensorflow.sparse import SparseTensor, to_dense

### Load data

In [2]:
dataset = pd.read_csv('comedy_data/dataset.csv')

In [3]:
dataset.head()

Unnamed: 0,show,line_id,text,text_reply,line_text_len,line_reply_len
0,friends,s01_e01_c01_u001,there is nothing to tell! he is just some guy ...,come on you are going out with the guy! there ...,13,17
1,friends,s01_e01_c01_u002,come on you are going out with the guy! there ...,all right joey be nice. so does he have a hump...,17,16
2,friends,s01_e01_c01_u003,all right joey be nice. so does he have a hump...,wait does he eat chalk?,16,5
3,friends,s01_e01_c01_u011,oh yeah. had that dream.,then i look down and i realize there is a phon...,5,12
4,friends,s01_e01_c01_u012,then i look down and i realize there is a phon...,instead of...?,12,2


In [None]:
# # create final dataset with input and output sentences

# final_dataset = dataset.copy()
# final_dataset = final_dataset[['dialogue', 'text_proc']].rename(columns = {'text_proc': 'input'})
# final_dataset['output'] = final_dataset.groupby('dialogue')['input'].shift(-1)
# final_dataset = final_dataset.drop('dialogue', axis = 1).dropna()
# final_dataset['output'] = final_dataset['output'].apply(lambda text: "<GO> " + text)

In [None]:
#CREATE VOCABULARY
vocab_size1 = 5000-4
content1 = dataset.input.values
regex_pattern = '([a-z]+|[0-9]+|\\b[?!.]+|[\x27]\\b)'
vectorizer1= CountVectorizer(token_pattern=regex_pattern,max_features=vocab_size1)
vectorizer1.fit(content1)

In [None]:
def tokenize_text(text, regex_pattern = '([a-z]+|[0-9]+|\\b[?!.]+|[\x27]\\b)'):
    return tokenize.regexp_tokenize(text.lower(),pattern=regex_pattern)

def rare_words_to_unk(tokenlist, vocab=vectorizer1.vocabulary_.keys(), replaceToken='<UNK>'):
    return [w if w in vocab else replaceToken for w in tokenlist]

def add_go_eos_and_padding(tokenlist, max_sentence_length):
    return tokenlist + ["<EOS>"] + ["<PAD>"] * (max_sentence_length - len(tokenlist))

def tokens_to_text(tokenlist):
    return " ".join(tokenlist)

def process_tokens(tokenlist, max_sentence_length):
    tokenlist = rare_words_to_unk(tokenlist)
    tokenlist = add_go_eos_and_padding(tokenlist, max_sentence_length)
    proc_text = tokens_to_text(tokenlist)
    return proc_text


dataset['input_tokens'] = dataset.input.apply(tokenize_text)
max_sentence_length = dataset["input_tokens"].apply(len).max()
dataset['text_proc'] = dataset.input_tokens.apply(lambda tokenlist: process_tokens(tokenlist, max_sentence_length))

In [None]:
dataset.head()

In [None]:
# create final dataset with input and output sentences

final_dataset = dataset.copy()
final_dataset = final_dataset[['dialogue', 'text_proc']].rename(columns = {'text_proc': 'input'})
final_dataset['output'] = final_dataset.groupby('dialogue')['input'].shift(-1)
final_dataset = final_dataset.drop('dialogue', axis = 1).dropna()
final_dataset['output'] = final_dataset['output'].apply(lambda text: "<GO> " + text)


final_dataset = final_dataset[:50000]

In [None]:
final_dataset.head()

In [None]:
# integer encode the sentences

def integer_encoding(docs):

    # create the tokenizer
    t = Tokenizer(filters = '', split=" ")
    
    # fit the tokenizer on the sentences
    t.fit_on_texts(docs)
    
    # summarize what was learned
    print('documents count: ', t.document_count)
    print('vocabulary size: ', len(t.word_counts))

    # integer encode sentences
    encoded_docs = t.texts_to_sequences(docs)
    
    return t


# all text sentences
docs = pd.concat([final_dataset.input, final_dataset.output], ignore_index = True)

# train encoder on complete text
int_encoder = integer_encoding(docs)

# integer encode inputs and outputs
encoded_input = int_encoder.texts_to_sequences(final_dataset.input)
encoded_output = int_encoder.texts_to_sequences(final_dataset.output)

In [None]:
def build_tensor(encoder, encoded_docs):

    encoder = int_encoder
    encoded_docs = decoder_target_data

    # number of documents
    nr_docs = len(encoded_docs)

    # length of sentence
    len_sentence = len(encoded_docs[0])

    # vocabulary size
    vocab_size = len(encoder.word_counts)

    # force integer encoding to start at 0 instead of 1
    #     encoded_docs_ = np.array(encoded_docs) - 1
    encoded_docs_ = encoded_docs


    # find dimensions to build 3D sparse tensor
    sentence = []
    position = []
    words_index = []

    for doc_index in np.arange(0, nr_docs):

        s = list(np.ones(len_sentence, dtype = int) * doc_index)
        p = list(np.arange(0, len_sentence))
        w = list(encoded_docs_[doc_index])

        sentence += s
        position += p
        words_index += w


    # build indices from dimensions
    indices = []

    for s, p, w in zip(sentence, position, words_index):

        indices.append([s, p, w])


    # build sparse tensor

    data = np.ones(len_sentence * nr_docs)

    t = SparseTensor(indices = indices, values = data, dense_shape = [nr_docs, len_sentence, vocab_size])

    # transform to dense

    t = to_dense(t)

    return t

In [None]:
from tensorflow import convert_to_tensor

In [None]:
# Index encode the sentences

encoder_input_data = np.array(int_encoder.texts_to_sequences(final_dataset.input)) - 1
print(encoder_input_data.shape)

decoder_target_data = np.array(int_encoder.texts_to_sequences(final_dataset.output)) - 1

print(decoder_target_data.shape)

# shift the decoder input / target data so that the target predicts the next word after the input

decoder_input_data = decoder_target_data[:,:-1] # remove last word

decoder_target_data = decoder_target_data[:,1:] # remove first word

print(decoder_target_data.shape)

In [None]:
decoder_target_data_ohe = build_tensor(int_encoder, decoder_target_data)

In [None]:
vocab_size = len(int_encoder.word_index)
print('vocabulary size: ', vocab_size)

embedding_dim = 50

max_length_in = encoder_input_data.shape[1]
print(max_length_in)
max_length_out = decoder_target_data.shape[1]
print(max_length_out)

# latent_dim = 100

# batch_size = 32
# epochs = 20

In [None]:
# Load Embeddings
embedding_dim = 50

# load the whole embedding into memory
embeddings_index = dict()
f = open('glove6B/glove_6B_50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in int_encoder.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i-1] = embedding_vector

#### Embedding Layer

arguments:

- input_dim: integer. size of vocabulary (i.e. maximum integer index + 1)
- output_dim: dimension of the dense embedding  

In [None]:
from keras.layers import Flatten, TimeDistributed

In [None]:
num_encoder_tokens = vocab_size
num_decoder_tokens = vocab_size
latent_dim = embedding_dim

batch_size = 32
epochs = 10

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))

embedder_in = Embedding(num_encoder_tokens, # vocabulary size
                        latent_dim, # embeddings vector size
                        weights=[embedding_matrix],
                        trainable = False)

encoder_inputs_embedded = embedder_in(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)
encoded_lstm, state_h, state_c = encoder_lstm(encoder_inputs_embedded)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(max_length_out,))

# embedder_out = Embedding(num_decoder_tokens, latent_dim)
embedder_out = Embedding(num_decoder_tokens, # vocabulary size
                        latent_dim, # embeddings vector size
                        weights=[embedding_matrix],
                        trainable = False)
decoder_inputs_embedded = embedder_out(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_embedded, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!
model.fit([encoder_input_data, decoder_input_data], decoder_target_data_ohe,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)


In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]



decoder_inputs_ = Input(shape=(None,), batch_size = 1)

x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs_)

decoder_outputs_, state_h, state_c = decoder_lstm(x, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]

decoder_outputs_ = decoder_dense(decoder_outputs_)

decoder_model = Model(
    [decoder_inputs_] + decoder_states_inputs, 
    [decoder_outputs_] + decoder_states)


In [None]:
int_encoder.word_index['<go>']

In [None]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [None]:
max_decoder_seq_length = max_length_out

# def decode_sequence(input_seq):

input_seq = sentence_encoded

# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)

# Generate empty target sequence of length 1.
# target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq = np.ones((1, max_length_out)) * (int_encoder.word_index['<pad>']-1)
# Populate the first character of target sequence with the start character.
#     target_seq[0, 0, target_token_index['\t']] = 1.

# target_seq[0, 0, int_encoder.word_index['<go>'] - 1] = 1
target_seq[0][0] = int_encoder.word_index['<go>'] - 1
print(target_seq)
print()

# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ''
n_words = 1
while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    
    print(output_tokens[0, n_words, :])
#     print(output_tokens[0, -1, :])

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, n_words, :])

    sampled_token_index += 1
#         sampled_char = reverse_target_char_index[sampled_token_index]
    sampled_char = int_encoder.index_word[sampled_token_index]

    decoded_sentence += " " + sampled_char
    print(decoded_sentence)

    # Exit condition: either hit max length
    # or find stop character.
    if (sampled_char == '<eos>' or len(decoded_sentence) > max_decoder_seq_length):
        stop_condition = True

    # Update the target sequence (of length 1).
#     target_seq = np.zeros((1, 1, num_decoder_tokens))
#     target_seq[0, 0, sampled_token_index] = 1.
    target_seq[0][n_words] = sampled_token_index
    print(target_seq)
    print()
#     target_seq = np.argmax(output_tokens, axis = 1)
    n_words += 1

    # Update states
    states_value = [h, c]

#     return target_seq, decoded_sentence

In [None]:
sentence = 'hi how are you'

sentence_proc = process_tokens(tokenize_text(sentence), max_sentence_length)
sentence_proc = sentence_proc.lower()

sentence_encoded = int_encoder.texts_to_sequences(pd.Series(sentence_proc))

# a, b = decode_sequence(sentence_encoded)

# sentence_matrix = build_tensor(int_encoder, sentence_encoded)