# New playground

## Explore the input file

In [2]:
import numpy as np
from os import path
import string
import csv
from nltk.tokenize import word_tokenize, sent_tokenize
from keras.preprocessing.text import Tokenizer

import matplotlib.pyplot as plt
%matplotlib inline 

Using TensorFlow backend.


In [16]:
# Don't filter . and ?
punctuations_filter='!"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

def remove_punctuations(sentence):
    return sentence.translate(str.maketrans('', '', punctuations_filter))

def remove_dots(sentence):
    return sentence.replace('...', ' ')

def remove_nonascii(word):
    return ''.join([char if ord(char) < 128 else '' for char in word])

def make_lower(word):
    return word.lower()

def clean_word(word):
    processed = remove_nonascii(word)
    processed = make_lower(processed)
    if not processed.isdigit():
        return processed
    else:
        return ''

# We will just use dialogue by Jerry
input_file_path = path.join("..", "input_files", "complete_seinfeld_scripts.csv")

sentences_by_jerry = []
with open(input_file_path) as input_file:
    input_data = csv.DictReader(input_file)
    for row in input_data:
        if(row['Character'] == 'JERRY'):
            for sentence in sent_tokenize(row['Dialogue']):
                sentence = remove_dots(sentence)
                sentence = remove_punctuations(sentence)
                sentences_by_jerry.append([clean_word(word) for word in word_tokenize(sentence)])
print("There are {} sentences by Jerry in all Seinfeld episodes!\n".format(len(sentences_by_jerry)))
print("Few words by Jerry:")
print(sentences_by_jerry[0:30])

There are 24073 sentences by Jerry in all Seinfeld episodes!

Few words by Jerry:
[['do', 'you', 'know', 'what', 'this', 'is', 'all', 'about', '?'], ['do', 'you', 'know', 'why', 'were', 'here', '?'], ['to', 'be', 'out', 'this', 'is', 'out', 'and', 'out', 'is', 'one', 'of', 'the', 'single', 'most', 'enjoyable', 'experiences', 'of', 'life', '.'], ['people', 'did', 'you', 'ever', 'hear', 'people', 'talking', 'about', 'we', 'should', 'go', 'out', '?'], ['this', 'is', 'what', 'theyre', 'talking', 'about', 'this', 'whole', 'thing', 'were', 'all', 'out', 'now', 'no', 'one', 'is', 'home', '.'], ['not', 'one', 'person', 'here', 'is', 'home', 'were', 'all', 'out'], ['there', 'are', 'people', 'tryin', 'to', 'find', 'us', 'they', 'dont', 'know', 'where', 'we', 'are', '.'], ['on', 'an', 'imaginary', 'phone', 'did', 'you', 'ring', '?', 'i', 'cant', 'find', 'him', '.'], ['where', 'did', 'he', 'go', '?'], ['he', 'didnt', 'tell', 'me', 'where', 'he', 'was', 'going', '.'], ['he', 'must', 'have', 'gone',

In [23]:
punctuations_filter='!"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~0123456789'

def remove_punctuations(sentence):
    return sentence.translate(str.maketrans('', '', punctuations_filter))

remove_punctuations("My name is sdf;ffd 10, 11, 12")

'My name is sdfffd   '

In [17]:
# Create a tokenizer
tokenizer = Tokenizer(filters=punctuations_filter)

# And build the word index
tokenizer.fit_on_texts(sentences_by_jerry)

# This is how we can recover the word index that was computed
word_index = tokenizer.word_index

# Vocabulary size
vocabulary_size = len(word_index) + 1
print("Vocabulary size is: {}.".format(vocabulary_size))

Vocabulary size is: 9928.


In [18]:
sequence_max_len = 3

prefix_word = []
target_word = []
for dialogue in sentences_by_jerry:
    for i in range (len(dialogue) - sequence_max_len):
        prefix_word.append(dialogue[i: i + sequence_max_len])
        target_word.append(dialogue[i + sequence_max_len])

print(prefix_word[0:3])
print(target_word[0:100])

print(len(prefix_word))
print(len(target_word))

[['do', 'you', 'know'], ['you', 'know', 'what'], ['know', 'what', 'this']]
['what', 'this', 'is', 'all', 'about', '?', 'why', 'were', 'here', '?', 'this', 'is', 'out', 'and', 'out', 'is', 'one', 'of', 'the', 'single', 'most', 'enjoyable', 'experiences', 'of', 'life', '.', 'ever', 'hear', 'people', 'talking', 'about', 'we', 'should', 'go', 'out', '?', 'theyre', 'talking', 'about', 'this', 'whole', 'thing', 'were', 'all', 'out', 'now', 'no', 'one', 'is', 'home', '.', 'here', 'is', 'home', 'were', 'all', 'out', 'tryin', 'to', 'find', 'us', 'they', 'dont', 'know', 'where', 'we', 'are', '.', 'phone', 'did', 'you', 'ring', '?', 'i', 'cant', 'find', 'him', '.', 'go', '?', 'me', 'where', 'he', 'was', 'going', '.', 'gone', 'out', '.', 'go', 'out', 'you', 'get', 'ready', 'you', 'pick', 'out', 'the', 'clothes', 'right']
100853
100853


In [19]:
# This turns strings into lists of integer indices.
prefix_sequences = tokenizer.texts_to_sequences(prefix_word)
target = tokenizer.texts_to_sequences(target_word)
print(target[:100])

target_sequences = []
for sequence in target:
    for seq in sequence:
        target_sequences.append(seq)

print(prefix_sequences[0:3])
print(target_sequences[0:3])
print(len(prefix_sequences))
print(len(target_sequences))

[[9], [15], [14], [40], [45], [2], [69], [78], [61], [2], [15], [14], [37], [11], [37], [14], [77], [12], [4], [1176], [445], [4804], [4805], [12], [192], [1], [179], [241], [127], [188], [45], [49], [138], [53], [37], [2], [113], [188], [45], [15], [161], [97], [78], [40], [37], [81], [17], [77], [14], [267], [1], [61], [14], [267], [78], [40], [37], [2111], [7], [256], [190], [51], [19], [16], [112], [49], [41], [1], [137], [66], [3], [1817], [2], [5], [73], [256], [63], [1], [53], [2], [20], [112], [29], [35], [71], [1], [543], [37], [1], [53], [37], [3], [38], [471], [3], [298], [37], [4], [431], [50]]
[[23, 3, 16], [3, 16, 9], [16, 9, 15]]
[9, 15, 14]
100853
100740


In [15]:
from keras.utils import to_categorical

# Normalize
X = np.array(prefix_sequences)
print(X.shape)
y = np.array(target_sequences)
print(y.shape)

X = X / float(vocabulary_size)
y = to_categorical(y, num_classes=vocabulary_size)
print(y[0:3])

NameError: name 'prefix_sequences' is not defined

In [12]:
# Options are 50, 100, 200, 300
embedding_dim = 100

glove_dir = 'glove.6B'

embeddings_index = {}
f = open(path.join("..", glove_dir, 'glove.6B.{}d.txt'.format(embedding_dim)))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [13]:
embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < vocabulary_size:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=99)

print("The training set has %d samples." % len(X_train))
print("The validation set has %d samples." % len(X_val))

NameError: name 'X' is not defined

In [28]:
from keras.models import Model
from keras.regularizers import l2
from keras.layers.merge import concatenate
from keras.layers import Input, Embedding, Dropout, LSTM, Dense

regularizer = l2(1e-4)
dropout = 0.2

inputs = Input(shape=(sequence_max_len,))
embedding = Embedding(vocabulary_size, embedding_dim, input_length=sequence_max_len)(inputs)
embedding = Dropout(dropout)(embedding)

LSTM1 = LSTM(embedding_dim,
               return_sequences=True,
               dropout=dropout, recurrent_dropout=dropout,
               kernel_regularizer=regularizer, recurrent_regularizer=regularizer, bias_regularizer=regularizer)(embedding)
LSTM2 = LSTM(embedding_dim,
               return_sequences=True,
               dropout=dropout, recurrent_dropout=dropout,
               kernel_regularizer=regularizer, recurrent_regularizer=regularizer, bias_regularizer=regularizer)(LSTM1)

concat = concatenate([embedding, LSTM1, LSTM2])

LSTM3 = LSTM(embedding_dim,
               return_sequences=False,
               dropout=dropout, recurrent_dropout=dropout,
               kernel_regularizer=regularizer, recurrent_regularizer=regularizer, bias_regularizer=regularizer)(concat)

dense1 = Dense(embedding_dim, activation='relu', kernel_regularizer=regularizer, bias_regularizer=regularizer)(LSTM3)
dense1 = Dropout(dropout)(dense1)

outputs = Dense(vocabulary_size, activation='softmax')(dense1)

model = Model(inputs=[inputs], outputs=[outputs])

model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 3)            0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 3, 100)       1002100     input_10[0][0]                   
__________________________________________________________________________________________________
dropout_20 (Dropout)            (None, 3, 100)       0           embedding_11[0][0]               
__________________________________________________________________________________________________
lstm_22 (LSTM)                  (None, 3, 100)       80400       dropout_20[0][0]                 
__________________________________________________________________________________________________
lstm_23 (L

In [None]:
from keras.optimizers import RMSprop

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=1E-5), metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

epochs = 20
batch_size = 128

# Stop training when a monitored quantity has stopped improving after 20 epochs
early_stop = EarlyStopping(patience=20, verbose=1)

# Reduce learning rate when a metric has stopped improving
reduce_lr = ReduceLROnPlateau(factor=0.3, patience=3, cooldown=3, verbose=1)

# Save the best model after every epoch
check_point = ModelCheckpoint(filepath='../saved_models/model_weights.hdf5', 
                               verbose=1, save_best_only=True)
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1,
                             validation_data=(X_val, y_val), 
                             callbacks=[check_point, early_stop, reduce_lr])

# Summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.savefig('../plots/model_accuracy.pdf', bbox_inches='tight')
plt.show()

# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.savefig('../plots/model_loss.pdf', bbox_inches='tight')
plt.show()