In [64]:
# Import necessary dependencies
import keras
from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import pickle
import re
import numpy as np

In [2]:
print(keras.__version__)

2.2.4


In [65]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [3]:
# Load train data into memory
with open("pickle_data/text_train.pickle", "rb") as f:
    text_train = pickle.load(f)

with open("pickle_data/sent_train.pickle", "rb") as f:
    sent_train = pickle.load(f)

In [4]:
print(len(text_train))
print(len(sent_train))

25000
25000


In [5]:
# Load test data into memory
with open("pickle_data/text_test.pickle", "rb") as f:
    text_test = pickle.load(f)

with open("pickle_data/sent_test.pickle", "rb") as f:
    sent_test = pickle.load(f)

In [6]:
print(len(text_test))
print(len(sent_test))

25000
25000


In [8]:
# Preprocess the train data
corpus_train = []
for text in text_train:
    text = text.replace("<br /><br />", " ") # Replace <br /><br /> with space
    text = re.sub(r'\W', ' ', text) # Replace punctuations with space
    text = text.lower() # Conveter to lower case
    text = re.sub(r'\s+[a-z]\s+', ' ', text) # Replace single characters with space
    text = re.sub(r'^[a-z]\s+', ' ', text) # Replace single characters at the beginning of the sentencecs with space
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    corpus_train.append(text)

In [9]:
# Check to see the results of preprocessing
corpus_train[0]

'zero day leads you to think even re think why two boys young men would do what they did commit mutual suicide via slaughtering their classmates it captures what must be beyond bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own mutual world via coupled destruction it is not perfect movie but given what money time the filmmaker and actors had it is remarkable product in terms of explaining the motives and actions of the two young suicide murderers it is better than elephant in terms of being film that gets under our rationalistic skin it is far far better film than almost anything you are likely to see flawed but honest with terrible honesty '

In [10]:
len(corpus_train)

25000

In [11]:
# Preprocess the test data
corpus_test = []
for text in text_test:
    text = text.replace("<br /><br />", " ") # Replace <br /><br /> with space
    text = re.sub(r'\W', ' ', text) # Replace punctuations with space
    text = text.lower() # Conveter to lower case
    text = re.sub(r'\s+[a-z]\s+', ' ', text) # Replace single characters with space
    text = re.sub(r'^[a-z]\s+', ' ', text) # Replace single characters at the beginning of the sentencecs with space
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    corpus_test.append(text)

In [12]:
len(corpus_test)

25000

In [92]:
avg_len = sum([len(e.split()) for e in corpus_train]) / len(corpus_train)
avg_len

224.15232

In [93]:
maxLen = 300

In [94]:
"""
A function to use Glove data
"""
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding='utf8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        word_to_index = {}
        index_to_word = {}
        for w in sorted(words):
            word_to_index[w] = i
            index_to_word[i] = w
            i += 1
    
    return word_to_index, index_to_word, word_to_vec_map, words

In [95]:
word_to_index, index_to_word, word_to_vec_map, words = read_glove_vecs('glove_data/glove.6B.50d.txt')

In [96]:
# Run a quick check
word = "cucumber"
index = 289846
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])

the index of cucumber in the vocabulary is 113317
the 289846th word in the vocabulary is potatos


In [97]:
# Check word vector dimension
word_to_vec_map[word].shape

(50,)

In [106]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding() 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    m = len(X)
    
    X_indices = np.zeros((m, max_len))
    
    # Loop over training examples
    for i in range(m):
        sentence_words = X[i].lower().split() # Returns a list of words
        j = 0
        for w in sentence_words:
            if w in words:
                X_indices[i, j] = word_to_index[w]
                j += 1
                if j >= max_len:
                    break
            
    return X_indices    

In [107]:
# Run a quick check
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = sentences_to_indices(X1, word_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =", X1_indices)

X1 = ['funny lol' 'lets play baseball' 'food is ready for you']
X1_indices = [[155345. 225122.      0.      0.      0.]
 [220930. 286375.  69714.      0.      0.]
 [151204. 192973. 302254. 151349. 394475.]]


In [108]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    vocab_len = len(word_to_index) + 1                # 0 is reserved for padding
    emb_dim = word_to_vec_map["cucumber"].shape[0]    # 50
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer   

In [109]:
def LSTM_model(input_shape, word_to_vec_map, word_to_index):
    """
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    sentence_indices = Input(input_shape, dtype='int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    embeddings = embedding_layer(sentence_indices)
    
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.5)(X)
    X = Dense(1, activation=None)(X)
    X = Activation('sigmoid')(X)
    
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [110]:
model = LSTM_model((maxLen, ), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 300)               0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 300, 50)           20000050  
_________________________________________________________________
lstm_13 (LSTM)               (None, 300, 128)          91648     
_________________________________________________________________
dropout_13 (Dropout)         (None, 300, 128)          0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_14 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 129       
__________

In [111]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [112]:
X_train_indices = sentences_to_indices(corpus_train, word_to_index, maxLen)

In [114]:
model.fit(X_train_indices, sent_train, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x4ec205c0>

In [None]:
# Check the accuracy on test set
X_test_indices = sentences_to_indices(corpus_test, word_to_index, maxLen)
loss, acc = model.evaluate(X_test_indices, sent_test)