In [1]:
import sys
import io
import re
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from keras.models import load_model, save_model, Model
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding

DATA_PATH = "./data/"

DISPLAY_GRAPH = True

Using TensorFlow backend.


In [2]:
def text_to_list(text):
    """" split input text to a list of words, considering most punctuation as words """
    # textl = re.findall(r"[\w']+|[.,!?;]", text)
    # textl = re.findall(r"[a-z']+|[.,:\'!?;()\"-]", text)  # punctuation management, might be improved (\n ?)
    return re.findall(r"[\w]+|[.,':!?;()\"-]", text)

In [3]:
def read_glove_vecs(glove_file, additional_dimensions=0, max_vocabulary_size=None, keep_only_filter=None):
    """
    Load glove matrix (word representation).

    Arguments:
    glove_file -- str, file to import glove word representation (matrix of weights)
    additional_dimensions -- (optional) int, Nb of additional dimensions to add to loaded vectors (filled with zeros)
    max_vocabulary_size -- (optional) int, nb max of loaded words. Use it to reduce dimensionality
    keep_only_filter -- (optional) iterable, seq of words aimed to be loaded in glove file (ignore words not in filter).

    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)

    """
    with open(glove_file, 'r', encoding='utf8') as f:
        words = set()
        word_to_vec_map = {}

        i = 0
        for line in f:
            if max_vocabulary_size and i >= max_vocabulary_size:
                # we have enough words, we stop adding them
                break
            line = line.strip().split()
            curr_word = line[0]
            if keep_only_filter and curr_word not in keep_only_filter:
                # if word does not appear in filter (in input text?) we don t keep it
                continue
            words.add(curr_word)
            if not additional_dimensions:
                word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            else:
                word_to_vec_map[curr_word] = np.array(line[1:] + ['0.'] * additional_dimensions, dtype=np.float64)
            i += 1

        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [4]:
def prepare_text_and_glove(text_file, glove_file, replace_by_token=True, max_vocabulary_size=None, display_analysis=True):
    """ Load text and glove matrix from files. Split input text as a list, considering most punctuation as words.
      Replace unknown words and nouns by corresponding tokens.
      Add those tokens in vocabulary and within glove representation by adding 2 dimensions.
      return textl, word_to_index, index_to_word, word_to_vec_map"""

    # text = io.open(text_file, encoding='utf-8').read().lower()
    texT = io.open(text_file, encoding='utf-8').read()
    text = texT.lower()

    textl = text_to_list(text)  # without capital letters
    texTl = text_to_list(texT)  # with capital letters (we assume: unknown word has a capital letter then it's a noun)

    textl_set = set(textl)   # list of unique words, we'll filter gloave matrix on them
    # load initial glove representation, we add 2 dimensions for noons and ukn tokens
    word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(glove_file, additional_dimensions=2,
                                                                    max_vocabulary_size=max_vocabulary_size,
                                                                    keep_only_filter=textl_set)
    representation_dimensionality = word_to_vec_map[index_to_word[1]].shape[0]

    nb_of_words = len(textl)
    vocab_size = len(word_to_index)
    nouns, unknown_words = list(), list()

    # add <NOUN> and <UKN> tokens to vocabulary and glove representation
    nouns_token, unknown_token = "<NOUN>", "<UKN>"
    index_to_word[vocab_size + 1] = nouns_token
    index_to_word[vocab_size + 2] = unknown_token
    word_to_index[nouns_token] = vocab_size + 1
    word_to_index[unknown_token] = vocab_size + 2
    noun_vect = [0.] * representation_dimensionality
    noun_vect[-2] = 1
    ukn_vect = [0.] * representation_dimensionality
    ukn_vect[-1] = 1
    word_to_vec_map[nouns_token] = np.array(noun_vect, dtype=np.float64)
    word_to_vec_map[unknown_token] = np.array(ukn_vect, dtype=np.float64)

    working_words = 0
    for i, w in enumerate(textl):
        try:
            _ = word_to_index[w]
            working_words += 1

        except KeyError:
            if w not in unknown_words and w not in nouns:
                # print("word number", i, "does not exist:", w)
                if textl[i] == texTl[i]:  # i.e. if there is a capital letter difference
                    unknown_words.append(w)
                else:
                    nouns.append(w)

            if replace_by_token:
                if w in unknown_words:
                    textl[i] = unknown_token
                else:
                    textl[i] = nouns_token

    if display_analysis:
        print("nb of words:", nb_of_words)
        # print(unknown_token, len(unknown_words), unknown_words)
        # print(nouns_token, len(nouns), nouns)
        print("% of known words in text", working_words/nb_of_words)

    return textl, word_to_index, index_to_word, word_to_vec_map

In [5]:
def text_to_indices_simple(txt, word_to_index):
    """ converts input text to indices """
    txtl = text_to_list(txt.lower())
    txTl = text_to_list(txt)
    txt_indices = np.zeros((1, len(txtl)), dtype=np.int32)
    for i, w in enumerate(txtl):
        try:
            w_i = word_to_index[w]
        except KeyError:
            if w != txTl[i]:  # capital letter difference -> it's a Noun !
                w_i = word_to_index['<NOUN>']
            else:  # not a capital letter difference -> unknown word !
                w_i = word_to_index['<UKN>']
        txt_indices[0, i] = w_i
    return txt_indices

In [6]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index, index_to_word):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.

    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """

    vocab_len = len(word_to_index) + 1  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map[index_to_word[1]].shape[0]  # define dimensionality of your GloVe word vectors (= 50)

    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))

    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False.
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))

    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])

    return embedding_layer

In [7]:
def create_simple_lstm_model(input_shape, word_to_vec_map, word_to_index, index_to_word):
    """
    Function creating the Emojify-v2 model's graph.

    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """

    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(input_shape, dtype='int32')

    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index, index_to_word)

    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)

    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(64, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.6)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    # X = LSTM(128, return_state=True)(X)
    X = LSTM(64, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.6)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    # X = Dense(5, activation='softmax')(X)
    X = Dense(len(word_to_index))(X)
    # Add a softmax activation
    X = Activation(activation='softmax')(X)

    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [8]:
def txtl_to_X_Y(txtl, word_to_index, nb_of_context_words):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4).

    Arguments:
    txtl -- list of words (strings)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.

    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)

    # descr of the algo:
    1. convert txtl to txtl_indices
    1.bis. set m = len(txt) - nb_of_context_words -1
    2. initialize X_indices = np.zeros(m, nb_of_context_words)
    3. initilize Y_hot = np.zeros(m, vocabulary_size)
    4. within loop, set one by one X_indices to corresponding index value
    5. same loop, set Y_hot[i, correspesponding_answer_index] = 1
    """

    # initialize X and Y as numpy matrices of zeros and the correct shape
    m = len(txtl) - nb_of_context_words - 1  # minus one because each input x_i needs an expected answer y_i
    vocab_size = len(word_to_index)
    X_indices = np.zeros((m, nb_of_context_words), dtype=np.int32)
    Y_hot = np.zeros((m, vocab_size), dtype=np.bool)

    txtl_indices = [word_to_index[w] for w in txtl]

    for i in range(m):

        y_index = txtl_indices[i + nb_of_context_words]
        # y_word = txtl[i + nb_of_context_words] # debug purpose only

        Y_hot[i, y_index-1] = 1  # y is indexed from 0, words from 1
        X_indices[i] = txtl_indices[i: i + nb_of_context_words]
        # below alternative to build x same thing with loop and more code
        # x_indices = txtl_indices[i: i + nb_of_context_words]
        # x_words = txtl[i: i + nb_of_context_words]  # debug purpose only
        # for j in range(nb_of_context_words):
        #     X_indices[i, j] = x_indices[j]

    return X_indices, Y_hot

In [9]:
def split_inputs(x, y, split_ratio, random=True, seed=None, return_indices=False):
    """ return reduced_x, cross_val_x, reduced_y, cross_val_y
    reduced_x has size split_ratio * x.shape[0]
    cross_val_x has size (1-split_ratio) * x.shape[0]
    involved_indices (optional) is a list of 2 elements: first and second set of involved_indices"""

    if seed: np.random.seed(seed)
    assert(x.shape[0] == y.shape[0])

    if random:
        # shuffle inputs
        p = np.random.permutation(x.shape[0])
        if isinstance(x, np.ndarray):
            x = x[p]
            y = y[p]
        elif isinstance(x, pd.DataFrame) or isinstance(x, pd.Series):
            x = x.iloc[p]
            y = y.iloc[p]
        else:
            raise TypeError("input type should be ndarray or DataFrame or Series")
    else:
        p = np.array(range(x.shape[0]))

    # split our training all into 2 sets
    n = int(x.shape[0] * split_ratio)
    cross_val_x = x[n:]
    reduced_x = x[:n]
    cross_val_y = y[n:]
    reduced_y = y[:n]

    if return_indices:
        return reduced_x, cross_val_x, reduced_y, cross_val_y, [p[:n], p[n:]]
    return reduced_x, cross_val_x, reduced_y, cross_val_y

In [10]:
def random_prediction(preds, softmax_needed=False, temperature=1.0):
    # helper function to sample an index from a probability array
    # we might prefer not to use a softmax in last layer: in this case we need to 'softmax' output before
    # picking one prediction according to model's distrib
    if softmax_needed:
        preds = np.asarray(preds).astype('float64')  # needed anyway ?
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    out_index = np.random.choice(range(preds.shape[0]), p=probas.ravel())
    return out_index

In [11]:
np.random.seed(42)
glove_file = DATA_PATH + 'glove.6B.50d.txt'
txt_file = DATA_PATH + 'harry_potter_7.txt'

print("Loading text data and glove matrix...")

nb_of_context_words = 12

textl, word_to_index, index_to_word, word_to_vec_map = prepare_text_and_glove(txt_file, glove_file)

nb_words_loaded = 50000
textl = textl[:nb_words_loaded]  # memory issues, i have to cut the text :(

X_indices, Y_hot = txtl_to_X_Y(textl, word_to_index, nb_of_context_words)
x_train, x_test, y_train, y_test = split_inputs(X_indices, Y_hot, split_ratio=0.95)
print("X_train shape:", x_train.shape)
print("X_val shape:", x_test.shape)
print("Y_train shape:", y_train.shape)
print("Y_val shape:", y_test.shape)


epochs = 2
model_file = DATA_PATH + "simple_lstm_model_" + str(epochs) + "_epochs_" + str(nb_words_loaded) + "_words.h5"
try:
    model = load_model(model_file)
    is_loaded = True
except:
    # Its better to have a decreasing learning rate during the training to reach efficiently the global
    # minimum of the loss function.
    # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically
    # every X steps (epochs) depending if it is necessary (when accuracy is not improved).
    # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=1, factor=0.6,
                                                min_lr=0.0001)

    # Define the optimizer
    # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0)

    model = create_simple_lstm_model((nb_of_context_words,), word_to_vec_map, word_to_index, index_to_word)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # model.fit(x_train, y_train, epochs=nb_epochs, batch_size=1024, shuffle=True)
    history = model.fit(x=x_train, y=y_train, epochs=epochs, batch_size=1024, validation_data=(x_test, y_test),
                        verbose=2, callbacks=[learning_rate_reduction])
    # Plot the loss and accuracy curves for training and validation
    fig, ax = plt.subplots(2, 1)
    ax[0].plot(history.history['loss'][5:], color='b', label="Training loss")
    ax[0].plot(history.history['val_loss'][5:], color='r', label="validation loss", axes=ax[0])
    legend = ax[0].legend(loc='best', shadow=True)

    ax[1].plot(history.history['acc'], color='b', label="Training accuracy")
    ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy")
    legend = ax[1].legend(loc='best', shadow=True)
    if DISPLAY_GRAPH: plt.show()

    is_loaded = False


model.summary()
loss, acc = model.evaluate(x_test, y_test)
print()
print("Test accuracy = ", acc)

#save model
if not is_loaded:
    save_model(model, model_file)

np.random.seed(23)
nb_words_to_create = 5
first_input_text = "It's the story of a ghost, which had a"
x = text_to_indices_simple(first_input_text, word_to_index)
print(x.shape, x)
sys.stdout.write(first_input_text)
for i in range(nb_words_to_create):

    preds = model.predict(x, verbose=0)[0]
    out_index = random_prediction(preds, softmax_needed=True)  # care, indexed from 0
    out_word = index_to_word[out_index + 1]  # care, indexed from 1
    # print('b', x[0, 1:nb_of_context_words] + [out_index + 1, ])
    x[0, 0:nb_of_context_words-1] = x[0, 1:nb_of_context_words]
    x[0, nb_of_context_words-1] = out_index + 1  # care, indexed from 1
    sys.stdout.write(' ' + out_word)
    # print(out_word)

Loading text data and glove matrix...
nb of words: 257072
% of known words in text 0.9919555610879442
X_train shape: (47487, 12)
X_val shape: (2500, 12)
Y_train shape: (47487, 10592)
Y_val shape: (2500, 10592)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 12)                0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 12, 52)            550836    
_________________________________________________________________
lstm_13 (LSTM)               (None, 12, 64)            29952     
_________________________________________________________________
dropout_13 (Dropout)         (None, 12, 64)            0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 64)                33024     
________________________________________________________________