In [1]:
import numpy as np
import pandas as pd
from preprocess import preprocess  # local file. restart kernel if this changed, it won't be re-imported otherwise
from sklearn.model_selection import train_test_split

## Read the first few rows during crude developing:
train = pd.read_csv('data/train.csv', nrows=1000).fillna(' ')
test = pd.read_csv('data/test.csv', nrows=1000).fillna(' ')

## These lines load all data:
#train = pd.read_csv('data/train.csv').fillna(' ')
#test = pd.read_csv('data/test.csv').fillna(' ')

[train, test, train_text, test_text, all_text, class_names] = preprocess(train, test)

# TODO why does a train dev split break the code?
# train, dev = train_test_split(train, test_size=0.1, random_state=42)

In [2]:
max_len_train = train['comment_text'].str.split().apply(len).max()
max_len_test = test['comment_text'].str.split().apply(len).max()

max_len = max(max_len_train, max_len_test)

# max_len = 200  # that might work to have shorter LSTM cells

In [3]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [5]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [6]:
import re

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = re.sub(r"\W", " ", X[i]).lower().split()

        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            
            if w in word_to_index:
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[w]
                
            # Increment j to j + 1
            j += 1
            
            if j >= max_len:  
                # adding this clause, you can have sentences longer than max_len and crop them
                break
            
    ### END CODE HERE ###
    
    return X_indices

In [7]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [8]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

In [23]:
def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    #embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    X = embedding_layer(sentence_indices)   
    # Or use a trainable embedding layer:
    #embed_size = 128
    #max_features = input_shape[0]
    #X = Embedding(len(word_to_index), embed_size)(sentence_indices)
    
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    #X = LSTM(units = 64, return_sequences = True)(X)
    # Add dropout with a probability of 0.5
    #X = Dropout(rate = 0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    #X = LSTM(units = 64, activation = 'relu',
    #         kernel_initializer = 'glorot_uniform', recurrent_initializer = 'glorot_uniform')(X)
    X = LSTM(units = 64)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(units = 6, activation = 'sigmoid')(X)
    # Add a softmax activation
    #X = Activation("softmax")(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

In [16]:
model = Emojify_V2((max_len,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1052)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 1052, 128)         51200000  
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 390       
Total params: 51,249,798
Trainable params: 51,249,798
Non-trainable params: 0
_________________________________________________________________


In [17]:
Y_train = train[class_names].values

In [18]:
X_train_indices = sentences_to_indices(train['comment_text'], word_to_index, max_len)

In [19]:
# model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.fit(X_train_indices, Y_train, epochs = 2, batch_size = 32, shuffle=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcb0d256748>

In [21]:
pred = model.predict(X_train_indices)

In [22]:
pred

array([[0.08517398, 0.01614716, 0.03653552, 0.01195431, 0.03969808,
        0.01115324],
       [0.08517398, 0.01614716, 0.03653552, 0.01195432, 0.03969808,
        0.01115324],
       [0.08517399, 0.01614716, 0.03653552, 0.01195432, 0.03969808,
        0.01115324],
       ...,
       [0.08517399, 0.01614716, 0.03653552, 0.01195432, 0.03969808,
        0.01115324],
       [0.08517399, 0.01614716, 0.03653552, 0.01195432, 0.03969808,
        0.01115324],
       [0.08517399, 0.01614716, 0.03653552, 0.01195431, 0.03969808,
        0.01115324]], dtype=float32)