In [1]:
import numpy as np
import pandas as pd
from preprocess import preprocess  # local file. restart kernel if this changed, it won't be re-imported otherwise
from sklearn.model_selection import train_test_split

## Read the first few rows during crude developing:
#train = pd.read_csv('../data/train.csv', nrows=10000).fillna(' ')  # train has 159571 comments
#test = pd.read_csv('../data/test.csv', nrows=1000).fillna(' ')

## These lines load all data:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

[train, test, train_text, test_text, all_text, class_names] = preprocess(train, test)

# Remove 80% of the train set's comments that are all-ok to counter the imbalancedness a bit
#delete_idx = (train[class_names].sum(axis = 1) == 0) & (np.random.rand(len(train)) > 0.2)
#keep_idx = ~delete_idx  # elem-wise NOT for pd.Series
#train = train[keep_idx]

In [2]:
# I want a fixed 5000 observations in the test set. Adapt the fraction accordingly:
test_fraction = 5000 / len(train)

train, dev = train_test_split(train, test_size=test_fraction, random_state=42)
# Reset indices because they were shuffled and I need them from 0 to len()
train.index = list(range(len(train)))
dev.index = list(range(len(dev)))

In [3]:
# either:
#max_len_train = train['comment_text'].str.split().apply(len).max()
#max_len_test = test['comment_text'].str.split().apply(len).max()
#max_len = max(max_len_train, max_len_test)

# or:
max_len = 200  # that might work to have shorter LSTM cells

In [4]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, GRU, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [6]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../data/glove.6B.50d.txt')

In [7]:
import re

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]

    X_indices = np.zeros((m, max_len))
    
    for i in range(m):
        
        # Convert the ith training sentence in lower case and split is into words. You get a list of words.
        sentence_words = re.sub(r"\W", " ", X[i]).lower().split()

        n_words = len(sentence_words)
        
        if n_words > max_len:  # use the *beginning* of the comment if it's too long
            sentence_words = sentence_words[0:max_len]
            n_words = max_len

        ctr = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            j = max_len - n_words + ctr 
            
            if w in word_to_index:
                # pad sequences from the left, i.e. start so the "last" word in the comment is the max_len'th entry.
                X_indices[i, j] = word_to_index[w]
                
            ctr += 1
            
            if j >= max_len:  
                # adding this clause, you can have sentences longer than max_len and crop them
                break
            
    return X_indices

In [8]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (e.g. 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)#, mask_zero = True)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [9]:
def myModel(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    X = embedding_layer(sentence_indices)   

    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    #X = LSTM(units = 64, return_sequences = True)(X)
    X = Bidirectional(GRU(units = 32, activation = 'relu', return_sequences = True))(X)
    # Add dropout with a probability of 0.5
    #X = Dropout(rate = 0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    #X = LSTM(units = 64)(X)
    X = GRU(units = 64, activation = 'relu')(X)
    # Add dropout with a probability of 0.5
    #X = Dropout(rate = 0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(units = 6, activation = 'sigmoid')(X)
    # Add a softmax activation
    #X = Activation("softmax")(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

In [10]:
model = myModel((max_len,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 50)           20000050  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 64)           15936     
_________________________________________________________________
gru_2 (GRU)                  (None, 64)                24768     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 20,041,144
Trainable params: 41,094
Non-trainable params: 20,000,050
_________________________________________________________________


In [11]:
X_train_indices = sentences_to_indices(train['comment_text'], word_to_index, max_len)
Y_train = train[class_names].values
X_dev_indices = sentences_to_indices(dev['comment_text'], word_to_index, max_len)
Y_dev = dev[class_names].values

In [12]:
from keras import optimizers

adam_gradclip = optimizers.Adam(clipnorm = 1, clipvalue = 0.5)

In [13]:
# model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# if you want to save a model's configuration and weights, and train it further tomorrow:

#from keras.models import load_model
#model = load_model("08-model-bidir.h5")

## Loss history:

#### Epoch 1

Epoch 1/1
154571/154571 [==============================] - 936s 6ms/step - loss: 0.0617 - acc: 0.9786
CPU times: user 1h 7min 17s, sys: 15min 40s, total: 1h 22min 57s
Wall time: 15min 37s

#### Epoch 2

Epoch 1/1
154571/154571 [==============================] - 936s 6ms/step - loss: 0.0481 - acc: 0.9822
CPU times: user 1h 6min 41s, sys: 16min 12s, total: 1h 22min 53s
Wall time: 15min 35s

#### Epoch 3

Epoch 1/1
154571/154571 [==============================] - 922s 6ms/step - loss: 0.0449 - acc: 0.9831
CPU times: user 1h 6min 11s, sys: 15min 45s, total: 1h 21min 56s
Wall time: 15min 22s

#### Epoch 4

Epoch 1/1
154571/154571 [==============================] - 923s 6ms/step - loss: 0.0428 - acc: 0.9837
CPU times: user 1h 6min 20s, sys: 15min 39s, total: 1h 21min 59s
Wall time: 15min 23s

#### Epoch 5

Epoch 1/1
154571/154571 [==============================] - 922s 6ms/step - loss: 0.0413 - acc: 0.9842
CPU times: user 1h 6min 26s, sys: 15min 32s, total: 1h 21min 58s
Wall time: 15min 22s

#### Epoch 6

Epoch 1/1
154571/154571 [==============================] - 3854s 25ms/step - loss: 0.0399 - acc: 0.9847
CPU times: user 1h 9min 1s, sys: 3h 23min 54s, total: 4h 32min 55s
Wall time: 1h 4min 13s

In [None]:
%%time 

model.fit(X_train_indices, Y_train, epochs = 1, batch_size = 32, shuffle=True)
model.save("08-model-bidir.h5")

Epoch 1/1
  4064/154571 [..............................] - ETA: 15:33 - loss: 0.2120 - acc: 0.9335

In [42]:
from sklearn import metrics

m = 5000

pred_train = model.predict(X_train_indices[0:m, :])  # train performance on first 5k samples only...
aucs = np.zeros((len(class_names)))

for y_i in range(len(class_names)):
    y = Y_train[0:m, y_i]
    yhat = pred_train[0:m, y_i]
    fpr, tpr, thresholds = metrics.roc_curve(y, yhat, pos_label = 1)
    this_auc = metrics.auc(fpr, tpr)
    aucs[y_i] = this_auc
    
np.mean(aucs)

0.9918577265489499

Full run:

- 1 epoch: 0.976620166821609
- 2: 0.9845997807085977
- 3: 0.9878419339250856
- 4: 0.989974420318395
- 5: 0.9909152880074429
- 6: 0.9918577265489499


# Compute dev set mean AUC

In [43]:
# %%time

# model.evaluate(X_dev_indices, Y_dev, batch_size = len(train))
pred_dev = model.predict(X_dev_indices)

aucs = np.zeros((len(class_names)))

for y_i in range(len(class_names)):
    y = Y_dev[:, y_i]
    yhat = pred_dev[:, y_i]
    fpr, tpr, thresholds = metrics.roc_curve(y, yhat, pos_label = 1)
    this_auc = metrics.auc(fpr, tpr)
    aucs[y_i] = this_auc
    
np.mean(aucs)

0.9860198630049598

Full run:

- 1 epoch: 0.9760979734073979
- 2: 0.9833688361886418  # 0.9726 on kaggle
- 3: 0.9849738642393575  # 0.975
- 4: 0.9856226244722991  # 0.9755
- 5: 0.9869049877287438  # 0.9762
- 6: 0.9860198630049598


# Sanity check train predictions

In [44]:
pred = model.predict(X_train_indices[0:7, :])  # the 7th comment (in unshuffled train) is a very toxic doggo

In [45]:
pred

array([[1.49887273e-04, 5.32895186e-08, 1.32274026e-05, 2.57904667e-08,
        1.63681962e-05, 6.26262681e-06],
       [1.36357406e-03, 4.06428484e-07, 1.11510715e-04, 2.91054698e-06,
        1.32622808e-04, 7.97560006e-06],
       [1.50668508e-04, 1.46248382e-08, 2.69128286e-05, 1.51216852e-07,
        2.48739761e-05, 3.57397596e-07],
       [1.09278051e-04, 1.09495293e-08, 7.30224065e-06, 1.52667624e-07,
        1.40962302e-05, 1.46720640e-06],
       [2.38061007e-02, 1.52052735e-05, 6.72566378e-03, 6.67584300e-06,
        5.85544249e-03, 5.07789722e-04],
       [3.98359080e-05, 5.48619727e-09, 9.71933423e-06, 1.40894912e-07,
        5.46385991e-06, 6.13036320e-07],
       [9.99693394e-01, 5.03463268e-01, 9.68376160e-01, 7.94901609e-01,
        7.90331244e-01, 5.58436923e-02]], dtype=float32)

# Create submission csv

In [46]:
X_test_indices = sentences_to_indices(test['comment_text'], word_to_index, max_len)
pred = model.predict(X_test_indices)
submit = pd.DataFrame(test['id'].values, columns=['id'])
preds = pd.DataFrame(pred, columns=class_names)
submit = pd.concat([submit, preds], axis = 1)
#submit
submit.to_csv('../data/submission_08_bidir.csv', index = False)