In [2]:
3 + 4

7

In [3]:
import numpy as np
import pandas as pd
from preprocess import preprocess  # local file. restart kernel if this changed, it won't be re-imported otherwise
from sklearn.model_selection import train_test_split

## Read the first few rows during crude developing:
#train = pd.read_csv('../data/train.csv', nrows=50000).fillna(' ')  # train has 159571 comments
#test = pd.read_csv('../data/test.csv', nrows=5000).fillna(' ')

## These lines load all data:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

[train, test, train_text, test_text, all_text, class_names] = preprocess(train, test)

# Remove 80% of the train set's comments that are all-ok to counter the imbalancedness a bit
#delete_idx = (train[class_names].sum(axis = 1) == 0) & (np.random.rand(len(train)) > 0.2)
#keep_idx = ~delete_idx  # elem-wise NOT for pd.Series
#train = train[keep_idx]

In [4]:
# I want a fixed 5000 observations in the test set. Adapt the fraction accordingly:
test_fraction = 5000 / len(train)

train, dev = train_test_split(train, test_size=test_fraction, random_state=42)
# Reset indices because they were shuffled and I need them from 0 to len()
train.index = list(range(len(train)))
dev.index = list(range(len(dev)))

In [5]:
# either:
#max_len_train = train['comment_text'].str.split().apply(len).max()
#max_len_test = test['comment_text'].str.split().apply(len).max()
#max_len = max(max_len_train, max_len_test)

# or:
max_len = 200  # that might work to have shorter LSTM cells

## Verify you're using GPU

(also enter `nvidia-smi` in the shell to see a `top`-like GPU usage summary)

In [6]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

  from ._conv import register_converters as _register_converters


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4946876026808567605
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11288277812
locality {
  bus_id: 1
}
incarnation: 175892099359653336
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0, compute capability: 3.7"
]


In [7]:
import tensorflow as tf
with tf.device('/gpu:0'):
    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
    c = tf.matmul(a, b)

with tf.Session() as sess:
    print (sess.run(c))

[[22. 28.]
 [49. 64.]]


In [8]:
# this took ages!
#import tensorflow as tf
#sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

## Proceed

In [9]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, GRU, Activation, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

Using TensorFlow backend.


In [10]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [11]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('../data/glove.6B.100d.txt')

In [12]:
import re

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]

    X_indices = np.zeros((m, max_len))
    
    for i in range(m):
        
        # Convert the ith training sentence in lower case and split is into words. You get a list of words.
        sentence_words = re.sub(r"\W", " ", X[i]).lower().split()

        n_words = len(sentence_words)
        
        if n_words > max_len:  # use the *beginning* of the comment if it's too long
            sentence_words = sentence_words[0:max_len]
            n_words = max_len

        ctr = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            j = max_len - n_words + ctr 
            
            if w in word_to_index:
                # pad sequences from the left, i.e. start so the "last" word in the comment is the max_len'th entry.
                X_indices[i, j] = word_to_index[w]
                
            ctr += 1
            
            if j >= max_len:  
                # adding this clause, you can have sentences longer than max_len and crop them
                break
            
    return X_indices

In [13]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (e.g. 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)#, mask_zero = True)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [14]:
def myModel(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    X = embedding_layer(sentence_indices)   

    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    #X = LSTM(units = 64, return_sequences = True)(X)
    X = Bidirectional(GRU(units = 32, activation = 'relu', return_sequences = True, 
                          dropout = 0.25, recurrent_dropout = 0.25))(X)
    X_max = GlobalMaxPooling1D()(X)
    X_avg = GlobalAveragePooling1D()(X)
    X = concatenate([X_max, X_avg])
    X = Dense(50, activation = 'relu')(X)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.25)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    #X = LSTM(units = 64)(X)
    #X = GRU(units = 64, activation = 'relu')(X)
    # Add dropout with a probability of 0.5
    #X = Dropout(rate = 0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(units = 6, activation = 'sigmoid')(X)
    # Add a softmax activation
    #X = Activation("softmax")(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

In [15]:
model = myModel((max_len,), word_to_vec_map, word_to_index)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 100)     40000100    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200, 64)      25536       embedding_1[0][0]                
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 64)           0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
global_ave

In [16]:
X_train_indices = sentences_to_indices(train['comment_text'], word_to_index, max_len)
Y_train = train[class_names].values
X_dev_indices = sentences_to_indices(dev['comment_text'], word_to_index, max_len)
Y_dev = dev[class_names].values

In [17]:
# If you want gradient clipping, put optimizer=adam_gradclip in the model.compile() call

#from keras import optimizers
#
#adam_gradclip = optimizers.Adam(clipnorm = 1, clipvalue = 0.5)

In [18]:
# model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
# if you want to save a model's configuration and weights, and train it further tomorrow:

#from keras.models import load_model
#model = load_model("11-model.h5")

# Up the batch size to use all GPU cores!

### batch_size = 2048

GPU utilization  85%

53sec/epoch

total wall time (5 epochs) 4min 30sec

### batch_size = 4096

GPU utilization 87-90%

35sec/epoch

total wall time (5 epochs) 2min 56sec

In [26]:
%%time 

model.fit(X_train_indices, Y_train, epochs = 10, batch_size = 4096, shuffle=True)
model.save("11-model.h5")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 5min 46s, sys: 1min 25s, total: 7min 12s
Wall time: 5min 46s


In [27]:
from sklearn import metrics

m = 5000

pred_train = model.predict(X_train_indices[0:m, :])  # train performance on first 5k samples only...
aucs = np.zeros((len(class_names)))

for y_i in range(len(class_names)):
    y = Y_train[0:m, y_i]
    yhat = pred_train[0:m, y_i]
    fpr, tpr, thresholds = metrics.roc_curve(y, yhat, pos_label = 1)
    this_auc = metrics.auc(fpr, tpr)
    aucs[y_i] = this_auc
    
np.mean(aucs)

0.9804495992549048

GPU Glove 100 w/ dropout (batch size 4096), avg+max pooling concat'ed:

- 5 Epochs: 0.946007807866916
- 10: 0.9647018394257634
- 20: 0.9764435124653218
- 30: 0.9804495992549048

CPU Glove 100 w/ dropout (batch size 512), avg+max pooling concat'ed:

- 5 Epochs: 0.9813304560865536
- 10: 0.9858895978606


# Compute dev set mean AUC

In [28]:
# %%time

# model.evaluate(X_dev_indices, Y_dev, batch_size = len(train))
pred_dev = model.predict(X_dev_indices)

aucs = np.zeros((len(class_names)))

for y_i in range(len(class_names)):
    y = Y_dev[:, y_i]
    yhat = pred_dev[:, y_i]
    fpr, tpr, thresholds = metrics.roc_curve(y, yhat, pos_label = 1)
    this_auc = metrics.auc(fpr, tpr)
    aucs[y_i] = this_auc
    
np.mean(aucs)

0.9810237145941006

GPU Glove 100 w/ dropout (batch size 4096), avg+max pooling concat'ed:

- 5: 0.938849160860439
- 10: 0.9617743661464152
- 20: 0.9757392806839182
- 30: 0.9810237145941006

CPU Glove 100 w/ dropout (batch size 512), avg+max pooling concat'ed:

- 5: 0.9818096375442297
- 10: 0.9868617031269369  # 0.9765 on kaggle

# Sanity check train predictions

In [29]:
pred = model.predict(X_train_indices[0:7, :])  # the 7th comment (in unshuffled train) is a very toxic doggo

In [30]:
pred

array([[1.12801127e-03, 2.01574039e-06, 3.40407249e-04, 1.68826173e-05,
        1.58078881e-04, 1.05438312e-05],
       [8.59036576e-03, 1.17899262e-05, 1.14056678e-03, 6.66162741e-05,
        1.31641165e-03, 7.25964928e-05],
       [4.65280493e-04, 7.82757752e-08, 5.99020896e-05, 2.78784637e-06,
        2.93973972e-05, 9.68194627e-07],
       [5.89314674e-04, 1.12668644e-07, 6.74578550e-05, 3.63553113e-06,
        5.60625085e-05, 1.08975030e-06],
       [1.50705613e-02, 7.65925870e-06, 2.53840745e-03, 5.40783913e-05,
        2.33510602e-03, 1.02269616e-04],
       [1.06972991e-03, 4.07087924e-07, 1.65902369e-04, 1.60822729e-05,
        9.64629653e-05, 3.51609128e-06],
       [9.79930282e-01, 3.23228002e-01, 8.85371685e-01, 8.63350928e-02,
        7.75630593e-01, 2.05526173e-01]], dtype=float32)