In [2]:
import re, os, gc, time, pandas as pd, numpy as np
import tqdm

np.random.seed(32)
os.environ["OMP_NUM_THREADS"] = "5"
from nltk import tokenize, word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, Add, Flatten, TimeDistributed,CuDNNGRU,CuDNNLSTM
from keras.optimizers import Adam, RMSprop
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
# from keras.engine.topology import Layer
from keras.engine import InputSpec, Layer
from global_variables import TRAIN_FILENAME, TEST_FILENAME, COMMENT, LIST_CLASSES
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback


"""
I should also try:
https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py
"""


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))


class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(**kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None


embed_size = 300
max_features = 150000
max_text_len = 300
max_sent = 5

# EMBEDDING_FILE = "../input/glove840b300dtxt/glove.840B.300d.txt
EMBEDDING_FILE = "assets/embedding_models/ft_300d_crawl/crawl-300d-2M.vec"


def clean_corpus(comment):
    comment = comment.replace('&', ' and ')
    comment = comment.replace('0', ' zero ')
    comment = comment.replace('1', ' one ')
    comment = comment.replace('2', ' two ')
    comment = comment.replace('3', ' three ')
    comment = comment.replace('4', ' four ')
    comment = comment.replace('5', ' five ')
    comment = comment.replace('6', ' six ')
    comment = comment.replace('7', ' seven ')
    comment = comment.replace('8', ' eight ')
    comment = comment.replace('9', ' nine ')
    comment = comment.replace('\'ve', ' have ')
    comment = comment.replace('\'d', ' would ')
    comment = comment.replace('\'m', ' am ')
    comment = comment.replace('n\'t', ' not ')
    comment = comment.replace('\'s', ' is ')
    comment = comment.replace('\'r', ' are ')
    comment = re.sub(r"\\", "", comment)
    comment = word_tokenize(comment)
    comment = " ".join(word for word in comment)
    return comment.strip().lower()


tic = time.time()

train = pd.read_csv(TRAIN_FILENAME)
test = pd.read_csv(TEST_FILENAME)
Y = train[LIST_CLASSES].values

print('cleaning corpus')
train[COMMENT].fillna("no comment", inplace = True)
train[COMMENT] = train[COMMENT].apply(lambda x: clean_corpus(x))

test[COMMENT].fillna("no comment", inplace = True)
test[COMMENT] = test[COMMENT].apply(lambda x: clean_corpus(x))

print('tokenizing')
train["sentences"] = train[COMMENT].apply(lambda x: tokenize.sent_tokenize(x))
test["sentences"] = test[COMMENT].apply(lambda x: tokenize.sent_tokenize(x))
toc = time.time()
print(toc-tic)


from keras.preprocessing.text import Tokenizer, text_to_word_sequence

print('fitting tokenizer')
raw_text = train[COMMENT]
tk = Tokenizer(num_words = max_features, lower = True)
tk.fit_on_texts(raw_text)

def sentenize(data):
    comments = data["sentences"]
    sent_matrix = np.zeros((len(comments), max_sent, max_text_len), dtype = "int32")
    for i, sentences in enumerate(comments):
        for j, sent in enumerate(sentences):
            if j < max_sent:
                wordTokens = text_to_word_sequence(sent)
                k=0
                for _, word in enumerate(wordTokens):
                    try:
                        if k < max_text_len and tk.word_index[word] < max_features:
                            sent_matrix[i, j, k] = tk.word_index[word]
                            k = k+1
                    except:
                            sent_matrix[i, j, k] = 0
                            k = k+1
    return sent_matrix

print('sentenizing')
X = sentenize(train)
X_test = sentenize(test)

del train, test
gc.collect()

print('loading embeddings')
tic = time.time()
def get_coefs(word,*arr): return word, np.asarray(arr, dtype = "float32")
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
toc = time.time()
print(toc-tic)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


cleaning corpus
tokenizing
175.55582976341248
fitting tokenizer
sentenizing
loading embeddings
85.90108561515808


In [3]:

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import SpatialDropout1D,GlobalAveragePooling1D,GlobalMaxPooling1D,concatenate

def build_model(rnn_units = 0, lr = 0.0):
    sentence_input = Input(shape = (max_text_len,), dtype = "int32")
    embedded_sequences = Embedding(nb_words, embed_size, weights=[embedding_matrix],
              input_length=max_text_len, trainable=False)(sentence_input)
    do_embedded_sequences = SpatialDropout1D(0.5)(embedded_sequences)
    l_lstm = Bidirectional(CuDNNLSTM(rnn_units),merge_mode='sum')(do_embedded_sequences)
    sentEncoder = Model(sentence_input, l_lstm)

    review_input = Input(shape = (max_sent, max_text_len), dtype = "int32")
    review_encoder = TimeDistributed(sentEncoder)(review_input)
    review_encoder = SpatialDropout1D(0.5)(review_encoder)
    l_lstm_sent = Bidirectional(CuDNNLSTM(rnn_units),merge_mode='sum')(review_encoder)
    
    
    #l_lstm_sent = Bidirectional(CuDNNLSTM(rnn_units,return_sequences=True))(review_encoder)
    #avg_pool = GlobalAveragePooling1D()(l_lstm_sent)
    #max_pool = GlobalMaxPooling1D()(l_lstm_sent)
    #conc = concatenate([avg_pool, max_pool])
    #conc = Dropout(0.2)(conc)
    #out = Dense(6, activation="sigmoid")(conc)
    l_lstm_sent = Dropout(0.2)(l_lstm_sent)
    out = Dense(6, activation = "sigmoid")(l_lstm_sent)
    model = Model(review_input, out)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr),  metrics = ["accuracy"])
    return model



model = build_model(rnn_units = 32, lr = 1e-3)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 5, 300)            0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 32)             45085504  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 5, 32)             0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                16896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 198       
Total params: 45,102,598
Trainable params: 102,598
Non-trainable params: 45,000,000
__________________________________________________________

In [3]:
fold_id = 9
fold_size = len(X) // 10
fold_start = fold_size * fold_id
fold_end = fold_start + fold_size

In [6]:
fold_end

159571

In [5]:
if fold_id == 9:
    fold_end = len(X)

In [7]:




X_valid = X[fold_start:fold_end]
Y_valid = Y[fold_start:fold_end]
X_train = np.concatenate([X[:fold_start], X[fold_end:]])
Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

model = build_model(rnn_units = 64, lr = 1e-3)
file_path = "HAN_%s_.hdf5" %fold_id
ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
history = model.fit(X_train, Y_train, batch_size = 64, epochs = 15, validation_data = (X_valid, Y_valid),
                verbose = 1, callbacks = [ra_val, check_point])

Train on 143613 samples, validate on 15958 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.971781

Epoch 00001: val_loss improved from inf to 0.05087, saving model to HAN_9_.hdf5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.978148

Epoch 00002: val_loss improved from 0.05087 to 0.04822, saving model to HAN_9_.hdf5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.979982

Epoch 00003: val_loss did not improve
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.982253

Epoch 00004: val_loss improved from 0.04822 to 0.04624, saving model to HAN_9_.hdf5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.983539

Epoch 00005: val_loss improved from 0.04624 to 0.04468, saving model to HAN_9_.hdf5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.984120

Epoch 00006: val_loss improved from 0.04468 to 0.04423, saving model to HAN_9_.hdf5
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.984087

Epoch 00007: val_loss did not improve
Epoch 8/15
 ROC-AUC - epoch: 8 - score: 0.984326

Epoch 00008: val_loss improved from 0.04423 to 0.04409, s

In [5]:
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    model = build_model(rnn_units = 64, lr = 1e-3)
    file_path = "HAN_%s_.hdf5" %fold_id
    ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit(X_train, Y_train, batch_size = 64, epochs = 15, validation_data = (X_valid, Y_valid),
                    verbose = 1, callbacks = [ra_val, check_point])



Train on 143614 samples, validate on 15957 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.975378

Epoch 00001: val_loss improved from inf to 0.05095, saving model to HAN_0_.hdf5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.979589

Epoch 00002: val_loss improved from 0.05095 to 0.04723, saving model to HAN_0_.hdf5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.981205

Epoch 00003: val_loss improved from 0.04723 to 0.04577, saving model to HAN_0_.hdf5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.982592

Epoch 00004: val_loss improved from 0.04577 to 0.04448, saving model to HAN_0_.hdf5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.983621

Epoch 00005: val_loss improved from 0.04448 to 0.04338, saving model to HAN_0_.hdf5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.983867

Epoch 00006: val_loss improved from 0.04338 to 0.04325, saving model to HAN_0_.hdf5
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.985115

Epoch 00007: val_loss improved from 0.04325 to 0.04231, saving model to HAN_0_.hdf5
Epoch 8/15
 ROC

 ROC-AUC - epoch: 4 - score: 0.977234

Epoch 00004: val_loss improved from 0.04859 to 0.04667, saving model to HAN_3_.hdf5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.978960

Epoch 00005: val_loss improved from 0.04667 to 0.04564, saving model to HAN_3_.hdf5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.978599

Epoch 00006: val_loss improved from 0.04564 to 0.04541, saving model to HAN_3_.hdf5
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.980398

Epoch 00007: val_loss improved from 0.04541 to 0.04501, saving model to HAN_3_.hdf5
Epoch 8/15
 ROC-AUC - epoch: 8 - score: 0.981975

Epoch 00008: val_loss did not improve
Epoch 9/15
 ROC-AUC - epoch: 9 - score: 0.981099

Epoch 00009: val_loss improved from 0.04501 to 0.04416, saving model to HAN_3_.hdf5
Epoch 10/15
 ROC-AUC - epoch: 10 - score: 0.980824

Epoch 00010: val_loss did not improve
Epoch 11/15
 ROC-AUC - epoch: 11 - score: 0.981574

Epoch 00011: val_loss did not improve
Epoch 12/15
 ROC-AUC - epoch: 12 - score: 0.982299

Epoch 00012: val_loss

Epoch 14/15
 ROC-AUC - epoch: 14 - score: 0.987029

Epoch 00014: val_loss did not improve
Epoch 15/15
 ROC-AUC - epoch: 15 - score: 0.986979

Epoch 00015: val_loss did not improve
Train on 143614 samples, validate on 15957 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.974295

Epoch 00001: val_loss improved from inf to 0.04844, saving model to HAN_5_.hdf5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.976826

Epoch 00002: val_loss improved from 0.04844 to 0.04800, saving model to HAN_5_.hdf5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.980374

Epoch 00003: val_loss improved from 0.04800 to 0.04414, saving model to HAN_5_.hdf5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.982033

Epoch 00004: val_loss improved from 0.04414 to 0.04272, saving model to HAN_5_.hdf5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.982931

Epoch 00005: val_loss improved from 0.04272 to 0.04212, saving model to HAN_5_.hdf5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.984035

Epoch 00006: val_loss improved from 0.04212 to 0.0411

Epoch 8/15
 ROC-AUC - epoch: 8 - score: 0.984762

Epoch 00008: val_loss did not improve
Epoch 9/15
 ROC-AUC - epoch: 9 - score: 0.984995

Epoch 00009: val_loss improved from 0.04192 to 0.04130, saving model to HAN_6_.hdf5
Epoch 10/15
 ROC-AUC - epoch: 10 - score: 0.984750

Epoch 00010: val_loss did not improve
Epoch 11/15
 ROC-AUC - epoch: 11 - score: 0.984738

Epoch 00011: val_loss did not improve
Epoch 12/15
 ROC-AUC - epoch: 12 - score: 0.985242

Epoch 00012: val_loss did not improve
Epoch 13/15
 ROC-AUC - epoch: 13 - score: 0.985209

Epoch 00013: val_loss did not improve
Epoch 14/15
 ROC-AUC - epoch: 14 - score: 0.985883

Epoch 00014: val_loss did not improve
Epoch 15/15
 ROC-AUC - epoch: 15 - score: 0.983979

Epoch 00015: val_loss did not improve
Train on 143614 samples, validate on 15957 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.973080

Epoch 00001: val_loss improved from inf to 0.05024, saving model to HAN_7_.hdf5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.978006

Epoch 0

Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.983180

Epoch 00003: val_loss improved from 0.04656 to 0.04420, saving model to HAN_8_.hdf5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.984520

Epoch 00004: val_loss improved from 0.04420 to 0.04383, saving model to HAN_8_.hdf5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.984543

Epoch 00005: val_loss improved from 0.04383 to 0.04333, saving model to HAN_8_.hdf5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.985631

Epoch 00006: val_loss improved from 0.04333 to 0.04193, saving model to HAN_8_.hdf5
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.986068

Epoch 00007: val_loss did not improve
Epoch 8/15
 ROC-AUC - epoch: 8 - score: 0.985996

Epoch 00008: val_loss improved from 0.04193 to 0.04109, saving model to HAN_8_.hdf5
Epoch 9/15
 ROC-AUC - epoch: 9 - score: 0.985830

Epoch 00009: val_loss did not improve
Epoch 10/15
 ROC-AUC - epoch: 10 - score: 0.986652

Epoch 00010: val_loss improved from 0.04109 to 0.04084, saving model to HAN_8_.hdf5
Epoch 11/15
 ROC-A

In [4]:
list_of_preds = []
list_of_vals = []
list_of_y = []
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    file_path = 'HAN_' + str(fold_id) + '_.hdf5'
    model = load_model(file_path) #,custom_objects = {"AttentionWeightedAverage": AttentionWeightedAverage}
    preds = model.predict(X_test, batch_size = 256, verbose = 1)
    list_of_preds.append(preds)
    vals = model.predict(X_valid, batch_size = 256, verbose = 1)
    list_of_vals.append(vals)
    list_of_y.append(Y_valid)



In [5]:
test_predicts = np.zeros(list_of_preds[0].shape)
for fold_predict in list_of_preds:
    test_predicts += fold_predict

test_predicts /= len(list_of_preds)
submission = pd.read_csv('assets/raw_data/sample_submission.csv')
submission[LIST_CLASSES] = test_predicts
submission.to_csv('han_l2_test_data.csv', index=False)

l2_data = pd.DataFrame(columns=['logits_' + c for c in LIST_CLASSES]+LIST_CLASSES)
l2_data[['logits_' + c for c in LIST_CLASSES]] = pd.DataFrame(np.concatenate(list_of_vals,axis = 0))
l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y,axis = 0))
l2_data.to_csv('han_l2_train_data.csv')