In [1]:
import re, os, gc, time, pandas as pd, numpy as np
import tqdm

np.random.seed(32)
#os.environ["OMP_NUM_THREADS"] = "5"
from nltk import tokenize, word_tokenize
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, Add, Flatten, TimeDistributed,CuDNNGRU,CuDNNLSTM
from keras.optimizers import Adam, RMSprop
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
# from keras.engine.topology import Layer
from keras.engine import InputSpec, Layer
from preprocess_utils import preprocess
from global_variables import TRAIN_FILENAME, TEST_FILENAME, COMMENT, LIST_CLASSES, UNKNOWN_WORD
import logging
from collections import Counter
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
from nltk.tokenize import TweetTokenizer
from keras.preprocessing import text, sequence
   
embed_size = 300
max_features = 150000
max_text_len = 300

# EMBEDDING_FILE = "../input/glove840b300dtxt/glove.840B.300d.txt
EMBEDDING_FILE = "assets/embedding_models/ft_300d_crawl/crawl-300d-2M.vec"

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

def rm_hyperlinks(words):
    words = [w if not (w.startswith('http') or
                       w.startswith('www') or
                       w.endswith('.com') or
                        w.startswith('en.wikipedia.org/')) else 'url' for w in words]
    return words

def strip_spaces(words):
    return [w.replace(' ', '') for w in words]

def tokenize_sentences(sentences):
    twitter_tokenizer = TweetTokenizer()
    tokenized_sentences = []
    for sentence in tqdm.tqdm(sentences,mininterval=5):
        if hasattr(sentence, "decode"):
            sentence = sentence.decode("utf-8")
        tokens = twitter_tokenizer.tokenize(sentence)
        tokenized_sentences.append(tokens)
    return tokenized_sentences

def tokenize_list_of_sentences(list_of_sentences):

    list_of_tokenized_sentences = []
    for sentences in list_of_sentences:
        tokenized_sentences = tokenize_sentences(sentences)

        # more preprocess on word level
        tokenized_sentences = [rm_hyperlinks(s) for s in tokenized_sentences]
        tokenized_sentences = [strip_spaces(s) for s in tokenized_sentences]
        list_of_tokenized_sentences.append(tokenized_sentences)

    return list_of_tokenized_sentences

def create_word2id(list_of_tokenized_sentences,max_features):
    word_counter = Counter()
    print('CREATING VOCABULARY')
    for tokenized_sentences in list_of_tokenized_sentences:
        for tokens in tqdm.tqdm(tokenized_sentences):
            word_counter.update(tokens)

    raw_counts = word_counter.most_common(max_features)
    vocab = [char_tuple[0] for char_tuple in raw_counts]
    print('%s words detected, keeping %s words' % (len(word_counter), len(vocab)))
    word2id = {word: (ind + 1) for ind, word in enumerate(vocab)}
    word2id[UNKNOWN_WORD] = len(word2id)
    id2word = dict((id, word) for word, id in word2id.items())
    return word2id, id2word

def tokenized_sentences2seq(tokenized_sentences, words_dict):
    print('converting to sequence')
    sequences = []
    for sentence in tqdm.tqdm(tokenized_sentences, mininterval=5):
        seq = []
        for token in sentence:
            try:
                seq.append(words_dict[token])
            except KeyError:
                seq.append(words_dict[UNKNOWN_WORD])
        sequences.append(seq)
    return sequences

def tokenized_sentences2seq2(tokenized_sentences, words_dict):
    print('converting to sequence')
    sequences = [words_dict[token] if token in words_dict else words_dict[UNKNOWN_WORD] for token in tqdm.tqdm(tokenized_sentences, mininterval=5)]
    return sequences

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

def convert_tokens_to_ids(tokenized_sentences, embedding_word_dict, id2word):
    words_train = []
    'converting word index to embedding index'
    for sentence in tqdm.tqdm(tokenized_sentences):
        current_words = []
        for word_index in sentence:
            try:
                word = id2word[word_index]
                word_id = embedding_word_dict.get(word, len(embedding_word_dict) - 2)
            except KeyError:
                word_id = embedding_word_dict.get(UNKNOWN_WORD, len(embedding_word_dict) - 2)
            current_words.append(word_id)

        if len(current_words) >= max_text_len:
            current_words = current_words[:max_text_len]
        else:
            current_words += [len(embedding_word_dict) - 1] * (max_text_len - len(current_words))
        words_train.append(current_words)
    return words_train

tic = time.time()

train_data = pd.read_csv(TRAIN_FILENAME)
test_data = pd.read_csv(TEST_FILENAME)
Y = train_data[LIST_CLASSES].values

test_data = preprocess(test_data)
train_data = preprocess(train_data)

train_data = train_data["comment_text"].fillna("fillna").values
test_data = test_data["comment_text"].fillna("fillna").values

tokenizer = text.Tokenizer(num_words=max_features)
print('fitting tokenizer')
tokenizer.fit_on_texts(list(train_data) + list(test_data))
train_data = tokenizer.texts_to_sequences(train_data)
test_data = tokenizer.texts_to_sequences(test_data)
X = sequence.pad_sequences(train_data, maxlen=max_text_len)
X_test = sequence.pad_sequences(test_data, maxlen=max_text_len)

del train_data
del test_data

print('getting embeddings')
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

print('done')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


preprocessing
lowercase
removing breaks
expanding contractions
replacing smileys
replacing ip
removing links
replacing numbers
removing bigrams
isolating punct
preprocessing
lowercase
removing breaks
expanding contractions
replacing smileys
replacing ip
removing links
replacing numbers
removing bigrams
isolating punct
getting embeddings
done


In [2]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import SpatialDropout1D, MaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate, Reshape, Conv2D, MaxPool2D
from keras import regularizers

class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(**kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

def build_model(lr=0.0):
    inp = Input(shape=(max_text_len, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False)(inp)
    x = SpatialDropout1D(0.4)(x)
    
    conv_0 = Conv1D(64, kernel_size=1, kernel_initializer='normal',activation='elu')(x)
    conv_1 = Conv1D(64, kernel_size=2, kernel_initializer='normal',activation='elu')(x)
    conv_2 = Conv1D(64, kernel_size=3, kernel_initializer='normal',activation='elu')(x)
    conv_3 = Conv1D(64, kernel_size=5, kernel_initializer='normal',activation='elu')(x)
    
    maxpool_0 = MaxPool1D(pool_size=(max_text_len - 1 + 1))(conv_0)
    maxpool_1 = MaxPool1D(pool_size=(max_text_len - 2 + 1))(conv_1)
    maxpool_2 = MaxPool1D(pool_size=(max_text_len - 3 + 1))(conv_2)
    maxpool_3 = MaxPool1D(pool_size=(max_text_len - 5 + 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = AttentionWeightedAverage()(z)
    z = Dropout(0.3)(z)
        
    out = Dense(6, activation="sigmoid")(z)
    
    model = Model(inp, out)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr), metrics=["accuracy"])
    return model


model = build_model(lr=1e-3)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     45000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 300, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 300, 64)      19264       spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv1d_2 (

In [43]:
fold_count = 1
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    model = build_model(lr = 0.001)
    file_path = "CNN_test1_%s_.hdf5" %fold_id
    ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit(X_train, Y_train, batch_size = 256, epochs = 10, validation_data = (X_valid, Y_valid),
                  verbose = 1, callbacks = [ra_val, check_point])

Train on 143614 samples, validate on 15957 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.978514

Epoch 00001: val_loss improved from inf to 0.04973, saving model to CNN_test1_0_.hdf5
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.984996

Epoch 00002: val_loss improved from 0.04973 to 0.04528, saving model to CNN_test1_0_.hdf5
Epoch 3/10
 ROC-AUC - epoch: 3 - score: 0.985561

Epoch 00003: val_loss improved from 0.04528 to 0.04289, saving model to CNN_test1_0_.hdf5
Epoch 4/10
 ROC-AUC - epoch: 4 - score: 0.987053

Epoch 00004: val_loss did not improve
Epoch 5/10
 ROC-AUC - epoch: 5 - score: 0.987044

Epoch 00005: val_loss improved from 0.04289 to 0.04196, saving model to CNN_test1_0_.hdf5
Epoch 6/10
 ROC-AUC - epoch: 6 - score: 0.986786

Epoch 00006: val_loss improved from 0.04196 to 0.04092, saving model to CNN_test1_0_.hdf5
Epoch 7/10
 ROC-AUC - epoch: 7 - score: 0.987241

Epoch 00007: val_loss did not improve
Epoch 8/10
 ROC-AUC - epoch: 8 - score: 0.987176

Epoch 00008: val_loss did n

In [4]:
def build_model(lr=0.0):
    inp = Input(shape=(max_text_len, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False)(inp)
    x = SpatialDropout1D(0.4)(x)
    
    conv_0 = Conv1D(64, kernel_size=1, kernel_initializer='normal',activation='elu')(x)
    conv_1 = Conv1D(64, kernel_size=2, kernel_initializer='normal',activation='elu')(x)
    conv_2 = Conv1D(64, kernel_size=3, kernel_initializer='normal',activation='elu')(x)
    conv_3 = Conv1D(64, kernel_size=5, kernel_initializer='normal',activation='elu')(x)
    
    maxpool_0 = MaxPool1D(pool_size=(max_text_len - 1 + 1))(conv_0)
    maxpool_1 = MaxPool1D(pool_size=(max_text_len - 2 + 1))(conv_1)
    maxpool_2 = MaxPool1D(pool_size=(max_text_len - 3 + 1))(conv_2)
    maxpool_3 = MaxPool1D(pool_size=(max_text_len - 5 + 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = AttentionWeightedAverage()(z)
    z = Dropout(0.4)(z)
        
    out = Dense(6, activation="sigmoid")(z)
    
    model = Model(inp, out)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr), metrics=["accuracy"])
    return model


model = build_model(lr=1e-3)
model.summary()

fold_count = 10
fold_size = len(X) // 10
for fold_id in [7,8,9]:#range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    model = build_model(lr = 0.001)
    file_path = "Inception_ATT3_%s_.hdf5" %fold_id
    ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit(X_train, Y_train, batch_size = 256, epochs = 10, validation_data = (X_valid, Y_valid),
                  verbose = 1, callbacks = [ra_val, check_point])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 300)     45000000    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 300, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 300, 64)      19264       spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
conv1d_6 (

Epoch 4/10
 ROC-AUC - epoch: 4 - score: 0.984346

Epoch 00004: val_loss did not improve
Epoch 5/10
 ROC-AUC - epoch: 5 - score: 0.985450

Epoch 00005: val_loss improved from 0.04396 to 0.04248, saving model to Inception_ATT3_8_.hdf5
Epoch 6/10
 ROC-AUC - epoch: 6 - score: 0.984942

Epoch 00006: val_loss did not improve
Epoch 7/10
 ROC-AUC - epoch: 7 - score: 0.984397

Epoch 00007: val_loss did not improve
Epoch 8/10
 ROC-AUC - epoch: 8 - score: 0.984919

Epoch 00008: val_loss did not improve
Epoch 9/10
 ROC-AUC - epoch: 9 - score: 0.984514

Epoch 00009: val_loss did not improve
Epoch 10/10
 ROC-AUC - epoch: 10 - score: 0.984691

Epoch 00010: val_loss did not improve
Train on 143613 samples, validate on 15958 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.974228

Epoch 00001: val_loss improved from inf to 0.05037, saving model to Inception_ATT3_9_.hdf5
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.979520

Epoch 00002: val_loss improved from 0.05037 to 0.04802, saving model to Inception_A

In [5]:
list_of_preds = []
list_of_vals = []
list_of_y = []
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    file_path = 'Inception_ATT3_' + str(fold_id) + '_.hdf5'
    model = load_model(file_path,custom_objects = {"AttentionWeightedAverage": AttentionWeightedAverage})
    preds = model.predict(X_test, batch_size = 256, verbose = 1)
    list_of_preds.append(preds)
    vals = model.predict(X_valid, batch_size = 256, verbose = 1)
    list_of_vals.append(vals)
    list_of_y.append(Y_valid)
test_predicts = np.zeros(list_of_preds[0].shape)
for fold_predict in list_of_preds:
    test_predicts += fold_predict

test_predicts /= len(list_of_preds)
submission = pd.read_csv('assets/raw_data/sample_submission.csv')
submission[LIST_CLASSES] = test_predicts
submission.to_csv('Inception_ATT3_l2_test_data.csv', index=False)

l2_data = pd.DataFrame(columns=['logits_' + c for c in LIST_CLASSES]+LIST_CLASSES)
l2_data[['logits_' + c for c in LIST_CLASSES]] = pd.DataFrame(np.concatenate(list_of_vals,axis = 0))
l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y,axis = 0))
l2_data.to_csv('Inception_ATT3_l2_train_data.csv')

