In [1]:
import re, os, gc, time, pandas as pd, numpy as np
import tqdm

np.random.seed(32)
#os.environ["OMP_NUM_THREADS"] = "5"
from nltk import tokenize, word_tokenize
import collections
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, Add, Flatten, TimeDistributed,CuDNNGRU,CuDNNLSTM
from keras.optimizers import Adam, RMSprop
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
# from keras.engine.topology import Layer
from keras.engine import InputSpec, Layer

from global_variables import TRAIN_FILENAME, TEST_FILENAME, COMMENT, LIST_CLASSES, UNKNOWN_CHAR, PAD_CHAR
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))
            
embed_size = 100
max_features = 150000
max_text_len = 700
min_count_chars = 50

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
def create_char_vocabulary(texts, min_count_chars):
    counter = collections.Counter()
    for k, text in enumerate(texts):
        counter.update(text)
    raw_counts = list(counter.items())
    print('%s characters found' %len(counter))
    print('keepin characters with count > %s' % min_count_chars)
    vocab = [char_tuple[0] for char_tuple in raw_counts if char_tuple[1] > min_count_chars]
    char2index = {char:(ind+1) for ind, char in enumerate(vocab)}
    char2index[UNKNOWN_CHAR] = 0
    char2index[PAD_CHAR] = -1
    index2char = {ind:char for char, ind in char2index.items()}
    char_vocab_size = len(char2index)
    print('%s remaining characters' % char_vocab_size)
    return char2index, index2char

def char2seq(texts, char2index, maxlen):
    res = np.zeros((len(texts),maxlen))
    for k,text in tqdm.tqdm(enumerate(texts)):
        seq = np.zeros((len(text)))
        for l, char in enumerate(text):
            try:
                id = char2index[char]
                seq[l] = id
            except KeyError:
                seq[l] = char2index[UNKNOWN_CHAR]
        seq = seq[:maxlen]
        res[k][:len(seq)] = seq
    return res

In [3]:
train_data = pd.read_csv('assets/raw_data/train.csv')
test_data = pd.read_csv('assets/raw_data/test.csv')
sentences_train = train_data["comment_text"].fillna("_NAN_").values
sentences_test = test_data["comment_text"].fillna("_NAN_").values
Y = train_data[LIST_CLASSES].values

In [4]:
char2index, index2char = create_char_vocabulary(sentences_train, min_count_chars)
X = char2seq(sentences_train, char2index=char2index, maxlen=max_text_len)
X_test = char2seq(sentences_test, char2index=char2index, maxlen=max_text_len)

2410it [00:00, 24050.37it/s]

2335 characters found
keepin characters with count > 50
241 remaining characters


159571it [00:06, 24121.94it/s]
153164it [00:05, 25856.21it/s]


In [21]:
"""
http://soroush.mit.edu/publications/tweet2vec_vvr.pdf
"""

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate,MaxPooling1D

embed_size = 50
def build_model(lr=0.0):
    sentence_input = Input(shape=(max_text_len,), dtype="int32")
    embedded_sequences = Embedding(len(char2index), embed_size,input_length=max_text_len, trainable=True)(sentence_input)
    conv = SpatialDropout1D(0.2)(embedded_sequences)
    conv = Conv1D(64, 7, strides=1)(conv)
    conv = Conv1D(64, 7, strides=1)(conv)
    conv = MaxPooling1D(pool_size=3)(conv)
    conv = Dropout(0.2)(conv)
    conv = Conv1D(64, 3, strides=1)(conv)
    conv = Conv1D(64, 3, strides=1)(conv)
    x = Bidirectional(CuDNNGRU(64,return_sequences=True))(conv)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    x = Dropout(0.2)(conc)
    out = Dense(6, activation="sigmoid")(x)
    model = Model(sentence_input, out)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr), metrics=["accuracy"])
    return model


model = build_model(lr=1e-3)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, 700)          0                                            
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 700, 50)      12050       input_17[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_17 (SpatialDr (None, 700, 50)      0           embedding_17[0][0]               
__________________________________________________________________________________________________
conv1d_33 (Conv1D)              (None, 694, 64)      22464       spatial_dropout1d_17[0][0]       
__________________________________________________________________________________________________
conv1d_34 

In [33]:
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    model = build_model(lr = 0.001)
    file_path = "CRNN3_%s_.hdf5" %fold_id
    ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit(X_train, Y_train, batch_size = 64, epochs = 15, validation_data = (X_valid, Y_valid),
                    verbose = 1, callbacks = [ra_val, check_point])

Train on 143614 samples, validate on 15957 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.936639

Epoch 00001: val_loss improved from inf to 0.07545, saving model to CRNN3_0_.hdf5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.957210

Epoch 00002: val_loss improved from 0.07545 to 0.06001, saving model to CRNN3_0_.hdf5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.964078

Epoch 00003: val_loss improved from 0.06001 to 0.05728, saving model to CRNN3_0_.hdf5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.969618

Epoch 00004: val_loss improved from 0.05728 to 0.05344, saving model to CRNN3_0_.hdf5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.972727

Epoch 00005: val_loss improved from 0.05344 to 0.05128, saving model to CRNN3_0_.hdf5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.975379

Epoch 00006: val_loss improved from 0.05128 to 0.04942, saving model to CRNN3_0_.hdf5
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.976251

Epoch 00007: val_loss improved from 0.04942 to 0.04824, saving model to CRNN3_0_.hdf5
E

KeyboardInterrupt: 

In [32]:
list_of_preds = []
list_of_vals = []
list_of_y = []
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    file_path = 'VGG_' + str(fold_id) + '_.hdf5'
    model = build_model(lr = 0.001)
    model.load_weights(file_path)
    #model = load_model(file_path,custom_objects = {"Capsule": Capsule})
    preds = model.predict(X_test, batch_size = 256, verbose = 1)
    list_of_preds.append(preds)
    vals = model.predict(X_valid, batch_size = 256, verbose = 1)
    list_of_vals.append(vals)
    list_of_y.append(Y_valid)
test_predicts = np.zeros(list_of_preds[0].shape)
for fold_predict in list_of_preds:
    test_predicts += fold_predict

test_predicts /= len(list_of_preds)
submission = pd.read_csv('assets/raw_data/sample_submission.csv')
submission[LIST_CLASSES] = test_predicts
submission.to_csv('CRNN2_l2_test_data.csv', index=False)

l2_data = pd.DataFrame(columns=['logits_' + c for c in LIST_CLASSES]+LIST_CLASSES)
l2_data[['logits_' + c for c in LIST_CLASSES]] = pd.DataFrame(np.concatenate(list_of_vals,axis = 0))
l2_data[LIST_CLASSES] = pd.DataFrame(np.concatenate(list_of_y,axis = 0))
l2_data.to_csv('CRNN2_l2_train_data.csv')

