In [35]:
import re, os, gc, time, pandas as pd, numpy as np
import tqdm

np.random.seed(32)
#os.environ["OMP_NUM_THREADS"] = "5"
from nltk import tokenize, word_tokenize
import collections
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, Add, Flatten, TimeDistributed,CuDNNGRU,CuDNNLSTM
from keras.optimizers import Adam, RMSprop
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
# from keras.engine.topology import Layer
from keras.engine import InputSpec, Layer

from global_variables import TRAIN_FILENAME, TEST_FILENAME, COMMENT, LIST_CLASSES, UNKNOWN_CHAR, PAD_CHAR
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))
            
embed_size = 256
max_features = 150000
max_text_len = 700
min_count_chars = 50

In [36]:
def create_char_vocabulary(texts, min_count_chars):
    counter = collections.Counter()
    for k, text in enumerate(texts):
        counter.update(text)
    raw_counts = list(counter.items())
    print('%s characters found' %len(counter))
    print('keepin characters with count > %s' % min_count_chars)
    vocab = [char_tuple[0] for char_tuple in raw_counts if char_tuple[1] > min_count_chars]
    char2index = {char:(ind+1) for ind, char in enumerate(vocab)}
    char2index[UNKNOWN_CHAR] = 0
    char2index[PAD_CHAR] = -1
    index2char = {ind:char for char, ind in char2index.items()}
    char_vocab_size = len(char2index)
    print('%s remaining characters' % char_vocab_size)
    return char2index, index2char

def char2seq(texts, char2index, maxlen):
    res = np.zeros((len(texts),maxlen))
    for k,text in tqdm.tqdm(enumerate(texts)):
        seq = np.zeros((len(text)))
        for l, char in enumerate(text):
            try:
                id = char2index[char]
                seq[l] = id
            except KeyError:
                seq[l] = char2index[UNKNOWN_CHAR]
        seq = seq[:maxlen]
        res[k][:len(seq)] = seq
    return res

In [37]:
train_data = pd.read_csv('assets/raw_data/train.csv')
test_data = pd.read_csv('assets/raw_data/test.csv')
sentences_train = train_data["comment_text"].fillna("_NAN_").values
sentences_test = test_data["comment_text"].fillna("_NAN_").values
Y = train_data[LIST_CLASSES].values

In [38]:
char2index, index2char = create_char_vocabulary(sentences_train, min_count_chars)
X = char2seq(sentences_train, char2index=char2index, maxlen=max_text_len)
X_test = char2seq(sentences_test, char2index=char2index, maxlen=max_text_len)

2384it [00:00, 23830.93it/s]

2335 characters found
keepin characters with count > 50
241 remaining characters


159571it [00:06, 24228.25it/s]
153164it [00:05, 25703.01it/s]


In [100]:
"""
http://soroush.mit.edu/publications/tweet2vec_vvr.pdf
"""

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate,MaxPooling1D


def build_model(lr=0.0):
    sentence_input = Input(shape=(max_text_len,), dtype="int32")
    embedded_sequences = Embedding(len(char2index), 256,input_length=max_text_len, trainable=True)(sentence_input)
    embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
    conv = embedded_sequences
    conv = Conv1D(64, 7, strides=1)(conv)
    conv = Conv1D(64, 7, strides=1)(conv)
    conv = MaxPooling1D(pool_size=3)(conv)
    conv = Dropout(0.2)(conv)
    conv = Conv1D(64, 3, strides=1)(conv)
    conv = Conv1D(64, 3, strides=1)(conv)
    lstm = Bidirectional(CuDNNLSTM(64), merge_mode='sum')(conv)
    lstm = Dropout(0.5)(lstm)
    #lstm = Dropout(0.3)(lstm)
    #flat = Flatten()(conv)
    #flat = Dropout(0.3)(flat)
    #flat = Dense(64, activation="relu")(flat)
    #out = Dense(6, activation="sigmoid")(flat)
    out = Dense(6, activation="sigmoid")(lstm)
    model = Model(sentence_input, out)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr), metrics=["accuracy"])
    return model


model = build_model(lr=1e-3)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_86 (InputLayer)        (None, 700)               0         
_________________________________________________________________
embedding_86 (Embedding)     (None, 700, 256)          61696     
_________________________________________________________________
spatial_dropout1d_53 (Spatia (None, 700, 256)          0         
_________________________________________________________________
conv1d_642 (Conv1D)          (None, 694, 64)           114752    
_________________________________________________________________
conv1d_643 (Conv1D)          (None, 688, 64)           28736     
_________________________________________________________________
max_pooling1d_288 (MaxPoolin (None, 229, 64)           0         
_________________________________________________________________
dropout_73 (Dropout)         (None, 229, 64)           0         
__________

In [101]:
fold_count = 10
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    model = build_model(lr = 0.0003)
    file_path = "VGG_%s_.hdf5" %fold_id
    ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit(X_train, Y_train, batch_size = 128, epochs = 15, validation_data = (X_valid, Y_valid),
                    verbose = 1, callbacks = [ra_val, check_point])

Train on 143614 samples, validate on 15957 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.889004

Epoch 00001: val_loss improved from inf to 0.09333, saving model to VGG_0_.hdf5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.934126

Epoch 00002: val_loss improved from 0.09333 to 0.07308, saving model to VGG_0_.hdf5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.947383

Epoch 00003: val_loss improved from 0.07308 to 0.06530, saving model to VGG_0_.hdf5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.954628

Epoch 00004: val_loss improved from 0.06530 to 0.06520, saving model to VGG_0_.hdf5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.959435

Epoch 00005: val_loss improved from 0.06520 to 0.06115, saving model to VGG_0_.hdf5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.962733

Epoch 00006: val_loss improved from 0.06115 to 0.05686, saving model to VGG_0_.hdf5
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.965750

Epoch 00007: val_loss improved from 0.05686 to 0.05550, saving model to VGG_0_.hdf5
Epoch 8/15

KeyboardInterrupt: 

In [90]:
"""
py_crepe
"""

def build_model(lr=0.0):
    sentence_input = Input(shape=(max_text_len,), dtype="int32")
    embedded_sequences = Embedding(len(char2index), 128,input_length=max_text_len, trainable=True)(sentence_input)
    #embedded_sequences = SpatialDropout1D(0.1)(embedded_sequences)
    conv = embedded_sequences
    conv = Conv1D(64, 7, strides=1)(conv)
    conv = MaxPooling1D(pool_size=3)(conv)
    conv = Conv1D(64, 7, strides=1)(conv)
    conv = MaxPooling1D(pool_size=3)(conv)
    conv = Dropout(0.5)(conv)
    conv = Conv1D(64, 3, strides=1)(conv)
    conv = Conv1D(64, 3, strides=1)(conv)
    #conv = Conv1D(64, 3, strides=1)(conv)
    #conv = Conv1D(64, 3, strides=1)(conv)
    conv = MaxPooling1D(pool_size=3)(conv)
    flat = Flatten()(conv)
    flat = Dense(128, activation="relu")(flat)
    flat = Dropout(0.5)(flat)
    #flat = Dense(128, activation="relu")(flat)
    #flat = Dropout(0.5)(flat)
    #lstm = Bidirectional(CuDNNLSTM(64), merge_mode='sum')(conv)
    #lstm = Dropout(0.3)(lstm)
    #lstm = Dropout(0.3)(lstm)
    #flat = Flatten()(conv)
    #flat = Dropout(0.3)(flat)
    #flat = Dense(64, activation="relu")(flat)
    out = Dense(6, activation="sigmoid")(flat)
    #out = Dense(6, activation="sigmoid")(lstm)
    model = Model(sentence_input, out)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr), metrics=["accuracy"])
    return model


model = build_model(lr=1e-3)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_67 (InputLayer)        (None, 700)               0         
_________________________________________________________________
embedding_67 (Embedding)     (None, 700, 128)          30848     
_________________________________________________________________
conv1d_560 (Conv1D)          (None, 694, 64)           57408     
_________________________________________________________________
max_pooling1d_263 (MaxPoolin (None, 231, 64)           0         
_________________________________________________________________
conv1d_561 (Conv1D)          (None, 225, 64)           28736     
_________________________________________________________________
max_pooling1d_264 (MaxPoolin (None, 75, 64)            0         
_________________________________________________________________
dropout_48 (Dropout)         (None, 75, 64)            0         
__________

In [92]:
fold_count = 1
fold_size = len(X) // 10
for fold_id in range(0, fold_count):
    fold_start = fold_size * fold_id
    fold_end = fold_start + fold_size

    if fold_id == 9:
        fold_end = len(X)

    X_valid = X[fold_start:fold_end]
    Y_valid = Y[fold_start:fold_end]
    X_train = np.concatenate([X[:fold_start], X[fold_end:]])
    Y_train = np.concatenate([Y[:fold_start], Y[fold_end:]])

    model = build_model(lr = 0.0002)
    file_path = "VGG_%s_.hdf5" %fold_id
    ra_val = RocAucEvaluation(validation_data = (X_valid, Y_valid), interval = 1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min", save_best_only = True, verbose = 1)
    history = model.fit(X_train, Y_train, batch_size = 128, epochs = 15, validation_data = (X_valid, Y_valid),
                    verbose = 1, callbacks = [ra_val, check_point])

Train on 143614 samples, validate on 15957 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.899193

Epoch 00001: val_loss improved from inf to 0.09543, saving model to VGG_0_.hdf5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.942262

Epoch 00002: val_loss improved from 0.09543 to 0.07270, saving model to VGG_0_.hdf5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.950165

Epoch 00003: val_loss improved from 0.07270 to 0.07090, saving model to VGG_0_.hdf5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.951687

Epoch 00004: val_loss did not improve
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.962034

Epoch 00005: val_loss improved from 0.07090 to 0.06674, saving model to VGG_0_.hdf5
Epoch 6/15

KeyboardInterrupt: 