In [1]:
import numpy as np
import pandas as pd
import os
from os.path import join
from tqdm import tqdm

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D

from keras.callbacks import Callback
from keras.preprocessing import text, sequence
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
max_features=100000
maxlen=150
embed_size=300


path_wordvec = 'E://DM//NLP//WordVec'
EMBEDDING_FILE = join(path_wordvec, 'glove.840B.300d.txt')
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]


train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
X_test = test["comment_text"].str.lower()
y_train = train[labels]


print('X_train: ' + str(X_train.shape))
print('y_train: ' + str(y_train.shape))
print('X_test: ' + str(X_test.shape))
print('y_train: ' + str(list(y_train.columns)))

In [None]:
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=maxlen)

length_seq = [len(document) for document in X_train_seq[:10]]
print('length of X_train_seq:' + str(length_seq))
print('shape of X_train_pad: ' + str(X_train_pad.shape))

In [None]:
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        assert len(values)>=300
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
print(len(embeddings_index))
print(len(word_index))
print(max_features)
print(embedding_matrix.shape)


In [None]:
num_missed = 0
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        num_missed +=1
print('number of words that are not found in the embedding_vector: ' + str(num_missed))


In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [None]:
# A blog about LSTM-CNNs:
# http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
def GetModel():
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model
    
model = GetModel()
model.summary()

In [None]:
batch_size = 128
epochs = 4
X_tra, X_val, y_tra, y_val = train_test_split(X_train_pad, y_train, train_size=0.9, random_state=233)

In [None]:
filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

In [None]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)
#Loading model weights
model.load_weights(filepath)
print('Predicting....')
y_pred = model.predict(X_test_pad,batch_size=1024,verbose=1)

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
submission[labels] = y_pred
submission.to_csv('submission.csv', index=False)