In [1]:
import numpy as np # linear algebra
import pandas as pd 

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
max_features = 20000
maxlen = 100

In [40]:
train = pd.read_csv("data/train_cleaned.csv")
test = pd.read_csv("data/test_cleaned.csv")

In [41]:
list_sentences_train = train["comment_text"].fillna("unk").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("unk").values

In [42]:
list_sentences_train = train["comment_text"].fillna("unk").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("unk").values

In [43]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [44]:
tokenizer.word_counts

OrderedDict([('nonsens', 960),
             ('kiss', 130),
             ('off', 3767),
             ('geek', 68),
             ('what', 20812),
             ('i', 144284),
             ('said', 4672),
             ('is', 108822),
             ('true', 2064),
             ('will', 23384),
             ('have', 54088),
             ('your', 38433),
             ('account', 2949),
             ('termin', 110),
             ('plea', 18053),
             ('do', 40401),
             ('not', 87629),
             ('vandal', 6528),
             ('page', 34760),
             ('a', 176681),
             ('you', 132262),
             ('did', 12170),
             ('with', 36162),
             ('this', 58665),
             ('edit', 25144),
             ('to', 178539),
             ('w', 876),
             ('s', 3955),
             ('merwin', 1),
             ('if', 35353),
             ('continu', 3951),
             ('so', 21883),
             ('be', 58990),
             ('block', 10247),
         

In [45]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(100, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.3)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [46]:
model = get_model()
batch_size = 32
epochs = 3

In [47]:
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


In [48]:
callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

Train on 86265 samples, validate on 9586 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f557c68efd0>

In [38]:
model.load_weights(file_path)

y_test = model.predict(X_te)

In [39]:
sample_submission = pd.read_csv("data/sample_submission.csv")

sample_submission[list_classes] = y_test



sample_submission.to_csv("baseline1.csv", index=False)