In [6]:
%pylab inline
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import GridSearchCV
from keras.models import Model
from keras import losses
from keras import backend as K
from keras.layers import Dense, Embedding, Input
from keras.preprocessing import text, sequence
from keras.layers import LSTM, GRU, Bidirectional, GlobalMaxPool1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [2]:
def preprocess_1(ftrain, ftest, max_features=20000, maxlen=100):
    train = pd.read_csv(ftrain)
    test  = pd.read_csv(ftest)

    list_sentences_train = train["comment_text"].fillna(" ").values
    list_sentences_test = test["comment_text"].fillna(" ").values
    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

    train_sentence_filtered = train[   (train.toxic == 1)   | (train.severe_toxic == 1) \
                                 | (train.obscene == 1) | (train.threat == 1)   \
                                 | (train.insult ==  1) | (train.identity_hate == 1) ]
    list_filtered_train = train_sentence_filtered["comment_text"].fillna(" ").values

    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(list_filtered_train))
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

    X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
    y = train[list_classes].values
    return X_train, X_test, y

def preprocess_2(ftrain, ftest, max_features=20000, maxlen=100):
    train_orig = pd.read_csv(ftrain)
    test_orig  = pd.read_csv(ftest)

    train_effective_samples = train_orig[   (train_orig.toxic == 1)   | (train_orig.severe_toxic == 1) \
                                        | (train_orig.obscene == 1) | (train_orig.threat == 1)   \
                                      | (train_orig.insult ==  1) | (train_orig.identity_hate == 1) ]
    # repeat the threat data at the beginning
    threat_samples = train_orig[train_orig.threat == 1]
    word_fit       = threat_samples.append(train_effective_samples)
    train_combined = threat_samples.append(train_orig)
    
    list_sentences_fit = word_fit["comment_text"].fillna(" ").values
    list_sentences_train = train_combined["comment_text"].fillna(" ").values
    list_sentences_test = test_orig["comment_text"].fillna(" ").values
    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(list_sentences_fit))
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

    X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
    y = train_combined[list_classes].values
    return X_train, X_test, y    

In [3]:
train_file = "./input/train.csv"
test_file  = "./input/test.csv"
max_features = 20000
maxlen = 100

X_train, X_test, y = preprocess_2(train_file, test_file, max_features, maxlen)

In [7]:
# define the Keras model graph
def get_model():
    embed_size = 600
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(GRU(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.3)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [8]:
model = get_model()

batch_size = 32
epochs = 3

file_path="weights_GRU_021718.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks_list = [checkpoint, early] 

#model.load_weights(file_path)
history = model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)



Train on 144044 samples, validate on 16005 samples
Epoch 1/3
Epoch 00001: val_loss improved from inf to 0.04678, saving model to weights_GRU_021718.best.hdf5
Epoch 2/3
Epoch 00002: val_loss improved from 0.04678 to 0.04397, saving model to weights_GRU_021718.best.hdf5
Epoch 3/3
Epoch 00003: val_loss did not improve


In [None]:
print(history.history.keys())

In [None]:
plot(history.history['val_loss'],'o')

#with open('./trainHistory_021418', 'wb') as file:
#    pickle.dump(history.history, file)

In [8]:
model = get_model()
file_path="weights_submission_021618.best.hdf5"
model.load_weights(file_path)
y_test = model.predict(X_test)


In [10]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv("./input/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("sub_021618.csv", index=False)

In [None]:
model = get_model()
file_path="weights_submission_021618.best.hdf5"
model.load_weights(file_path)
y_pred = model.predict(X_train[-10:-1])
print(type(y_pred), type(y[-10:-1]))
score = K.eval(losses.binary_crossentropy(y[-10:-1], y_pred))